""" scoutd/forges.py - scrape self-hosted git forges these people = highest signal. they actually selfhost. supported platforms: - gitea (and forks like forgejo) - gogs - gitlab ce - sourcehut - codeberg (gitea-based) scrapes users AND extracts contact info for outreach. """ import os import re import json import time import requests from typing import List, Dict, Optional, Tuple from datetime import datetime from .signals import analyze_text # rate limiting REQUEST_DELAY = 1.0 # known public instances to scrape # format: (name, url, platform_type) KNOWN_INSTANCES = [ # === PUBLIC INSTANCES === # local/private instances can be added via LOCAL_FORGE_INSTANCES env var # codeberg (largest gitea instance) ('codeberg', 'https://codeberg.org', 'gitea'), # sourcehut ('sourcehut', 'https://sr.ht', 'sourcehut'), # notable gitea/forgejo instances ('gitea.com', 'https://gitea.com', 'gitea'), ('git.disroot.org', 'https://git.disroot.org', 'gitea'), ('git.gay', 'https://git.gay', 'forgejo'), ('git.envs.net', 'https://git.envs.net', 'forgejo'), ('tildegit', 'https://tildegit.org', 'gitea'), ('git.sr.ht', 'https://git.sr.ht', 'sourcehut'), # gitlab ce instances ('framagit', 'https://framagit.org', 'gitlab'), ('gitlab.gnome.org', 'https://gitlab.gnome.org', 'gitlab'), ('invent.kde.org', 'https://invent.kde.org', 'gitlab'), ('salsa.debian.org', 'https://salsa.debian.org', 'gitlab'), ] # headers HEADERS = { 'User-Agent': 'connectd/1.0 (finding builders with aligned values)', 'Accept': 'application/json', } def log(msg): print(f" forges: {msg}") # === GITEA/FORGEJO/GOGS API === # these share the same API structure def scrape_gitea_users(instance_url: str, limit: int = 100) -> List[Dict]: """ scrape users from a gitea/forgejo/gogs instance. uses the explore/users page or API if available. """ users = [] # try API first (gitea 1.x+) try: api_url = f"{instance_url}/api/v1/users/search" params = {'q': '', 'limit': min(limit, 50)} resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15) if resp.status_code == 200: data = resp.json() user_list = data.get('data', []) or data.get('users', []) or data if isinstance(user_list, list): for u in user_list[:limit]: users.append({ 'username': u.get('login') or u.get('username'), 'full_name': u.get('full_name'), 'avatar': u.get('avatar_url'), 'website': u.get('website'), 'location': u.get('location'), 'bio': u.get('description') or u.get('bio'), }) log(f" got {len(users)} users via API") except Exception as e: log(f" API failed: {e}") # fallback: scrape explore page if not users: try: explore_url = f"{instance_url}/explore/users" resp = requests.get(explore_url, headers=HEADERS, timeout=15) if resp.status_code == 200: # parse HTML for usernames usernames = re.findall(r'href="/([^/"]+)"[^>]*class="[^"]*user[^"]*"', resp.text) usernames += re.findall(r']+href="/([^/"]+)"[^>]*title="[^"]*"', resp.text) usernames = list(set(usernames))[:limit] for username in usernames: if username and not username.startswith(('explore', 'api', 'user', 'repo')): users.append({'username': username}) log(f" got {len(users)} users via scrape") except Exception as e: log(f" scrape failed: {e}") return users def get_gitea_user_details(instance_url: str, username: str) -> Optional[Dict]: """get detailed user info from gitea/forgejo/gogs""" try: # API endpoint api_url = f"{instance_url}/api/v1/users/{username}" resp = requests.get(api_url, headers=HEADERS, timeout=10) if resp.status_code == 200: u = resp.json() return { 'username': u.get('login') or u.get('username'), 'full_name': u.get('full_name'), 'email': u.get('email'), # may be hidden 'website': u.get('website'), 'location': u.get('location'), 'bio': u.get('description') or u.get('bio'), 'created': u.get('created'), 'followers': u.get('followers_count', 0), 'following': u.get('following_count', 0), } except: pass return None def get_gitea_user_repos(instance_url: str, username: str, limit: int = 10) -> List[Dict]: """get user's repos from gitea/forgejo/gogs""" repos = [] try: api_url = f"{instance_url}/api/v1/users/{username}/repos" resp = requests.get(api_url, headers=HEADERS, timeout=10) if resp.status_code == 200: for r in resp.json()[:limit]: repos.append({ 'name': r.get('name'), 'full_name': r.get('full_name'), 'description': r.get('description'), 'stars': r.get('stars_count', 0), 'forks': r.get('forks_count', 0), 'language': r.get('language'), 'updated': r.get('updated_at'), }) except: pass return repos # === GITLAB CE API === def scrape_gitlab_users(instance_url: str, limit: int = 100) -> List[Dict]: """scrape users from a gitlab ce instance""" users = [] try: # gitlab API - public users endpoint api_url = f"{instance_url}/api/v4/users" params = {'per_page': min(limit, 100), 'active': True} resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15) if resp.status_code == 200: for u in resp.json()[:limit]: users.append({ 'username': u.get('username'), 'full_name': u.get('name'), 'avatar': u.get('avatar_url'), 'website': u.get('website_url'), 'location': u.get('location'), 'bio': u.get('bio'), 'public_email': u.get('public_email'), }) log(f" got {len(users)} gitlab users") except Exception as e: log(f" gitlab API failed: {e}") return users def get_gitlab_user_details(instance_url: str, username: str) -> Optional[Dict]: """get detailed gitlab user info""" try: api_url = f"{instance_url}/api/v4/users" params = {'username': username} resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10) if resp.status_code == 200: users = resp.json() if users: u = users[0] return { 'username': u.get('username'), 'full_name': u.get('name'), 'email': u.get('public_email'), 'website': u.get('website_url'), 'location': u.get('location'), 'bio': u.get('bio'), 'created': u.get('created_at'), } except: pass return None def get_gitlab_user_projects(instance_url: str, username: str, limit: int = 10) -> List[Dict]: """get user's projects from gitlab""" repos = [] try: # first get user id api_url = f"{instance_url}/api/v4/users" params = {'username': username} resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10) if resp.status_code == 200 and resp.json(): user_id = resp.json()[0].get('id') # get projects proj_url = f"{instance_url}/api/v4/users/{user_id}/projects" resp = requests.get(proj_url, headers=HEADERS, timeout=10) if resp.status_code == 200: for p in resp.json()[:limit]: repos.append({ 'name': p.get('name'), 'full_name': p.get('path_with_namespace'), 'description': p.get('description'), 'stars': p.get('star_count', 0), 'forks': p.get('forks_count', 0), 'updated': p.get('last_activity_at'), }) except: pass return repos # === SOURCEHUT API === def scrape_sourcehut_users(limit: int = 100) -> List[Dict]: """ scrape users from sourcehut. sourcehut doesn't have a public user list, so we scrape from: - recent commits - mailing lists - project pages """ users = [] seen = set() try: # scrape from git.sr.ht explore resp = requests.get('https://git.sr.ht/projects', headers=HEADERS, timeout=15) if resp.status_code == 200: # extract usernames from repo paths like ~username/repo usernames = re.findall(r'href="/~([^/"]+)', resp.text) for username in usernames: if username not in seen: seen.add(username) users.append({'username': username}) if len(users) >= limit: break log(f" got {len(users)} sourcehut users") except Exception as e: log(f" sourcehut scrape failed: {e}") return users def get_sourcehut_user_details(username: str) -> Optional[Dict]: """get sourcehut user details""" try: # scrape profile page profile_url = f"https://sr.ht/~{username}" resp = requests.get(profile_url, headers=HEADERS, timeout=10) if resp.status_code == 200: bio = '' # extract bio from page bio_match = re.search(r'
\s*

([^<]+)

', resp.text) if bio_match: bio = bio_match.group(1).strip() return { 'username': username, 'bio': bio, 'profile_url': profile_url, } except: pass return None def get_sourcehut_user_repos(username: str, limit: int = 10) -> List[Dict]: """get sourcehut user's repos""" repos = [] try: git_url = f"https://git.sr.ht/~{username}" resp = requests.get(git_url, headers=HEADERS, timeout=10) if resp.status_code == 200: # extract repo names repo_matches = re.findall(rf'href="/~{username}/([^"]+)"', resp.text) for repo in repo_matches[:limit]: if repo and not repo.startswith(('refs', 'log', 'tree')): repos.append({ 'name': repo, 'full_name': f"~{username}/{repo}", }) except: pass return repos # === UNIFIED SCRAPER === def scrape_forge(instance_name: str, instance_url: str, platform_type: str, limit: int = 50) -> List[Dict]: """ scrape users from any forge type. returns list of human dicts ready for database. """ log(f"scraping {instance_name} ({platform_type})...") humans = [] # get user list based on platform type if platform_type in ('gitea', 'forgejo', 'gogs'): users = scrape_gitea_users(instance_url, limit) get_details = lambda u: get_gitea_user_details(instance_url, u) get_repos = lambda u: get_gitea_user_repos(instance_url, u) elif platform_type == 'gitlab': users = scrape_gitlab_users(instance_url, limit) get_details = lambda u: get_gitlab_user_details(instance_url, u) get_repos = lambda u: get_gitlab_user_projects(instance_url, u) elif platform_type == 'sourcehut': users = scrape_sourcehut_users(limit) get_details = get_sourcehut_user_details get_repos = get_sourcehut_user_repos else: log(f" unknown platform type: {platform_type}") return [] for user in users: username = user.get('username') if not username: continue time.sleep(REQUEST_DELAY) # get detailed info details = get_details(username) if details: user.update(details) # get repos repos = get_repos(username) # build human record bio = user.get('bio', '') or '' website = user.get('website', '') or '' # analyze signals from bio score, signals, reasons = analyze_text(bio + ' ' + website) # BOOST: self-hosted git = highest signal score += 25 signals.append('selfhosted_git') reasons.append(f'uses self-hosted git ({instance_name})') # extract contact info contact = {} email = user.get('email') or user.get('public_email') if email and '@' in email: contact['email'] = email if website: contact['website'] = website # build human dict human = { 'platform': f'{platform_type}:{instance_name}', 'username': username, 'name': user.get('full_name'), 'bio': bio, 'url': f"{instance_url}/{username}" if platform_type != 'sourcehut' else f"https://sr.ht/~{username}", 'score': score, 'signals': json.dumps(signals), 'reasons': json.dumps(reasons), 'contact': json.dumps(contact), 'extra': json.dumps({ 'instance': instance_name, 'instance_url': instance_url, 'platform_type': platform_type, 'repos': repos[:5], 'followers': user.get('followers', 0), 'email': email, 'website': website, }), 'user_type': 'builder' if repos else 'none', } humans.append(human) log(f" {username}: score={score}, repos={len(repos)}") return humans def scrape_all_forges(limit_per_instance: int = 30) -> List[Dict]: """scrape all known forge instances""" all_humans = [] for instance_name, instance_url, platform_type in KNOWN_INSTANCES: try: humans = scrape_forge(instance_name, instance_url, platform_type, limit_per_instance) all_humans.extend(humans) log(f" {instance_name}: {len(humans)} humans") except Exception as e: log(f" {instance_name} failed: {e}") time.sleep(2) # be nice between instances log(f"total: {len(all_humans)} humans from {len(KNOWN_INSTANCES)} forges") return all_humans # === OUTREACH METHODS === def can_message_on_forge(instance_url: str, platform_type: str) -> bool: """check if we can send messages on this forge""" # gitea/forgejo don't have DMs # gitlab has merge request comments # sourcehut has mailing lists return platform_type in ('gitlab', 'sourcehut') def open_forge_issue(instance_url: str, platform_type: str, owner: str, repo: str, title: str, body: str) -> Tuple[bool, str]: """ open an issue on a forge as outreach method. requires API token for authenticated requests. """ # would need tokens per instance - for now return False # this is a fallback method, email is preferred return False, "forge issue creation not implemented yet" # === DISCOVERY === def discover_forge_instances() -> List[Tuple[str, str, str]]: """ discover new forge instances from: - fediverse (they often announce) - known lists - DNS patterns returns list of (name, url, platform_type) """ # start with known instances instances = list(KNOWN_INSTANCES) # could add discovery logic here: # - scrape https://codeberg.org/forgejo/forgejo/issues for instance mentions # - check fediverse for git.* domains # - crawl gitea/forgejo awesome lists return instances if __name__ == '__main__': # test print("testing forge scrapers...") # test codeberg humans = scrape_forge('codeberg', 'https://codeberg.org', 'gitea', limit=5) print(f"codeberg: {len(humans)} humans") for h in humans[:2]: print(f" {h['username']}: {h['score']} - {h.get('signals')}")