ha-addons/connectd/scoutd/github.py
2025-12-15 11:06:57 -06:00

330 lines
11 KiB
Python

"""
scoutd/github.py - github discovery
scrapes repos, bios, commit patterns to find aligned builders
also detects lost builders - people with potential who haven't started yet
"""
import requests
import json
import time
import os
from datetime import datetime
from pathlib import Path
from collections import defaultdict
from .signals import analyze_text, TARGET_TOPICS, ECOSYSTEM_REPOS
from .lost import (
analyze_github_for_lost_signals,
analyze_text_for_lost_signals,
classify_user,
get_signal_descriptions,
)
from .handles import discover_all_handles
# rate limit: 60/hr unauthenticated, 5000/hr with token
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '')
HEADERS = {'Accept': 'application/vnd.github.v3+json'}
if GITHUB_TOKEN:
HEADERS['Authorization'] = f'token {GITHUB_TOKEN}'
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'github'
def _api_get(url, params=None):
"""rate-limited api request with caching"""
cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# check cache (1 hour expiry)
if cache_file.exists():
try:
data = json.loads(cache_file.read_text())
if time.time() - data.get('_cached_at', 0) < 3600:
return data.get('_data')
except:
pass
# rate limit
time.sleep(0.5 if GITHUB_TOKEN else 2)
try:
resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
resp.raise_for_status()
result = resp.json()
# cache
cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
return result
except requests.exceptions.RequestException as e:
print(f" github api error: {e}")
return None
def search_repos_by_topic(topic, per_page=100):
"""search repos by topic tag"""
url = 'https://api.github.com/search/repositories'
params = {'q': f'topic:{topic}', 'sort': 'stars', 'order': 'desc', 'per_page': per_page}
data = _api_get(url, params)
return data.get('items', []) if data else []
def get_repo_contributors(repo_full_name, per_page=100):
"""get top contributors to a repo"""
url = f'https://api.github.com/repos/{repo_full_name}/contributors'
return _api_get(url, {'per_page': per_page}) or []
def get_github_user(login):
"""get full user profile"""
url = f'https://api.github.com/users/{login}'
return _api_get(url)
def get_user_repos(login, per_page=100):
"""get user's repos"""
url = f'https://api.github.com/users/{login}/repos'
return _api_get(url, {'per_page': per_page, 'sort': 'pushed'}) or []
def analyze_github_user(login):
"""
analyze a github user for values alignment
returns dict with score, confidence, signals, contact info
"""
user = get_github_user(login)
if not user:
return None
repos = get_user_repos(login)
# collect text corpus
text_parts = []
if user.get('bio'):
text_parts.append(user['bio'])
if user.get('company'):
text_parts.append(user['company'])
if user.get('location'):
text_parts.append(user['location'])
# analyze repos
all_topics = []
languages = defaultdict(int)
total_stars = 0
for repo in repos:
if repo.get('description'):
text_parts.append(repo['description'])
if repo.get('topics'):
all_topics.extend(repo['topics'])
if repo.get('language'):
languages[repo['language']] += 1
total_stars += repo.get('stargazers_count', 0)
full_text = ' '.join(text_parts)
# analyze signals
text_score, positive_signals, negative_signals = analyze_text(full_text)
# topic alignment
aligned_topics = set(all_topics) & set(TARGET_TOPICS)
topic_score = len(aligned_topics) * 10
# builder score (repos indicate building, not just talking)
builder_score = 0
if len(repos) > 20:
builder_score = 15
elif len(repos) > 10:
builder_score = 10
elif len(repos) > 5:
builder_score = 5
# hireable bonus
hireable_score = 5 if user.get('hireable') else 0
# total score
total_score = text_score + topic_score + builder_score + hireable_score
# === LOST BUILDER DETECTION ===
# build profile dict for lost analysis
profile_for_lost = {
'bio': user.get('bio'),
'repos': repos,
'public_repos': user.get('public_repos', len(repos)),
'followers': user.get('followers', 0),
'following': user.get('following', 0),
'extra': {
'top_repos': repos[:10],
},
}
# analyze for lost signals
lost_signals, lost_weight = analyze_github_for_lost_signals(profile_for_lost)
# also check text for lost language patterns
text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)
for sig in text_lost_signals:
if sig not in lost_signals:
lost_signals.append(sig)
lost_weight += text_lost_weight
lost_potential_score = lost_weight
# classify: builder, lost, both, or none
user_type = classify_user(lost_potential_score, builder_score, total_score)
# confidence based on data richness
confidence = 0.3
if user.get('bio'):
confidence += 0.15
if len(repos) > 5:
confidence += 0.15
if len(text_parts) > 5:
confidence += 0.15
if user.get('email') or user.get('blog') or user.get('twitter_username'):
confidence += 0.15
if total_stars > 100:
confidence += 0.1
confidence = min(confidence, 1.0)
# build reasons
reasons = []
if positive_signals:
reasons.append(f"signals: {', '.join(positive_signals[:5])}")
if aligned_topics:
reasons.append(f"topics: {', '.join(list(aligned_topics)[:5])}")
if builder_score > 0:
reasons.append(f"builder ({len(repos)} repos)")
if negative_signals:
reasons.append(f"WARNING: {', '.join(negative_signals)}")
# add lost reasons if applicable
if user_type == 'lost' or user_type == 'both':
lost_descriptions = get_signal_descriptions(lost_signals)
if lost_descriptions:
reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")
# === DEEP HANDLE DISCOVERY ===
# follow blog links, scrape websites, find ALL social handles
handles, discovered_emails = discover_all_handles(user)
# merge discovered emails with github email
all_emails = discovered_emails or []
if user.get('email'):
all_emails.append(user['email'])
all_emails = list(set(e for e in all_emails if e and 'noreply' not in e.lower()))
return {
'platform': 'github',
'username': login,
'url': f"https://github.com/{login}",
'name': user.get('name'),
'bio': user.get('bio'),
'location': user.get('location'),
'score': total_score,
'confidence': confidence,
'signals': positive_signals,
'negative_signals': negative_signals,
'topics': list(aligned_topics),
'languages': dict(languages),
'repo_count': len(repos),
'total_stars': total_stars,
'reasons': reasons,
'contact': {
'email': all_emails[0] if all_emails else None,
'emails': all_emails,
'blog': user.get('blog'),
'twitter': user.get('twitter_username') or handles.get('twitter'),
'mastodon': handles.get('mastodon'),
'bluesky': handles.get('bluesky'),
'matrix': handles.get('matrix'),
'lemmy': handles.get('lemmy'),
},
'extra': {
'topics': list(aligned_topics),
'languages': dict(languages),
'repo_count': len(repos),
'total_stars': total_stars,
'hireable': user.get('hireable', False),
'handles': handles, # all discovered handles
},
'hireable': user.get('hireable', False),
'scraped_at': datetime.now().isoformat(),
# lost builder fields
'lost_potential_score': lost_potential_score,
'lost_signals': lost_signals,
'user_type': user_type, # 'builder', 'lost', 'both', 'none'
}
def scrape_github(db, limit_per_source=50):
"""
full github scrape
returns list of analyzed users
"""
print("scoutd/github: starting scrape...")
all_logins = set()
# 1. ecosystem repo contributors
print(" scraping ecosystem repo contributors...")
for repo in ECOSYSTEM_REPOS:
contributors = get_repo_contributors(repo, per_page=limit_per_source)
for c in contributors:
login = c.get('login')
if login and not login.endswith('[bot]'):
all_logins.add(login)
print(f" {repo}: {len(contributors)} contributors")
# 2. topic repos
print(" scraping topic repos...")
for topic in TARGET_TOPICS[:10]:
repos = search_repos_by_topic(topic, per_page=30)
for repo in repos:
owner = repo.get('owner', {}).get('login')
if owner and not owner.endswith('[bot]'):
all_logins.add(owner)
print(f" #{topic}: {len(repos)} repos")
print(f" found {len(all_logins)} unique users to analyze")
# analyze each
results = []
builders_found = 0
lost_found = 0
for i, login in enumerate(all_logins):
if i % 20 == 0:
print(f" analyzing... {i}/{len(all_logins)}")
try:
result = analyze_github_user(login)
if result and result['score'] > 0:
results.append(result)
db.save_human(result)
user_type = result.get('user_type', 'none')
if user_type == 'builder':
builders_found += 1
if result['score'] >= 50:
print(f"{login}: {result['score']} pts, {result['confidence']:.0%} conf")
elif user_type == 'lost':
lost_found += 1
lost_score = result.get('lost_potential_score', 0)
if lost_score >= 40:
print(f" 💔 {login}: lost_score={lost_score}, values={result['score']} pts")
elif user_type == 'both':
builders_found += 1
lost_found += 1
print(f"{login}: recovering builder (lost={result.get('lost_potential_score', 0)}, active={result['score']})")
except Exception as e:
print(f" error on {login}: {e}")
print(f"scoutd/github: found {len(results)} aligned humans")
print(f" - {builders_found} active builders")
print(f" - {lost_found} lost builders (need encouragement)")
return results