mirror of
https://github.com/sudoxnym/ha-addons.git
synced 2026-04-14 11:37:33 +00:00
330 lines
11 KiB
Python
330 lines
11 KiB
Python
"""
|
|
scoutd/github.py - github discovery
|
|
scrapes repos, bios, commit patterns to find aligned builders
|
|
also detects lost builders - people with potential who haven't started yet
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
import time
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
from .signals import analyze_text, TARGET_TOPICS, ECOSYSTEM_REPOS
|
|
from .lost import (
|
|
analyze_github_for_lost_signals,
|
|
analyze_text_for_lost_signals,
|
|
classify_user,
|
|
get_signal_descriptions,
|
|
)
|
|
from .handles import discover_all_handles
|
|
|
|
# rate limit: 60/hr unauthenticated, 5000/hr with token
|
|
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '')
|
|
HEADERS = {'Accept': 'application/vnd.github.v3+json'}
|
|
if GITHUB_TOKEN:
|
|
HEADERS['Authorization'] = f'token {GITHUB_TOKEN}'
|
|
|
|
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'github'
|
|
|
|
|
|
def _api_get(url, params=None):
|
|
"""rate-limited api request with caching"""
|
|
cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
|
|
cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
|
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# check cache (1 hour expiry)
|
|
if cache_file.exists():
|
|
try:
|
|
data = json.loads(cache_file.read_text())
|
|
if time.time() - data.get('_cached_at', 0) < 3600:
|
|
return data.get('_data')
|
|
except:
|
|
pass
|
|
|
|
# rate limit
|
|
time.sleep(0.5 if GITHUB_TOKEN else 2)
|
|
|
|
try:
|
|
resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
|
|
resp.raise_for_status()
|
|
result = resp.json()
|
|
|
|
# cache
|
|
cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
|
|
return result
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" github api error: {e}")
|
|
return None
|
|
|
|
|
|
def search_repos_by_topic(topic, per_page=100):
|
|
"""search repos by topic tag"""
|
|
url = 'https://api.github.com/search/repositories'
|
|
params = {'q': f'topic:{topic}', 'sort': 'stars', 'order': 'desc', 'per_page': per_page}
|
|
data = _api_get(url, params)
|
|
return data.get('items', []) if data else []
|
|
|
|
|
|
def get_repo_contributors(repo_full_name, per_page=100):
|
|
"""get top contributors to a repo"""
|
|
url = f'https://api.github.com/repos/{repo_full_name}/contributors'
|
|
return _api_get(url, {'per_page': per_page}) or []
|
|
|
|
|
|
def get_github_user(login):
|
|
"""get full user profile"""
|
|
url = f'https://api.github.com/users/{login}'
|
|
return _api_get(url)
|
|
|
|
|
|
def get_user_repos(login, per_page=100):
|
|
"""get user's repos"""
|
|
url = f'https://api.github.com/users/{login}/repos'
|
|
return _api_get(url, {'per_page': per_page, 'sort': 'pushed'}) or []
|
|
|
|
|
|
def analyze_github_user(login):
|
|
"""
|
|
analyze a github user for values alignment
|
|
returns dict with score, confidence, signals, contact info
|
|
"""
|
|
user = get_github_user(login)
|
|
if not user:
|
|
return None
|
|
|
|
repos = get_user_repos(login)
|
|
|
|
# collect text corpus
|
|
text_parts = []
|
|
if user.get('bio'):
|
|
text_parts.append(user['bio'])
|
|
if user.get('company'):
|
|
text_parts.append(user['company'])
|
|
if user.get('location'):
|
|
text_parts.append(user['location'])
|
|
|
|
# analyze repos
|
|
all_topics = []
|
|
languages = defaultdict(int)
|
|
total_stars = 0
|
|
|
|
for repo in repos:
|
|
if repo.get('description'):
|
|
text_parts.append(repo['description'])
|
|
if repo.get('topics'):
|
|
all_topics.extend(repo['topics'])
|
|
if repo.get('language'):
|
|
languages[repo['language']] += 1
|
|
total_stars += repo.get('stargazers_count', 0)
|
|
|
|
full_text = ' '.join(text_parts)
|
|
|
|
# analyze signals
|
|
text_score, positive_signals, negative_signals = analyze_text(full_text)
|
|
|
|
# topic alignment
|
|
aligned_topics = set(all_topics) & set(TARGET_TOPICS)
|
|
topic_score = len(aligned_topics) * 10
|
|
|
|
# builder score (repos indicate building, not just talking)
|
|
builder_score = 0
|
|
if len(repos) > 20:
|
|
builder_score = 15
|
|
elif len(repos) > 10:
|
|
builder_score = 10
|
|
elif len(repos) > 5:
|
|
builder_score = 5
|
|
|
|
# hireable bonus
|
|
hireable_score = 5 if user.get('hireable') else 0
|
|
|
|
# total score
|
|
total_score = text_score + topic_score + builder_score + hireable_score
|
|
|
|
# === LOST BUILDER DETECTION ===
|
|
# build profile dict for lost analysis
|
|
profile_for_lost = {
|
|
'bio': user.get('bio'),
|
|
'repos': repos,
|
|
'public_repos': user.get('public_repos', len(repos)),
|
|
'followers': user.get('followers', 0),
|
|
'following': user.get('following', 0),
|
|
'extra': {
|
|
'top_repos': repos[:10],
|
|
},
|
|
}
|
|
|
|
# analyze for lost signals
|
|
lost_signals, lost_weight = analyze_github_for_lost_signals(profile_for_lost)
|
|
|
|
# also check text for lost language patterns
|
|
text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)
|
|
for sig in text_lost_signals:
|
|
if sig not in lost_signals:
|
|
lost_signals.append(sig)
|
|
lost_weight += text_lost_weight
|
|
|
|
lost_potential_score = lost_weight
|
|
|
|
# classify: builder, lost, both, or none
|
|
user_type = classify_user(lost_potential_score, builder_score, total_score)
|
|
|
|
# confidence based on data richness
|
|
confidence = 0.3
|
|
if user.get('bio'):
|
|
confidence += 0.15
|
|
if len(repos) > 5:
|
|
confidence += 0.15
|
|
if len(text_parts) > 5:
|
|
confidence += 0.15
|
|
if user.get('email') or user.get('blog') or user.get('twitter_username'):
|
|
confidence += 0.15
|
|
if total_stars > 100:
|
|
confidence += 0.1
|
|
confidence = min(confidence, 1.0)
|
|
|
|
# build reasons
|
|
reasons = []
|
|
if positive_signals:
|
|
reasons.append(f"signals: {', '.join(positive_signals[:5])}")
|
|
if aligned_topics:
|
|
reasons.append(f"topics: {', '.join(list(aligned_topics)[:5])}")
|
|
if builder_score > 0:
|
|
reasons.append(f"builder ({len(repos)} repos)")
|
|
if negative_signals:
|
|
reasons.append(f"WARNING: {', '.join(negative_signals)}")
|
|
|
|
# add lost reasons if applicable
|
|
if user_type == 'lost' or user_type == 'both':
|
|
lost_descriptions = get_signal_descriptions(lost_signals)
|
|
if lost_descriptions:
|
|
reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")
|
|
|
|
# === DEEP HANDLE DISCOVERY ===
|
|
# follow blog links, scrape websites, find ALL social handles
|
|
handles, discovered_emails = discover_all_handles(user)
|
|
|
|
# merge discovered emails with github email
|
|
all_emails = discovered_emails or []
|
|
if user.get('email'):
|
|
all_emails.append(user['email'])
|
|
all_emails = list(set(e for e in all_emails if e and 'noreply' not in e.lower()))
|
|
|
|
return {
|
|
'platform': 'github',
|
|
'username': login,
|
|
'url': f"https://github.com/{login}",
|
|
'name': user.get('name'),
|
|
'bio': user.get('bio'),
|
|
'location': user.get('location'),
|
|
'score': total_score,
|
|
'confidence': confidence,
|
|
'signals': positive_signals,
|
|
'negative_signals': negative_signals,
|
|
'topics': list(aligned_topics),
|
|
'languages': dict(languages),
|
|
'repo_count': len(repos),
|
|
'total_stars': total_stars,
|
|
'reasons': reasons,
|
|
'contact': {
|
|
'email': all_emails[0] if all_emails else None,
|
|
'emails': all_emails,
|
|
'blog': user.get('blog'),
|
|
'twitter': user.get('twitter_username') or handles.get('twitter'),
|
|
'mastodon': handles.get('mastodon'),
|
|
'bluesky': handles.get('bluesky'),
|
|
'matrix': handles.get('matrix'),
|
|
'lemmy': handles.get('lemmy'),
|
|
},
|
|
'extra': {
|
|
'topics': list(aligned_topics),
|
|
'languages': dict(languages),
|
|
'repo_count': len(repos),
|
|
'total_stars': total_stars,
|
|
'hireable': user.get('hireable', False),
|
|
'handles': handles, # all discovered handles
|
|
},
|
|
'hireable': user.get('hireable', False),
|
|
'scraped_at': datetime.now().isoformat(),
|
|
# lost builder fields
|
|
'lost_potential_score': lost_potential_score,
|
|
'lost_signals': lost_signals,
|
|
'user_type': user_type, # 'builder', 'lost', 'both', 'none'
|
|
}
|
|
|
|
|
|
def scrape_github(db, limit_per_source=50):
|
|
"""
|
|
full github scrape
|
|
returns list of analyzed users
|
|
"""
|
|
print("scoutd/github: starting scrape...")
|
|
|
|
all_logins = set()
|
|
|
|
# 1. ecosystem repo contributors
|
|
print(" scraping ecosystem repo contributors...")
|
|
for repo in ECOSYSTEM_REPOS:
|
|
contributors = get_repo_contributors(repo, per_page=limit_per_source)
|
|
for c in contributors:
|
|
login = c.get('login')
|
|
if login and not login.endswith('[bot]'):
|
|
all_logins.add(login)
|
|
print(f" {repo}: {len(contributors)} contributors")
|
|
|
|
# 2. topic repos
|
|
print(" scraping topic repos...")
|
|
for topic in TARGET_TOPICS[:10]:
|
|
repos = search_repos_by_topic(topic, per_page=30)
|
|
for repo in repos:
|
|
owner = repo.get('owner', {}).get('login')
|
|
if owner and not owner.endswith('[bot]'):
|
|
all_logins.add(owner)
|
|
print(f" #{topic}: {len(repos)} repos")
|
|
|
|
print(f" found {len(all_logins)} unique users to analyze")
|
|
|
|
# analyze each
|
|
results = []
|
|
builders_found = 0
|
|
lost_found = 0
|
|
|
|
for i, login in enumerate(all_logins):
|
|
if i % 20 == 0:
|
|
print(f" analyzing... {i}/{len(all_logins)}")
|
|
|
|
try:
|
|
result = analyze_github_user(login)
|
|
if result and result['score'] > 0:
|
|
results.append(result)
|
|
db.save_human(result)
|
|
|
|
user_type = result.get('user_type', 'none')
|
|
|
|
if user_type == 'builder':
|
|
builders_found += 1
|
|
if result['score'] >= 50:
|
|
print(f" ★ {login}: {result['score']} pts, {result['confidence']:.0%} conf")
|
|
|
|
elif user_type == 'lost':
|
|
lost_found += 1
|
|
lost_score = result.get('lost_potential_score', 0)
|
|
if lost_score >= 40:
|
|
print(f" 💔 {login}: lost_score={lost_score}, values={result['score']} pts")
|
|
|
|
elif user_type == 'both':
|
|
builders_found += 1
|
|
lost_found += 1
|
|
print(f" ⚡ {login}: recovering builder (lost={result.get('lost_potential_score', 0)}, active={result['score']})")
|
|
|
|
except Exception as e:
|
|
print(f" error on {login}: {e}")
|
|
|
|
print(f"scoutd/github: found {len(results)} aligned humans")
|
|
print(f" - {builders_found} active builders")
|
|
print(f" - {lost_found} lost builders (need encouragement)")
|
|
return results
|