ha-addons/connectd/scoutd/github.py

"""
scoutd/github.py - github discovery
scrapes repos, bios, commit patterns to find aligned builders
also detects lost builders - people with potential who haven't started yet
"""

import requests
import json
import time
import os
from datetime import datetime
from pathlib import Path
from collections import defaultdict

from .signals import analyze_text, TARGET_TOPICS, ECOSYSTEM_REPOS
from .lost import (
    analyze_github_for_lost_signals,
    analyze_text_for_lost_signals,
    classify_user,
    get_signal_descriptions,
)
from .handles import discover_all_handles

# rate limit: 60/hr unauthenticated, 5000/hr with token
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '')
HEADERS = {'Accept': 'application/vnd.github.v3+json'}
if GITHUB_TOKEN:
    HEADERS['Authorization'] = f'token {GITHUB_TOKEN}'

CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'github'


def _api_get(url, params=None):
    """rate-limited api request with caching"""
    cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
    cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
    CACHE_DIR.mkdir(parents=True, exist_ok=True)

    # check cache (1 hour expiry)
    if cache_file.exists():
        try:
            data = json.loads(cache_file.read_text())
            if time.time() - data.get('_cached_at', 0) < 3600:
                return data.get('_data')
        except:
            pass

    # rate limit
    time.sleep(0.5 if GITHUB_TOKEN else 2)

    try:
        resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
        resp.raise_for_status()
        result = resp.json()

        # cache
        cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
        return result
    except requests.exceptions.RequestException as e:
        print(f"  github api error: {e}")
        return None


def search_repos_by_topic(topic, per_page=100):
    """search repos by topic tag"""
    url = 'https://api.github.com/search/repositories'
    params = {'q': f'topic:{topic}', 'sort': 'stars', 'order': 'desc', 'per_page': per_page}
    data = _api_get(url, params)
    return data.get('items', []) if data else []


def get_repo_contributors(repo_full_name, per_page=100):
    """get top contributors to a repo"""
    url = f'https://api.github.com/repos/{repo_full_name}/contributors'
    return _api_get(url, {'per_page': per_page}) or []


def get_github_user(login):
    """get full user profile"""
    url = f'https://api.github.com/users/{login}'
    return _api_get(url)


def get_user_repos(login, per_page=100):
    """get user's repos"""
    url = f'https://api.github.com/users/{login}/repos'
    return _api_get(url, {'per_page': per_page, 'sort': 'pushed'}) or []


def analyze_github_user(login):
    """
    analyze a github user for values alignment
    returns dict with score, confidence, signals, contact info
    """
    user = get_github_user(login)
    if not user:
        return None

    repos = get_user_repos(login)

    # collect text corpus
    text_parts = []
    if user.get('bio'):
        text_parts.append(user['bio'])
    if user.get('company'):
        text_parts.append(user['company'])
    if user.get('location'):
        text_parts.append(user['location'])

    # analyze repos
    all_topics = []
    languages = defaultdict(int)
    total_stars = 0

    for repo in repos:
        if repo.get('description'):
            text_parts.append(repo['description'])
        if repo.get('topics'):
            all_topics.extend(repo['topics'])
        if repo.get('language'):
            languages[repo['language']] += 1
        total_stars += repo.get('stargazers_count', 0)

    full_text = ' '.join(text_parts)

    # analyze signals
    text_score, positive_signals, negative_signals = analyze_text(full_text)

    # topic alignment
    aligned_topics = set(all_topics) & set(TARGET_TOPICS)
    topic_score = len(aligned_topics) * 10

    # builder score (repos indicate building, not just talking)
    builder_score = 0
    if len(repos) > 20:
        builder_score = 15
    elif len(repos) > 10:
        builder_score = 10
    elif len(repos) > 5:
        builder_score = 5

    # hireable bonus
    hireable_score = 5 if user.get('hireable') else 0

    # total score
    total_score = text_score + topic_score + builder_score + hireable_score

    # === LOST BUILDER DETECTION ===
    # build profile dict for lost analysis
    profile_for_lost = {
        'bio': user.get('bio'),
        'repos': repos,
        'public_repos': user.get('public_repos', len(repos)),
        'followers': user.get('followers', 0),
        'following': user.get('following', 0),
        'extra': {
            'top_repos': repos[:10],
        },
    }

    # analyze for lost signals
    lost_signals, lost_weight = analyze_github_for_lost_signals(profile_for_lost)

    # also check text for lost language patterns
    text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)
    for sig in text_lost_signals:
        if sig not in lost_signals:
            lost_signals.append(sig)
            lost_weight += text_lost_weight

    lost_potential_score = lost_weight

    # classify: builder, lost, both, or none
    user_type = classify_user(lost_potential_score, builder_score, total_score)

    # confidence based on data richness
    confidence = 0.3
    if user.get('bio'):
        confidence += 0.15
    if len(repos) > 5:
        confidence += 0.15
    if len(text_parts) > 5:
        confidence += 0.15
    if user.get('email') or user.get('blog') or user.get('twitter_username'):
        confidence += 0.15
    if total_stars > 100:
        confidence += 0.1
    confidence = min(confidence, 1.0)

    # build reasons
    reasons = []
    if positive_signals:
        reasons.append(f"signals: {', '.join(positive_signals[:5])}")
    if aligned_topics:
        reasons.append(f"topics: {', '.join(list(aligned_topics)[:5])}")
    if builder_score > 0:
        reasons.append(f"builder ({len(repos)} repos)")
    if negative_signals:
        reasons.append(f"WARNING: {', '.join(negative_signals)}")

    # add lost reasons if applicable
    if user_type == 'lost' or user_type == 'both':
        lost_descriptions = get_signal_descriptions(lost_signals)
        if lost_descriptions:
            reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")

    # === DEEP HANDLE DISCOVERY ===
    # follow blog links, scrape websites, find ALL social handles
    handles, discovered_emails = discover_all_handles(user)

    # merge discovered emails with github email
    all_emails = discovered_emails or []
    if user.get('email'):
        all_emails.append(user['email'])
    all_emails = list(set(e for e in all_emails if e and 'noreply' not in e.lower()))

    return {
        'platform': 'github',
        'username': login,
        'url': f"https://github.com/{login}",
        'name': user.get('name'),
        'bio': user.get('bio'),
        'location': user.get('location'),
        'score': total_score,
        'confidence': confidence,
        'signals': positive_signals,
        'negative_signals': negative_signals,
        'topics': list(aligned_topics),
        'languages': dict(languages),
        'repo_count': len(repos),
        'total_stars': total_stars,
        'reasons': reasons,
        'contact': {
            'email': all_emails[0] if all_emails else None,
            'emails': all_emails,
            'blog': user.get('blog'),
            'twitter': user.get('twitter_username') or handles.get('twitter'),
            'mastodon': handles.get('mastodon'),
            'bluesky': handles.get('bluesky'),
            'matrix': handles.get('matrix'),
            'lemmy': handles.get('lemmy'),
        },
        'extra': {
            'topics': list(aligned_topics),
            'languages': dict(languages),
            'repo_count': len(repos),
            'total_stars': total_stars,
            'hireable': user.get('hireable', False),
            'handles': handles,  # all discovered handles
        },
        'hireable': user.get('hireable', False),
        'scraped_at': datetime.now().isoformat(),
        # lost builder fields
        'lost_potential_score': lost_potential_score,
        'lost_signals': lost_signals,
        'user_type': user_type,  # 'builder', 'lost', 'both', 'none'
    }


def scrape_github(db, limit_per_source=50):
    """
    full github scrape
    returns list of analyzed users
    """
    print("scoutd/github: starting scrape...")

    all_logins = set()

    # 1. ecosystem repo contributors
    print("  scraping ecosystem repo contributors...")
    for repo in ECOSYSTEM_REPOS:
        contributors = get_repo_contributors(repo, per_page=limit_per_source)
        for c in contributors:
            login = c.get('login')
            if login and not login.endswith('[bot]'):
                all_logins.add(login)
        print(f"    {repo}: {len(contributors)} contributors")

    # 2. topic repos
    print("  scraping topic repos...")
    for topic in TARGET_TOPICS[:10]:
        repos = search_repos_by_topic(topic, per_page=30)
        for repo in repos:
            owner = repo.get('owner', {}).get('login')
            if owner and not owner.endswith('[bot]'):
                all_logins.add(owner)
        print(f"    #{topic}: {len(repos)} repos")

    print(f"  found {len(all_logins)} unique users to analyze")

    # analyze each
    results = []
    builders_found = 0
    lost_found = 0

    for i, login in enumerate(all_logins):
        if i % 20 == 0:
            print(f"  analyzing... {i}/{len(all_logins)}")

        try:
            result = analyze_github_user(login)
            if result and result['score'] > 0:
                results.append(result)
                db.save_human(result)

                user_type = result.get('user_type', 'none')

                if user_type == 'builder':
                    builders_found += 1
                    if result['score'] >= 50:
                        print(f"    ★ {login}: {result['score']} pts, {result['confidence']:.0%} conf")

                elif user_type == 'lost':
                    lost_found += 1
                    lost_score = result.get('lost_potential_score', 0)
                    if lost_score >= 40:
                        print(f"    💔 {login}: lost_score={lost_score}, values={result['score']} pts")

                elif user_type == 'both':
                    builders_found += 1
                    lost_found += 1
                    print(f"    ⚡ {login}: recovering builder (lost={result.get('lost_potential_score', 0)}, active={result['score']})")

        except Exception as e:
            print(f"    error on {login}: {e}")

    print(f"scoutd/github: found {len(results)} aligned humans")
    print(f"  - {builders_found} active builders")
    print(f"  - {lost_found} lost builders (need encouragement)")
    return results