connectd/scoutd/reddit.py

"""
scoutd/reddit.py - reddit discovery with TAVILY web search

CRITICAL: always quote usernames in tavily searches to avoid fuzzy matching
"""

import requests
import json
import time
import re
import os
from datetime import datetime
from pathlib import Path
from collections import defaultdict

from .signals import analyze_text, ALIGNED_SUBREDDITS, NEGATIVE_SUBREDDITS
from .lost import (
    analyze_reddit_for_lost_signals,
    analyze_text_for_lost_signals,
    classify_user,
    get_signal_descriptions,
    STUCK_SUBREDDITS,
)

HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'}
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit'

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY', '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')


def _api_get(url, params=None, headers=None):
    cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
    cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    if cache_file.exists():
        try:
            data = json.loads(cache_file.read_text())
            if time.time() - data.get('_cached_at', 0) < 3600:
                return data.get('_data')
        except:
            pass
    time.sleep(1)
    req_headers = {**HEADERS, **(headers or {})}
    try:
        resp = requests.get(url, headers=req_headers, params=params, timeout=30)
        resp.raise_for_status()
        result = resp.json()
        cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
        return result
    except:
        return None


def tavily_search(query, max_results=10):
    if not TAVILY_API_KEY:
        return []
    try:
        resp = requests.post(
            'https://api.tavily.com/search',
            json={'api_key': TAVILY_API_KEY, 'query': query, 'max_results': max_results},
            timeout=30
        )
        if resp.status_code == 200:
            return resp.json().get('results', [])
    except Exception as e:
        print(f"      tavily error: {e}")
    return []


def extract_links_from_text(text, username=None):
    found = {}
    if not text:
        return found
    text_lower = text.lower()
    username_lower = username.lower() if username else None
    
    # email
    for email in re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
        if any(x in email.lower() for x in ['noreply', 'example', '@reddit', 'info@', 'support@', 'contact@', 'admin@']):
            continue
        if username_lower and username_lower in email.lower():
            found['email'] = email
            break
        if 'email' not in found:
            found['email'] = email
    
    # github
    for gh in re.findall(r'github\.com/([a-zA-Z0-9_-]+)', text):
        if gh.lower() in ['topics', 'explore', 'trending', 'sponsors', 'orgs']:
            continue
        if username_lower and gh.lower() == username_lower:
            found['github'] = gh
            break
    
    # mastodon
    masto = re.search(r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
    if masto:
        found['mastodon'] = f"@{masto.group(1)}@{masto.group(2)}"
    for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
        m = re.search(f'{inst}/@([a-zA-Z0-9_]+)', text)
        if m:
            found['mastodon'] = f"@{m.group(1)}@{inst}"
            break
    
    # bluesky
    bsky = re.search(r'bsky\.app/profile/([a-zA-Z0-9_.-]+)', text)
    if bsky:
        found['bluesky'] = bsky.group(1)
    
    # twitter
    tw = re.search(r'(?:twitter|x)\.com/([a-zA-Z0-9_]+)', text)
    if tw and tw.group(1).lower() not in ['home', 'explore', 'search']:
        found['twitter'] = tw.group(1)
    
    # linkedin
    li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', text)
    if li:
        found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
    
    # twitch
    twitch = re.search(r'twitch\.tv/([a-zA-Z0-9_]+)', text)
    if twitch:
        found['twitch'] = f"https://twitch.tv/{twitch.group(1)}"
    
    # itch.io
    itch = re.search(r'itch\.io/profile/([a-zA-Z0-9_-]+)', text)
    if itch:
        found['itch'] = f"https://itch.io/profile/{itch.group(1)}"
    
    # website
    for url in re.findall(r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)', text):
        skip = ['reddit', 'imgur', 'google', 'facebook', 'twitter', 'youtube', 'wikipedia', 'amazon']
        if not any(x in url.lower() for x in skip):
            if username_lower and username_lower in url.lower():
                found['website'] = f"https://{url}"
                break
            if 'website' not in found:
                found['website'] = f"https://{url}"
    
    return found


def cross_platform_discovery(username, full_text=''):
    """
    search the ENTIRE internet using TAVILY.
    CRITICAL: always quote username to avoid fuzzy matching!
    """
    found = {}
    all_content = full_text
    username_lower = username.lower()
    
    print(f"    🔍 cross-platform search for {username}...")
    
    # ALWAYS QUOTE THE USERNAME - critical for exact matching
    searches = [
        f'"{username}"',                          # just username, quoted
        f'"{username}" github',                   # github
        f'"{username}" developer programmer',     # dev context
        f'"{username}" email contact',            # contact
        f'"{username}" mastodon',                 # fediverse
    ]
    
    for query in searches:
        print(f"      🌐 tavily: {query}")
        results = tavily_search(query, max_results=5)
        
        for result in results:
            url = result.get('url', '').lower()
            title = result.get('title', '')
            content = result.get('content', '')
            combined = f"{url} {title} {content}"
            
            # validate username appears
            if username_lower not in combined.lower():
                continue
            
            all_content += f" {combined}"
            
            # extract from URL directly
            if f'github.com/{username_lower}' in url and not found.get('github'):
                found['github'] = username
                print(f"        ✓ github: {username}")
            
            if f'twitch.tv/{username_lower}' in url and not found.get('twitch'):
                found['twitch'] = f"https://twitch.tv/{username}"
                print(f"        ✓ twitch")
            
            if 'itch.io/profile/' in url and username_lower in url and not found.get('itch'):
                found['itch'] = url if url.startswith('http') else f"https://{url}"
                print(f"        ✓ itch.io")
            
            if 'linkedin.com/in/' in url and not found.get('linkedin'):
                li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', url)
                if li:
                    found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
                    print(f"        ✓ linkedin")
        
        # extract from content
        extracted = extract_links_from_text(all_content, username)
        for k, v in extracted.items():
            if k not in found:
                found[k] = v
                print(f"        ✓ {k}")
        
        # good contact found? stop searching
        if found.get('email') or found.get('github') or found.get('mastodon') or found.get('twitch'):
            break
    
    # === API CHECKS ===
    if not found.get('github'):
        headers = {'Authorization': f'token {GITHUB_TOKEN}'} if GITHUB_TOKEN else {}
        try:
            resp = requests.get(f'https://api.github.com/users/{username}', headers=headers, timeout=10)
            if resp.status_code == 200:
                data = resp.json()
                found['github'] = username
                print(f"        ✓ github API")
                if data.get('email') and 'email' not in found:
                    found['email'] = data['email']
                if data.get('blog') and 'website' not in found:
                    found['website'] = data['blog'] if data['blog'].startswith('http') else f"https://{data['blog']}"
        except:
            pass
    
    if not found.get('mastodon'):
        for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
            try:
                resp = requests.get(f'https://{inst}/api/v1/accounts/lookup', params={'acct': username}, timeout=5)
                if resp.status_code == 200:
                    found['mastodon'] = f"@{username}@{inst}"
                    print(f"        ✓ mastodon: {found['mastodon']}")
                    break
            except:
                continue
    
    if not found.get('bluesky'):
        try:
            resp = requests.get('https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile', 
                              params={'actor': f'{username}.bsky.social'}, timeout=10)
            if resp.status_code == 200:
                found['bluesky'] = resp.json().get('handle')
                print(f"        ✓ bluesky")
        except:
            pass
    
    return found


def get_user_profile(username):
    url = f'https://www.reddit.com/user/{username}/about.json'
    data = _api_get(url)
    if not data or 'data' not in data:
        return None
    profile = data['data']
    return {
        'username': username,
        'bio': profile.get('subreddit', {}).get('public_description', ''),
        'title': profile.get('subreddit', {}).get('title', ''),
        'total_karma': profile.get('total_karma', 0),
    }


def get_subreddit_users(subreddit, limit=100):
    users = set()
    for endpoint in ['new', 'comments']:
        url = f'https://www.reddit.com/r/{subreddit}/{endpoint}.json'
        data = _api_get(url, {'limit': limit})
        if data and 'data' in data:
            for item in data['data'].get('children', []):
                author = item['data'].get('author')
                if author and author not in ['[deleted]', 'AutoModerator']:
                    users.add(author)
    return users


def get_user_activity(username):
    activity = []
    for endpoint in ['submitted', 'comments']:
        url = f'https://www.reddit.com/user/{username}/{endpoint}.json'
        data = _api_get(url, {'limit': 100})
        if data and 'data' in data:
            for item in data['data'].get('children', []):
                activity.append({
                    'type': 'post' if endpoint == 'submitted' else 'comment',
                    'subreddit': item['data'].get('subreddit'),
                    'title': item['data'].get('title', ''),
                    'body': item['data'].get('selftext', '') or item['data'].get('body', ''),
                    'score': item['data'].get('score', 0),
                })
    return activity


def analyze_reddit_user(username):
    activity = get_user_activity(username)
    if not activity:
        return None

    profile = get_user_profile(username)
    sub_activity = defaultdict(int)
    text_parts = []
    total_karma = 0

    for item in activity:
        sub = item.get('subreddit', '').lower()
        if sub:
            sub_activity[sub] += 1
        if item.get('title'):
            text_parts.append(item['title'])
        if item.get('body'):
            text_parts.append(item['body'])
        total_karma += item.get('score', 0)

    full_text = ' '.join(text_parts)
    text_score, positive_signals, negative_signals = analyze_text(full_text)

    external_links = {}
    if profile:
        external_links.update(extract_links_from_text(f"{profile.get('bio', '')} {profile.get('title', '')}", username))
    external_links.update(extract_links_from_text(full_text, username))

    # TAVILY search
    discovered = cross_platform_discovery(username, full_text)
    external_links.update(discovered)

    # scoring
    sub_score = 0
    aligned_subs = []
    for sub, count in sub_activity.items():
        weight = ALIGNED_SUBREDDITS.get(sub, 0)
        if weight > 0:
            sub_score += weight * min(count, 5)
            aligned_subs.append(sub)

    if len(aligned_subs) >= 5:
        sub_score += 30
    elif len(aligned_subs) >= 3:
        sub_score += 15

    for sub in sub_activity:
        if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]:
            sub_score -= 50
            negative_signals.append(f"r/{sub}")

    total_score = text_score + sub_score

    if external_links.get('github'):
        total_score += 10
        positive_signals.append('github')
    if external_links.get('mastodon'):
        total_score += 10
        positive_signals.append('mastodon')
    if external_links.get('email'):
        total_score += 15
        positive_signals.append('email')
    if external_links.get('twitch'):
        total_score += 5
        positive_signals.append('twitch')

    # lost builder
    subreddits_list = list(sub_activity.keys())
    lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list)
    text_lost_signals, _ = analyze_text_for_lost_signals(full_text)
    for sig in text_lost_signals:
        if sig not in lost_signals:
            lost_signals.append(sig)

    builder_activity = 20 if external_links.get('github') else 0
    user_type = classify_user(lost_weight, builder_activity, total_score)

    confidence = min(0.95, 0.3 + (0.2 if len(activity) > 20 else 0) + (0.2 if len(aligned_subs) >= 2 else 0) + (0.1 if external_links else 0))

    reddit_only = not any([external_links.get(k) for k in ['github', 'mastodon', 'bluesky', 'email', 'matrix', 'linkedin', 'twitch', 'itch']])

    return {
        'platform': 'reddit',
        'username': username,
        'url': f"https://reddit.com/u/{username}",
        'score': total_score,
        'confidence': confidence,
        'signals': positive_signals,
        'negative_signals': negative_signals,
        'subreddits': aligned_subs,
        'activity_count': len(activity),
        'karma': total_karma,
        'reasons': [f"contact: {', '.join(external_links.keys())}"] if external_links else [],
        'scraped_at': datetime.now().isoformat(),
        'external_links': external_links,
        'reddit_only': reddit_only,
        'extra': external_links,
        'lost_potential_score': lost_weight,
        'lost_signals': lost_signals,
        'user_type': user_type,
    }


def scrape_reddit(db, limit_per_sub=50):
    print("scoutd/reddit: scraping (TAVILY enabled)...")
    user_subs = defaultdict(set)
    for sub in ['intentionalcommunity', 'cohousing', 'selfhosted', 'homeassistant', 'solarpunk', 'cooperatives', 'privacy', 'localllama', 'homelab', 'learnprogramming']:
        users = get_subreddit_users(sub, limit=limit_per_sub)
        for user in users:
            user_subs[user].add(sub)

    multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2}
    print(f"  {len(multi_sub)} users in 2+ subs")

    results = []
    for username in multi_sub:
        try:
            result = analyze_reddit_user(username)
            if result and result['score'] > 0:
                results.append(result)
                db.save_human(result)
        except Exception as e:
            print(f"  error: {username}: {e}")

    print(f"scoutd/reddit: {len(results)} humans")
    return results


def _add_to_manual_queue(result):
    queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json'
    queue_file.parent.mkdir(parents=True, exist_ok=True)
    queue = json.loads(queue_file.read_text()) if queue_file.exists() else []
    if not any(q.get('username') == result['username'] for q in queue):
        queue.append({'platform': 'reddit', 'username': result['username'], 'url': result['url'], 'score': result['score'], 'queued_at': datetime.now().isoformat()})
        queue_file.write_text(json.dumps(queue, indent=2))