connectd/scoutd/reddit.py

"""
scoutd/reddit.py - reddit discovery (DISCOVERY ONLY, NOT OUTREACH)

reddit is a SIGNAL SOURCE, not a contact channel.
flow:
1. scrape reddit for users active in target subs
2. extract their reddit profile
3. look for links TO other platforms (github, mastodon, website, etc.)
4. add to scout database with reddit as signal source
5. reach out via their OTHER platforms, never reddit

if reddit user has no external links:
   - add to manual_queue with note "reddit-only, needs manual review"

also detects lost builders - stuck in learnprogramming for years, imposter syndrome, etc.
"""

import requests
import json
import time
import re
from datetime import datetime
from pathlib import Path
from collections import defaultdict

from .signals import analyze_text, ALIGNED_SUBREDDITS, NEGATIVE_SUBREDDITS
from .lost import (
    analyze_reddit_for_lost_signals,
    analyze_text_for_lost_signals,
    classify_user,
    get_signal_descriptions,
    STUCK_SUBREDDITS,
)

HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'}
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit'

# patterns for extracting external platform links
PLATFORM_PATTERNS = {
    'github': [
        r'github\.com/([a-zA-Z0-9_-]+)',
        r'gh:\s*@?([a-zA-Z0-9_-]+)',
    ],
    'mastodon': [
        r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
        r'mastodon\.social/@([a-zA-Z0-9_]+)',
        r'fosstodon\.org/@([a-zA-Z0-9_]+)',
        r'hachyderm\.io/@([a-zA-Z0-9_]+)',
        r'tech\.lgbt/@([a-zA-Z0-9_]+)',
    ],
    'twitter': [
        r'twitter\.com/([a-zA-Z0-9_]+)',
        r'x\.com/([a-zA-Z0-9_]+)',
        r'(?:^|\s)@([a-zA-Z0-9_]{1,15})(?:\s|$)',  # bare @handle
    ],
    'bluesky': [
        r'bsky\.app/profile/([a-zA-Z0-9_.-]+)',
        r'([a-zA-Z0-9_-]+)\.bsky\.social',
    ],
    'website': [
        r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)',
    ],
    'matrix': [
        r'@([a-zA-Z0-9_-]+):([a-zA-Z0-9.-]+)',
    ],
}


def _api_get(url, params=None):
    """rate-limited request"""
    cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
    cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
    CACHE_DIR.mkdir(parents=True, exist_ok=True)

    if cache_file.exists():
        try:
            data = json.loads(cache_file.read_text())
            if time.time() - data.get('_cached_at', 0) < 3600:
                return data.get('_data')
        except:
            pass

    time.sleep(2)  # reddit rate limit

    try:
        resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
        resp.raise_for_status()
        result = resp.json()
        cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
        return result
    except requests.exceptions.RequestException as e:
        print(f"  reddit api error: {e}")
        return None


def extract_external_links(text):
    """extract links to other platforms from text"""
    links = {}

    if not text:
        return links

    for platform, patterns in PLATFORM_PATTERNS.items():
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                if platform == 'mastodon' and isinstance(matches[0], tuple):
                    # full fediverse handle
                    links[platform] = f"@{matches[0][0]}@{matches[0][1]}"
                elif platform == 'matrix' and isinstance(matches[0], tuple):
                    links[platform] = f"@{matches[0][0]}:{matches[0][1]}"
                elif platform == 'website':
                    # skip reddit/imgur/etc
                    for match in matches:
                        if not any(x in match.lower() for x in ['reddit', 'imgur', 'redd.it', 'i.redd']):
                            links[platform] = f"https://{match}"
                            break
                else:
                    links[platform] = matches[0]
                break

    return links


def get_user_profile(username):
    """get user profile including bio/description"""
    url = f'https://www.reddit.com/user/{username}/about.json'
    data = _api_get(url)

    if not data or 'data' not in data:
        return None

    profile = data['data']
    return {
        'username': username,
        'name': profile.get('name'),
        'bio': profile.get('subreddit', {}).get('public_description', ''),
        'title': profile.get('subreddit', {}).get('title', ''),
        'icon': profile.get('icon_img'),
        'created_utc': profile.get('created_utc'),
        'total_karma': profile.get('total_karma', 0),
        'link_karma': profile.get('link_karma', 0),
        'comment_karma': profile.get('comment_karma', 0),
    }


def get_subreddit_users(subreddit, limit=100):
    """get recent posters/commenters from a subreddit"""
    users = set()

    # posts
    url = f'https://www.reddit.com/r/{subreddit}/new.json'
    data = _api_get(url, {'limit': limit})
    if data and 'data' in data:
        for post in data['data'].get('children', []):
            author = post['data'].get('author')
            if author and author not in ['[deleted]', 'AutoModerator']:
                users.add(author)

    # comments
    url = f'https://www.reddit.com/r/{subreddit}/comments.json'
    data = _api_get(url, {'limit': limit})
    if data and 'data' in data:
        for comment in data['data'].get('children', []):
            author = comment['data'].get('author')
            if author and author not in ['[deleted]', 'AutoModerator']:
                users.add(author)

    return users


def get_user_activity(username):
    """get user's posts and comments"""
    activity = []

    # posts
    url = f'https://www.reddit.com/user/{username}/submitted.json'
    data = _api_get(url, {'limit': 100})
    if data and 'data' in data:
        for post in data['data'].get('children', []):
            activity.append({
                'type': 'post',
                'subreddit': post['data'].get('subreddit'),
                'title': post['data'].get('title', ''),
                'body': post['data'].get('selftext', ''),
                'score': post['data'].get('score', 0),
            })

    # comments
    url = f'https://www.reddit.com/user/{username}/comments.json'
    data = _api_get(url, {'limit': 100})
    if data and 'data' in data:
        for comment in data['data'].get('children', []):
            activity.append({
                'type': 'comment',
                'subreddit': comment['data'].get('subreddit'),
                'body': comment['data'].get('body', ''),
                'score': comment['data'].get('score', 0),
            })

    return activity


def analyze_reddit_user(username):
    """
    analyze a reddit user for alignment and extract external platform links.

    reddit is DISCOVERY ONLY - we find users here but contact them elsewhere.
    """
    activity = get_user_activity(username)
    if not activity:
        return None

    # get profile for bio
    profile = get_user_profile(username)

    # count subreddit activity
    sub_activity = defaultdict(int)
    text_parts = []
    total_karma = 0

    for item in activity:
        sub = item.get('subreddit', '').lower()
        if sub:
            sub_activity[sub] += 1
        if item.get('title'):
            text_parts.append(item['title'])
        if item.get('body'):
            text_parts.append(item['body'])
        total_karma += item.get('score', 0)

    full_text = ' '.join(text_parts)
    text_score, positive_signals, negative_signals = analyze_text(full_text)

    # EXTRACT EXTERNAL LINKS - this is the key part
    # check profile bio first
    external_links = {}
    if profile:
        bio_text = f"{profile.get('bio', '')} {profile.get('title', '')}"
        external_links.update(extract_external_links(bio_text))

    # also scan posts/comments for links (people often share their github etc)
    activity_links = extract_external_links(full_text)
    for platform, link in activity_links.items():
        if platform not in external_links:
            external_links[platform] = link

    # subreddit scoring
    sub_score = 0
    aligned_subs = []
    for sub, count in sub_activity.items():
        weight = ALIGNED_SUBREDDITS.get(sub, 0)
        if weight > 0:
            sub_score += weight * min(count, 5)
            aligned_subs.append(sub)

    # multi-sub bonus
    if len(aligned_subs) >= 5:
        sub_score += 30
    elif len(aligned_subs) >= 3:
        sub_score += 15

    # negative sub penalty
    for sub in sub_activity:
        if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]:
            sub_score -= 50
            negative_signals.append(f"r/{sub}")

    total_score = text_score + sub_score

    # bonus if they have external links (we can actually contact them)
    if external_links.get('github'):
        total_score += 10
        positive_signals.append('has github')
    if external_links.get('mastodon'):
        total_score += 10
        positive_signals.append('has mastodon')
    if external_links.get('website'):
        total_score += 5
        positive_signals.append('has website')

    # === LOST BUILDER DETECTION ===
    # reddit is HIGH SIGNAL for lost builders - stuck in learnprogramming,
    # imposter syndrome posts, "i wish i could" language, etc.
    subreddits_list = list(sub_activity.keys())
    lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list)

    # also check full text for lost patterns (already done partially in analyze_reddit_for_lost_signals)
    text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)
    for sig in text_lost_signals:
        if sig not in lost_signals:
            lost_signals.append(sig)
            lost_weight += text_lost_weight

    lost_potential_score = lost_weight

    # classify: builder, lost, both, or none
    # for reddit, builder_score is based on having external links + high karma
    builder_activity = 0
    if external_links.get('github'):
        builder_activity += 20
    if total_karma > 1000:
        builder_activity += 15
    elif total_karma > 500:
        builder_activity += 10

    user_type = classify_user(lost_potential_score, builder_activity, total_score)

    # confidence
    confidence = 0.3
    if len(activity) > 20:
        confidence += 0.2
    if len(aligned_subs) >= 2:
        confidence += 0.2
    if len(text_parts) > 10:
        confidence += 0.2
    # higher confidence if we have contact methods
    if external_links:
        confidence += 0.1
    confidence = min(confidence, 0.95)

    reasons = []
    if aligned_subs:
        reasons.append(f"active in: {', '.join(aligned_subs[:5])}")
    if positive_signals:
        reasons.append(f"signals: {', '.join(positive_signals[:5])}")
    if negative_signals:
        reasons.append(f"WARNING: {', '.join(negative_signals)}")
    if external_links:
        reasons.append(f"external: {', '.join(external_links.keys())}")

    # add lost reasons if applicable
    if user_type == 'lost' or user_type == 'both':
        lost_descriptions = get_signal_descriptions(lost_signals)
        if lost_descriptions:
            reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")

    # determine if this is reddit-only (needs manual review)
    reddit_only = len(external_links) == 0
    if reddit_only:
        reasons.append("REDDIT-ONLY: needs manual review for outreach")

    return {
        'platform': 'reddit',
        'username': username,
        'url': f"https://reddit.com/u/{username}",
        'score': total_score,
        'confidence': confidence,
        'signals': positive_signals,
        'negative_signals': negative_signals,
        'subreddits': aligned_subs,
        'activity_count': len(activity),
        'karma': total_karma,
        'reasons': reasons,
        'scraped_at': datetime.now().isoformat(),
        # external platform links for outreach
        'external_links': external_links,
        'reddit_only': reddit_only,
        'extra': {
            'github': external_links.get('github'),
            'mastodon': external_links.get('mastodon'),
            'twitter': external_links.get('twitter'),
            'bluesky': external_links.get('bluesky'),
            'website': external_links.get('website'),
            'matrix': external_links.get('matrix'),
            'reddit_karma': total_karma,
            'reddit_activity': len(activity),
        },
        # lost builder fields
        'lost_potential_score': lost_potential_score,
        'lost_signals': lost_signals,
        'user_type': user_type,
    }


def scrape_reddit(db, limit_per_sub=50):
    """
    full reddit scrape - DISCOVERY ONLY

    finds aligned users, extracts external links for outreach.
    reddit-only users go to manual queue.
    """
    print("scoutd/reddit: starting scrape (discovery only, not outreach)...")

    # find users in multiple aligned subs
    user_subs = defaultdict(set)

    # aligned subs - active builders
    priority_subs = ['intentionalcommunity', 'cohousing', 'selfhosted',
                     'homeassistant', 'solarpunk', 'cooperatives', 'privacy',
                     'localllama', 'homelab', 'degoogle', 'pihole', 'unraid']

    # lost builder subs - people who need encouragement
    # these folks might be stuck, but they have aligned interests
    lost_subs = ['learnprogramming', 'findapath', 'getdisciplined',
                 'careerguidance', 'cscareerquestions', 'decidingtobebetter']

    # scrape both - we want to find lost builders with aligned interests
    all_subs = priority_subs + lost_subs

    for sub in all_subs:
        print(f"  scraping r/{sub}...")
        users = get_subreddit_users(sub, limit=limit_per_sub)
        for user in users:
            user_subs[user].add(sub)
        print(f"    found {len(users)} users")

    # filter for multi-sub users
    multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2}
    print(f"  {len(multi_sub)} users in 2+ aligned subs")

    # analyze
    results = []
    reddit_only_count = 0
    external_link_count = 0
    builders_found = 0
    lost_found = 0

    for username in multi_sub:
        try:
            result = analyze_reddit_user(username)
            if result and result['score'] > 0:
                results.append(result)
                db.save_human(result)

                user_type = result.get('user_type', 'none')

                # track lost builders - reddit is high signal for these
                if user_type == 'lost':
                    lost_found += 1
                    lost_score = result.get('lost_potential_score', 0)
                    if lost_score >= 40:
                        print(f"    💔 u/{username}: lost_score={lost_score}, values={result['score']} pts")
                        # lost builders also go to manual queue if reddit-only
                        if result.get('reddit_only'):
                            _add_to_manual_queue(result)

                elif user_type == 'builder':
                    builders_found += 1

                elif user_type == 'both':
                    builders_found += 1
                    lost_found += 1
                    print(f"    ⚡ u/{username}: recovering builder")

                # track external links
                if result.get('reddit_only'):
                    reddit_only_count += 1
                    # add high-value users to manual queue for review
                    if result['score'] >= 50 and user_type != 'lost':  # lost already added above
                        _add_to_manual_queue(result)
                        print(f"    📋 u/{username}: {result['score']} pts (reddit-only → manual queue)")
                else:
                    external_link_count += 1
                    if result['score'] >= 50 and user_type == 'builder':
                        links = list(result.get('external_links', {}).keys())
                        print(f"    ★ u/{username}: {result['score']} pts → {', '.join(links)}")

        except Exception as e:
            print(f"    error on {username}: {e}")

    print(f"scoutd/reddit: found {len(results)} aligned humans")
    print(f"  - {builders_found} active builders")
    print(f"  - {lost_found} lost builders (need encouragement)")
    print(f"  - {external_link_count} with external links (reachable)")
    print(f"  - {reddit_only_count} reddit-only (manual queue)")
    return results


def _add_to_manual_queue(result):
    """add reddit-only user to manual queue for review"""
    from pathlib import Path
    import json

    queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json'
    queue_file.parent.mkdir(parents=True, exist_ok=True)

    queue = []
    if queue_file.exists():
        try:
            queue = json.loads(queue_file.read_text())
        except:
            pass

    # check if already in queue
    existing = [q for q in queue if q.get('username') == result['username'] and q.get('platform') == 'reddit']
    if existing:
        return

    queue.append({
        'platform': 'reddit',
        'username': result['username'],
        'url': result['url'],
        'score': result['score'],
        'subreddits': result.get('subreddits', []),
        'signals': result.get('signals', []),
        'reasons': result.get('reasons', []),
        'note': 'reddit-only user - no external links found. DM manually if promising.',
        'queued_at': datetime.now().isoformat(),
        'status': 'pending',
    })

    queue_file.write_text(json.dumps(queue, indent=2))