connectd/scoutd/mastodon.py

"""
scoutd/mastodon.py - fediverse discovery
scrapes high-signal instances: tech.lgbt, social.coop, fosstodon, hackers.town
also detects lost builders - social isolation, imposter syndrome, struggling folks
"""

import requests
import json
import time
import re
from datetime import datetime
from pathlib import Path

from .signals import analyze_text, ALIGNED_INSTANCES
from .lost import (
    analyze_social_for_lost_signals,
    analyze_text_for_lost_signals,
    classify_user,
    get_signal_descriptions,
)

HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'}
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'mastodon'

TARGET_HASHTAGS = [
    'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource',
    'privacy', 'solarpunk', 'cooperative', 'cohousing', 'mutualaid',
    'intentionalcommunity', 'degoogle', 'fediverse', 'indieweb',
]


def _api_get(url, params=None):
    """rate-limited request"""
    cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
    cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
    CACHE_DIR.mkdir(parents=True, exist_ok=True)

    if cache_file.exists():
        try:
            data = json.loads(cache_file.read_text())
            if time.time() - data.get('_cached_at', 0) < 3600:
                return data.get('_data')
        except:
            pass

    time.sleep(1)

    try:
        resp = requests.get(url, headers=HEADERS, params=params, timeout=30)
        resp.raise_for_status()
        result = resp.json()
        cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
        return result
    except requests.exceptions.RequestException as e:
        print(f"  mastodon api error: {e}")
        return None


def strip_html(text):
    """strip html tags"""
    return re.sub(r'<[^>]+>', ' ', text) if text else ''


def get_instance_directory(instance, limit=40):
    """get users from instance directory"""
    url = f'https://{instance}/api/v1/directory'
    return _api_get(url, {'limit': limit, 'local': 'true'}) or []


def get_hashtag_timeline(instance, hashtag, limit=40):
    """get posts from hashtag"""
    url = f'https://{instance}/api/v1/timelines/tag/{hashtag}'
    return _api_get(url, {'limit': limit}) or []


def get_user_statuses(instance, user_id, limit=30):
    """get user's recent posts"""
    url = f'https://{instance}/api/v1/accounts/{user_id}/statuses'
    return _api_get(url, {'limit': limit, 'exclude_reblogs': 'true'}) or []


def analyze_mastodon_user(account, instance):
    """analyze a mastodon account"""
    acct = account.get('acct', '')
    if '@' not in acct:
        acct = f"{acct}@{instance}"

    # collect text
    text_parts = []
    bio = strip_html(account.get('note', ''))
    if bio:
        text_parts.append(bio)

    display_name = account.get('display_name', '')
    if display_name:
        text_parts.append(display_name)

    # profile fields
    for field in account.get('fields', []):
        if field.get('name'):
            text_parts.append(field['name'])
        if field.get('value'):
            text_parts.append(strip_html(field['value']))

    # get recent posts
    user_id = account.get('id')
    if user_id:
        statuses = get_user_statuses(instance, user_id)
        for status in statuses:
            content = strip_html(status.get('content', ''))
            if content:
                text_parts.append(content)

    full_text = ' '.join(text_parts)
    text_score, positive_signals, negative_signals = analyze_text(full_text)

    # instance bonus
    instance_bonus = ALIGNED_INSTANCES.get(instance, 0)
    total_score = text_score + instance_bonus

    # pronouns bonus
    if re.search(r'\b(they/them|she/her|he/him|xe/xem)\b', full_text, re.I):
        total_score += 10
        positive_signals.append('pronouns')

    # activity level
    statuses_count = account.get('statuses_count', 0)
    followers = account.get('followers_count', 0)
    if statuses_count > 100:
        total_score += 5

    # === LOST BUILDER DETECTION ===
    # build profile and posts for lost analysis
    profile_for_lost = {
        'bio': bio,
        'note': account.get('note'),
    }

    # convert statuses to posts format for analyze_social_for_lost_signals
    posts_for_lost = []
    if user_id:
        statuses = get_user_statuses(instance, user_id)
        for status in statuses:
            posts_for_lost.append({
                'content': strip_html(status.get('content', '')),
                'reblog': status.get('reblog'),
            })

    # analyze for lost signals
    lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost)

    # also check combined text for lost patterns
    text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)
    for sig in text_lost_signals:
        if sig not in lost_signals:
            lost_signals.append(sig)
            lost_weight += text_lost_weight

    lost_potential_score = lost_weight

    # classify: builder, lost, both, or none
    # for mastodon, we use statuses_count as a proxy for builder activity
    builder_activity = 10 if statuses_count > 100 else 5 if statuses_count > 50 else 0
    user_type = classify_user(lost_potential_score, builder_activity, total_score)

    # confidence
    confidence = 0.3
    if len(text_parts) > 5:
        confidence += 0.2
    if statuses_count > 50:
        confidence += 0.2
    if len(positive_signals) > 3:
        confidence += 0.2
    confidence = min(confidence, 0.9)

    reasons = []
    if instance in ALIGNED_INSTANCES:
        reasons.append(f"on {instance}")
    if positive_signals:
        reasons.append(f"signals: {', '.join(positive_signals[:5])}")
    if negative_signals:
        reasons.append(f"WARNING: {', '.join(negative_signals)}")

    # add lost reasons if applicable
    if user_type == 'lost' or user_type == 'both':
        lost_descriptions = get_signal_descriptions(lost_signals)
        if lost_descriptions:
            reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")

    return {
        'platform': 'mastodon',
        'username': acct,
        'url': account.get('url'),
        'name': display_name,
        'bio': bio,
        'instance': instance,
        'score': total_score,
        'confidence': confidence,
        'signals': positive_signals,
        'negative_signals': negative_signals,
        'statuses_count': statuses_count,
        'followers': followers,
        'reasons': reasons,
        'scraped_at': datetime.now().isoformat(),
        # lost builder fields
        'lost_potential_score': lost_potential_score,
        'lost_signals': lost_signals,
        'user_type': user_type,
    }


def scrape_mastodon(db, limit_per_instance=40):
    """full mastodon scrape"""
    print("scoutd/mastodon: starting scrape...")

    all_accounts = []

    # 1. instance directories
    print("  scraping instance directories...")
    for instance in ALIGNED_INSTANCES:
        accounts = get_instance_directory(instance, limit=limit_per_instance)
        for acct in accounts:
            acct['_instance'] = instance
            all_accounts.append(acct)
        print(f"    {instance}: {len(accounts)} users")

    # 2. hashtag timelines
    print("  scraping hashtags...")
    seen = set()
    for tag in TARGET_HASHTAGS[:8]:
        for instance in ['fosstodon.org', 'tech.lgbt', 'social.coop']:
            posts = get_hashtag_timeline(instance, tag, limit=20)
            for post in posts:
                account = post.get('account', {})
                acct = account.get('acct', '')
                if '@' not in acct:
                    acct = f"{acct}@{instance}"

                if acct not in seen:
                    seen.add(acct)
                    account['_instance'] = instance
                    all_accounts.append(account)

    # dedupe
    unique = {}
    for acct in all_accounts:
        key = acct.get('acct', acct.get('id', ''))
        if key not in unique:
            unique[key] = acct

    print(f"  {len(unique)} unique accounts to analyze")

    # analyze
    results = []
    builders_found = 0
    lost_found = 0

    for acct_data in unique.values():
        instance = acct_data.get('_instance', 'mastodon.social')
        try:
            result = analyze_mastodon_user(acct_data, instance)
            if result and result['score'] > 0:
                results.append(result)
                db.save_human(result)

                user_type = result.get('user_type', 'none')

                if user_type == 'builder':
                    builders_found += 1
                    if result['score'] >= 40:
                        print(f"    ★ @{result['username']}: {result['score']} pts")

                elif user_type == 'lost':
                    lost_found += 1
                    lost_score = result.get('lost_potential_score', 0)
                    if lost_score >= 40:
                        print(f"    💔 @{result['username']}: lost_score={lost_score}, values={result['score']} pts")

                elif user_type == 'both':
                    builders_found += 1
                    lost_found += 1
                    print(f"    ⚡ @{result['username']}: recovering builder")

        except Exception as e:
            print(f"    error: {e}")

    print(f"scoutd/mastodon: found {len(results)} aligned humans")
    print(f"  - {builders_found} active builders")
    print(f"  - {lost_found} lost builders (need encouragement)")
    return results
autonomous daemon with platform-native contact detection - determine_contact_method now recognizes mastodon/bluesky users by platform - username IS the handle for platform-native users - fixed orphaned matches table issue - wave 1 intros sent successfully 2025-12-16 09:22:58 +00:00			`"""`
			`scoutd/mastodon.py - fediverse discovery`
			`scrapes high-signal instances: tech.lgbt, social.coop, fosstodon, hackers.town`
			`also detects lost builders - social isolation, imposter syndrome, struggling folks`
			`"""`

			`import requests`
			`import json`
			`import time`
			`import re`
			`from datetime import datetime`
			`from pathlib import Path`

			`from .signals import analyze_text, ALIGNED_INSTANCES`
			`from .lost import (`
			`analyze_social_for_lost_signals,`
			`analyze_text_for_lost_signals,`
			`classify_user,`
			`get_signal_descriptions,`
			`)`

			`HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'}`
			`CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'mastodon'`

			`TARGET_HASHTAGS = [`
			`'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource',`
			`'privacy', 'solarpunk', 'cooperative', 'cohousing', 'mutualaid',`
			`'intentionalcommunity', 'degoogle', 'fediverse', 'indieweb',`
			`]`


			`def _api_get(url, params=None):`
			`"""rate-limited request"""`
			`cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"`
			`cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"`
			`CACHE_DIR.mkdir(parents=True, exist_ok=True)`

			`if cache_file.exists():`
			`try:`
			`data = json.loads(cache_file.read_text())`
			`if time.time() - data.get('_cached_at', 0) < 3600:`
			`return data.get('_data')`
			`except:`
			`pass`

			`time.sleep(1)`

			`try:`
			`resp = requests.get(url, headers=HEADERS, params=params, timeout=30)`
			`resp.raise_for_status()`
			`result = resp.json()`
			`cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))`
			`return result`
			`except requests.exceptions.RequestException as e:`
			`print(f" mastodon api error: {e}")`
			`return None`


			`def strip_html(text):`
			`"""strip html tags"""`
			`return re.sub(r'<[^>]+>', ' ', text) if text else ''`


			`def get_instance_directory(instance, limit=40):`
			`"""get users from instance directory"""`
			`url = f'https://{instance}/api/v1/directory'`
			`return _api_get(url, {'limit': limit, 'local': 'true'}) or []`


			`def get_hashtag_timeline(instance, hashtag, limit=40):`
			`"""get posts from hashtag"""`
			`url = f'https://{instance}/api/v1/timelines/tag/{hashtag}'`
			`return _api_get(url, {'limit': limit}) or []`


			`def get_user_statuses(instance, user_id, limit=30):`
			`"""get user's recent posts"""`
			`url = f'https://{instance}/api/v1/accounts/{user_id}/statuses'`
			`return _api_get(url, {'limit': limit, 'exclude_reblogs': 'true'}) or []`


			`def analyze_mastodon_user(account, instance):`
			`"""analyze a mastodon account"""`
			`acct = account.get('acct', '')`
			`if '@' not in acct:`
			`acct = f"{acct}@{instance}"`

			`# collect text`
			`text_parts = []`
			`bio = strip_html(account.get('note', ''))`
			`if bio:`
			`text_parts.append(bio)`

			`display_name = account.get('display_name', '')`
			`if display_name:`
			`text_parts.append(display_name)`

			`# profile fields`
			`for field in account.get('fields', []):`
			`if field.get('name'):`
			`text_parts.append(field['name'])`
			`if field.get('value'):`
			`text_parts.append(strip_html(field['value']))`

			`# get recent posts`
			`user_id = account.get('id')`
			`if user_id:`
			`statuses = get_user_statuses(instance, user_id)`
			`for status in statuses:`
			`content = strip_html(status.get('content', ''))`
			`if content:`
			`text_parts.append(content)`

			`full_text = ' '.join(text_parts)`
			`text_score, positive_signals, negative_signals = analyze_text(full_text)`

			`# instance bonus`
			`instance_bonus = ALIGNED_INSTANCES.get(instance, 0)`
			`total_score = text_score + instance_bonus`

			`# pronouns bonus`
			`if re.search(r'\b(they/them\|she/her\|he/him\|xe/xem)\b', full_text, re.I):`
			`total_score += 10`
			`positive_signals.append('pronouns')`

			`# activity level`
			`statuses_count = account.get('statuses_count', 0)`
			`followers = account.get('followers_count', 0)`
			`if statuses_count > 100:`
			`total_score += 5`

			`# === LOST BUILDER DETECTION ===`
			`# build profile and posts for lost analysis`
			`profile_for_lost = {`
			`'bio': bio,`
			`'note': account.get('note'),`
			`}`

			`# convert statuses to posts format for analyze_social_for_lost_signals`
			`posts_for_lost = []`
			`if user_id:`
			`statuses = get_user_statuses(instance, user_id)`
			`for status in statuses:`
			`posts_for_lost.append({`
			`'content': strip_html(status.get('content', '')),`
			`'reblog': status.get('reblog'),`
			`})`

			`# analyze for lost signals`
			`lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost)`

			`# also check combined text for lost patterns`
			`text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)`
			`for sig in text_lost_signals:`
			`if sig not in lost_signals:`
			`lost_signals.append(sig)`
			`lost_weight += text_lost_weight`

			`lost_potential_score = lost_weight`

			`# classify: builder, lost, both, or none`
			`# for mastodon, we use statuses_count as a proxy for builder activity`
			`builder_activity = 10 if statuses_count > 100 else 5 if statuses_count > 50 else 0`
			`user_type = classify_user(lost_potential_score, builder_activity, total_score)`

			`# confidence`
			`confidence = 0.3`
			`if len(text_parts) > 5:`
			`confidence += 0.2`
			`if statuses_count > 50:`
			`confidence += 0.2`
			`if len(positive_signals) > 3:`
			`confidence += 0.2`
			`confidence = min(confidence, 0.9)`

			`reasons = []`
			`if instance in ALIGNED_INSTANCES:`
			`reasons.append(f"on {instance}")`
			`if positive_signals:`
			`reasons.append(f"signals: {', '.join(positive_signals[:5])}")`
			`if negative_signals:`
			`reasons.append(f"WARNING: {', '.join(negative_signals)}")`

			`# add lost reasons if applicable`
			`if user_type == 'lost' or user_type == 'both':`
			`lost_descriptions = get_signal_descriptions(lost_signals)`
			`if lost_descriptions:`
			`reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")`

			`return {`
			`'platform': 'mastodon',`
			`'username': acct,`
			`'url': account.get('url'),`
			`'name': display_name,`
			`'bio': bio,`
			`'instance': instance,`
			`'score': total_score,`
			`'confidence': confidence,`
			`'signals': positive_signals,`
			`'negative_signals': negative_signals,`
			`'statuses_count': statuses_count,`
			`'followers': followers,`
			`'reasons': reasons,`
			`'scraped_at': datetime.now().isoformat(),`
			`# lost builder fields`
			`'lost_potential_score': lost_potential_score,`
			`'lost_signals': lost_signals,`
			`'user_type': user_type,`
			`}`


			`def scrape_mastodon(db, limit_per_instance=40):`
			`"""full mastodon scrape"""`
			`print("scoutd/mastodon: starting scrape...")`

			`all_accounts = []`

			`# 1. instance directories`
			`print(" scraping instance directories...")`
			`for instance in ALIGNED_INSTANCES:`
			`accounts = get_instance_directory(instance, limit=limit_per_instance)`
			`for acct in accounts:`
			`acct['_instance'] = instance`
			`all_accounts.append(acct)`
			`print(f" {instance}: {len(accounts)} users")`

			`# 2. hashtag timelines`
			`print(" scraping hashtags...")`
			`seen = set()`
			`for tag in TARGET_HASHTAGS[:8]:`
			`for instance in ['fosstodon.org', 'tech.lgbt', 'social.coop']:`
			`posts = get_hashtag_timeline(instance, tag, limit=20)`
			`for post in posts:`
			`account = post.get('account', {})`
			`acct = account.get('acct', '')`
			`if '@' not in acct:`
			`acct = f"{acct}@{instance}"`

			`if acct not in seen:`
			`seen.add(acct)`
			`account['_instance'] = instance`
			`all_accounts.append(account)`

			`# dedupe`
			`unique = {}`
			`for acct in all_accounts:`
			`key = acct.get('acct', acct.get('id', ''))`
			`if key not in unique:`
			`unique[key] = acct`

			`print(f" {len(unique)} unique accounts to analyze")`

			`# analyze`
			`results = []`
			`builders_found = 0`
			`lost_found = 0`

			`for acct_data in unique.values():`
			`instance = acct_data.get('_instance', 'mastodon.social')`
			`try:`
			`result = analyze_mastodon_user(acct_data, instance)`
			`if result and result['score'] > 0:`
			`results.append(result)`
			`db.save_human(result)`

			`user_type = result.get('user_type', 'none')`

			`if user_type == 'builder':`
			`builders_found += 1`
			`if result['score'] >= 40:`
			`print(f" ★ @{result['username']}: {result['score']} pts")`

			`elif user_type == 'lost':`
			`lost_found += 1`
			`lost_score = result.get('lost_potential_score', 0)`
			`if lost_score >= 40:`
			`print(f" 💔 @{result['username']}: lost_score={lost_score}, values={result['score']} pts")`

			`elif user_type == 'both':`
			`builders_found += 1`
			`lost_found += 1`
			`print(f" ⚡ @{result['username']}: recovering builder")`

			`except Exception as e:`
			`print(f" error: {e}")`

			`print(f"scoutd/mastodon: found {len(results)} aligned humans")`
			`print(f" - {builders_found} active builders")`
			`print(f" - {lost_found} lost builders (need encouragement)")`
			`return results`