""" scoutd/mastodon.py - fediverse discovery scrapes high-signal instances: tech.lgbt, social.coop, fosstodon, hackers.town also detects lost builders - social isolation, imposter syndrome, struggling folks """ import requests import json import time import re from datetime import datetime from pathlib import Path from .signals import analyze_text, ALIGNED_INSTANCES from .lost import ( analyze_social_for_lost_signals, analyze_text_for_lost_signals, classify_user, get_signal_descriptions, ) HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'mastodon' TARGET_HASHTAGS = [ 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', 'privacy', 'solarpunk', 'cooperative', 'cohousing', 'mutualaid', 'intentionalcommunity', 'degoogle', 'fediverse', 'indieweb', ] def _api_get(url, params=None): """rate-limited request""" cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" CACHE_DIR.mkdir(parents=True, exist_ok=True) if cache_file.exists(): try: data = json.loads(cache_file.read_text()) if time.time() - data.get('_cached_at', 0) < 3600: return data.get('_data') except: pass time.sleep(1) try: resp = requests.get(url, headers=HEADERS, params=params, timeout=30) resp.raise_for_status() result = resp.json() cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) return result except requests.exceptions.RequestException as e: print(f" mastodon api error: {e}") return None def strip_html(text): """strip html tags""" return re.sub(r'<[^>]+>', ' ', text) if text else '' def get_instance_directory(instance, limit=40): """get users from instance directory""" url = f'https://{instance}/api/v1/directory' return _api_get(url, {'limit': limit, 'local': 'true'}) or [] def get_hashtag_timeline(instance, hashtag, limit=40): """get posts from hashtag""" url = f'https://{instance}/api/v1/timelines/tag/{hashtag}' return _api_get(url, {'limit': limit}) or [] def get_user_statuses(instance, user_id, limit=30): """get user's recent posts""" url = f'https://{instance}/api/v1/accounts/{user_id}/statuses' return _api_get(url, {'limit': limit, 'exclude_reblogs': 'true'}) or [] def analyze_mastodon_user(account, instance): """analyze a mastodon account""" acct = account.get('acct', '') if '@' not in acct: acct = f"{acct}@{instance}" # collect text text_parts = [] bio = strip_html(account.get('note', '')) if bio: text_parts.append(bio) display_name = account.get('display_name', '') if display_name: text_parts.append(display_name) # profile fields for field in account.get('fields', []): if field.get('name'): text_parts.append(field['name']) if field.get('value'): text_parts.append(strip_html(field['value'])) # get recent posts user_id = account.get('id') if user_id: statuses = get_user_statuses(instance, user_id) for status in statuses: content = strip_html(status.get('content', '')) if content: text_parts.append(content) full_text = ' '.join(text_parts) text_score, positive_signals, negative_signals = analyze_text(full_text) # instance bonus instance_bonus = ALIGNED_INSTANCES.get(instance, 0) total_score = text_score + instance_bonus # pronouns bonus if re.search(r'\b(they/them|she/her|he/him|xe/xem)\b', full_text, re.I): total_score += 10 positive_signals.append('pronouns') # activity level statuses_count = account.get('statuses_count', 0) followers = account.get('followers_count', 0) if statuses_count > 100: total_score += 5 # === LOST BUILDER DETECTION === # build profile and posts for lost analysis profile_for_lost = { 'bio': bio, 'note': account.get('note'), } # convert statuses to posts format for analyze_social_for_lost_signals posts_for_lost = [] if user_id: statuses = get_user_statuses(instance, user_id) for status in statuses: posts_for_lost.append({ 'content': strip_html(status.get('content', '')), 'reblog': status.get('reblog'), }) # analyze for lost signals lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) # also check combined text for lost patterns text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) for sig in text_lost_signals: if sig not in lost_signals: lost_signals.append(sig) lost_weight += text_lost_weight lost_potential_score = lost_weight # classify: builder, lost, both, or none # for mastodon, we use statuses_count as a proxy for builder activity builder_activity = 10 if statuses_count > 100 else 5 if statuses_count > 50 else 0 user_type = classify_user(lost_potential_score, builder_activity, total_score) # confidence confidence = 0.3 if len(text_parts) > 5: confidence += 0.2 if statuses_count > 50: confidence += 0.2 if len(positive_signals) > 3: confidence += 0.2 confidence = min(confidence, 0.9) reasons = [] if instance in ALIGNED_INSTANCES: reasons.append(f"on {instance}") if positive_signals: reasons.append(f"signals: {', '.join(positive_signals[:5])}") if negative_signals: reasons.append(f"WARNING: {', '.join(negative_signals)}") # add lost reasons if applicable if user_type == 'lost' or user_type == 'both': lost_descriptions = get_signal_descriptions(lost_signals) if lost_descriptions: reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") return { 'platform': 'mastodon', 'username': acct, 'url': account.get('url'), 'name': display_name, 'bio': bio, 'instance': instance, 'score': total_score, 'confidence': confidence, 'signals': positive_signals, 'negative_signals': negative_signals, 'statuses_count': statuses_count, 'followers': followers, 'reasons': reasons, 'scraped_at': datetime.now().isoformat(), # lost builder fields 'lost_potential_score': lost_potential_score, 'lost_signals': lost_signals, 'user_type': user_type, } def scrape_mastodon(db, limit_per_instance=40): """full mastodon scrape""" print("scoutd/mastodon: starting scrape...") all_accounts = [] # 1. instance directories print(" scraping instance directories...") for instance in ALIGNED_INSTANCES: accounts = get_instance_directory(instance, limit=limit_per_instance) for acct in accounts: acct['_instance'] = instance all_accounts.append(acct) print(f" {instance}: {len(accounts)} users") # 2. hashtag timelines print(" scraping hashtags...") seen = set() for tag in TARGET_HASHTAGS[:8]: for instance in ['fosstodon.org', 'tech.lgbt', 'social.coop']: posts = get_hashtag_timeline(instance, tag, limit=20) for post in posts: account = post.get('account', {}) acct = account.get('acct', '') if '@' not in acct: acct = f"{acct}@{instance}" if acct not in seen: seen.add(acct) account['_instance'] = instance all_accounts.append(account) # dedupe unique = {} for acct in all_accounts: key = acct.get('acct', acct.get('id', '')) if key not in unique: unique[key] = acct print(f" {len(unique)} unique accounts to analyze") # analyze results = [] builders_found = 0 lost_found = 0 for acct_data in unique.values(): instance = acct_data.get('_instance', 'mastodon.social') try: result = analyze_mastodon_user(acct_data, instance) if result and result['score'] > 0: results.append(result) db.save_human(result) user_type = result.get('user_type', 'none') if user_type == 'builder': builders_found += 1 if result['score'] >= 40: print(f" ★ @{result['username']}: {result['score']} pts") elif user_type == 'lost': lost_found += 1 lost_score = result.get('lost_potential_score', 0) if lost_score >= 40: print(f" 💔 @{result['username']}: lost_score={lost_score}, values={result['score']} pts") elif user_type == 'both': builders_found += 1 lost_found += 1 print(f" ⚡ @{result['username']}: recovering builder") except Exception as e: print(f" error: {e}") print(f"scoutd/mastodon: found {len(results)} aligned humans") print(f" - {builders_found} active builders") print(f" - {lost_found} lost builders (need encouragement)") return results