connectd/scoutd/reddit.py

429 lines
16 KiB
Python
Raw Normal View History

"""
scoutd/reddit.py - reddit discovery with TAVILY web search
CRITICAL: always quote usernames in tavily searches to avoid fuzzy matching
"""
import requests
import json
import time
import re
import os
from datetime import datetime
from pathlib import Path
from collections import defaultdict
from .signals import analyze_text, ALIGNED_SUBREDDITS, NEGATIVE_SUBREDDITS
from .lost import (
analyze_reddit_for_lost_signals,
analyze_text_for_lost_signals,
classify_user,
get_signal_descriptions,
STUCK_SUBREDDITS,
)
HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'}
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit'
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY', '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')
def _api_get(url, params=None, headers=None):
cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
if cache_file.exists():
try:
data = json.loads(cache_file.read_text())
if time.time() - data.get('_cached_at', 0) < 3600:
return data.get('_data')
except:
pass
time.sleep(1)
req_headers = {**HEADERS, **(headers or {})}
try:
resp = requests.get(url, headers=req_headers, params=params, timeout=30)
resp.raise_for_status()
result = resp.json()
cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
return result
except:
return None
def tavily_search(query, max_results=10):
if not TAVILY_API_KEY:
return []
try:
resp = requests.post(
'https://api.tavily.com/search',
json={'api_key': TAVILY_API_KEY, 'query': query, 'max_results': max_results},
timeout=30
)
if resp.status_code == 200:
return resp.json().get('results', [])
except Exception as e:
print(f" tavily error: {e}")
return []
def extract_links_from_text(text, username=None):
found = {}
if not text:
return found
text_lower = text.lower()
username_lower = username.lower() if username else None
# email
for email in re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
if any(x in email.lower() for x in ['noreply', 'example', '@reddit', 'info@', 'support@', 'contact@', 'admin@']):
continue
if username_lower and username_lower in email.lower():
found['email'] = email
break
if 'email' not in found:
found['email'] = email
# github
for gh in re.findall(r'github\.com/([a-zA-Z0-9_-]+)', text):
if gh.lower() in ['topics', 'explore', 'trending', 'sponsors', 'orgs']:
continue
if username_lower and gh.lower() == username_lower:
found['github'] = gh
break
# mastodon
masto = re.search(r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
if masto:
found['mastodon'] = f"@{masto.group(1)}@{masto.group(2)}"
for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
m = re.search(f'{inst}/@([a-zA-Z0-9_]+)', text)
if m:
found['mastodon'] = f"@{m.group(1)}@{inst}"
break
# bluesky
bsky = re.search(r'bsky\.app/profile/([a-zA-Z0-9_.-]+)', text)
if bsky:
found['bluesky'] = bsky.group(1)
# twitter
tw = re.search(r'(?:twitter|x)\.com/([a-zA-Z0-9_]+)', text)
if tw and tw.group(1).lower() not in ['home', 'explore', 'search']:
found['twitter'] = tw.group(1)
# linkedin
li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', text)
if li:
found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
# twitch
twitch = re.search(r'twitch\.tv/([a-zA-Z0-9_]+)', text)
if twitch:
found['twitch'] = f"https://twitch.tv/{twitch.group(1)}"
# itch.io
itch = re.search(r'itch\.io/profile/([a-zA-Z0-9_-]+)', text)
if itch:
found['itch'] = f"https://itch.io/profile/{itch.group(1)}"
# website
for url in re.findall(r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)', text):
skip = ['reddit', 'imgur', 'google', 'facebook', 'twitter', 'youtube', 'wikipedia', 'amazon']
if not any(x in url.lower() for x in skip):
if username_lower and username_lower in url.lower():
found['website'] = f"https://{url}"
break
if 'website' not in found:
found['website'] = f"https://{url}"
return found
def cross_platform_discovery(username, full_text=''):
"""
search the ENTIRE internet using TAVILY.
CRITICAL: always quote username to avoid fuzzy matching!
"""
found = {}
all_content = full_text
username_lower = username.lower()
print(f" 🔍 cross-platform search for {username}...")
# ALWAYS QUOTE THE USERNAME - critical for exact matching
searches = [
f'"{username}"', # just username, quoted
f'"{username}" github', # github
f'"{username}" developer programmer', # dev context
f'"{username}" email contact', # contact
f'"{username}" mastodon', # fediverse
]
for query in searches:
print(f" 🌐 tavily: {query}")
results = tavily_search(query, max_results=5)
for result in results:
url = result.get('url', '').lower()
title = result.get('title', '')
content = result.get('content', '')
combined = f"{url} {title} {content}"
# validate username appears
if username_lower not in combined.lower():
continue
all_content += f" {combined}"
# extract from URL directly
if f'github.com/{username_lower}' in url and not found.get('github'):
found['github'] = username
print(f" ✓ github: {username}")
if f'twitch.tv/{username_lower}' in url and not found.get('twitch'):
found['twitch'] = f"https://twitch.tv/{username}"
print(f" ✓ twitch")
if 'itch.io/profile/' in url and username_lower in url and not found.get('itch'):
found['itch'] = url if url.startswith('http') else f"https://{url}"
print(f" ✓ itch.io")
if 'linkedin.com/in/' in url and not found.get('linkedin'):
li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', url)
if li:
found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
print(f" ✓ linkedin")
# extract from content
extracted = extract_links_from_text(all_content, username)
for k, v in extracted.items():
if k not in found:
found[k] = v
print(f"{k}")
# good contact found? stop searching
if found.get('email') or found.get('github') or found.get('mastodon') or found.get('twitch'):
break
# === API CHECKS ===
if not found.get('github'):
headers = {'Authorization': f'token {GITHUB_TOKEN}'} if GITHUB_TOKEN else {}
try:
resp = requests.get(f'https://api.github.com/users/{username}', headers=headers, timeout=10)
if resp.status_code == 200:
data = resp.json()
found['github'] = username
print(f" ✓ github API")
if data.get('email') and 'email' not in found:
found['email'] = data['email']
if data.get('blog') and 'website' not in found:
found['website'] = data['blog'] if data['blog'].startswith('http') else f"https://{data['blog']}"
except:
pass
if not found.get('mastodon'):
for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
try:
resp = requests.get(f'https://{inst}/api/v1/accounts/lookup', params={'acct': username}, timeout=5)
if resp.status_code == 200:
found['mastodon'] = f"@{username}@{inst}"
print(f" ✓ mastodon: {found['mastodon']}")
break
except:
continue
if not found.get('bluesky'):
try:
resp = requests.get('https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile',
params={'actor': f'{username}.bsky.social'}, timeout=10)
if resp.status_code == 200:
found['bluesky'] = resp.json().get('handle')
print(f" ✓ bluesky")
except:
pass
return found
def get_user_profile(username):
url = f'https://www.reddit.com/user/{username}/about.json'
data = _api_get(url)
if not data or 'data' not in data:
return None
profile = data['data']
return {
'username': username,
'bio': profile.get('subreddit', {}).get('public_description', ''),
'title': profile.get('subreddit', {}).get('title', ''),
'total_karma': profile.get('total_karma', 0),
}
def get_subreddit_users(subreddit, limit=100):
users = set()
for endpoint in ['new', 'comments']:
url = f'https://www.reddit.com/r/{subreddit}/{endpoint}.json'
data = _api_get(url, {'limit': limit})
if data and 'data' in data:
for item in data['data'].get('children', []):
author = item['data'].get('author')
if author and author not in ['[deleted]', 'AutoModerator']:
users.add(author)
return users
def get_user_activity(username):
activity = []
for endpoint in ['submitted', 'comments']:
url = f'https://www.reddit.com/user/{username}/{endpoint}.json'
data = _api_get(url, {'limit': 100})
if data and 'data' in data:
for item in data['data'].get('children', []):
activity.append({
'type': 'post' if endpoint == 'submitted' else 'comment',
'subreddit': item['data'].get('subreddit'),
'title': item['data'].get('title', ''),
'body': item['data'].get('selftext', '') or item['data'].get('body', ''),
'score': item['data'].get('score', 0),
})
return activity
def analyze_reddit_user(username):
activity = get_user_activity(username)
if not activity:
return None
profile = get_user_profile(username)
sub_activity = defaultdict(int)
text_parts = []
total_karma = 0
for item in activity:
sub = item.get('subreddit', '').lower()
if sub:
sub_activity[sub] += 1
if item.get('title'):
text_parts.append(item['title'])
if item.get('body'):
text_parts.append(item['body'])
total_karma += item.get('score', 0)
full_text = ' '.join(text_parts)
text_score, positive_signals, negative_signals = analyze_text(full_text)
external_links = {}
if profile:
external_links.update(extract_links_from_text(f"{profile.get('bio', '')} {profile.get('title', '')}", username))
external_links.update(extract_links_from_text(full_text, username))
# TAVILY search
discovered = cross_platform_discovery(username, full_text)
external_links.update(discovered)
# scoring
sub_score = 0
aligned_subs = []
for sub, count in sub_activity.items():
weight = ALIGNED_SUBREDDITS.get(sub, 0)
if weight > 0:
sub_score += weight * min(count, 5)
aligned_subs.append(sub)
if len(aligned_subs) >= 5:
sub_score += 30
elif len(aligned_subs) >= 3:
sub_score += 15
for sub in sub_activity:
if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]:
sub_score -= 50
negative_signals.append(f"r/{sub}")
total_score = text_score + sub_score
if external_links.get('github'):
total_score += 10
positive_signals.append('github')
if external_links.get('mastodon'):
total_score += 10
positive_signals.append('mastodon')
if external_links.get('email'):
total_score += 15
positive_signals.append('email')
if external_links.get('twitch'):
total_score += 5
positive_signals.append('twitch')
# lost builder
subreddits_list = list(sub_activity.keys())
lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list)
text_lost_signals, _ = analyze_text_for_lost_signals(full_text)
for sig in text_lost_signals:
if sig not in lost_signals:
lost_signals.append(sig)
builder_activity = 20 if external_links.get('github') else 0
user_type = classify_user(lost_weight, builder_activity, total_score)
confidence = min(0.95, 0.3 + (0.2 if len(activity) > 20 else 0) + (0.2 if len(aligned_subs) >= 2 else 0) + (0.1 if external_links else 0))
reddit_only = not any([external_links.get(k) for k in ['github', 'mastodon', 'bluesky', 'email', 'matrix', 'linkedin', 'twitch', 'itch']])
return {
'platform': 'reddit',
'username': username,
'url': f"https://reddit.com/u/{username}",
'score': total_score,
'confidence': confidence,
'signals': positive_signals,
'negative_signals': negative_signals,
'subreddits': aligned_subs,
'activity_count': len(activity),
'karma': total_karma,
'reasons': [f"contact: {', '.join(external_links.keys())}"] if external_links else [],
'scraped_at': datetime.now().isoformat(),
'external_links': external_links,
'reddit_only': reddit_only,
'extra': external_links,
'lost_potential_score': lost_weight,
'lost_signals': lost_signals,
'user_type': user_type,
}
def scrape_reddit(db, limit_per_sub=50):
print("scoutd/reddit: scraping (TAVILY enabled)...")
user_subs = defaultdict(set)
for sub in ['intentionalcommunity', 'cohousing', 'selfhosted', 'homeassistant', 'solarpunk', 'cooperatives', 'privacy', 'localllama', 'homelab', 'learnprogramming']:
users = get_subreddit_users(sub, limit=limit_per_sub)
for user in users:
user_subs[user].add(sub)
multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2}
print(f" {len(multi_sub)} users in 2+ subs")
results = []
for username in multi_sub:
try:
result = analyze_reddit_user(username)
if result and result['score'] > 0:
results.append(result)
db.save_human(result)
except Exception as e:
print(f" error: {username}: {e}")
print(f"scoutd/reddit: {len(results)} humans")
return results
def _add_to_manual_queue(result):
queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json'
queue_file.parent.mkdir(parents=True, exist_ok=True)
queue = json.loads(queue_file.read_text()) if queue_file.exists() else []
if not any(q.get('username') == result['username'] for q in queue):
queue.append({'platform': 'reddit', 'username': result['username'], 'url': result['url'], 'score': result['score'], 'queued_at': datetime.now().isoformat()})
queue_file.write_text(json.dumps(queue, indent=2))