2025-12-16 09:22:58 +00:00
|
|
|
"""
|
2025-12-16 21:30:05 +00:00
|
|
|
scoutd/reddit.py - reddit discovery with TAVILY web search
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
CRITICAL: always quote usernames in tavily searches to avoid fuzzy matching
|
2025-12-16 09:22:58 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import re
|
2025-12-16 21:30:05 +00:00
|
|
|
import os
|
2025-12-16 09:22:58 +00:00
|
|
|
from datetime import datetime
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
|
|
from .signals import analyze_text, ALIGNED_SUBREDDITS, NEGATIVE_SUBREDDITS
|
|
|
|
|
from .lost import (
|
|
|
|
|
analyze_reddit_for_lost_signals,
|
|
|
|
|
analyze_text_for_lost_signals,
|
|
|
|
|
classify_user,
|
|
|
|
|
get_signal_descriptions,
|
|
|
|
|
STUCK_SUBREDDITS,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'}
|
|
|
|
|
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit'
|
|
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
|
|
|
|
|
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY', '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')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _api_get(url, params=None, headers=None):
|
2025-12-16 09:22:58 +00:00
|
|
|
cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
|
|
|
|
|
cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
|
|
|
|
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
if cache_file.exists():
|
|
|
|
|
try:
|
|
|
|
|
data = json.loads(cache_file.read_text())
|
|
|
|
|
if time.time() - data.get('_cached_at', 0) < 3600:
|
|
|
|
|
return data.get('_data')
|
|
|
|
|
except:
|
|
|
|
|
pass
|
2025-12-16 21:30:05 +00:00
|
|
|
time.sleep(1)
|
|
|
|
|
req_headers = {**HEADERS, **(headers or {})}
|
2025-12-16 09:22:58 +00:00
|
|
|
try:
|
2025-12-16 21:30:05 +00:00
|
|
|
resp = requests.get(url, headers=req_headers, params=params, timeout=30)
|
2025-12-16 09:22:58 +00:00
|
|
|
resp.raise_for_status()
|
|
|
|
|
result = resp.json()
|
|
|
|
|
cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
|
|
|
|
|
return result
|
2025-12-16 21:30:05 +00:00
|
|
|
except:
|
2025-12-16 09:22:58 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
def tavily_search(query, max_results=10):
|
|
|
|
|
if not TAVILY_API_KEY:
|
|
|
|
|
return []
|
|
|
|
|
try:
|
|
|
|
|
resp = requests.post(
|
|
|
|
|
'https://api.tavily.com/search',
|
|
|
|
|
json={'api_key': TAVILY_API_KEY, 'query': query, 'max_results': max_results},
|
|
|
|
|
timeout=30
|
|
|
|
|
)
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
return resp.json().get('results', [])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f" tavily error: {e}")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_links_from_text(text, username=None):
|
|
|
|
|
found = {}
|
2025-12-16 09:22:58 +00:00
|
|
|
if not text:
|
2025-12-16 21:30:05 +00:00
|
|
|
return found
|
|
|
|
|
text_lower = text.lower()
|
|
|
|
|
username_lower = username.lower() if username else None
|
|
|
|
|
|
|
|
|
|
# email
|
|
|
|
|
for email in re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
|
|
|
|
|
if any(x in email.lower() for x in ['noreply', 'example', '@reddit', 'info@', 'support@', 'contact@', 'admin@']):
|
|
|
|
|
continue
|
|
|
|
|
if username_lower and username_lower in email.lower():
|
|
|
|
|
found['email'] = email
|
|
|
|
|
break
|
|
|
|
|
if 'email' not in found:
|
|
|
|
|
found['email'] = email
|
|
|
|
|
|
|
|
|
|
# github
|
|
|
|
|
for gh in re.findall(r'github\.com/([a-zA-Z0-9_-]+)', text):
|
|
|
|
|
if gh.lower() in ['topics', 'explore', 'trending', 'sponsors', 'orgs']:
|
|
|
|
|
continue
|
|
|
|
|
if username_lower and gh.lower() == username_lower:
|
|
|
|
|
found['github'] = gh
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# mastodon
|
|
|
|
|
masto = re.search(r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
|
|
|
|
|
if masto:
|
|
|
|
|
found['mastodon'] = f"@{masto.group(1)}@{masto.group(2)}"
|
|
|
|
|
for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
|
|
|
|
|
m = re.search(f'{inst}/@([a-zA-Z0-9_]+)', text)
|
|
|
|
|
if m:
|
|
|
|
|
found['mastodon'] = f"@{m.group(1)}@{inst}"
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# bluesky
|
|
|
|
|
bsky = re.search(r'bsky\.app/profile/([a-zA-Z0-9_.-]+)', text)
|
|
|
|
|
if bsky:
|
|
|
|
|
found['bluesky'] = bsky.group(1)
|
|
|
|
|
|
|
|
|
|
# twitter
|
|
|
|
|
tw = re.search(r'(?:twitter|x)\.com/([a-zA-Z0-9_]+)', text)
|
|
|
|
|
if tw and tw.group(1).lower() not in ['home', 'explore', 'search']:
|
|
|
|
|
found['twitter'] = tw.group(1)
|
|
|
|
|
|
|
|
|
|
# linkedin
|
|
|
|
|
li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', text)
|
|
|
|
|
if li:
|
|
|
|
|
found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
|
|
|
|
|
|
|
|
|
|
# twitch
|
|
|
|
|
twitch = re.search(r'twitch\.tv/([a-zA-Z0-9_]+)', text)
|
|
|
|
|
if twitch:
|
|
|
|
|
found['twitch'] = f"https://twitch.tv/{twitch.group(1)}"
|
|
|
|
|
|
|
|
|
|
# itch.io
|
|
|
|
|
itch = re.search(r'itch\.io/profile/([a-zA-Z0-9_-]+)', text)
|
|
|
|
|
if itch:
|
|
|
|
|
found['itch'] = f"https://itch.io/profile/{itch.group(1)}"
|
|
|
|
|
|
|
|
|
|
# website
|
|
|
|
|
for url in re.findall(r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)', text):
|
|
|
|
|
skip = ['reddit', 'imgur', 'google', 'facebook', 'twitter', 'youtube', 'wikipedia', 'amazon']
|
|
|
|
|
if not any(x in url.lower() for x in skip):
|
|
|
|
|
if username_lower and username_lower in url.lower():
|
|
|
|
|
found['website'] = f"https://{url}"
|
2025-12-16 09:22:58 +00:00
|
|
|
break
|
2025-12-16 21:30:05 +00:00
|
|
|
if 'website' not in found:
|
|
|
|
|
found['website'] = f"https://{url}"
|
|
|
|
|
|
|
|
|
|
return found
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
|
|
|
|
|
def cross_platform_discovery(username, full_text=''):
|
|
|
|
|
"""
|
|
|
|
|
search the ENTIRE internet using TAVILY.
|
|
|
|
|
CRITICAL: always quote username to avoid fuzzy matching!
|
|
|
|
|
"""
|
|
|
|
|
found = {}
|
|
|
|
|
all_content = full_text
|
|
|
|
|
username_lower = username.lower()
|
|
|
|
|
|
|
|
|
|
print(f" 🔍 cross-platform search for {username}...")
|
|
|
|
|
|
|
|
|
|
# ALWAYS QUOTE THE USERNAME - critical for exact matching
|
|
|
|
|
searches = [
|
|
|
|
|
f'"{username}"', # just username, quoted
|
|
|
|
|
f'"{username}" github', # github
|
|
|
|
|
f'"{username}" developer programmer', # dev context
|
|
|
|
|
f'"{username}" email contact', # contact
|
|
|
|
|
f'"{username}" mastodon', # fediverse
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for query in searches:
|
|
|
|
|
print(f" 🌐 tavily: {query}")
|
|
|
|
|
results = tavily_search(query, max_results=5)
|
|
|
|
|
|
|
|
|
|
for result in results:
|
|
|
|
|
url = result.get('url', '').lower()
|
|
|
|
|
title = result.get('title', '')
|
|
|
|
|
content = result.get('content', '')
|
|
|
|
|
combined = f"{url} {title} {content}"
|
|
|
|
|
|
|
|
|
|
# validate username appears
|
|
|
|
|
if username_lower not in combined.lower():
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
all_content += f" {combined}"
|
|
|
|
|
|
|
|
|
|
# extract from URL directly
|
|
|
|
|
if f'github.com/{username_lower}' in url and not found.get('github'):
|
|
|
|
|
found['github'] = username
|
|
|
|
|
print(f" ✓ github: {username}")
|
|
|
|
|
|
|
|
|
|
if f'twitch.tv/{username_lower}' in url and not found.get('twitch'):
|
|
|
|
|
found['twitch'] = f"https://twitch.tv/{username}"
|
|
|
|
|
print(f" ✓ twitch")
|
|
|
|
|
|
|
|
|
|
if 'itch.io/profile/' in url and username_lower in url and not found.get('itch'):
|
|
|
|
|
found['itch'] = url if url.startswith('http') else f"https://{url}"
|
|
|
|
|
print(f" ✓ itch.io")
|
|
|
|
|
|
|
|
|
|
if 'linkedin.com/in/' in url and not found.get('linkedin'):
|
|
|
|
|
li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', url)
|
|
|
|
|
if li:
|
|
|
|
|
found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
|
|
|
|
|
print(f" ✓ linkedin")
|
|
|
|
|
|
|
|
|
|
# extract from content
|
|
|
|
|
extracted = extract_links_from_text(all_content, username)
|
|
|
|
|
for k, v in extracted.items():
|
|
|
|
|
if k not in found:
|
|
|
|
|
found[k] = v
|
|
|
|
|
print(f" ✓ {k}")
|
|
|
|
|
|
|
|
|
|
# good contact found? stop searching
|
|
|
|
|
if found.get('email') or found.get('github') or found.get('mastodon') or found.get('twitch'):
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# === API CHECKS ===
|
|
|
|
|
if not found.get('github'):
|
|
|
|
|
headers = {'Authorization': f'token {GITHUB_TOKEN}'} if GITHUB_TOKEN else {}
|
|
|
|
|
try:
|
|
|
|
|
resp = requests.get(f'https://api.github.com/users/{username}', headers=headers, timeout=10)
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
data = resp.json()
|
|
|
|
|
found['github'] = username
|
|
|
|
|
print(f" ✓ github API")
|
|
|
|
|
if data.get('email') and 'email' not in found:
|
|
|
|
|
found['email'] = data['email']
|
|
|
|
|
if data.get('blog') and 'website' not in found:
|
|
|
|
|
found['website'] = data['blog'] if data['blog'].startswith('http') else f"https://{data['blog']}"
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
if not found.get('mastodon'):
|
|
|
|
|
for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
|
|
|
|
|
try:
|
|
|
|
|
resp = requests.get(f'https://{inst}/api/v1/accounts/lookup', params={'acct': username}, timeout=5)
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
found['mastodon'] = f"@{username}@{inst}"
|
|
|
|
|
print(f" ✓ mastodon: {found['mastodon']}")
|
|
|
|
|
break
|
|
|
|
|
except:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if not found.get('bluesky'):
|
|
|
|
|
try:
|
|
|
|
|
resp = requests.get('https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile',
|
|
|
|
|
params={'actor': f'{username}.bsky.social'}, timeout=10)
|
|
|
|
|
if resp.status_code == 200:
|
|
|
|
|
found['bluesky'] = resp.json().get('handle')
|
|
|
|
|
print(f" ✓ bluesky")
|
|
|
|
|
except:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return found
|
2025-12-16 09:22:58 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_profile(username):
|
|
|
|
|
url = f'https://www.reddit.com/user/{username}/about.json'
|
|
|
|
|
data = _api_get(url)
|
|
|
|
|
if not data or 'data' not in data:
|
|
|
|
|
return None
|
|
|
|
|
profile = data['data']
|
|
|
|
|
return {
|
|
|
|
|
'username': username,
|
|
|
|
|
'bio': profile.get('subreddit', {}).get('public_description', ''),
|
|
|
|
|
'title': profile.get('subreddit', {}).get('title', ''),
|
|
|
|
|
'total_karma': profile.get('total_karma', 0),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_subreddit_users(subreddit, limit=100):
|
|
|
|
|
users = set()
|
2025-12-16 21:30:05 +00:00
|
|
|
for endpoint in ['new', 'comments']:
|
|
|
|
|
url = f'https://www.reddit.com/r/{subreddit}/{endpoint}.json'
|
|
|
|
|
data = _api_get(url, {'limit': limit})
|
|
|
|
|
if data and 'data' in data:
|
|
|
|
|
for item in data['data'].get('children', []):
|
|
|
|
|
author = item['data'].get('author')
|
|
|
|
|
if author and author not in ['[deleted]', 'AutoModerator']:
|
|
|
|
|
users.add(author)
|
2025-12-16 09:22:58 +00:00
|
|
|
return users
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_activity(username):
|
|
|
|
|
activity = []
|
2025-12-16 21:30:05 +00:00
|
|
|
for endpoint in ['submitted', 'comments']:
|
|
|
|
|
url = f'https://www.reddit.com/user/{username}/{endpoint}.json'
|
|
|
|
|
data = _api_get(url, {'limit': 100})
|
|
|
|
|
if data and 'data' in data:
|
|
|
|
|
for item in data['data'].get('children', []):
|
|
|
|
|
activity.append({
|
|
|
|
|
'type': 'post' if endpoint == 'submitted' else 'comment',
|
|
|
|
|
'subreddit': item['data'].get('subreddit'),
|
|
|
|
|
'title': item['data'].get('title', ''),
|
|
|
|
|
'body': item['data'].get('selftext', '') or item['data'].get('body', ''),
|
|
|
|
|
'score': item['data'].get('score', 0),
|
|
|
|
|
})
|
2025-12-16 09:22:58 +00:00
|
|
|
return activity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_reddit_user(username):
|
|
|
|
|
activity = get_user_activity(username)
|
|
|
|
|
if not activity:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
profile = get_user_profile(username)
|
|
|
|
|
sub_activity = defaultdict(int)
|
|
|
|
|
text_parts = []
|
|
|
|
|
total_karma = 0
|
|
|
|
|
|
|
|
|
|
for item in activity:
|
|
|
|
|
sub = item.get('subreddit', '').lower()
|
|
|
|
|
if sub:
|
|
|
|
|
sub_activity[sub] += 1
|
|
|
|
|
if item.get('title'):
|
|
|
|
|
text_parts.append(item['title'])
|
|
|
|
|
if item.get('body'):
|
|
|
|
|
text_parts.append(item['body'])
|
|
|
|
|
total_karma += item.get('score', 0)
|
|
|
|
|
|
|
|
|
|
full_text = ' '.join(text_parts)
|
|
|
|
|
text_score, positive_signals, negative_signals = analyze_text(full_text)
|
|
|
|
|
|
|
|
|
|
external_links = {}
|
|
|
|
|
if profile:
|
2025-12-16 21:30:05 +00:00
|
|
|
external_links.update(extract_links_from_text(f"{profile.get('bio', '')} {profile.get('title', '')}", username))
|
|
|
|
|
external_links.update(extract_links_from_text(full_text, username))
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
# TAVILY search
|
|
|
|
|
discovered = cross_platform_discovery(username, full_text)
|
|
|
|
|
external_links.update(discovered)
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
# scoring
|
2025-12-16 09:22:58 +00:00
|
|
|
sub_score = 0
|
|
|
|
|
aligned_subs = []
|
|
|
|
|
for sub, count in sub_activity.items():
|
|
|
|
|
weight = ALIGNED_SUBREDDITS.get(sub, 0)
|
|
|
|
|
if weight > 0:
|
|
|
|
|
sub_score += weight * min(count, 5)
|
|
|
|
|
aligned_subs.append(sub)
|
|
|
|
|
|
|
|
|
|
if len(aligned_subs) >= 5:
|
|
|
|
|
sub_score += 30
|
|
|
|
|
elif len(aligned_subs) >= 3:
|
|
|
|
|
sub_score += 15
|
|
|
|
|
|
|
|
|
|
for sub in sub_activity:
|
|
|
|
|
if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]:
|
|
|
|
|
sub_score -= 50
|
|
|
|
|
negative_signals.append(f"r/{sub}")
|
|
|
|
|
|
|
|
|
|
total_score = text_score + sub_score
|
|
|
|
|
|
|
|
|
|
if external_links.get('github'):
|
|
|
|
|
total_score += 10
|
2025-12-16 21:30:05 +00:00
|
|
|
positive_signals.append('github')
|
2025-12-16 09:22:58 +00:00
|
|
|
if external_links.get('mastodon'):
|
|
|
|
|
total_score += 10
|
2025-12-16 21:30:05 +00:00
|
|
|
positive_signals.append('mastodon')
|
|
|
|
|
if external_links.get('email'):
|
|
|
|
|
total_score += 15
|
|
|
|
|
positive_signals.append('email')
|
|
|
|
|
if external_links.get('twitch'):
|
2025-12-16 09:22:58 +00:00
|
|
|
total_score += 5
|
2025-12-16 21:30:05 +00:00
|
|
|
positive_signals.append('twitch')
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
# lost builder
|
2025-12-16 09:22:58 +00:00
|
|
|
subreddits_list = list(sub_activity.keys())
|
|
|
|
|
lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list)
|
2025-12-16 21:30:05 +00:00
|
|
|
text_lost_signals, _ = analyze_text_for_lost_signals(full_text)
|
2025-12-16 09:22:58 +00:00
|
|
|
for sig in text_lost_signals:
|
|
|
|
|
if sig not in lost_signals:
|
|
|
|
|
lost_signals.append(sig)
|
|
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
builder_activity = 20 if external_links.get('github') else 0
|
|
|
|
|
user_type = classify_user(lost_weight, builder_activity, total_score)
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
confidence = min(0.95, 0.3 + (0.2 if len(activity) > 20 else 0) + (0.2 if len(aligned_subs) >= 2 else 0) + (0.1 if external_links else 0))
|
|
|
|
|
|
|
|
|
|
reddit_only = not any([external_links.get(k) for k in ['github', 'mastodon', 'bluesky', 'email', 'matrix', 'linkedin', 'twitch', 'itch']])
|
2025-12-16 09:22:58 +00:00
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'platform': 'reddit',
|
|
|
|
|
'username': username,
|
|
|
|
|
'url': f"https://reddit.com/u/{username}",
|
|
|
|
|
'score': total_score,
|
|
|
|
|
'confidence': confidence,
|
|
|
|
|
'signals': positive_signals,
|
|
|
|
|
'negative_signals': negative_signals,
|
|
|
|
|
'subreddits': aligned_subs,
|
|
|
|
|
'activity_count': len(activity),
|
|
|
|
|
'karma': total_karma,
|
2025-12-16 21:30:05 +00:00
|
|
|
'reasons': [f"contact: {', '.join(external_links.keys())}"] if external_links else [],
|
2025-12-16 09:22:58 +00:00
|
|
|
'scraped_at': datetime.now().isoformat(),
|
|
|
|
|
'external_links': external_links,
|
|
|
|
|
'reddit_only': reddit_only,
|
2025-12-16 21:30:05 +00:00
|
|
|
'extra': external_links,
|
|
|
|
|
'lost_potential_score': lost_weight,
|
2025-12-16 09:22:58 +00:00
|
|
|
'lost_signals': lost_signals,
|
|
|
|
|
'user_type': user_type,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scrape_reddit(db, limit_per_sub=50):
|
2025-12-16 21:30:05 +00:00
|
|
|
print("scoutd/reddit: scraping (TAVILY enabled)...")
|
2025-12-16 09:22:58 +00:00
|
|
|
user_subs = defaultdict(set)
|
2025-12-16 21:30:05 +00:00
|
|
|
for sub in ['intentionalcommunity', 'cohousing', 'selfhosted', 'homeassistant', 'solarpunk', 'cooperatives', 'privacy', 'localllama', 'homelab', 'learnprogramming']:
|
2025-12-16 09:22:58 +00:00
|
|
|
users = get_subreddit_users(sub, limit=limit_per_sub)
|
|
|
|
|
for user in users:
|
|
|
|
|
user_subs[user].add(sub)
|
|
|
|
|
|
|
|
|
|
multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2}
|
2025-12-16 21:30:05 +00:00
|
|
|
print(f" {len(multi_sub)} users in 2+ subs")
|
2025-12-16 09:22:58 +00:00
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
for username in multi_sub:
|
|
|
|
|
try:
|
|
|
|
|
result = analyze_reddit_user(username)
|
|
|
|
|
if result and result['score'] > 0:
|
|
|
|
|
results.append(result)
|
|
|
|
|
db.save_human(result)
|
|
|
|
|
except Exception as e:
|
2025-12-16 21:30:05 +00:00
|
|
|
print(f" error: {username}: {e}")
|
2025-12-16 09:22:58 +00:00
|
|
|
|
2025-12-16 21:30:05 +00:00
|
|
|
print(f"scoutd/reddit: {len(results)} humans")
|
2025-12-16 09:22:58 +00:00
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _add_to_manual_queue(result):
|
|
|
|
|
queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json'
|
|
|
|
|
queue_file.parent.mkdir(parents=True, exist_ok=True)
|
2025-12-16 21:30:05 +00:00
|
|
|
queue = json.loads(queue_file.read_text()) if queue_file.exists() else []
|
|
|
|
|
if not any(q.get('username') == result['username'] for q in queue):
|
|
|
|
|
queue.append({'platform': 'reddit', 'username': result['username'], 'url': result['url'], 'score': result['score'], 'queued_at': datetime.now().isoformat()})
|
|
|
|
|
queue_file.write_text(json.dumps(queue, indent=2))
|