""" scoutd/twitter.py - twitter/x discovery via nitter instances scrapes nitter (twitter frontend) to find users posting about aligned topics without needing twitter API access nitter instances rotate to avoid rate limits """ import requests import json import time import re from datetime import datetime from pathlib import Path from bs4 import BeautifulSoup from .signals import analyze_text HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0'} CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'twitter' # nitter instances (rotate through these) NITTER_INSTANCES = [ 'nitter.privacydev.net', 'nitter.poast.org', 'nitter.woodland.cafe', 'nitter.esmailelbob.xyz', ] # hashtags to search ALIGNED_HASHTAGS = [ 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', 'privacy', 'solarpunk', 'cooperative', 'mutualaid', 'localfirst', 'indieweb', 'smallweb', 'permacomputing', 'degrowth', 'techworkers', ] _current_instance_idx = 0 def get_nitter_instance(): """get current nitter instance, rotate on failure""" global _current_instance_idx return NITTER_INSTANCES[_current_instance_idx % len(NITTER_INSTANCES)] def rotate_instance(): """switch to next nitter instance""" global _current_instance_idx _current_instance_idx += 1 def _scrape_page(url, retries=3): """scrape a nitter page with instance rotation""" for attempt in range(retries): instance = get_nitter_instance() full_url = url.replace('{instance}', instance) # check cache cache_key = f"{full_url}" cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" CACHE_DIR.mkdir(parents=True, exist_ok=True) if cache_file.exists(): try: data = json.loads(cache_file.read_text()) if time.time() - data.get('_cached_at', 0) < 3600: return data.get('_html') except: pass time.sleep(2) # rate limit try: resp = requests.get(full_url, headers=HEADERS, timeout=30) if resp.status_code == 200: cache_file.write_text(json.dumps({ '_cached_at': time.time(), '_html': resp.text })) return resp.text elif resp.status_code in [429, 503]: print(f" nitter {instance} rate limited, rotating...") rotate_instance() else: print(f" nitter error: {resp.status_code}") return None except Exception as e: print(f" nitter {instance} error: {e}") rotate_instance() return None def search_hashtag(hashtag): """search for tweets with hashtag""" url = f"https://{{instance}}/search?q=%23{hashtag}&f=tweets" html = _scrape_page(url) if not html: return [] soup = BeautifulSoup(html, 'html.parser') tweets = [] for tweet_div in soup.select('.timeline-item'): try: username_elem = tweet_div.select_one('.username') content_elem = tweet_div.select_one('.tweet-content') fullname_elem = tweet_div.select_one('.fullname') if username_elem and content_elem: username = username_elem.text.strip().lstrip('@') tweets.append({ 'username': username, 'name': fullname_elem.text.strip() if fullname_elem else username, 'content': content_elem.text.strip(), }) except Exception as e: continue return tweets def get_user_profile(username): """get user profile from nitter""" url = f"https://{{instance}}/{username}" html = _scrape_page(url) if not html: return None soup = BeautifulSoup(html, 'html.parser') try: bio_elem = soup.select_one('.profile-bio') bio = bio_elem.text.strip() if bio_elem else '' location_elem = soup.select_one('.profile-location') location = location_elem.text.strip() if location_elem else '' website_elem = soup.select_one('.profile-website a') website = website_elem.get('href') if website_elem else '' # get recent tweets for more signal tweets = [] for tweet_div in soup.select('.timeline-item')[:10]: content_elem = tweet_div.select_one('.tweet-content') if content_elem: tweets.append(content_elem.text.strip()) return { 'username': username, 'bio': bio, 'location': location, 'website': website, 'recent_tweets': tweets, } except Exception as e: print(f" error parsing {username}: {e}") return None def analyze_twitter_user(username, profile=None): """analyze a twitter user for alignment""" if not profile: profile = get_user_profile(username) if not profile: return None # collect text text_parts = [profile.get('bio', '')] text_parts.extend(profile.get('recent_tweets', [])) full_text = ' '.join(text_parts) text_score, positive_signals, negative_signals = analyze_text(full_text) # twitter is noisy, lower base confidence confidence = 0.25 if len(positive_signals) >= 3: confidence += 0.2 if profile.get('website'): confidence += 0.1 if len(profile.get('recent_tweets', [])) >= 5: confidence += 0.1 confidence = min(confidence, 0.7) # cap lower for twitter reasons = [] if positive_signals: reasons.append(f"signals: {', '.join(positive_signals[:5])}") if negative_signals: reasons.append(f"WARNING: {', '.join(negative_signals)}") return { 'platform': 'twitter', 'username': username, 'url': f"https://twitter.com/{username}", 'name': profile.get('name', username), 'bio': profile.get('bio'), 'location': profile.get('location'), 'score': text_score, 'confidence': confidence, 'signals': positive_signals, 'negative_signals': negative_signals, 'reasons': reasons, 'contact': { 'twitter': username, 'website': profile.get('website'), }, 'scraped_at': datetime.now().isoformat(), } def scrape_twitter(db, limit_per_hashtag=50): """full twitter scrape via nitter""" print("scoutd/twitter: starting scrape via nitter...") all_users = {} for hashtag in ALIGNED_HASHTAGS: print(f" #{hashtag}...") tweets = search_hashtag(hashtag) for tweet in tweets[:limit_per_hashtag]: username = tweet.get('username') if username and username not in all_users: all_users[username] = { 'username': username, 'name': tweet.get('name'), 'hashtags': [hashtag], } elif username: all_users[username]['hashtags'].append(hashtag) print(f" found {len(tweets)} tweets") # prioritize users in multiple hashtags multi_hashtag = {u: d for u, d in all_users.items() if len(d.get('hashtags', [])) >= 2} print(f" {len(multi_hashtag)} users in 2+ aligned hashtags") # analyze results = [] for username, data in list(multi_hashtag.items())[:100]: # limit to prevent rate limits try: result = analyze_twitter_user(username) if result and result['score'] > 0: results.append(result) db.save_human(result) if result['score'] >= 30: print(f" ★ @{username}: {result['score']} pts") except Exception as e: print(f" error on {username}: {e}") print(f"scoutd/twitter: found {len(results)} aligned humans") return results