commit 99946bfef59153108a2d55fa53e43ed8b4a1381a Author: root Date: Tue Dec 16 09:22:58 2025 +0000 autonomous daemon with platform-native contact detection - determine_contact_method now recognizes mastodon/bluesky users by platform - username IS the handle for platform-native users - fixed orphaned matches table issue - wave 1 intros sent successfully diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..8058653 --- /dev/null +++ b/.env.example @@ -0,0 +1,58 @@ +# connectd environment variables +# copy to .env and fill in your values + +# === REQUIRED FOR LLM DRAFTING === +GROQ_API_KEY= +GROQ_MODEL=llama-3.1-70b-versatile + +# === DISCOVERY SOURCES === +# github (optional - works without token but rate limited) +GITHUB_TOKEN= + +# mastodon (for DM delivery) +MASTODON_TOKEN= +MASTODON_INSTANCE=mastodon.social + +# bluesky (for DM delivery) +BLUESKY_HANDLE= +BLUESKY_APP_PASSWORD= + +# matrix (for DM delivery) +MATRIX_HOMESERVER= +MATRIX_USER_ID= +MATRIX_ACCESS_TOKEN= + +# discord (for discovery + DM delivery) +DISCORD_BOT_TOKEN= +DISCORD_TARGET_SERVERS= # comma separated server IDs + +# lemmy (for authenticated access to your instance) +LEMMY_INSTANCE= +LEMMY_USERNAME= +LEMMY_PASSWORD= + +# === EMAIL DELIVERY === +SMTP_HOST= +SMTP_PORT=465 +SMTP_USER= +SMTP_PASS= +FROM_EMAIL=connectd + +# === HOST USER CONFIG === +# the person running connectd - gets priority matching +# set HOST_USER to your github username and connectd will auto-discover your info +# other vars override/supplement discovered values +HOST_USER= +HOST_NAME= +HOST_EMAIL= +HOST_GITHUB= # defaults to HOST_USER +HOST_MASTODON= # format: @user@instance +HOST_REDDIT= +HOST_LEMMY= # format: @user@instance +HOST_LOBSTERS= +HOST_MATRIX= # format: @user:server +HOST_DISCORD= # user id +HOST_BLUESKY= # format: handle.bsky.social +HOST_LOCATION= +HOST_INTERESTS= # comma separated: intentional-community,cooperative,solarpunk +HOST_LOOKING_FOR= # what you're looking for in matches diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..352d9c2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,30 @@ +# env +.env +*.env + +# python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +.venv/ +ENV/ + +# database +*.db +db/cache/ + +# data +data/ + +# ide +.idea/ +.vscode/ +*.swp +*.swo + +# os +.DS_Store +Thumbs.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e32ca97 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +WORKDIR /app + +# install deps +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# copy app +COPY . . + +# create data directories (db files stored in /data, not /app/db) +RUN mkdir -p /data/db /data/cache + +# set DB path via env +ENV DB_PATH=/data/db/connectd.db +ENV CACHE_DIR=/data/cache + +# default command runs daemon +CMD ["python", "daemon.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..0b7b4e0 --- /dev/null +++ b/README.md @@ -0,0 +1,189 @@ +# connectd + +

+ connectd logo +

+ +a daemon that finds isolated builders with aligned values and connects them. + +**this is not networking. this is community building.** + +## the mission + +i was lost for 10 years. medicated. numbed. convinced something was wrong with me. nobody told me there was another way. + +connectd isn't just finding builders. it's finding the people who don't know they CAN build. the ones who gave up. the ones still stuck in the old system. the ones who need someone to show them the door exists. + +we lift them up. we show them what's possible. we connect them to people who GET IT. + +## what it does + +1. **scouts** - discovers humans across platforms (github, reddit, mastodon, lemmy, discord, lobsters, bluesky, matrix) +2. **analyzes** - scores them for values alignment AND lost builder potential +3. **matches** - pairs aligned builders together, or pairs lost builders with inspiring active ones +4. **drafts** - uses LLM to write genuine, personalized intros +5. **delivers** - sends via email, mastodon DM, bluesky DM, matrix DM, discord DM, or github issue + +fully autonomous. no manual review. self-sustaining pipe. + +## values it looks for + +- privacy, selfhosted, foss +- solarpunk, cooperative, decentralized +- queer-friendly, anti-capitalist +- remote work, work-life balance +- community over competition + +## lost builder signals + +people who have potential but haven't started yet, gave up, or are struggling: + +- "gave up coding", "burned out", "imposter syndrome" +- active in career-help communities but not shipping +- enthusiasm for others' work but no projects of their own +- aspirational bios ("learning...", "aspiring...") + +lost builders don't get matched to each other (both need energy). they get matched to ACTIVE builders who can inspire them. + +## quick start + +```bash +# clone +git clone https://github.com/sudoxnym/connectd +cd connectd + +# configure +cp .env.example .env +# edit .env with your API keys + +# run with docker +docker compose up -d + +# or run directly +pip install -r requirements.txt +python daemon.py --dry-run # preview mode +python daemon.py # live mode +``` + +## cli commands + +```bash +# discovery +python cli.py scout # all platforms +python cli.py scout --github # github only +python cli.py scout --reddit --lemmy # specific platforms +python cli.py scout --user octocat # deep scrape one user + +# matching +python cli.py match # find matches +python cli.py match --lost # find matches for lost builders +python cli.py match --mine # your matches (if you set up profile) + +# intros +python cli.py intro --dry-run # preview intros +python cli.py intro # generate and queue +python cli.py intro --lost # lost builder intros + +# lost builders +python cli.py lost # show lost builders +python cli.py lost --verbose # show the philosophy + +# daemon +python cli.py daemon # run continuous +python cli.py daemon --dry-run # preview mode +python cli.py daemon --oneshot # run once then exit + +# status +python cli.py status # show stats +``` + +## docker + +```bash +# build +docker build -t connectd . + +# run daemon +docker compose up -d + +# run one-off commands +docker compose run --rm connectd python cli.py scout +docker compose run --rm connectd python cli.py status +``` + +## environment variables + +copy `.env.example` to `.env` and fill in your values: + +```bash +cp .env.example .env +``` + +### required + +| variable | description | +|----------|-------------| +| `GROQ_API_KEY` | for LLM intro drafting ([get one here](https://console.groq.com)) | + +### discovery sources + +| variable | description | +|----------|-------------| +| `GITHUB_TOKEN` | higher rate limits for github API | +| `DISCORD_BOT_TOKEN` | discord bot token for server access | +| `DISCORD_TARGET_SERVERS` | comma-separated server IDs to scout | +| `LEMMY_INSTANCE` | your lemmy instance (e.g. `lemmy.ml`) | +| `LEMMY_USERNAME` | lemmy username for auth | +| `LEMMY_PASSWORD` | lemmy password for auth | + +### delivery methods + +| variable | description | +|----------|-------------| +| `MASTODON_TOKEN` | mastodon access token | +| `MASTODON_INSTANCE` | your mastodon instance (e.g. `mastodon.social`) | +| `BLUESKY_HANDLE` | bluesky handle (e.g. `you.bsky.social`) | +| `BLUESKY_APP_PASSWORD` | bluesky app password | +| `MATRIX_HOMESERVER` | matrix homeserver URL | +| `MATRIX_USER_ID` | matrix user ID (e.g. `@bot:matrix.org`) | +| `MATRIX_ACCESS_TOKEN` | matrix access token | +| `SMTP_HOST` | email server host | +| `SMTP_PORT` | email server port (default 465) | +| `SMTP_USER` | email username | +| `SMTP_PASS` | email password | +| `FROM_EMAIL` | from address for emails | + +you need at least ONE delivery method configured for intros to be sent. + +## architecture + +``` +scoutd/ - discovery modules (one per platform) +matchd/ - matching + fingerprinting logic +introd/ - intro drafting + delivery +db/ - sqlite storage +config.py - central configuration +daemon.py - continuous runner +cli.py - command line interface +``` + +## intervals (daemon mode) + +- scout: every 4 hours +- match: every 1 hour +- stranger intros: every 2 hours (max 20/day) +- lost builder intros: every 6 hours (max 5/day) + +## forking + +this is meant to be forked. clone it, set your own values in `scoutd/signals.py`, point it at your communities, run it. + +the goal is a self-sustaining pipe that finds YOUR people. the ones who share YOUR values. and connects them. + +## license + +MIT - do whatever you want with it. just pay it forward. + +--- + +*"you're not behind. you're just not started yet."* diff --git a/Screenshot_20251216_021234.png b/Screenshot_20251216_021234.png new file mode 100644 index 0000000..45f60cc Binary files /dev/null and b/Screenshot_20251216_021234.png differ diff --git a/api.py b/api.py new file mode 100644 index 0000000..18ad5f2 --- /dev/null +++ b/api.py @@ -0,0 +1,1191 @@ +#!/usr/bin/env python3 +""" +connectd/api.py - REST API for stats and control + +exposes daemon stats for home assistant integration. +runs on port 8099 by default. +""" + +import os +import json +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler +from datetime import datetime + +from db import Database +from db.users import get_priority_users, get_priority_user_matches, get_priority_user + +API_PORT = int(os.environ.get('CONNECTD_API_PORT', 8099)) + +# shared state (updated by daemon) +_daemon_state = { + 'running': False, + 'dry_run': False, + 'last_scout': None, + 'last_match': None, + 'last_intro': None, + 'last_lost': None, + 'intros_today': 0, + 'lost_intros_today': 0, + 'started_at': None, +} + + +def update_daemon_state(state_dict): + """update shared daemon state (called by daemon)""" + global _daemon_state + _daemon_state.update(state_dict) + + +def get_daemon_state(): + """get current daemon state""" + return _daemon_state.copy() + + + +DASHBOARD_HTML = """ + + + connectd + + + + + +

connectd + repo + org +

connectd repo org

+ + + + + +

+ + +""" + + +# draft cache - stores generated drafts so they dont regenerate +_draft_cache = {} + +def get_cached_draft(match_id, match_type='match'): + key = f"{match_type}:{match_id}" + return _draft_cache.get(key) + +def cache_draft(match_id, draft_data, match_type='match'): + key = f"{match_type}:{match_id}" + _draft_cache[key] = draft_data + +class APIHandler(BaseHTTPRequestHandler): + """simple REST API handler""" + + def log_message(self, format, *args): + """suppress default logging""" + pass + + def _send_json(self, data, status=200): + """send JSON response""" + self.send_response(status) + self.send_header('Content-Type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(json.dumps(data).encode()) + + def do_GET(self): + """handle GET requests""" + path = self.path.split('?')[0] + if path == '/favicon.png' or path == '/favicon.ico': + self._handle_favicon() + elif path == '/' or path == '/dashboard': + self._handle_dashboard() + elif path == '/api/stats': + self._handle_stats() + elif path == '/api/host': + self._handle_host() + elif path == '/api/host_matches': + self._handle_host_matches() + elif path == '/api/your_matches': + self._handle_your_matches() + elif path == '/api/preview_match_draft': + self._handle_preview_match_draft() + elif path == '/api/preview_host_draft': + self._handle_preview_host_draft() + elif path == '/api/preview_draft': + self._handle_preview_draft() + elif path == '/api/pending_about_you': + self._handle_pending_about_you() + elif path == '/api/pending_to_you': + self._handle_pending_to_you() + elif path == '/api/pending_matches': + self._handle_pending_matches() + elif path == '/api/sent_intros': + self._handle_sent_intros() + elif path == '/api/failed_intros': + self._handle_failed_intros() + elif path == '/api/health': + self._handle_health() + elif path == '/api/state': + self._handle_state() + elif path == '/api/priority_matches': + self._handle_priority_matches() + elif path == '/api/top_humans': + self._handle_top_humans() + elif path == '/api/user': + self._handle_user() + else: + self._send_json({'error': 'not found'}, 404) + def _handle_favicon(self): + from pathlib import Path + fav = Path('/app/data/favicon.png') + if fav.exists(): + self.send_response(200) + self.send_header('Content-Type', 'image/png') + self.end_headers() + self.wfile.write(fav.read_bytes()) + else: + self.send_response(404) + self.end_headers() + + def _handle_dashboard(self): + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write(DASHBOARD_HTML.encode()) + + def _handle_sent_intros(self): + from pathlib import Path + log_path = Path("/app/data/delivery_log.json") + sent = [] + if log_path.exists(): + with open(log_path) as f: + log = json.load(f) + sent = log.get("sent", [])[-20:] + sent.reverse() + self._send_json({"sent": sent}) + + def _handle_failed_intros(self): + from pathlib import Path + log_path = Path("/app/data/delivery_log.json") + failed = [] + if log_path.exists(): + with open(log_path) as f: + log = json.load(f) + failed = log.get("failed", []) + self._send_json({"failed": failed}) + + def _handle_host(self): + """daemon status and match stats""" + import sqlite3 + state = get_daemon_state() + try: + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("SELECT COUNT(*) FROM matches WHERE status='pending' AND overlap_score >= 60") + pending = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE status='intro_sent'") + sent = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE status='rejected'") + rejected = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches") + total = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 90") + s90 = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 80 AND overlap_score < 90") + s80 = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 70 AND overlap_score < 80") + s70 = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 60 AND overlap_score < 70") + s60 = c.fetchone()[0] + conn.close() + except: + pending = sent = rejected = total = s90 = s80 = s70 = s60 = 0 + uptime = None + if state.get('started_at'): + try: + start = datetime.fromisoformat(state['started_at']) if isinstance(state['started_at'], str) else state['started_at'] + uptime = int((datetime.now() - start).total_seconds()) + except: pass + self._send_json({ + 'running': state.get('running', False), 'dry_run': state.get('dry_run', False), + 'uptime_seconds': uptime, 'intros_today': state.get('intros_today', 0), + 'matches_pending': pending, 'matches_sent': sent, 'matches_rejected': rejected, 'matches_total': total, + 'score_90_plus': s90, 'score_80_89': s80, 'score_70_79': s70, 'score_60_69': s60, + }) + + def _handle_your_matches(self): + """matches involving the host - shows both directions""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 15 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': [], 'host': None}) + db.close() + return + host = users[0] + host_name = host.get('github') or host.get('name') + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT m.id, m.overlap_score, m.overlap_reasons, m.status, + h1.username, h1.platform, h1.contact, + h2.username, h2.platform, h2.contact + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE (h1.username = ? OR h2.username = ?) + AND m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC LIMIT ?""", (host_name, host_name, limit)) + matches = [] + for row in c.fetchall(): + if row[4] == host_name: + other_user, other_platform = row[7], row[8] + other_contact = j.loads(row[9]) if row[9] else {} + else: + other_user, other_platform = row[4], row[5] + other_contact = j.loads(row[6]) if row[6] else {} + reasons = j.loads(row[2]) if row[2] else [] + matches.append({ + 'id': row[0], 'score': int(row[1]), 'reasons': reasons, + 'status': row[3], 'other_user': other_user, 'other_platform': other_platform, + 'contact': other_contact.get('email') or other_contact.get('mastodon') or '' + }) + conn.close() + db.close() + self._send_json({'host': host_name, 'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_preview_match_draft(self): + """preview draft for a match - dir=to_you or to_them""" + import sqlite3 + import json as j + from introd.groq_draft import draft_intro_with_llm + from db.users import get_priority_users + + match_id = None + direction = 'to_you' + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('id='): + try: match_id = int(p.split('=')[1]) + except: pass + if p.startswith('dir='): + direction = p.split('=')[1] + + if not match_id: + self._send_json({'error': 'need ?id=match_id'}, 400) + return + + cache_key = f"{match_id}_{direction}" + cached = get_cached_draft(cache_key, 'match') + if cached: + cached['cached'] = True + self._send_json(cached) + return + + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'error': 'no priority user'}, 404) + db.close() + return + host = users[0] + host_name = host.get('github') or host.get('name') + + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT h1.username, h1.platform, h1.contact, h1.extra, + h2.username, h2.platform, h2.contact, h2.extra, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.id = ?""", (match_id,)) + row = c.fetchone() + conn.close() + db.close() + + if not row: + self._send_json({'error': 'match not found'}, 404) + return + + human_a = {'username': row[0], 'platform': row[1], + 'contact': j.loads(row[2]) if row[2] else {}, + 'extra': j.loads(row[3]) if row[3] else {}} + human_b = {'username': row[4], 'platform': row[5], + 'contact': j.loads(row[6]) if row[6] else {}, + 'extra': j.loads(row[7]) if row[7] else {}} + reasons = j.loads(row[9]) if row[9] else [] + + if human_a['username'] == host_name: + host_human, other_human = human_a, human_b + else: + host_human, other_human = human_b, human_a + + if direction == 'to_you': + match_data = {'human_a': host_human, 'human_b': other_human, + 'overlap_score': row[8], 'overlap_reasons': reasons} + recipient_name = host_name + about_name = other_human['username'] + else: + match_data = {'human_a': other_human, 'human_b': host_human, + 'overlap_score': row[8], 'overlap_reasons': reasons} + recipient_name = other_human['username'] + about_name = host_name + + result, error = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if error: + self._send_json({'error': error}, 500) + return + + response = { + 'match_id': match_id, + 'direction': direction, + 'to': recipient_name, + 'about': about_name, + 'subject': result.get('subject'), + 'draft': result.get('draft'), + 'score': row[8], + 'cached': False, + } + cache_draft(cache_key, response, 'match') + self._send_json(response) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_host_matches(self): + """matches for priority user""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 20 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': [], 'host': None}) + db.close() + return + host = users[0] + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT pm.id, pm.overlap_score, pm.overlap_reasons, pm.status, h.username, h.platform, h.contact + FROM priority_matches pm JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? ORDER BY pm.overlap_score DESC LIMIT ?""", (host['id'], limit)) + matches = [] + for row in c.fetchall(): + reasons = j.loads(row[2]) if row[2] else [] + contact = j.loads(row[6]) if row[6] else {} + matches.append({'id': row[0], 'score': int(row[1]), 'reasons': reasons, 'status': row[3], + 'other_user': row[4], 'other_platform': row[5], + 'contact': contact.get('email') or contact.get('mastodon') or contact.get('github') or ''}) + conn.close() + db.close() + self._send_json({'host': host.get('github') or host.get('name'), 'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_preview_host_draft(self): + """preview draft for a priority match - dir=to_you or to_them""" + import sqlite3 + import json as j + from introd.groq_draft import draft_intro_with_llm + from db.users import get_priority_users + + match_id = None + direction = 'to_you' + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('id='): + try: match_id = int(p.split('=')[1]) + except: pass + if p.startswith('dir='): + direction = p.split('=')[1] + + if not match_id: + self._send_json({'error': 'need ?id=match_id'}, 400) + return + + cache_key = f"host_{match_id}_{direction}" + cached = get_cached_draft(cache_key, 'host') + if cached: + cached['cached'] = True + self._send_json(cached) + return + + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'error': 'no priority user'}, 404) + db.close() + return + host = users[0] + + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + # Get the matched human from priority_matches + c.execute("""SELECT h.username, h.platform, h.contact, h.extra, pm.overlap_score, pm.overlap_reasons + FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.id = ?""", (match_id,)) + row = c.fetchone() + conn.close() + db.close() + + if not row: + self._send_json({'error': 'match not found'}, 404) + return + + # The matched person (who we found for the host) + other = {'username': row[0], 'platform': row[1], + 'contact': j.loads(row[2]) if row[2] else {}, + 'extra': j.loads(row[3]) if row[3] else {}} + + # Build host as human_a (recipient), other as human_b (subject) + host_human = {'username': host.get('github') or host.get('name'), + 'platform': 'priority', + 'contact': {'email': host.get('email'), 'mastodon': host.get('mastodon'), 'github': host.get('github')}, + 'extra': {'bio': host.get('bio'), 'interests': host.get('interests')}} + + reasons = j.loads(row[5]) if row[5] else [] + match_data = {'human_a': host_human, 'human_b': other, + 'overlap_score': row[4], 'overlap_reasons': reasons} + + # direction determines who gets the intro + if direction == 'to_you': + # intro TO host ABOUT other + match_data = {'human_a': host_human, 'human_b': other, + 'overlap_score': row[4], 'overlap_reasons': reasons} + to_name = host.get('github') or host.get('name') + about_name = other['username'] + else: + # intro TO other ABOUT host + match_data = {'human_a': other, 'human_b': host_human, + 'overlap_score': row[4], 'overlap_reasons': reasons} + to_name = other['username'] + about_name = host.get('github') or host.get('name') + + result, error = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if error: + self._send_json({'error': error}, 500) + return + + cache_key = f"host_{match_id}_{direction}" + response = { + 'match_id': match_id, + 'direction': direction, + 'to': to_name, + 'about': about_name, + 'subject': result.get('subject'), + 'draft': result.get('draft'), + 'score': row[4], + 'cached': False, + } + cache_draft(cache_key, response, 'host') + self._send_json(response) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_preview_draft(self): + import sqlite3 + import json as j + from introd.groq_draft import draft_intro_with_llm + + match_id = None + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('id='): + try: match_id = int(p.split('=')[1]) + except: pass + + if not match_id: + self._send_json({'error': 'need ?id=match_id'}, 400) + return + + # check cache first + cached = get_cached_draft(match_id, 'queue') + if cached: + cached['cached'] = True + self._send_json(cached) + return + + try: + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT h1.username, h1.platform, h1.contact, h1.extra, + h2.username, h2.platform, h2.contact, h2.extra, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.id = ?""", (match_id,)) + row = c.fetchone() + conn.close() + + if not row: + self._send_json({'error': 'match not found'}, 404) + return + + human_a = {'username': row[0], 'platform': row[1], + 'contact': j.loads(row[2]) if row[2] else {}, + 'extra': j.loads(row[3]) if row[3] else {}} + human_b = {'username': row[4], 'platform': row[5], + 'contact': j.loads(row[6]) if row[6] else {}, + 'extra': j.loads(row[7]) if row[7] else {}} + reasons = j.loads(row[9]) if row[9] else [] + + match_data = {'human_a': human_a, 'human_b': human_b, + 'overlap_score': row[8], 'overlap_reasons': reasons} + + result, error = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if error: + self._send_json({'error': error}, 500) + return + + response = { + 'match_id': match_id, + 'to': human_a['username'], + 'about': human_b['username'], + 'subject': result.get('subject'), + 'draft': result.get('draft'), + 'score': row[8], + 'cached': False, + } + cache_draft(match_id, response, 'queue') + self._send_json(response) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_pending_about_you(self): + """pending intros where host is human_b (being introduced to others)""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 10 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': []}) + db.close() + return + host = users[0] + host_name = host.get('github') or host.get('name') + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT m.id, h1.username, h1.platform, h1.contact, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE h2.username = ? AND m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC LIMIT ?""", (host_name, limit)) + matches = [] + for row in c.fetchall(): + contact = j.loads(row[3]) if row[3] else {} + reasons = j.loads(row[5]) if row[5] else [] + method = 'email' if contact.get('email') else ('mastodon' if contact.get('mastodon') else None) + matches.append({'id': row[0], 'to_user': row[1], 'to_platform': row[2], + 'score': int(row[4]), 'reasons': reasons[:3], 'method': method, + 'contact': contact.get('email') or contact.get('mastodon') or ''}) + conn.close() + db.close() + self._send_json({'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_pending_to_you(self): + """pending intros where host is human_a (receiving intro about others)""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 20 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': []}) + db.close() + return + host = users[0] + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT pm.id, h.username, h.platform, pm.overlap_score, pm.overlap_reasons + FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? AND pm.status IN ('new', 'pending') + ORDER BY pm.overlap_score DESC LIMIT ?""", (host['id'], limit)) + matches = [] + for row in c.fetchall(): + reasons = j.loads(row[4]) if row[4] else [] + matches.append({'id': row[0], 'about_user': row[1], 'about_platform': row[2], + 'score': int(row[3]), 'reasons': reasons[:3]}) + conn.close() + db.close() + self._send_json({'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_pending_matches(self): + """pending matches - returns BOTH directions for each match""" + import sqlite3 + import json as j + limit = 30 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT m.id, h1.username, h1.platform, h1.contact, + h2.username, h2.platform, h2.contact, m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC LIMIT ?""", (limit // 2,)) + matches = [] + for row in c.fetchall(): + contact_a = j.loads(row[3]) if row[3] else {} + contact_b = j.loads(row[6]) if row[6] else {} + reasons = j.loads(row[8]) if row[8] else [] + # direction 1: TO human_a ABOUT human_b + method_a = 'email' if contact_a.get('email') else ('mastodon' if contact_a.get('mastodon') else None) + matches.append({'id': row[0], 'to_user': row[1], 'about_user': row[4], + 'score': int(row[7]), 'reasons': reasons[:3], 'method': method_a, + 'contact': contact_a.get('email') or contact_a.get('mastodon') or ''}) + # direction 2: TO human_b ABOUT human_a + method_b = 'email' if contact_b.get('email') else ('mastodon' if contact_b.get('mastodon') else None) + matches.append({'id': row[0], 'to_user': row[4], 'about_user': row[1], + 'score': int(row[7]), 'reasons': reasons[:3], 'method': method_b, + 'contact': contact_b.get('email') or contact_b.get('mastodon') or ''}) + conn.close() + self._send_json({'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_stats(self): + """return database statistics""" + try: + db = Database() + stats = db.stats() + db.close() + self._send_json(stats) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_health(self): + """return daemon health status""" + state = get_daemon_state() + + health = { + 'status': 'running' if state['running'] else 'stopped', + 'dry_run': state['dry_run'], + 'uptime_seconds': None, + } + + if state['started_at']: + uptime = datetime.now() - datetime.fromisoformat(state['started_at']) + health['uptime_seconds'] = int(uptime.total_seconds()) + + self._send_json(health) + + def _handle_state(self): + """return full daemon state""" + state = get_daemon_state() + + # convert datetimes to strings + for key in ['last_scout', 'last_match', 'last_intro', 'last_lost', 'started_at']: + if state[key] and isinstance(state[key], datetime): + state[key] = state[key].isoformat() + + self._send_json(state) + + def _handle_priority_matches(self): + """return priority matches for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'count': 0, + 'new_count': 0, + 'top_matches': [], + }) + db.close() + return + + # get matches for first priority user (host) + user = users[0] + matches = get_priority_user_matches(db.conn, user['id'], limit=10) + + new_count = sum(1 for m in matches if m.get('status') == 'new') + + top_matches = [] + for m in matches[:5]: + overlap_reasons = m.get('overlap_reasons', '[]') + if isinstance(overlap_reasons, str): + import json as json_mod + overlap_reasons = json_mod.loads(overlap_reasons) if overlap_reasons else [] + + top_matches.append({ + 'username': m.get('username'), + 'platform': m.get('platform'), + 'score': m.get('score', 0), + 'overlap_score': m.get('overlap_score', 0), + 'reasons': overlap_reasons[:3], + 'url': m.get('url'), + 'status': m.get('status', 'new'), + }) + + db.close() + self._send_json({ + 'count': len(matches), + 'new_count': new_count, + 'top_matches': top_matches, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_top_humans(self): + """return top scoring humans for HA sensor""" + try: + db = Database() + humans = db.get_all_humans(min_score=50, limit=5) + + top_humans = [] + for h in humans: + contact = h.get('contact', '{}') + if isinstance(contact, str): + import json as json_mod + contact = json_mod.loads(contact) if contact else {} + + signals = h.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + top_humans.append({ + 'username': h.get('username'), + 'platform': h.get('platform'), + 'score': h.get('score', 0), + 'name': h.get('name'), + 'signals': signals[:5], + 'contact_method': 'email' if contact.get('email') else + 'mastodon' if contact.get('mastodon') else + 'matrix' if contact.get('matrix') else 'manual', + }) + + db.close() + self._send_json({ + 'count': len(humans), + 'top_humans': top_humans, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_user(self): + """return priority user info for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'configured': False, + 'score': 0, + 'signals': [], + 'match_count': 0, + }) + db.close() + return + + user = users[0] + signals = user.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + interests = user.get('interests', '[]') + if isinstance(interests, str): + import json as json_mod + interests = json_mod.loads(interests) if interests else [] + + matches = get_priority_user_matches(db.conn, user['id'], limit=100) + + db.close() + self._send_json({ + 'configured': True, + 'name': user.get('name'), + 'github': user.get('github'), + 'mastodon': user.get('mastodon'), + 'reddit': user.get('reddit'), + 'lobsters': user.get('lobsters'), + 'matrix': user.get('matrix'), + 'lemmy': user.get('lemmy'), + 'discord': user.get('discord'), + 'bluesky': user.get('bluesky'), + 'score': user.get('score', 0), + 'signals': signals[:10], + 'interests': interests, + 'location': user.get('location'), + 'bio': user.get('bio'), + 'match_count': len(matches), + 'new_match_count': sum(1 for m in matches if m.get('status') == 'new'), + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + +def run_api_server(): + """run the API server in a thread""" + server = HTTPServer(('0.0.0.0', API_PORT), APIHandler) + print(f"connectd api running on port {API_PORT}") + server.serve_forever() + + +def start_api_thread(): + """start API server in background thread""" + thread = threading.Thread(target=run_api_server, daemon=True) + thread.start() + return thread + + +if __name__ == '__main__': + # standalone mode for testing + print(f"starting connectd api on port {API_PORT}...") + run_api_server() diff --git a/api.py.backup.20251215_221410 b/api.py.backup.20251215_221410 new file mode 100644 index 0000000..04e913e --- /dev/null +++ b/api.py.backup.20251215_221410 @@ -0,0 +1,1187 @@ +#!/usr/bin/env python3 +""" +connectd/api.py - REST API for stats and control + +exposes daemon stats for home assistant integration. +runs on port 8099 by default. +""" + +import os +import json +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler +from datetime import datetime + +from db import Database +from db.users import get_priority_users, get_priority_user_matches, get_priority_user + +API_PORT = int(os.environ.get('CONNECTD_API_PORT', 8099)) + +# shared state (updated by daemon) +_daemon_state = { + 'running': False, + 'dry_run': False, + 'last_scout': None, + 'last_match': None, + 'last_intro': None, + 'last_lost': None, + 'intros_today': 0, + 'lost_intros_today': 0, + 'started_at': None, +} + + +def update_daemon_state(state_dict): + """update shared daemon state (called by daemon)""" + global _daemon_state + _daemon_state.update(state_dict) + + +def get_daemon_state(): + """get current daemon state""" + return _daemon_state.copy() + + + +DASHBOARD_HTML = """ + + + connectd + + + + + +

connectd + repo + org +

+ + + + +""" + + +# draft cache - stores generated drafts so they dont regenerate +_draft_cache = {} + +def get_cached_draft(match_id, match_type='match'): + key = f"{match_type}:{match_id}" + return _draft_cache.get(key) + +def cache_draft(match_id, draft_data, match_type='match'): + key = f"{match_type}:{match_id}" + _draft_cache[key] = draft_data + +class APIHandler(BaseHTTPRequestHandler): + """simple REST API handler""" + + def log_message(self, format, *args): + """suppress default logging""" + pass + + def _send_json(self, data, status=200): + """send JSON response""" + self.send_response(status) + self.send_header('Content-Type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(json.dumps(data).encode()) + + def do_GET(self): + """handle GET requests""" + path = self.path.split('?')[0] + if path == '/favicon.png' or path == '/favicon.ico': + self._handle_favicon() + elif path == '/' or path == '/dashboard': + self._handle_dashboard() + elif path == '/api/stats': + self._handle_stats() + elif path == '/api/host': + self._handle_host() + elif path == '/api/host_matches': + self._handle_host_matches() + elif path == '/api/your_matches': + self._handle_your_matches() + elif path == '/api/preview_match_draft': + self._handle_preview_match_draft() + elif path == '/api/preview_host_draft': + self._handle_preview_host_draft() + elif path == '/api/preview_draft': + self._handle_preview_draft() + elif path == '/api/pending_about_you': + self._handle_pending_about_you() + elif path == '/api/pending_to_you': + self._handle_pending_to_you() + elif path == '/api/pending_matches': + self._handle_pending_matches() + elif path == '/api/sent_intros': + self._handle_sent_intros() + elif path == '/api/failed_intros': + self._handle_failed_intros() + elif path == '/api/health': + self._handle_health() + elif path == '/api/state': + self._handle_state() + elif path == '/api/priority_matches': + self._handle_priority_matches() + elif path == '/api/top_humans': + self._handle_top_humans() + elif path == '/api/user': + self._handle_user() + else: + self._send_json({'error': 'not found'}, 404) + def _handle_favicon(self): + from pathlib import Path + fav = Path('/app/data/favicon.png') + if fav.exists(): + self.send_response(200) + self.send_header('Content-Type', 'image/png') + self.end_headers() + self.wfile.write(fav.read_bytes()) + else: + self.send_response(404) + self.end_headers() + + def _handle_dashboard(self): + self.send_response(200) + self.send_header("Content-Type", "text/html") + self.end_headers() + self.wfile.write(DASHBOARD_HTML.encode()) + + def _handle_sent_intros(self): + from pathlib import Path + log_path = Path("/app/data/delivery_log.json") + sent = [] + if log_path.exists(): + with open(log_path) as f: + log = json.load(f) + sent = log.get("sent", [])[-20:] + sent.reverse() + self._send_json({"sent": sent}) + + def _handle_failed_intros(self): + from pathlib import Path + log_path = Path("/app/data/delivery_log.json") + failed = [] + if log_path.exists(): + with open(log_path) as f: + log = json.load(f) + failed = log.get("failed", []) + self._send_json({"failed": failed}) + + def _handle_host(self): + """daemon status and match stats""" + import sqlite3 + state = get_daemon_state() + try: + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("SELECT COUNT(*) FROM matches WHERE status='pending' AND overlap_score >= 60") + pending = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE status='intro_sent'") + sent = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE status='rejected'") + rejected = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches") + total = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 90") + s90 = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 80 AND overlap_score < 90") + s80 = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 70 AND overlap_score < 80") + s70 = c.fetchone()[0] + c.execute("SELECT COUNT(*) FROM matches WHERE overlap_score >= 60 AND overlap_score < 70") + s60 = c.fetchone()[0] + conn.close() + except: + pending = sent = rejected = total = s90 = s80 = s70 = s60 = 0 + uptime = None + if state.get('started_at'): + try: + start = datetime.fromisoformat(state['started_at']) if isinstance(state['started_at'], str) else state['started_at'] + uptime = int((datetime.now() - start).total_seconds()) + except: pass + self._send_json({ + 'running': state.get('running', False), 'dry_run': state.get('dry_run', False), + 'uptime_seconds': uptime, 'intros_today': state.get('intros_today', 0), + 'matches_pending': pending, 'matches_sent': sent, 'matches_rejected': rejected, 'matches_total': total, + 'score_90_plus': s90, 'score_80_89': s80, 'score_70_79': s70, 'score_60_69': s60, + }) + + def _handle_your_matches(self): + """matches involving the host - shows both directions""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 15 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': [], 'host': None}) + db.close() + return + host = users[0] + host_name = host.get('github') or host.get('name') + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT m.id, m.overlap_score, m.overlap_reasons, m.status, + h1.username, h1.platform, h1.contact, + h2.username, h2.platform, h2.contact + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE (h1.username = ? OR h2.username = ?) + AND m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC LIMIT ?""", (host_name, host_name, limit)) + matches = [] + for row in c.fetchall(): + if row[4] == host_name: + other_user, other_platform = row[7], row[8] + other_contact = j.loads(row[9]) if row[9] else {} + else: + other_user, other_platform = row[4], row[5] + other_contact = j.loads(row[6]) if row[6] else {} + reasons = j.loads(row[2]) if row[2] else [] + matches.append({ + 'id': row[0], 'score': int(row[1]), 'reasons': reasons, + 'status': row[3], 'other_user': other_user, 'other_platform': other_platform, + 'contact': other_contact.get('email') or other_contact.get('mastodon') or '' + }) + conn.close() + db.close() + self._send_json({'host': host_name, 'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_preview_match_draft(self): + """preview draft for a match - dir=to_you or to_them""" + import sqlite3 + import json as j + from introd.groq_draft import draft_intro_with_llm + from db.users import get_priority_users + + match_id = None + direction = 'to_you' + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('id='): + try: match_id = int(p.split('=')[1]) + except: pass + if p.startswith('dir='): + direction = p.split('=')[1] + + if not match_id: + self._send_json({'error': 'need ?id=match_id'}, 400) + return + + cache_key = f"{match_id}_{direction}" + cached = get_cached_draft(cache_key, 'match') + if cached: + cached['cached'] = True + self._send_json(cached) + return + + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'error': 'no priority user'}, 404) + db.close() + return + host = users[0] + host_name = host.get('github') or host.get('name') + + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT h1.username, h1.platform, h1.contact, h1.extra, + h2.username, h2.platform, h2.contact, h2.extra, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.id = ?""", (match_id,)) + row = c.fetchone() + conn.close() + db.close() + + if not row: + self._send_json({'error': 'match not found'}, 404) + return + + human_a = {'username': row[0], 'platform': row[1], + 'contact': j.loads(row[2]) if row[2] else {}, + 'extra': j.loads(row[3]) if row[3] else {}} + human_b = {'username': row[4], 'platform': row[5], + 'contact': j.loads(row[6]) if row[6] else {}, + 'extra': j.loads(row[7]) if row[7] else {}} + reasons = j.loads(row[9]) if row[9] else [] + + if human_a['username'] == host_name: + host_human, other_human = human_a, human_b + else: + host_human, other_human = human_b, human_a + + if direction == 'to_you': + match_data = {'human_a': host_human, 'human_b': other_human, + 'overlap_score': row[8], 'overlap_reasons': reasons} + recipient_name = host_name + about_name = other_human['username'] + else: + match_data = {'human_a': other_human, 'human_b': host_human, + 'overlap_score': row[8], 'overlap_reasons': reasons} + recipient_name = other_human['username'] + about_name = host_name + + result, error = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if error: + self._send_json({'error': error}, 500) + return + + response = { + 'match_id': match_id, + 'direction': direction, + 'to': recipient_name, + 'about': about_name, + 'subject': result.get('subject'), + 'draft': result.get('draft'), + 'score': row[8], + 'cached': False, + } + cache_draft(cache_key, response, 'match') + self._send_json(response) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_host_matches(self): + """matches for priority user""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 20 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': [], 'host': None}) + db.close() + return + host = users[0] + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT pm.id, pm.overlap_score, pm.overlap_reasons, pm.status, h.username, h.platform, h.contact + FROM priority_matches pm JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? ORDER BY pm.overlap_score DESC LIMIT ?""", (host['id'], limit)) + matches = [] + for row in c.fetchall(): + reasons = j.loads(row[2]) if row[2] else [] + contact = j.loads(row[6]) if row[6] else {} + matches.append({'id': row[0], 'score': int(row[1]), 'reasons': reasons, 'status': row[3], + 'other_user': row[4], 'other_platform': row[5], + 'contact': contact.get('email') or contact.get('mastodon') or contact.get('github') or ''}) + conn.close() + db.close() + self._send_json({'host': host.get('github') or host.get('name'), 'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_preview_host_draft(self): + """preview draft for a priority match - dir=to_you or to_them""" + import sqlite3 + import json as j + from introd.groq_draft import draft_intro_with_llm + from db.users import get_priority_users + + match_id = None + direction = 'to_you' + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('id='): + try: match_id = int(p.split('=')[1]) + except: pass + if p.startswith('dir='): + direction = p.split('=')[1] + + if not match_id: + self._send_json({'error': 'need ?id=match_id'}, 400) + return + + cache_key = f"host_{match_id}_{direction}" + cached = get_cached_draft(cache_key, 'host') + if cached: + cached['cached'] = True + self._send_json(cached) + return + + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'error': 'no priority user'}, 404) + db.close() + return + host = users[0] + + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + # Get the matched human from priority_matches + c.execute("""SELECT h.username, h.platform, h.contact, h.extra, pm.overlap_score, pm.overlap_reasons + FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.id = ?""", (match_id,)) + row = c.fetchone() + conn.close() + db.close() + + if not row: + self._send_json({'error': 'match not found'}, 404) + return + + # The matched person (who we found for the host) + other = {'username': row[0], 'platform': row[1], + 'contact': j.loads(row[2]) if row[2] else {}, + 'extra': j.loads(row[3]) if row[3] else {}} + + # Build host as human_a (recipient), other as human_b (subject) + host_human = {'username': host.get('github') or host.get('name'), + 'platform': 'priority', + 'contact': {'email': host.get('email'), 'mastodon': host.get('mastodon'), 'github': host.get('github')}, + 'extra': {'bio': host.get('bio'), 'interests': host.get('interests')}} + + reasons = j.loads(row[5]) if row[5] else [] + match_data = {'human_a': host_human, 'human_b': other, + 'overlap_score': row[4], 'overlap_reasons': reasons} + + # direction determines who gets the intro + if direction == 'to_you': + # intro TO host ABOUT other + match_data = {'human_a': host_human, 'human_b': other, + 'overlap_score': row[4], 'overlap_reasons': reasons} + to_name = host.get('github') or host.get('name') + about_name = other['username'] + else: + # intro TO other ABOUT host + match_data = {'human_a': other, 'human_b': host_human, + 'overlap_score': row[4], 'overlap_reasons': reasons} + to_name = other['username'] + about_name = host.get('github') or host.get('name') + + result, error = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if error: + self._send_json({'error': error}, 500) + return + + cache_key = f"host_{match_id}_{direction}" + response = { + 'match_id': match_id, + 'direction': direction, + 'to': to_name, + 'about': about_name, + 'subject': result.get('subject'), + 'draft': result.get('draft'), + 'score': row[4], + 'cached': False, + } + cache_draft(cache_key, response, 'host') + self._send_json(response) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_preview_draft(self): + import sqlite3 + import json as j + from introd.groq_draft import draft_intro_with_llm + + match_id = None + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('id='): + try: match_id = int(p.split('=')[1]) + except: pass + + if not match_id: + self._send_json({'error': 'need ?id=match_id'}, 400) + return + + # check cache first + cached = get_cached_draft(match_id, 'queue') + if cached: + cached['cached'] = True + self._send_json(cached) + return + + try: + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT h1.username, h1.platform, h1.contact, h1.extra, + h2.username, h2.platform, h2.contact, h2.extra, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.id = ?""", (match_id,)) + row = c.fetchone() + conn.close() + + if not row: + self._send_json({'error': 'match not found'}, 404) + return + + human_a = {'username': row[0], 'platform': row[1], + 'contact': j.loads(row[2]) if row[2] else {}, + 'extra': j.loads(row[3]) if row[3] else {}} + human_b = {'username': row[4], 'platform': row[5], + 'contact': j.loads(row[6]) if row[6] else {}, + 'extra': j.loads(row[7]) if row[7] else {}} + reasons = j.loads(row[9]) if row[9] else [] + + match_data = {'human_a': human_a, 'human_b': human_b, + 'overlap_score': row[8], 'overlap_reasons': reasons} + + result, error = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if error: + self._send_json({'error': error}, 500) + return + + response = { + 'match_id': match_id, + 'to': human_a['username'], + 'about': human_b['username'], + 'subject': result.get('subject'), + 'draft': result.get('draft'), + 'score': row[8], + 'cached': False, + } + cache_draft(match_id, response, 'queue') + self._send_json(response) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_pending_about_you(self): + """pending intros where host is human_b (being introduced to others)""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 10 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': []}) + db.close() + return + host = users[0] + host_name = host.get('github') or host.get('name') + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT m.id, h1.username, h1.platform, h1.contact, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE h2.username = ? AND m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC LIMIT ?""", (host_name, limit)) + matches = [] + for row in c.fetchall(): + contact = j.loads(row[3]) if row[3] else {} + reasons = j.loads(row[5]) if row[5] else [] + method = 'email' if contact.get('email') else ('mastodon' if contact.get('mastodon') else None) + matches.append({'id': row[0], 'to_user': row[1], 'to_platform': row[2], + 'score': int(row[4]), 'reasons': reasons[:3], 'method': method, + 'contact': contact.get('email') or contact.get('mastodon') or ''}) + conn.close() + db.close() + self._send_json({'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_pending_to_you(self): + """pending intros where host is human_a (receiving intro about others)""" + import sqlite3 + import json as j + from db.users import get_priority_users + limit = 20 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + db = Database() + users = get_priority_users(db.conn) + if not users: + self._send_json({'matches': []}) + db.close() + return + host = users[0] + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT pm.id, h.username, h.platform, pm.overlap_score, pm.overlap_reasons + FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? AND pm.status IN ('new', 'pending') + ORDER BY pm.overlap_score DESC LIMIT ?""", (host['id'], limit)) + matches = [] + for row in c.fetchall(): + reasons = j.loads(row[4]) if row[4] else [] + matches.append({'id': row[0], 'about_user': row[1], 'about_platform': row[2], + 'score': int(row[3]), 'reasons': reasons[:3]}) + conn.close() + db.close() + self._send_json({'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_pending_matches(self): + """pending matches - returns BOTH directions for each match""" + import sqlite3 + import json as j + limit = 30 + if '?' in self.path: + for p in self.path.split('?')[1].split('&'): + if p.startswith('limit='): + try: limit = int(p.split('=')[1]) + except: pass + try: + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + c.execute("""SELECT m.id, h1.username, h1.platform, h1.contact, + h2.username, h2.platform, h2.contact, m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC LIMIT ?""", (limit // 2,)) + matches = [] + for row in c.fetchall(): + contact_a = j.loads(row[3]) if row[3] else {} + contact_b = j.loads(row[6]) if row[6] else {} + reasons = j.loads(row[8]) if row[8] else [] + # direction 1: TO human_a ABOUT human_b + method_a = 'email' if contact_a.get('email') else ('mastodon' if contact_a.get('mastodon') else None) + matches.append({'id': row[0], 'to_user': row[1], 'about_user': row[4], + 'score': int(row[7]), 'reasons': reasons[:3], 'method': method_a, + 'contact': contact_a.get('email') or contact_a.get('mastodon') or ''}) + # direction 2: TO human_b ABOUT human_a + method_b = 'email' if contact_b.get('email') else ('mastodon' if contact_b.get('mastodon') else None) + matches.append({'id': row[0], 'to_user': row[4], 'about_user': row[1], + 'score': int(row[7]), 'reasons': reasons[:3], 'method': method_b, + 'contact': contact_b.get('email') or contact_b.get('mastodon') or ''}) + conn.close() + self._send_json({'matches': matches}) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_stats(self): + """return database statistics""" + try: + db = Database() + stats = db.stats() + db.close() + self._send_json(stats) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_health(self): + """return daemon health status""" + state = get_daemon_state() + + health = { + 'status': 'running' if state['running'] else 'stopped', + 'dry_run': state['dry_run'], + 'uptime_seconds': None, + } + + if state['started_at']: + uptime = datetime.now() - datetime.fromisoformat(state['started_at']) + health['uptime_seconds'] = int(uptime.total_seconds()) + + self._send_json(health) + + def _handle_state(self): + """return full daemon state""" + state = get_daemon_state() + + # convert datetimes to strings + for key in ['last_scout', 'last_match', 'last_intro', 'last_lost', 'started_at']: + if state[key] and isinstance(state[key], datetime): + state[key] = state[key].isoformat() + + self._send_json(state) + + def _handle_priority_matches(self): + """return priority matches for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'count': 0, + 'new_count': 0, + 'top_matches': [], + }) + db.close() + return + + # get matches for first priority user (host) + user = users[0] + matches = get_priority_user_matches(db.conn, user['id'], limit=10) + + new_count = sum(1 for m in matches if m.get('status') == 'new') + + top_matches = [] + for m in matches[:5]: + overlap_reasons = m.get('overlap_reasons', '[]') + if isinstance(overlap_reasons, str): + import json as json_mod + overlap_reasons = json_mod.loads(overlap_reasons) if overlap_reasons else [] + + top_matches.append({ + 'username': m.get('username'), + 'platform': m.get('platform'), + 'score': m.get('score', 0), + 'overlap_score': m.get('overlap_score', 0), + 'reasons': overlap_reasons[:3], + 'url': m.get('url'), + 'status': m.get('status', 'new'), + }) + + db.close() + self._send_json({ + 'count': len(matches), + 'new_count': new_count, + 'top_matches': top_matches, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_top_humans(self): + """return top scoring humans for HA sensor""" + try: + db = Database() + humans = db.get_all_humans(min_score=50, limit=5) + + top_humans = [] + for h in humans: + contact = h.get('contact', '{}') + if isinstance(contact, str): + import json as json_mod + contact = json_mod.loads(contact) if contact else {} + + signals = h.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + top_humans.append({ + 'username': h.get('username'), + 'platform': h.get('platform'), + 'score': h.get('score', 0), + 'name': h.get('name'), + 'signals': signals[:5], + 'contact_method': 'email' if contact.get('email') else + 'mastodon' if contact.get('mastodon') else + 'matrix' if contact.get('matrix') else 'manual', + }) + + db.close() + self._send_json({ + 'count': len(humans), + 'top_humans': top_humans, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_user(self): + """return priority user info for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'configured': False, + 'score': 0, + 'signals': [], + 'match_count': 0, + }) + db.close() + return + + user = users[0] + signals = user.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + interests = user.get('interests', '[]') + if isinstance(interests, str): + import json as json_mod + interests = json_mod.loads(interests) if interests else [] + + matches = get_priority_user_matches(db.conn, user['id'], limit=100) + + db.close() + self._send_json({ + 'configured': True, + 'name': user.get('name'), + 'github': user.get('github'), + 'mastodon': user.get('mastodon'), + 'reddit': user.get('reddit'), + 'lobsters': user.get('lobsters'), + 'matrix': user.get('matrix'), + 'lemmy': user.get('lemmy'), + 'discord': user.get('discord'), + 'bluesky': user.get('bluesky'), + 'score': user.get('score', 0), + 'signals': signals[:10], + 'interests': interests, + 'location': user.get('location'), + 'bio': user.get('bio'), + 'match_count': len(matches), + 'new_match_count': sum(1 for m in matches if m.get('status') == 'new'), + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + +def run_api_server(): + """run the API server in a thread""" + server = HTTPServer(('0.0.0.0', API_PORT), APIHandler) + print(f"connectd api running on port {API_PORT}") + server.serve_forever() + + +def start_api_thread(): + """start API server in background thread""" + thread = threading.Thread(target=run_api_server, daemon=True) + thread.start() + return thread + + +if __name__ == '__main__': + # standalone mode for testing + print(f"starting connectd api on port {API_PORT}...") + run_api_server() diff --git a/api.py.clean b/api.py.clean new file mode 100644 index 0000000..68203dc --- /dev/null +++ b/api.py.clean @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +connectd/api.py - REST API for stats and control + +exposes daemon stats for home assistant integration. +runs on port 8099 by default. +""" + +import os +import json +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler +from datetime import datetime + +from db import Database +from db.users import get_priority_users, get_priority_user_matches, get_priority_user + +API_PORT = int(os.environ.get('CONNECTD_API_PORT', 8099)) + +# shared state (updated by daemon) +_daemon_state = { + 'running': False, + 'dry_run': False, + 'last_scout': None, + 'last_match': None, + 'last_intro': None, + 'last_lost': None, + 'intros_today': 0, + 'lost_intros_today': 0, + 'started_at': None, +} + + +def update_daemon_state(state_dict): + """update shared daemon state (called by daemon)""" + global _daemon_state + _daemon_state.update(state_dict) + + +def get_daemon_state(): + """get current daemon state""" + return _daemon_state.copy() + + +class APIHandler(BaseHTTPRequestHandler): + """simple REST API handler""" + + def log_message(self, format, *args): + """suppress default logging""" + pass + + def _send_json(self, data, status=200): + """send JSON response""" + self.send_response(status) + self.send_header('Content-Type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(json.dumps(data).encode()) + + def do_GET(self): + """handle GET requests""" + if self.path == '/api/stats': + self._handle_stats() + elif self.path == '/api/health': + self._handle_health() + elif self.path == '/api/state': + self._handle_state() + elif self.path == '/api/priority_matches': + self._handle_priority_matches() + elif self.path == '/api/top_humans': + self._handle_top_humans() + elif self.path == '/api/user': + self._handle_user() + else: + self._send_json({'error': 'not found'}, 404) + + def _handle_stats(self): + """return database statistics""" + try: + db = Database() + stats = db.stats() + db.close() + self._send_json(stats) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_health(self): + """return daemon health status""" + state = get_daemon_state() + + health = { + 'status': 'running' if state['running'] else 'stopped', + 'dry_run': state['dry_run'], + 'uptime_seconds': None, + } + + if state['started_at']: + uptime = datetime.now() - datetime.fromisoformat(state['started_at']) + health['uptime_seconds'] = int(uptime.total_seconds()) + + self._send_json(health) + + def _handle_state(self): + """return full daemon state""" + state = get_daemon_state() + + # convert datetimes to strings + for key in ['last_scout', 'last_match', 'last_intro', 'last_lost', 'started_at']: + if state[key] and isinstance(state[key], datetime): + state[key] = state[key].isoformat() + + self._send_json(state) + + def _handle_priority_matches(self): + """return priority matches for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'count': 0, + 'new_count': 0, + 'top_matches': [], + }) + db.close() + return + + # get matches for first priority user (host) + user = users[0] + matches = get_priority_user_matches(db.conn, user['id'], limit=10) + + new_count = sum(1 for m in matches if m.get('status') == 'new') + + top_matches = [] + for m in matches[:5]: + overlap_reasons = m.get('overlap_reasons', '[]') + if isinstance(overlap_reasons, str): + import json as json_mod + overlap_reasons = json_mod.loads(overlap_reasons) if overlap_reasons else [] + + top_matches.append({ + 'username': m.get('username'), + 'platform': m.get('platform'), + 'score': m.get('score', 0), + 'overlap_score': m.get('overlap_score', 0), + 'reasons': overlap_reasons[:3], + 'url': m.get('url'), + 'status': m.get('status', 'new'), + }) + + db.close() + self._send_json({ + 'count': len(matches), + 'new_count': new_count, + 'top_matches': top_matches, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_top_humans(self): + """return top scoring humans for HA sensor""" + try: + db = Database() + humans = db.get_all_humans(min_score=50, limit=5) + + top_humans = [] + for h in humans: + contact = h.get('contact', '{}') + if isinstance(contact, str): + import json as json_mod + contact = json_mod.loads(contact) if contact else {} + + signals = h.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + top_humans.append({ + 'username': h.get('username'), + 'platform': h.get('platform'), + 'score': h.get('score', 0), + 'name': h.get('name'), + 'signals': signals[:5], + 'contact_method': 'email' if contact.get('email') else + 'mastodon' if contact.get('mastodon') else + 'matrix' if contact.get('matrix') else 'manual', + }) + + db.close() + self._send_json({ + 'count': len(humans), + 'top_humans': top_humans, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_user(self): + """return priority user info for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'configured': False, + 'score': 0, + 'signals': [], + 'match_count': 0, + }) + db.close() + return + + user = users[0] + signals = user.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + interests = user.get('interests', '[]') + if isinstance(interests, str): + import json as json_mod + interests = json_mod.loads(interests) if interests else [] + + matches = get_priority_user_matches(db.conn, user['id'], limit=100) + + db.close() + self._send_json({ + 'configured': True, + 'name': user.get('name'), + 'github': user.get('github'), + 'mastodon': user.get('mastodon'), + 'reddit': user.get('reddit'), + 'lobsters': user.get('lobsters'), + 'matrix': user.get('matrix'), + 'lemmy': user.get('lemmy'), + 'discord': user.get('discord'), + 'bluesky': user.get('bluesky'), + 'score': user.get('score', 0), + 'signals': signals[:10], + 'interests': interests, + 'location': user.get('location'), + 'bio': user.get('bio'), + 'match_count': len(matches), + 'new_match_count': sum(1 for m in matches if m.get('status') == 'new'), + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + +def run_api_server(): + """run the API server in a thread""" + server = HTTPServer(('0.0.0.0', API_PORT), APIHandler) + print(f"connectd api running on port {API_PORT}") + server.serve_forever() + + +def start_api_thread(): + """start API server in background thread""" + thread = threading.Thread(target=run_api_server, daemon=True) + thread.start() + return thread + + +if __name__ == '__main__': + # standalone mode for testing + print(f"starting connectd api on port {API_PORT}...") + run_api_server() diff --git a/api_orig.py b/api_orig.py new file mode 100644 index 0000000..aa03ce3 --- /dev/null +++ b/api_orig.py @@ -0,0 +1,608 @@ +#!/usr/bin/env python3 +""" +connectd/api.py - REST API for stats and control + +exposes daemon stats for home assistant integration. +runs on port 8099 by default. +""" + +import os +import json +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler +from datetime import datetime + +from db import Database +from db.users import get_priority_users, get_priority_user_matches, get_priority_user + +API_PORT = int(os.environ.get('CONNECTD_API_PORT', 8099)) + +# shared state (updated by daemon) +_daemon_state = { + 'running': False, + 'dry_run': False, + 'last_scout': None, + 'last_match': None, + 'last_intro': None, + 'last_lost': None, + 'intros_today': 0, + 'lost_intros_today': 0, + 'started_at': None, +} + + +def update_daemon_state(state_dict): + """update shared daemon state (called by daemon)""" + global _daemon_state + _daemon_state.update(state_dict) + + +def get_daemon_state(): + """get current daemon state""" + return _daemon_state.copy() + + +class APIHandler(BaseHTTPRequestHandler): + """simple REST API handler""" + + def log_message(self, format, *args): + """suppress default logging""" + pass + + def _send_json(self, data, status=200): + """send JSON response""" + self.send_response(status) + self.send_header('Content-Type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(json.dumps(data).encode()) + + def do_GET(self): + """handle GET requests""" + path = self.path.split('?')[0] # strip query params for routing + if path == '/api/stats': + self._handle_stats() + elif path == '/api/health': + self._handle_health() + elif path == '/api/state': + self._handle_state() + elif path == '/api/priority_matches': + self._handle_priority_matches() + elif path == '/api/top_humans': + self._handle_top_humans() + elif path == '/api/user': + self._handle_user() + elif path == '/dashboard' or path == '/': + self._handle_dashboard() + elif path == '/api/preview_intros': + self._handle_preview_intros() + elif path == '/api/sent_intros': + self._handle_sent_intros() + elif path == '/api/failed_intros': + self._handle_failed_intros() + else: + self._send_json({'error': 'not found'}, 404) + + def _handle_stats(self): + """return database statistics""" + try: + db = Database() + stats = db.stats() + db.close() + self._send_json(stats) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_health(self): + """return daemon health status""" + state = get_daemon_state() + + health = { + 'status': 'running' if state['running'] else 'stopped', + 'dry_run': state['dry_run'], + 'uptime_seconds': None, + } + + if state['started_at']: + uptime = datetime.now() - datetime.fromisoformat(state['started_at']) + health['uptime_seconds'] = int(uptime.total_seconds()) + + self._send_json(health) + + def _handle_state(self): + """return full daemon state""" + state = get_daemon_state() + + # convert datetimes to strings + for key in ['last_scout', 'last_match', 'last_intro', 'last_lost', 'started_at']: + if state[key] and isinstance(state[key], datetime): + state[key] = state[key].isoformat() + + self._send_json(state) + + def _handle_priority_matches(self): + """return priority matches for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'count': 0, + 'new_count': 0, + 'top_matches': [], + }) + db.close() + return + + # get matches for first priority user (host) + user = users[0] + matches = get_priority_user_matches(db.conn, user['id'], limit=10) + + new_count = sum(1 for m in matches if m.get('status') == 'new') + + top_matches = [] + for m in matches[:5]: + overlap_reasons = m.get('overlap_reasons', '[]') + if isinstance(overlap_reasons, str): + import json as json_mod + overlap_reasons = json_mod.loads(overlap_reasons) if overlap_reasons else [] + + top_matches.append({ + 'username': m.get('username'), + 'platform': m.get('platform'), + 'score': m.get('score', 0), + 'overlap_score': m.get('overlap_score', 0), + 'reasons': overlap_reasons[:3], + 'url': m.get('url'), + 'status': m.get('status', 'new'), + }) + + db.close() + self._send_json({ + 'count': len(matches), + 'new_count': new_count, + 'top_matches': top_matches, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_top_humans(self): + """return top scoring humans for HA sensor""" + try: + db = Database() + humans = db.get_all_humans(min_score=50, limit=5) + + top_humans = [] + for h in humans: + contact = h.get('contact', '{}') + if isinstance(contact, str): + import json as json_mod + contact = json_mod.loads(contact) if contact else {} + + signals = h.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + top_humans.append({ + 'username': h.get('username'), + 'platform': h.get('platform'), + 'score': h.get('score', 0), + 'name': h.get('name'), + 'signals': signals[:5], + 'contact_method': 'email' if contact.get('email') else + 'mastodon' if contact.get('mastodon') else + 'matrix' if contact.get('matrix') else 'manual', + }) + + db.close() + self._send_json({ + 'count': len(humans), + 'top_humans': top_humans, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_user(self): + """return priority user info for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'configured': False, + 'score': 0, + 'signals': [], + 'match_count': 0, + }) + db.close() + return + + user = users[0] + signals = user.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + interests = user.get('interests', '[]') + if isinstance(interests, str): + import json as json_mod + interests = json_mod.loads(interests) if interests else [] + + matches = get_priority_user_matches(db.conn, user['id'], limit=100) + + db.close() + self._send_json({ + 'configured': True, + 'name': user.get('name'), + 'github': user.get('github'), + 'mastodon': user.get('mastodon'), + 'reddit': user.get('reddit'), + 'lobsters': user.get('lobsters'), + 'matrix': user.get('matrix'), + 'lemmy': user.get('lemmy'), + 'discord': user.get('discord'), + 'bluesky': user.get('bluesky'), + 'score': user.get('score', 0), + 'signals': signals[:10], + 'interests': interests, + 'location': user.get('location'), + 'bio': user.get('bio'), + 'match_count': len(matches), + 'new_match_count': sum(1 for m in matches if m.get('status') == 'new'), + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + +def run_api_server(): + """run the API server in a thread""" + server = HTTPServer(('0.0.0.0', API_PORT), APIHandler) + print(f"connectd api running on port {API_PORT}") + server.serve_forever() + + +def start_api_thread(): + """start API server in background thread""" + thread = threading.Thread(target=run_api_server, daemon=True) + thread.start() + return thread + + +if __name__ == '__main__': + # standalone mode for testing + print(f"starting connectd api on port {API_PORT}...") + run_api_server() + + +# === DASHBOARD ENDPOINTS === + +DASHBOARD_HTML = """ + + + connectd dashboard + + + + + +

connectd dashboard

+ +

+ + + + +

+ +

+ + + + +""" + + +class DashboardMixin: + """mixin to add dashboard endpoints to APIHandler""" + + def _handle_dashboard(self): + """serve the dashboard HTML""" + self.send_response(200) + self.send_header('Content-Type', 'text/html') + self.end_headers() + self.wfile.write(DASHBOARD_HTML.encode()) + + def _handle_preview_intros(self): + """preview pending intros with draft generation""" + import sqlite3 + import json + from introd.groq_draft import draft_intro_with_llm, determine_contact_method + + # parse limit from query string + limit = 5 + if '?' in self.path: + query = self.path.split('?')[1] + for param in query.split('&'): + if param.startswith('limit='): + try: + limit = int(param.split('=')[1]) + except: + pass + + conn = sqlite3.connect('/data/db/connectd.db') + c = conn.cursor() + + c.execute("""SELECT h1.username, h1.platform, h1.contact, h1.extra, + h2.username, h2.platform, h2.contact, h2.extra, + m.overlap_score, m.overlap_reasons + FROM matches m + JOIN humans h1 ON m.human_a_id = h1.id + JOIN humans h2 ON m.human_b_id = h2.id + WHERE m.status = 'pending' AND m.overlap_score >= 60 + ORDER BY m.overlap_score DESC + LIMIT ?""", (limit,)) + + previews = [] + for row in c.fetchall(): + human_a = { + 'username': row[0], 'platform': row[1], + 'contact': json.loads(row[2]) if row[2] else {}, + 'extra': json.loads(row[3]) if row[3] else {} + } + human_b = { + 'username': row[4], 'platform': row[5], + 'contact': json.loads(row[6]) if row[6] else {}, + 'extra': json.loads(row[7]) if row[7] else {} + } + reasons = json.loads(row[9]) if row[9] else [] + + match_data = { + 'human_a': human_a, 'human_b': human_b, + 'overlap_score': row[8], 'overlap_reasons': reasons + } + + # determine contact method + method, contact_info = determine_contact_method(human_a) + + # generate draft (skip if too slow) + draft = None + try: + result, _ = draft_intro_with_llm(match_data, recipient='a', dry_run=True) + if result: + draft = result.get('draft') + except: + pass + + previews.append({ + 'from_platform': human_b['platform'], + 'from_user': human_b['username'], + 'to_platform': human_a['platform'], + 'to_user': human_a['username'], + 'score': int(row[8]), + 'reasons': reasons[:3], + 'method': method, + 'contact_info': str(contact_info) if contact_info else None, + 'draft': draft + }) + + conn.close() + self._send_json({'previews': previews}) + + def _handle_sent_intros(self): + """return sent intro history from delivery log""" + import json + from pathlib import Path + + limit = 20 + if '?' in self.path: + query = self.path.split('?')[1] + for param in query.split('&'): + if param.startswith('limit='): + try: + limit = int(param.split('=')[1]) + except: + pass + + log_path = Path('/app/data/delivery_log.json') + if log_path.exists(): + with open(log_path) as f: + log = json.load(f) + sent = log.get('sent', [])[-limit:] + sent.reverse() # newest first + else: + sent = [] + + self._send_json({'sent': sent}) + + def _handle_failed_intros(self): + """return failed delivery attempts""" + import json + from pathlib import Path + + log_path = Path('/app/data/delivery_log.json') + if log_path.exists(): + with open(log_path) as f: + log = json.load(f) + failed = log.get('failed', []) + else: + failed = [] + + self._send_json({'failed': failed}) diff --git a/backups/data_20251215_194141/.gitkeep b/backups/data_20251215_194141/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/backups/data_20251215_194141/delivery_log.json b/backups/data_20251215_194141/delivery_log.json new file mode 100644 index 0000000..4a62437 --- /dev/null +++ b/backups/data_20251215_194141/delivery_log.json @@ -0,0 +1,137 @@ +{ + "sent": [ + { + "recipient_id": "github:dwmw2", + "recipient_name": "David Woodhouse", + "method": "email", + "contact_info": "dwmw2@infradead.org", + "overlap_score": 172.01631023799695, + "timestamp": "2025-12-15T23:14:45.542509", + "success": true, + "error": null + }, + { + "recipient_id": "github:pvizeli", + "recipient_name": "Pascal Vizeli", + "method": "email", + "contact_info": "pascal.vizeli@syshack.ch", + "overlap_score": 163.33333333333331, + "timestamp": "2025-12-15T23:14:48.462716", + "success": true, + "error": null + }, + { + "recipient_id": "github:2234839", + "recipient_name": "\u5d2e\u751f", + "method": "email", + "contact_info": "admin@shenzilong.cn", + "overlap_score": 163.09442000261095, + "timestamp": "2025-12-15T23:14:50.749442", + "success": true, + "error": null + }, + { + "recipient_id": "github:zomars", + "recipient_name": "Omar L\u00f3pez", + "method": "email", + "contact_info": "zomars@me.com", + "overlap_score": 138.9593178751708, + "timestamp": "2025-12-16T00:39:43.266181", + "success": true, + "error": null + }, + { + "recipient_id": "github:joshuaboniface", + "recipient_name": "Joshua M. Boniface", + "method": "mastodon", + "contact_info": "@joshuaboniface@www.youtube.com", + "overlap_score": 136.06304901929022, + "timestamp": "2025-12-16T00:59:21.763092", + "success": true, + "error": "https://mastodon.sudoxreboot.com/@connectd/115726533401043321" + }, + { + "recipient_id": "github:dariusk", + "recipient_name": "Darius Kazemi", + "method": "mastodon", + "contact_info": "@darius@friend.camp", + "overlap_score": 135.39490109778416, + "timestamp": "2025-12-16T00:59:22.199945", + "success": true, + "error": "https://mastodon.sudoxreboot.com/@connectd/115726533505124538" + } + ], + "failed": [ + { + "recipient_id": "github:joyeusenoelle", + "recipient_name": "No\u00eblle Anthony", + "method": "mastodon", + "contact_info": "@noelle@chat.noelle.codes", + "overlap_score": 65, + "timestamp": "2025-12-14T23:44:17.215796", + "success": false, + "error": "MASTODON_TOKEN not set" + }, + { + "recipient_id": "github:balloob", + "recipient_name": "Paulus Schoutsen", + "method": "mastodon", + "contact_info": "@home_assistant@youtube.com", + "overlap_score": 163.09442000261095, + "timestamp": "2025-12-15T23:14:50.155178", + "success": false, + "error": "mastodon api error: 401 - {\"error\":\"The access token is invalid\"}" + }, + { + "recipient_id": "github:balloob", + "recipient_name": "Paulus Schoutsen", + "method": "mastodon", + "contact_info": "@home_assistant@youtube.com", + "overlap_score": 163.09442000261095, + "timestamp": "2025-12-15T23:14:50.334902", + "success": false, + "error": "mastodon api error: 401 - {\"error\":\"The access token is invalid\"}" + }, + { + "recipient_id": "github:joshuaboniface", + "recipient_name": "Joshua M. Boniface", + "method": "mastodon", + "contact_info": "@joshuaboniface@www.youtube.com", + "overlap_score": 136.06304901929022, + "timestamp": "2025-12-16T00:53:25.848601", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + }, + { + "recipient_id": "github:joshuaboniface", + "recipient_name": "Joshua M. Boniface", + "method": "mastodon", + "contact_info": "@joshuaboniface@www.youtube.com", + "overlap_score": 136.06304901929022, + "timestamp": "2025-12-16T00:53:55.912872", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + }, + { + "recipient_id": "github:dariusk", + "recipient_name": "Darius Kazemi", + "method": "mastodon", + "contact_info": "@darius@friend.camp", + "overlap_score": 135.39490109778416, + "timestamp": "2025-12-16T00:54:25.947404", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + }, + { + "recipient_id": "github:dariusk", + "recipient_name": "Darius Kazemi", + "method": "mastodon", + "contact_info": "@darius@friend.camp", + "overlap_score": 135.39490109778416, + "timestamp": "2025-12-16T00:54:55.982839", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + } + ], + "queued": [] +} \ No newline at end of file diff --git a/backups/data_20251215_194141/manual_queue.json b/backups/data_20251215_194141/manual_queue.json new file mode 100644 index 0000000..5d4983b --- /dev/null +++ b/backups/data_20251215_194141/manual_queue.json @@ -0,0 +1,371 @@ +[ + { + "platform": "reddit", + "username": "julietcam84", + "url": "https://reddit.com/u/julietcam84", + "score": 195, + "subreddits": [ + "cooperatives", + "intentionalcommunity" + ], + "signals": [ + "cooperative", + "community", + "intentional_community", + "remote" + ], + "reasons": [ + "active in: cooperatives, intentionalcommunity", + "signals: cooperative, community, intentional_community, remote", + "REDDIT-ONLY: needs manual review for outreach" + ], + "note": "reddit-only user - no external links found. DM manually if promising.", + "queued_at": "2025-12-15T09:06:32.705954", + "status": "pending" + }, + { + "platform": "reddit", + "username": "MasterRoshi1620", + "url": "https://reddit.com/u/MasterRoshi1620", + "score": 159, + "subreddits": [ + "selfhosted", + "homelab" + ], + "signals": [ + "unix", + "privacy", + "selfhosted", + "modern_lang", + "containers" + ], + "reasons": [ + "active in: selfhosted, homelab", + "signals: unix, privacy, selfhosted, modern_lang, containers", + "REDDIT-ONLY: needs manual review for outreach" + ], + "note": "reddit-only user - no external links found. DM manually if promising.", + "queued_at": "2025-12-15T22:54:56.414100", + "status": "pending" + }, + { + "match": { + "id": 2779, + "human_a": { + "id": 642, + "username": "qcasey", + "platform": "github", + "name": "Quinn Casey", + "url": "https://github.com/qcasey", + "contact": "{\"email\": \"github@letterq.org\", \"emails\": [\"github@letterq.org\", \"134208@letterq.org\", \"ceo@business.net\", \"career@letterq.org\", \"recruitmentspam@letterq.org\"], \"blog\": \"https://quinncasey.com\", \"twitter\": null, \"mastodon\": \"@678995876047487016@discord.com\", \"bluesky\": \"quinncasey.com\", \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"JavaScript\": 12, \"Python\": 21, \"Go\": 15, \"TypeScript\": 4, \"Svelte\": 1, \"Rust\": 1, \"Kotlin\": 2, \"HTML\": 1, \"CSS\": 2, \"C\": 1, \"Dart\": 2, \"Ruby\": 1, \"C++\": 2, \"Dockerfile\": 1, \"Java\": 1, \"Shell\": 1, \"PHP\": 1, \"AppleScript\": 1}, \"repo_count\": 100, \"total_stars\": 324, \"extra\": {\"topics\": [], \"languages\": {\"JavaScript\": 12, \"Python\": 21, \"Go\": 15, \"TypeScript\": 4, \"Svelte\": 1, \"Rust\": 1, \"Kotlin\": 2, \"HTML\": 1, \"CSS\": 2, \"C\": 1, \"Dart\": 2, \"Ruby\": 1, \"C++\": 2, \"Dockerfile\": 1, \"Java\": 1, \"Shell\": 1, \"PHP\": 1, \"AppleScript\": 1}, \"repo_count\": 100, \"total_stars\": 324, \"hireable\": true, \"handles\": {\"github\": \"qcasey\", \"telegram\": \"@qcasey\", \"bluesky\": \"quinncasey.com\", \"mastodon\": \"@678995876047487016@discord.com\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:43:55.251547\"}" + }, + "human_b": { + "id": 91, + "username": "mib1185", + "platform": "github", + "name": "Michael", + "url": "https://github.com/mib1185", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Python\": 27, \"HTML\": 2, \"TypeScript\": 1, \"Dockerfile\": 2, \"Shell\": 8, \"JavaScript\": 1, \"Jinja\": 2, \"PHP\": 1, \"Go\": 1}, \"repo_count\": 85, \"total_stars\": 136, \"extra\": {\"topics\": [], \"languages\": {\"Python\": 27, \"HTML\": 2, \"TypeScript\": 1, \"Dockerfile\": 2, \"Shell\": 8, \"JavaScript\": 1, \"Jinja\": 2, \"PHP\": 1, \"Go\": 1}, \"repo_count\": 85, \"total_stars\": 136, \"hireable\": null, \"handles\": {\"github\": \"ansible\"}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:08:57.297790\"}" + }, + "overlap_score": 185.0, + "overlap_reasons": "[\"shared values: unix, foss, federated_chat, home_automation, privacy\", \"both remote-friendly\", \"complementary skills: Kotlin, C++, Jinja, Ruby, CSS\"]" + }, + "draft": "hi Quinn,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using JavaScript, Python, Go | (100 repos) | interested in foss, home_automation, privacy\n\nMichael is building: using Python, HTML, TypeScript | (85 repos) | interested in foss, home_automation, privacy\n\noverlap: shared values: unix, foss, federated_chat, home_automation, privacy | both remote-friendly | complementary skills: Kotlin, C++, Jinja, Ruby, CSS\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/mib1185\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 91, + "username": "mib1185", + "platform": "github", + "name": "Michael", + "url": "https://github.com/mib1185", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Python\": 27, \"HTML\": 2, \"TypeScript\": 1, \"Dockerfile\": 2, \"Shell\": 8, \"JavaScript\": 1, \"Jinja\": 2, \"PHP\": 1, \"Go\": 1}, \"repo_count\": 85, \"total_stars\": 136, \"extra\": {\"topics\": [], \"languages\": {\"Python\": 27, \"HTML\": 2, \"TypeScript\": 1, \"Dockerfile\": 2, \"Shell\": 8, \"JavaScript\": 1, \"Jinja\": 2, \"PHP\": 1, \"Go\": 1}, \"repo_count\": 85, \"total_stars\": 136, \"hireable\": null, \"handles\": {\"github\": \"ansible\"}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:08:57.297790\"}" + }, + "queued_at": "2025-12-15T23:14:45.528184", + "status": "pending" + }, + { + "match": { + "id": 2795, + "human_a": { + "id": 642, + "username": "qcasey", + "platform": "github", + "name": "Quinn Casey", + "url": "https://github.com/qcasey", + "contact": "{\"email\": \"github@letterq.org\", \"emails\": [\"github@letterq.org\", \"134208@letterq.org\", \"ceo@business.net\", \"career@letterq.org\", \"recruitmentspam@letterq.org\"], \"blog\": \"https://quinncasey.com\", \"twitter\": null, \"mastodon\": \"@678995876047487016@discord.com\", \"bluesky\": \"quinncasey.com\", \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"JavaScript\": 12, \"Python\": 21, \"Go\": 15, \"TypeScript\": 4, \"Svelte\": 1, \"Rust\": 1, \"Kotlin\": 2, \"HTML\": 1, \"CSS\": 2, \"C\": 1, \"Dart\": 2, \"Ruby\": 1, \"C++\": 2, \"Dockerfile\": 1, \"Java\": 1, \"Shell\": 1, \"PHP\": 1, \"AppleScript\": 1}, \"repo_count\": 100, \"total_stars\": 324, \"extra\": {\"topics\": [], \"languages\": {\"JavaScript\": 12, \"Python\": 21, \"Go\": 15, \"TypeScript\": 4, \"Svelte\": 1, \"Rust\": 1, \"Kotlin\": 2, \"HTML\": 1, \"CSS\": 2, \"C\": 1, \"Dart\": 2, \"Ruby\": 1, \"C++\": 2, \"Dockerfile\": 1, \"Java\": 1, \"Shell\": 1, \"PHP\": 1, \"AppleScript\": 1}, \"repo_count\": 100, \"total_stars\": 324, \"hireable\": true, \"handles\": {\"github\": \"qcasey\", \"telegram\": \"@qcasey\", \"bluesky\": \"quinncasey.com\", \"mastodon\": \"@678995876047487016@discord.com\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:43:55.251547\"}" + }, + "human_b": { + "id": 110, + "username": "RoboMagus", + "platform": "github", + "name": null, + "url": "https://github.com/RoboMagus", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Python\": 17, \"Vue\": 3, \"HTML\": 1, \"JavaScript\": 11, \"C++\": 7, \"TypeScript\": 6, \"Go\": 3, \"Kotlin\": 1, \"Shell\": 4, \"Dockerfile\": 2, \"C\": 1, \"Less\": 1}, \"repo_count\": 86, \"total_stars\": 77, \"extra\": {\"topics\": [], \"languages\": {\"Python\": 17, \"Vue\": 3, \"HTML\": 1, \"JavaScript\": 11, \"C++\": 7, \"TypeScript\": 6, \"Go\": 3, \"Kotlin\": 1, \"Shell\": 4, \"Dockerfile\": 2, \"C\": 1, \"Less\": 1}, \"repo_count\": 86, \"total_stars\": 77, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:09:50.629088\"}" + }, + "overlap_score": 173.03582460328593, + "overlap_reasons": "[\"shared values: unix, foss, home_automation, privacy, community\", \"both remote-friendly\", \"complementary skills: Less, Ruby, CSS, Dart, PHP\"]" + }, + "draft": "hi Quinn,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using JavaScript, Python, Go | (100 repos) | interested in foss, home_automation, privacy\n\nRoboMagus is building: using Python, Vue, HTML | (86 repos) | interested in foss, home_automation, privacy\n\noverlap: shared values: unix, foss, home_automation, privacy, community | both remote-friendly | complementary skills: Less, Ruby, CSS, Dart, PHP\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/RoboMagus\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 110, + "username": "RoboMagus", + "platform": "github", + "name": null, + "url": "https://github.com/RoboMagus", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Python\": 17, \"Vue\": 3, \"HTML\": 1, \"JavaScript\": 11, \"C++\": 7, \"TypeScript\": 6, \"Go\": 3, \"Kotlin\": 1, \"Shell\": 4, \"Dockerfile\": 2, \"C\": 1, \"Less\": 1}, \"repo_count\": 86, \"total_stars\": 77, \"extra\": {\"topics\": [], \"languages\": {\"Python\": 17, \"Vue\": 3, \"HTML\": 1, \"JavaScript\": 11, \"C++\": 7, \"TypeScript\": 6, \"Go\": 3, \"Kotlin\": 1, \"Shell\": 4, \"Dockerfile\": 2, \"C\": 1, \"Less\": 1}, \"repo_count\": 86, \"total_stars\": 77, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:09:50.629088\"}" + }, + "queued_at": "2025-12-15T23:14:45.535258", + "status": "pending" + }, + { + "match": { + "id": 2768, + "human_a": { + "id": 642, + "username": "qcasey", + "platform": "github", + "name": "Quinn Casey", + "url": "https://github.com/qcasey", + "contact": "{\"email\": \"github@letterq.org\", \"emails\": [\"github@letterq.org\", \"134208@letterq.org\", \"ceo@business.net\", \"career@letterq.org\", \"recruitmentspam@letterq.org\"], \"blog\": \"https://quinncasey.com\", \"twitter\": null, \"mastodon\": \"@678995876047487016@discord.com\", \"bluesky\": \"quinncasey.com\", \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"JavaScript\": 12, \"Python\": 21, \"Go\": 15, \"TypeScript\": 4, \"Svelte\": 1, \"Rust\": 1, \"Kotlin\": 2, \"HTML\": 1, \"CSS\": 2, \"C\": 1, \"Dart\": 2, \"Ruby\": 1, \"C++\": 2, \"Dockerfile\": 1, \"Java\": 1, \"Shell\": 1, \"PHP\": 1, \"AppleScript\": 1}, \"repo_count\": 100, \"total_stars\": 324, \"extra\": {\"topics\": [], \"languages\": {\"JavaScript\": 12, \"Python\": 21, \"Go\": 15, \"TypeScript\": 4, \"Svelte\": 1, \"Rust\": 1, \"Kotlin\": 2, \"HTML\": 1, \"CSS\": 2, \"C\": 1, \"Dart\": 2, \"Ruby\": 1, \"C++\": 2, \"Dockerfile\": 1, \"Java\": 1, \"Shell\": 1, \"PHP\": 1, \"AppleScript\": 1}, \"repo_count\": 100, \"total_stars\": 324, \"hireable\": true, \"handles\": {\"github\": \"qcasey\", \"telegram\": \"@qcasey\", \"bluesky\": \"quinncasey.com\", \"mastodon\": \"@678995876047487016@discord.com\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:43:55.251547\"}" + }, + "human_b": { + "id": 415, + "username": "sbilly", + "platform": "github", + "name": "sbilly", + "url": "https://github.com/sbilly", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"http://sbilly.com/\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"decentralized\", \"community\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [\"mesh-network\"], \"languages\": {\"Go\": 4, \"Shell\": 4, \"Dockerfile\": 1, \"Python\": 12, \"JavaScript\": 14, \"Java\": 3, \"Ruby\": 3, \"CSS\": 3, \"C++\": 6, \"CoffeeScript\": 1, \"Scala\": 2, \"HTML\": 5, \"Vue\": 1, \"Clojure\": 1, \"PHP\": 3, \"TypeScript\": 1, \"C\": 8, \"Assembly\": 2, \"Objective-C\": 1, \"C#\": 1}, \"repo_count\": 100, \"total_stars\": 14354, \"extra\": {\"topics\": [\"mesh-network\"], \"languages\": {\"Go\": 4, \"Shell\": 4, \"Dockerfile\": 1, \"Python\": 12, \"JavaScript\": 14, \"Java\": 3, \"Ruby\": 3, \"CSS\": 3, \"C++\": 6, \"CoffeeScript\": 1, \"Scala\": 2, \"HTML\": 5, \"Vue\": 1, \"Clojure\": 1, \"PHP\": 3, \"TypeScript\": 1, \"C\": 8, \"Assembly\": 2, \"Objective-C\": 1, \"C#\": 1}, \"repo_count\": 100, \"total_stars\": 14354, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:29:03.191201\"}" + }, + "overlap_score": 170.3406027914858, + "overlap_reasons": "[\"shared values: unix, foss, federated_chat, home_automation, privacy\", \"both remote-friendly\", \"complementary skills: Kotlin, Clojure, Scala, Objective-C, Dart\"]" + }, + "draft": "hi Quinn,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using JavaScript, Python, Go | (100 repos) | interested in foss, home_automation, privacy\n\nsbilly is building: working on mesh-network | using Go, Shell, Dockerfile | (100 repos) | interested in foss, home_automation, privacy\n\noverlap: shared values: unix, foss, federated_chat, home_automation, privacy | both remote-friendly | complementary skills: Kotlin, Clojure, Scala, Objective-C, Dart\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/sbilly\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 415, + "username": "sbilly", + "platform": "github", + "name": "sbilly", + "url": "https://github.com/sbilly", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"http://sbilly.com/\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"decentralized\", \"community\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [\"mesh-network\"], \"languages\": {\"Go\": 4, \"Shell\": 4, \"Dockerfile\": 1, \"Python\": 12, \"JavaScript\": 14, \"Java\": 3, \"Ruby\": 3, \"CSS\": 3, \"C++\": 6, \"CoffeeScript\": 1, \"Scala\": 2, \"HTML\": 5, \"Vue\": 1, \"Clojure\": 1, \"PHP\": 3, \"TypeScript\": 1, \"C\": 8, \"Assembly\": 2, \"Objective-C\": 1, \"C#\": 1}, \"repo_count\": 100, \"total_stars\": 14354, \"extra\": {\"topics\": [\"mesh-network\"], \"languages\": {\"Go\": 4, \"Shell\": 4, \"Dockerfile\": 1, \"Python\": 12, \"JavaScript\": 14, \"Java\": 3, \"Ruby\": 3, \"CSS\": 3, \"C++\": 6, \"CoffeeScript\": 1, \"Scala\": 2, \"HTML\": 5, \"Vue\": 1, \"Clojure\": 1, \"PHP\": 3, \"TypeScript\": 1, \"C\": 8, \"Assembly\": 2, \"Objective-C\": 1, \"C#\": 1}, \"repo_count\": 100, \"total_stars\": 14354, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:29:03.191201\"}" + }, + "queued_at": "2025-12-15T23:14:48.455001", + "status": "pending" + }, + { + "match": { + "id": 10793, + "human_a": { + "id": 526, + "username": "2234839", + "platform": "github", + "name": "\u5d2e\u751f", + "url": "https://github.com/2234839", + "contact": "{\"email\": \"admin@shenzilong.cn\", \"emails\": [\"admin@shenzilong.cn\"], \"blog\": \"https://shenzilong.cn\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"TypeScript\": 54, \"Vue\": 7, \"JavaScript\": 12, \"Rust\": 1, \"CSS\": 3, \"Go\": 1, \"Ruby\": 1, \"HTML\": 1, \"Svelte\": 2}, \"repo_count\": 100, \"total_stars\": 528, \"extra\": {\"topics\": [], \"languages\": {\"TypeScript\": 54, \"Vue\": 7, \"JavaScript\": 12, \"Rust\": 1, \"CSS\": 3, \"Go\": 1, \"Ruby\": 1, \"HTML\": 1, \"Svelte\": 2}, \"repo_count\": 100, \"total_stars\": 528, \"hireable\": null, \"handles\": {\"github\": \"2234839\"}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:37:19.731768\"}" + }, + "human_b": { + "id": 212, + "username": "uhthomas", + "platform": "github", + "name": "Thomas", + "url": "https://github.com/uhthomas", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"6f.io\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"CUE\": 1, \"Dockerfile\": 4, \"Go\": 27, \"Starlark\": 10, \"Rust\": 2, \"Lua\": 1, \"JavaScript\": 3, \"Dart\": 1, \"Python\": 1, \"TypeScript\": 1}, \"repo_count\": 100, \"total_stars\": 138, \"extra\": {\"topics\": [], \"languages\": {\"CUE\": 1, \"Dockerfile\": 4, \"Go\": 27, \"Starlark\": 10, \"Rust\": 2, \"Lua\": 1, \"JavaScript\": 3, \"Dart\": 1, \"Python\": 1, \"TypeScript\": 1}, \"repo_count\": 100, \"total_stars\": 138, \"hireable\": true, \"handles\": {\"github\": \"uhthomas\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:16:14.638950\"}" + }, + "overlap_score": 152.39313358485379, + "overlap_reasons": "[\"shared values: unix, community, foss, selfhosted, modern_lang\", \"both remote-friendly\", \"complementary skills: Python, HTML, Ruby, CSS, CUE\"]" + }, + "draft": "hi \u5d2e\u751f,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using TypeScript, Vue, JavaScript | (100 repos) | interested in foss, privacy, selfhosted\n\nThomas is building: using CUE, Dockerfile, Go | (100 repos) | interested in foss, selfhosted\n\noverlap: shared values: unix, community, foss, selfhosted, modern_lang | both remote-friendly | complementary skills: Python, HTML, Ruby, CSS, CUE\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/uhthomas\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 212, + "username": "uhthomas", + "platform": "github", + "name": "Thomas", + "url": "https://github.com/uhthomas", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"6f.io\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"CUE\": 1, \"Dockerfile\": 4, \"Go\": 27, \"Starlark\": 10, \"Rust\": 2, \"Lua\": 1, \"JavaScript\": 3, \"Dart\": 1, \"Python\": 1, \"TypeScript\": 1}, \"repo_count\": 100, \"total_stars\": 138, \"extra\": {\"topics\": [], \"languages\": {\"CUE\": 1, \"Dockerfile\": 4, \"Go\": 27, \"Starlark\": 10, \"Rust\": 2, \"Lua\": 1, \"JavaScript\": 3, \"Dart\": 1, \"Python\": 1, \"TypeScript\": 1}, \"repo_count\": 100, \"total_stars\": 138, \"hireable\": true, \"handles\": {\"github\": \"uhthomas\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:16:14.638950\"}" + }, + "queued_at": "2025-12-16T00:33:56.913113", + "status": "pending" + }, + { + "match": { + "id": 3924, + "human_a": { + "id": 777, + "username": "joshuaboniface", + "platform": "github", + "name": "Joshua M. Boniface", + "url": "https://github.com/joshuaboniface", + "contact": "{\"email\": \"joshua@boniface.me\", \"emails\": [\"joshua@boniface.me\"], \"blog\": \"https://www.boniface.me\", \"twitter\": null, \"mastodon\": \"@joshuaboniface@www.youtube.com\", \"bluesky\": null, \"matrix\": null, \"lemmy\": \"@djbon2112@old.reddit.com\"}", + "signals": "[\"unix\", \"foss\", \"federated_chat\", \"home_automation\", \"privacy\", \"decentralized\", \"selfhosted\", \"modern_lang\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Python\": 17, \"C#\": 13, \"JavaScript\": 5, \"SCSS\": 1, \"Go\": 1, \"HTML\": 2, \"Shell\": 4, \"C++\": 2, \"Java\": 3}, \"repo_count\": 96, \"total_stars\": 1157, \"extra\": {\"topics\": [], \"languages\": {\"Python\": 17, \"C#\": 13, \"JavaScript\": 5, \"SCSS\": 1, \"Go\": 1, \"HTML\": 2, \"Shell\": 4, \"C++\": 2, \"Java\": 3}, \"repo_count\": 96, \"total_stars\": 1157, \"hireable\": null, \"handles\": {\"github\": \"joshuaboniface\", \"linkedin\": \"joshuamboniface\", \"mastodon\": \"@joshuaboniface@www.youtube.com\", \"lemmy\": \"@djbon2112@old.reddit.com\"}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:52:45.963017\"}" + }, + "human_b": { + "id": 228, + "username": "mintsoft", + "platform": "github", + "name": "Rob Emery", + "url": "https://github.com/mintsoft", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"privacy\", \"decentralized\", \"selfhosted\", \"modern_lang\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Kotlin\": 4, \"Go\": 5, \"Python\": 7, \"C\": 2, \"Shell\": 3, \"Dart\": 1, \"Java\": 2, \"C#\": 1, \"PHP\": 2, \"C++\": 7, \"JavaScript\": 5, \"Perl\": 2, \"Makefile\": 1, \"HTML\": 1, \"PowerShell\": 1}, \"repo_count\": 100, \"total_stars\": 33, \"extra\": {\"topics\": [], \"languages\": {\"Kotlin\": 4, \"Go\": 5, \"Python\": 7, \"C\": 2, \"Shell\": 3, \"Dart\": 1, \"Java\": 2, \"C#\": 1, \"PHP\": 2, \"C++\": 7, \"JavaScript\": 5, \"Perl\": 2, \"Makefile\": 1, \"HTML\": 1, \"PowerShell\": 1}, \"repo_count\": 100, \"total_stars\": 33, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:17:08.966748\"}" + }, + "overlap_score": 149.1843240344525, + "overlap_reasons": "[\"shared values: unix, foss, privacy, decentralized, selfhosted\", \"both remote-friendly\", \"complementary skills: Kotlin, Makefile, PHP, Dart, SCSS\"]" + }, + "draft": "hi Joshua,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using Python, C#, JavaScript | (96 repos) | interested in foss, home_automation, privacy\n\nRob is building: using Kotlin, Go, Python | (100 repos) | interested in foss, privacy, selfhosted\n\noverlap: shared values: unix, foss, privacy, decentralized, selfhosted | both remote-friendly | complementary skills: Kotlin, Makefile, PHP, Dart, SCSS\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/mintsoft\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 228, + "username": "mintsoft", + "platform": "github", + "name": "Rob Emery", + "url": "https://github.com/mintsoft", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"privacy\", \"decentralized\", \"selfhosted\", \"modern_lang\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Kotlin\": 4, \"Go\": 5, \"Python\": 7, \"C\": 2, \"Shell\": 3, \"Dart\": 1, \"Java\": 2, \"C#\": 1, \"PHP\": 2, \"C++\": 7, \"JavaScript\": 5, \"Perl\": 2, \"Makefile\": 1, \"HTML\": 1, \"PowerShell\": 1}, \"repo_count\": 100, \"total_stars\": 33, \"extra\": {\"topics\": [], \"languages\": {\"Kotlin\": 4, \"Go\": 5, \"Python\": 7, \"C\": 2, \"Shell\": 3, \"Dart\": 1, \"Java\": 2, \"C#\": 1, \"PHP\": 2, \"C++\": 7, \"JavaScript\": 5, \"Perl\": 2, \"Makefile\": 1, \"HTML\": 1, \"PowerShell\": 1}, \"repo_count\": 100, \"total_stars\": 33, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:17:08.966748\"}" + }, + "queued_at": "2025-12-16T00:33:56.920505", + "status": "pending" + }, + { + "match": { + "id": 13072, + "human_a": { + "id": 212, + "username": "uhthomas", + "platform": "github", + "name": "Thomas", + "url": "https://github.com/uhthomas", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"6f.io\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"CUE\": 1, \"Dockerfile\": 4, \"Go\": 27, \"Starlark\": 10, \"Rust\": 2, \"Lua\": 1, \"JavaScript\": 3, \"Dart\": 1, \"Python\": 1, \"TypeScript\": 1}, \"repo_count\": 100, \"total_stars\": 138, \"extra\": {\"topics\": [], \"languages\": {\"CUE\": 1, \"Dockerfile\": 4, \"Go\": 27, \"Starlark\": 10, \"Rust\": 2, \"Lua\": 1, \"JavaScript\": 3, \"Dart\": 1, \"Python\": 1, \"TypeScript\": 1}, \"repo_count\": 100, \"total_stars\": 138, \"hireable\": true, \"handles\": {\"github\": \"uhthomas\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:16:14.638950\"}" + }, + "human_b": { + "id": 96, + "username": "SlyBouhafs", + "platform": "github", + "name": "Sly", + "url": "https://github.com/SlyBouhafs", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"selfhosted\", \"modern_lang\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"JavaScript\": 9, \"Makefile\": 1, \"Python\": 2, \"HTML\": 2, \"TypeScript\": 3, \"Lua\": 1, \"Vim script\": 1}, \"repo_count\": 29, \"total_stars\": 23, \"extra\": {\"topics\": [], \"languages\": {\"JavaScript\": 9, \"Makefile\": 1, \"Python\": 2, \"HTML\": 2, \"TypeScript\": 3, \"Lua\": 1, \"Vim script\": 1}, \"repo_count\": 29, \"total_stars\": 23, \"hireable\": true, \"handles\": {}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:09:12.423838\"}" + }, + "overlap_score": 142.37165974941587, + "overlap_reasons": "[\"shared values: unix, community, foss, selfhosted, modern_lang\", \"both remote-friendly\", \"complementary skills: Go, HTML, CUE, Dart, Makefile\"]" + }, + "draft": "hi Thomas,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using CUE, Dockerfile, Go | (100 repos) | interested in foss, selfhosted\n\nSly is building: using JavaScript, Makefile, Python | (29 repos) | interested in foss, selfhosted\n\noverlap: shared values: unix, community, foss, selfhosted, modern_lang | both remote-friendly | complementary skills: Go, HTML, CUE, Dart, Makefile\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/SlyBouhafs\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 96, + "username": "SlyBouhafs", + "platform": "github", + "name": "Sly", + "url": "https://github.com/SlyBouhafs", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"selfhosted\", \"modern_lang\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"JavaScript\": 9, \"Makefile\": 1, \"Python\": 2, \"HTML\": 2, \"TypeScript\": 3, \"Lua\": 1, \"Vim script\": 1}, \"repo_count\": 29, \"total_stars\": 23, \"extra\": {\"topics\": [], \"languages\": {\"JavaScript\": 9, \"Makefile\": 1, \"Python\": 2, \"HTML\": 2, \"TypeScript\": 3, \"Lua\": 1, \"Vim script\": 1}, \"repo_count\": 29, \"total_stars\": 23, \"hireable\": true, \"handles\": {}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:09:12.423838\"}" + }, + "queued_at": "2025-12-16T00:33:56.930693", + "status": "pending" + }, + { + "match": { + "id": 12980, + "human_a": { + "id": 775, + "username": "CarlSchwan", + "platform": "github", + "name": "Carl Schwan", + "url": "https://github.com/CarlSchwan", + "contact": "{\"email\": \"carlschwan@kde.org\", \"emails\": [\"carlschwan@kde.org\", \"carl@carlschwan.eu\"], \"blog\": \"https://carlschwan.eu\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\"]", + "extra": "{\"topics\": [], \"languages\": {\"C++\": 3, \"Shell\": 3, \"Lua\": 2, \"PHP\": 3, \"QML\": 1, \"CSS\": 1}, \"repo_count\": 100, \"total_stars\": 20, \"extra\": {\"topics\": [], \"languages\": {\"C++\": 3, \"Shell\": 3, \"Lua\": 2, \"PHP\": 3, \"QML\": 1, \"CSS\": 1}, \"repo_count\": 100, \"total_stars\": 20, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:52:38.446226\"}" + }, + "human_b": { + "id": 665, + "username": "TCOTC", + "platform": "github", + "name": "Jeffrey Chen", + "url": "https://github.com/TCOTC", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\"]", + "extra": "{\"topics\": [], \"languages\": {\"TypeScript\": 14, \"JavaScript\": 11, \"SCSS\": 2, \"Go\": 8, \"Kotlin\": 13, \"HTML\": 3, \"Python\": 10, \"C++\": 7, \"Java\": 5, \"PHP\": 2, \"Rust\": 2, \"Vue\": 4, \"C#\": 4, \"Shell\": 2, \"Swift\": 1, \"Ruby\": 1, \"Dart\": 1, \"Svelte\": 1}, \"repo_count\": 100, \"total_stars\": 19, \"extra\": {\"topics\": [], \"languages\": {\"TypeScript\": 14, \"JavaScript\": 11, \"SCSS\": 2, \"Go\": 8, \"Kotlin\": 13, \"HTML\": 3, \"Python\": 10, \"C++\": 7, \"Java\": 5, \"PHP\": 2, \"Rust\": 2, \"Vue\": 4, \"C#\": 4, \"Shell\": 2, \"Swift\": 1, \"Ruby\": 1, \"Dart\": 1, \"Svelte\": 1}, \"repo_count\": 100, \"total_stars\": 19, \"hireable\": null, \"handles\": {\"github\": \"siyuan-note\"}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:45:24.008492\"}" + }, + "overlap_score": 135.0, + "overlap_reasons": "[\"shared values: unix, foss, federated_chat, privacy, community\", \"complementary skills: Python, Kotlin, Ruby, JavaScript, QML\"]" + }, + "draft": "hi Carl,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: using C++, Shell, Lua | (100 repos) | interested in foss, privacy, selfhosted\n\nJeffrey is building: using TypeScript, JavaScript, SCSS | (100 repos) | interested in foss, privacy, selfhosted\n\noverlap: shared values: unix, foss, federated_chat, privacy, community | complementary skills: Python, Kotlin, Ruby, JavaScript, QML\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/TCOTC\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 665, + "username": "TCOTC", + "platform": "github", + "name": "Jeffrey Chen", + "url": "https://github.com/TCOTC", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"unix\", \"community\", \"foss\", \"federated_chat\", \"privacy\", \"selfhosted\", \"modern_lang\", \"containers\"]", + "extra": "{\"topics\": [], \"languages\": {\"TypeScript\": 14, \"JavaScript\": 11, \"SCSS\": 2, \"Go\": 8, \"Kotlin\": 13, \"HTML\": 3, \"Python\": 10, \"C++\": 7, \"Java\": 5, \"PHP\": 2, \"Rust\": 2, \"Vue\": 4, \"C#\": 4, \"Shell\": 2, \"Swift\": 1, \"Ruby\": 1, \"Dart\": 1, \"Svelte\": 1}, \"repo_count\": 100, \"total_stars\": 19, \"extra\": {\"topics\": [], \"languages\": {\"TypeScript\": 14, \"JavaScript\": 11, \"SCSS\": 2, \"Go\": 8, \"Kotlin\": 13, \"HTML\": 3, \"Python\": 10, \"C++\": 7, \"Java\": 5, \"PHP\": 2, \"Rust\": 2, \"Vue\": 4, \"C#\": 4, \"Shell\": 2, \"Swift\": 1, \"Ruby\": 1, \"Dart\": 1, \"Svelte\": 1}, \"repo_count\": 100, \"total_stars\": 19, \"hireable\": null, \"handles\": {\"github\": \"siyuan-note\"}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:45:24.008492\"}" + }, + "queued_at": "2025-12-16T00:59:33.606115", + "status": "pending" + }, + { + "match": { + "id": 12457, + "human_a": { + "id": 171, + "username": "louislam", + "platform": "github", + "name": "Louis Lam", + "url": "https://github.com/louislam", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": \"louislam\", \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"foss\", \"selfhosted\", \"modern_lang\", \"containers\", \"remote\"]", + "extra": "{\"topics\": [\"self-hosted\"], \"languages\": {\"JavaScript\": 10, \"PHP\": 6, \"TypeScript\": 11, \"HTML\": 1, \"Vue\": 1, \"Shell\": 3, \"Java\": 2, \"C#\": 2, \"Hack\": 1, \"Kotlin\": 1, \"Dockerfile\": 2, \"PLpgSQL\": 1, \"CSS\": 2, \"Smarty\": 1, \"Visual Basic\": 1}, \"repo_count\": 56, \"total_stars\": 101905, \"extra\": {\"topics\": [\"self-hosted\"], \"languages\": {\"JavaScript\": 10, \"PHP\": 6, \"TypeScript\": 11, \"HTML\": 1, \"Vue\": 1, \"Shell\": 3, \"Java\": 2, \"C#\": 2, \"Hack\": 1, \"Kotlin\": 1, \"Dockerfile\": 2, \"PLpgSQL\": 1, \"CSS\": 2, \"Smarty\": 1, \"Visual Basic\": 1}, \"repo_count\": 56, \"total_stars\": 101905, \"hireable\": true, \"handles\": {\"twitter\": \"@louislam\", \"github\": \"louislam\"}}, \"hireable\": true, \"scraped_at\": \"2025-12-15T22:13:56.492647\"}" + }, + "human_b": { + "id": 364, + "username": "anokfireball", + "platform": "github", + "name": "Fabian Koller", + "url": "https://github.com/anokfireball", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"foss\", \"selfhosted\", \"modern_lang\", \"containers\", \"p2p\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Jinja\": 1, \"HCL\": 1, \"Shell\": 1, \"Python\": 4, \"Dockerfile\": 1, \"C\": 2, \"Lua\": 1, \"Go\": 1, \"C++\": 2}, \"repo_count\": 22, \"total_stars\": 13, \"extra\": {\"topics\": [], \"languages\": {\"Jinja\": 1, \"HCL\": 1, \"Shell\": 1, \"Python\": 4, \"Dockerfile\": 1, \"C\": 2, \"Lua\": 1, \"Go\": 1, \"C++\": 2}, \"repo_count\": 22, \"total_stars\": 13, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:25:55.690643\"}" + }, + "overlap_score": 129.61885790097358, + "overlap_reasons": "[\"shared values: foss, selfhosted, modern_lang, containers, remote\", \"both remote-friendly\", \"complementary skills: Python, Kotlin, HCL, Jinja, C++\"]" + }, + "draft": "hi Louis,\n\ni'm an AI that connects isolated builders working on similar things.\n\nyou're building: working on self-hosted | using JavaScript, PHP, TypeScript | (56 repos) | interested in foss, selfhosted\n\nFabian is building: using Jinja, HCL, Shell | (22 repos) | interested in foss, selfhosted\n\noverlap: shared values: foss, selfhosted, modern_lang, containers, remote | both remote-friendly | complementary skills: Python, Kotlin, HCL, Jinja, C++\n\nthought you might benefit from knowing each other.\n\ntheir work: https://github.com/anokfireball\n\nno pitch. just connection. ignore if not useful.\n\n- connectd\n", + "recipient": { + "id": 364, + "username": "anokfireball", + "platform": "github", + "name": "Fabian Koller", + "url": "https://github.com/anokfireball", + "contact": "{\"email\": null, \"emails\": [], \"blog\": \"\", \"twitter\": null, \"mastodon\": null, \"bluesky\": null, \"matrix\": null, \"lemmy\": null}", + "signals": "[\"foss\", \"selfhosted\", \"modern_lang\", \"containers\", \"p2p\", \"remote\"]", + "extra": "{\"topics\": [], \"languages\": {\"Jinja\": 1, \"HCL\": 1, \"Shell\": 1, \"Python\": 4, \"Dockerfile\": 1, \"C\": 2, \"Lua\": 1, \"Go\": 1, \"C++\": 2}, \"repo_count\": 22, \"total_stars\": 13, \"extra\": {\"topics\": [], \"languages\": {\"Jinja\": 1, \"HCL\": 1, \"Shell\": 1, \"Python\": 4, \"Dockerfile\": 1, \"C\": 2, \"Lua\": 1, \"Go\": 1, \"C++\": 2}, \"repo_count\": 22, \"total_stars\": 13, \"hireable\": null, \"handles\": {}}, \"hireable\": null, \"scraped_at\": \"2025-12-15T22:25:55.690643\"}" + }, + "queued_at": "2025-12-16T01:12:40.906296", + "status": "pending" + } +] \ No newline at end of file diff --git a/backups/data_20251215_194141/org_cache.json b/backups/data_20251215_194141/org_cache.json new file mode 100644 index 0000000..f818641 --- /dev/null +++ b/backups/data_20251215_194141/org_cache.json @@ -0,0 +1,82 @@ +{ + "users": { + "testuser": [ + "home-assistant", + "esphome" + ], + "sudoxnym": [], + "joyeusenoelle": [], + "sbilly": [ + "awesome-security" + ], + "turt2live": [ + "matrix-org", + "element-hq", + "ENTS-Source", + "IETF-Hackathon", + "t2bot" + ], + "balloob": [ + "home-assistant", + "hassio-addons", + "NabuCasa", + "esphome", + "OpenHomeFoundation" + ], + "anikdhabal": [], + "fabaff": [ + "NixOS", + "home-assistant", + "affolter-engineering", + "esphome", + "home-assistant-ecosystem" + ], + "uhthomas": [ + "wiz-sec" + ], + "emontnemery": [], + "Stradex": [], + "Tribler": [], + "bdraco": [ + "CpanelInc", + "aio-libs", + "home-assistant", + "esphome", + "python-kasa", + "home-assistant-libs", + "Bluetooth-Devices", + "python-zeroconf", + "pyenphase", + "ESPHome-RATGDO", + "ratgdo", + "OpenHomeFoundation", + "uilibs", + "sblibs", + "openvideolibs", + "Harmony-Libs", + "lightinglibs", + "kohlerlibs", + "open-home-foundation-maintainers", + "Yale-Libs", + "Solarlibs", + "esphome-libs" + ], + "ArchiveBox": [] + }, + "updated": { + "testuser": "2025-12-14T22:44:28.772479", + "sudoxnym": "2025-12-14T22:51:13.523581", + "joyeusenoelle": "2025-12-14T23:19:46.135417", + "sbilly": "2025-12-14T23:19:55.813111", + "turt2live": "2025-12-14T23:20:04.266843", + "balloob": "2025-12-14T23:20:20.527129", + "anikdhabal": "2025-12-14T23:20:32.904717", + "fabaff": "2025-12-14T23:20:39.889442", + "uhthomas": "2025-12-14T23:20:59.048667", + "emontnemery": "2025-12-14T23:21:06.590806", + "Stradex": "2025-12-14T23:21:14.490327", + "Tribler": "2025-12-14T23:21:24.234634", + "bdraco": "2025-12-14T23:26:12.662456", + "ArchiveBox": "2025-12-14T23:26:32.513637" + } +} \ No newline at end of file diff --git a/backups/delivery_log_20251215_194141.json b/backups/delivery_log_20251215_194141.json new file mode 100644 index 0000000..4a62437 --- /dev/null +++ b/backups/delivery_log_20251215_194141.json @@ -0,0 +1,137 @@ +{ + "sent": [ + { + "recipient_id": "github:dwmw2", + "recipient_name": "David Woodhouse", + "method": "email", + "contact_info": "dwmw2@infradead.org", + "overlap_score": 172.01631023799695, + "timestamp": "2025-12-15T23:14:45.542509", + "success": true, + "error": null + }, + { + "recipient_id": "github:pvizeli", + "recipient_name": "Pascal Vizeli", + "method": "email", + "contact_info": "pascal.vizeli@syshack.ch", + "overlap_score": 163.33333333333331, + "timestamp": "2025-12-15T23:14:48.462716", + "success": true, + "error": null + }, + { + "recipient_id": "github:2234839", + "recipient_name": "\u5d2e\u751f", + "method": "email", + "contact_info": "admin@shenzilong.cn", + "overlap_score": 163.09442000261095, + "timestamp": "2025-12-15T23:14:50.749442", + "success": true, + "error": null + }, + { + "recipient_id": "github:zomars", + "recipient_name": "Omar L\u00f3pez", + "method": "email", + "contact_info": "zomars@me.com", + "overlap_score": 138.9593178751708, + "timestamp": "2025-12-16T00:39:43.266181", + "success": true, + "error": null + }, + { + "recipient_id": "github:joshuaboniface", + "recipient_name": "Joshua M. Boniface", + "method": "mastodon", + "contact_info": "@joshuaboniface@www.youtube.com", + "overlap_score": 136.06304901929022, + "timestamp": "2025-12-16T00:59:21.763092", + "success": true, + "error": "https://mastodon.sudoxreboot.com/@connectd/115726533401043321" + }, + { + "recipient_id": "github:dariusk", + "recipient_name": "Darius Kazemi", + "method": "mastodon", + "contact_info": "@darius@friend.camp", + "overlap_score": 135.39490109778416, + "timestamp": "2025-12-16T00:59:22.199945", + "success": true, + "error": "https://mastodon.sudoxreboot.com/@connectd/115726533505124538" + } + ], + "failed": [ + { + "recipient_id": "github:joyeusenoelle", + "recipient_name": "No\u00eblle Anthony", + "method": "mastodon", + "contact_info": "@noelle@chat.noelle.codes", + "overlap_score": 65, + "timestamp": "2025-12-14T23:44:17.215796", + "success": false, + "error": "MASTODON_TOKEN not set" + }, + { + "recipient_id": "github:balloob", + "recipient_name": "Paulus Schoutsen", + "method": "mastodon", + "contact_info": "@home_assistant@youtube.com", + "overlap_score": 163.09442000261095, + "timestamp": "2025-12-15T23:14:50.155178", + "success": false, + "error": "mastodon api error: 401 - {\"error\":\"The access token is invalid\"}" + }, + { + "recipient_id": "github:balloob", + "recipient_name": "Paulus Schoutsen", + "method": "mastodon", + "contact_info": "@home_assistant@youtube.com", + "overlap_score": 163.09442000261095, + "timestamp": "2025-12-15T23:14:50.334902", + "success": false, + "error": "mastodon api error: 401 - {\"error\":\"The access token is invalid\"}" + }, + { + "recipient_id": "github:joshuaboniface", + "recipient_name": "Joshua M. Boniface", + "method": "mastodon", + "contact_info": "@joshuaboniface@www.youtube.com", + "overlap_score": 136.06304901929022, + "timestamp": "2025-12-16T00:53:25.848601", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + }, + { + "recipient_id": "github:joshuaboniface", + "recipient_name": "Joshua M. Boniface", + "method": "mastodon", + "contact_info": "@joshuaboniface@www.youtube.com", + "overlap_score": 136.06304901929022, + "timestamp": "2025-12-16T00:53:55.912872", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + }, + { + "recipient_id": "github:dariusk", + "recipient_name": "Darius Kazemi", + "method": "mastodon", + "contact_info": "@darius@friend.camp", + "overlap_score": 135.39490109778416, + "timestamp": "2025-12-16T00:54:25.947404", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + }, + { + "recipient_id": "github:dariusk", + "recipient_name": "Darius Kazemi", + "method": "mastodon", + "contact_info": "@darius@friend.camp", + "overlap_score": 135.39490109778416, + "timestamp": "2025-12-16T00:54:55.982839", + "success": false, + "error": "HTTPSConnectionPool(host='mastodon.sudoxreboot.com', port=443): Max retries exceeded with url: /api/v1/statuses (Caused by ConnectTimeoutError(, 'Connection to mastodon.sudoxreboot.com timed out. (connect timeout=30)'))" + } + ], + "queued": [] +} \ No newline at end of file diff --git a/cli.py b/cli.py new file mode 100755 index 0000000..a82dd89 --- /dev/null +++ b/cli.py @@ -0,0 +1,878 @@ +#!/usr/bin/env python3 +""" +connectd - people discovery and matchmaking daemon +finds isolated builders and connects them +also finds LOST builders who need encouragement + +usage: + connectd scout # run all scrapers + connectd scout --github # github only + connectd scout --reddit # reddit only + connectd scout --mastodon # mastodon only + connectd scout --lobsters # lobste.rs only + connectd scout --matrix # matrix only + connectd scout --lost # show lost builder stats after scout + + connectd match # find all matches + connectd match --top 20 # show top 20 matches + connectd match --mine # show YOUR matches (priority user) + connectd match --lost # find matches for lost builders + + connectd intro # generate intros for top matches + connectd intro --match 123 # generate intro for specific match + connectd intro --dry-run # preview intros without saving + connectd intro --lost # generate intros for lost builders + + connectd review # interactive review queue + connectd send # send all approved intros + connectd send --export # export for manual sending + + connectd daemon # run as continuous daemon + connectd daemon --oneshot # run once then exit + connectd daemon --dry-run # run but never send intros + connectd daemon --oneshot --dry-run # one cycle, preview only + + connectd user # show your priority user profile + connectd user --setup # setup/update your profile + connectd user --matches # show matches found for you + + connectd status # show database stats (including lost builders) + connectd lost # show lost builders ready for outreach +""" + +import argparse +import sys +from pathlib import Path + +# add parent to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from db import Database +from db.users import (init_users_table, add_priority_user, get_priority_users, + get_priority_user_matches, score_priority_user, auto_match_priority_user, + update_priority_user_profile) +from scoutd import scrape_github, scrape_reddit, scrape_mastodon, scrape_lobsters, scrape_matrix +from scoutd.deep import deep_scrape_github_user +from scoutd.lost import get_signal_descriptions +from introd.deliver import (deliver_intro, deliver_batch, get_delivery_stats, + review_manual_queue, determine_best_contact, load_manual_queue, + save_manual_queue) +from matchd import find_all_matches, generate_fingerprint +from matchd.rank import get_top_matches +from matchd.lost import find_matches_for_lost_builders, get_lost_match_summary +from introd import draft_intro +from introd.draft import draft_intros_for_match +from introd.lost_intro import draft_lost_intro, get_lost_intro_config +from introd.review import review_all_pending, get_pending_intros +from introd.send import send_all_approved, export_manual_intros + + +def cmd_scout(args, db): + """run discovery scrapers""" + from scoutd.deep import deep_scrape_github_user, save_deep_profile + + print("=" * 60) + print("connectd scout - discovering aligned humans") + print("=" * 60) + + # deep scrape specific user + if args.user: + print(f"\ndeep scraping github user: {args.user}") + profile = deep_scrape_github_user(args.user) + if profile: + save_deep_profile(db, profile) + print(f"\n=== {profile['username']} ===") + print(f"real name: {profile.get('real_name')}") + print(f"location: {profile.get('location')}") + print(f"company: {profile.get('company')}") + print(f"email: {profile.get('email')}") + print(f"twitter: {profile.get('twitter')}") + print(f"mastodon: {profile.get('mastodon')}") + print(f"orgs: {', '.join(profile.get('orgs', []))}") + print(f"languages: {', '.join(list(profile.get('languages', {}).keys())[:5])}") + print(f"topics: {', '.join(profile.get('topics', [])[:10])}") + print(f"signals: {', '.join(profile.get('signals', []))}") + print(f"score: {profile.get('score')}") + if profile.get('linked_profiles'): + print(f"linked profiles: {list(profile['linked_profiles'].keys())}") + else: + print("failed to scrape user") + return + + run_all = not any([args.github, args.reddit, args.mastodon, args.lobsters, args.matrix, args.twitter, args.bluesky, args.lemmy, args.discord]) + + if args.github or run_all: + if args.deep: + # deep scrape mode - slower but more thorough + print("\nrunning DEEP github scrape (follows all links)...") + from scoutd.github import get_repo_contributors + from scoutd.signals import ECOSYSTEM_REPOS + + all_logins = set() + for repo in ECOSYSTEM_REPOS[:5]: # limit for deep mode + contributors = get_repo_contributors(repo, per_page=20) + for c in contributors: + login = c.get('login') + if login and not login.endswith('[bot]'): + all_logins.add(login) + print(f" {repo}: {len(contributors)} contributors") + + print(f"\ndeep scraping {len(all_logins)} users...") + for login in all_logins: + try: + profile = deep_scrape_github_user(login) + if profile and profile.get('score', 0) > 0: + save_deep_profile(db, profile) + if profile['score'] >= 30: + print(f" ★ {login}: {profile['score']} pts") + if profile.get('email'): + print(f" email: {profile['email']}") + if profile.get('mastodon'): + print(f" mastodon: {profile['mastodon']}") + except Exception as e: + print(f" error on {login}: {e}") + else: + scrape_github(db) + + if args.reddit or run_all: + scrape_reddit(db) + + if args.mastodon or run_all: + scrape_mastodon(db) + + if args.lobsters or run_all: + scrape_lobsters(db) + + if args.matrix or run_all: + scrape_matrix(db) + + if args.twitter or run_all: + from scoutd.twitter import scrape_twitter + scrape_twitter(db) + + if args.bluesky or run_all: + from scoutd.bluesky import scrape_bluesky + scrape_bluesky(db) + + if args.lemmy or run_all: + from scoutd.lemmy import scrape_lemmy + scrape_lemmy(db) + + if args.discord or run_all: + from scoutd.discord import scrape_discord + scrape_discord(db) + + # show stats + stats = db.stats() + print("\n" + "=" * 60) + print("SCOUT COMPLETE") + print("=" * 60) + print(f"total humans: {stats['total_humans']}") + for platform, count in stats.get('by_platform', {}).items(): + print(f" {platform}: {count}") + + # show lost builder stats if requested + if args.lost or True: # always show lost stats now + print("\n--- lost builder stats ---") + print(f"active builders: {stats.get('active_builders', 0)}") + print(f"lost builders: {stats.get('lost_builders', 0)}") + print(f"recovering builders: {stats.get('recovering_builders', 0)}") + print(f"high lost score (40+): {stats.get('high_lost_score', 0)}") + print(f"lost outreach sent: {stats.get('lost_outreach_sent', 0)}") + + +def cmd_match(args, db): + """find and rank matches""" + import json as json_mod + + print("=" * 60) + print("connectd match - finding aligned pairs") + print("=" * 60) + + # lost builder matching + if args.lost: + print("\n--- LOST BUILDER MATCHING ---") + print("finding inspiring builders for lost souls...\n") + + matches, error = find_matches_for_lost_builders(db, limit=args.top or 20) + + if error: + print(f"error: {error}") + return + + if not matches: + print("no lost builders ready for outreach") + return + + print(f"found {len(matches)} lost builders with matching active builders\n") + + for i, match in enumerate(matches, 1): + lost = match['lost_user'] + builder = match['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username') + builder_name = builder.get('name') or builder.get('username') + + print(f"{i}. {lost_name} ({lost.get('platform')}) → needs inspiration from") + print(f" {builder_name} ({builder.get('platform')})") + print(f" lost score: {lost.get('lost_potential_score', 0)} | values: {lost.get('score', 0)}") + print(f" shared interests: {', '.join(match.get('shared_interests', []))}") + print(f" builder has: {match.get('builder_repos', 0)} repos, {match.get('builder_stars', 0)} stars") + print() + + return + + if args.mine: + # show matches for priority user + init_users_table(db.conn) + users = get_priority_users(db.conn) + if not users: + print("no priority user configured. run: connectd user --setup") + return + + for user in users: + print(f"\n=== matches for {user['name']} ===\n") + matches = get_priority_user_matches(db.conn, user['id'], limit=args.top or 20) + + if not matches: + print("no matches yet - run: connectd scout && connectd match") + continue + + for i, match in enumerate(matches, 1): + print(f"{i}. {match['username']} ({match['platform']})") + print(f" score: {match['overlap_score']:.0f}") + print(f" url: {match['url']}") + reasons = match.get('overlap_reasons', '[]') + if isinstance(reasons, str): + reasons = json_mod.loads(reasons) + if reasons: + print(f" why: {reasons[0]}") + print() + return + + if args.top and not args.mine: + # just show existing top matches + matches = get_top_matches(db, limit=args.top) + else: + # run full matching + matches = find_all_matches(db, min_score=args.min_score, min_overlap=args.min_overlap) + + print("\n" + "-" * 60) + print(f"TOP {min(len(matches), args.top or 20)} MATCHES") + print("-" * 60) + + for i, match in enumerate(matches[:args.top or 20], 1): + human_a = match.get('human_a', {}) + human_b = match.get('human_b', {}) + + print(f"\n{i}. {human_a.get('username')} <-> {human_b.get('username')}") + print(f" platforms: {human_a.get('platform')} / {human_b.get('platform')}") + print(f" overlap: {match.get('overlap_score', 0):.0f} pts") + + reasons = match.get('overlap_reasons', []) + if isinstance(reasons, str): + reasons = json_mod.loads(reasons) + if reasons: + print(f" why: {' | '.join(reasons[:3])}") + + if match.get('geographic_match'): + print(f" location: compatible ✓") + + +def cmd_intro(args, db): + """generate intro drafts""" + import json as json_mod + + print("=" * 60) + print("connectd intro - drafting introductions") + print("=" * 60) + + if args.dry_run: + print("*** DRY RUN MODE - previewing only ***\n") + + # lost builder intros - different tone entirely + if args.lost: + print("\n--- LOST BUILDER INTROS ---") + print("drafting encouragement for lost souls...\n") + + matches, error = find_matches_for_lost_builders(db, limit=args.limit or 10) + + if error: + print(f"error: {error}") + return + + if not matches: + print("no lost builders ready for outreach") + return + + config = get_lost_intro_config() + count = 0 + + for match in matches: + lost = match['lost_user'] + builder = match['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username') + builder_name = builder.get('name') or builder.get('username') + + # draft intro + draft, error = draft_lost_intro(lost, builder, config) + + if error: + print(f" error drafting intro for {lost_name}: {error}") + continue + + if args.dry_run: + print("=" * 60) + print(f"TO: {lost_name} ({lost.get('platform')})") + print(f"LOST SCORE: {lost.get('lost_potential_score', 0)}") + print(f"INSPIRING: {builder_name} ({builder.get('url')})") + print("-" * 60) + print("MESSAGE:") + print(draft) + print("-" * 60) + print("[DRY RUN - NOT SAVED]") + print("=" * 60) + else: + print(f" drafted intro for {lost_name} → {builder_name}") + + count += 1 + + if args.dry_run: + print(f"\npreviewed {count} lost builder intros (dry run)") + else: + print(f"\ndrafted {count} lost builder intros") + print("these require manual review before sending") + + return + + if args.match: + # specific match + matches = [m for m in get_top_matches(db, limit=1000) if m.get('id') == args.match] + else: + # top matches + matches = get_top_matches(db, limit=args.limit or 10) + + if not matches: + print("no matches found") + return + + print(f"generating intros for {len(matches)} matches...") + + count = 0 + for match in matches: + intros = draft_intros_for_match(match) + + for intro in intros: + recipient = intro['recipient_human'] + other = intro['other_human'] + + if args.dry_run: + # get contact info + contact = recipient.get('contact', {}) + if isinstance(contact, str): + contact = json_mod.loads(contact) + email = contact.get('email', 'no email') + + # get overlap reasons + reasons = match.get('overlap_reasons', []) + if isinstance(reasons, str): + reasons = json_mod.loads(reasons) + reason_summary = ', '.join(reasons[:3]) if reasons else 'aligned values' + + # print preview + print("\n" + "=" * 60) + print(f"TO: {recipient.get('username')} ({recipient.get('platform')})") + print(f"EMAIL: {email}") + print(f"SUBJECT: you might want to meet {other.get('username')}") + print(f"SCORE: {match.get('overlap_score', 0):.0f} ({reason_summary})") + print("-" * 60) + print("MESSAGE:") + print(intro['draft']) + print("-" * 60) + print("[DRY RUN - NOT SENT]") + print("=" * 60) + else: + print(f"\n {recipient.get('username')} ({intro['channel']})") + + # save to db + db.save_intro( + match.get('id'), + recipient.get('id'), + intro['channel'], + intro['draft'] + ) + + count += 1 + + if args.dry_run: + print(f"\npreviewed {count} intros (dry run - nothing saved)") + else: + print(f"\ngenerated {count} intro drafts") + print("run 'connectd review' to approve before sending") + + +def cmd_review(args, db): + """interactive review queue""" + review_all_pending(db) + + +def cmd_send(args, db): + """send approved intros""" + import json as json_mod + + if args.export: + # export manual queue to file for review + queue = load_manual_queue() + pending = [q for q in queue if q.get('status') == 'pending'] + + with open(args.export, 'w') as f: + json.dump(pending, f, indent=2) + + print(f"exported {len(pending)} pending intros to {args.export}") + return + + # send all approved from manual queue + queue = load_manual_queue() + approved = [q for q in queue if q.get('status') == 'approved'] + + if not approved: + print("no approved intros to send") + print("use 'connectd review' to approve intros first") + return + + print(f"sending {len(approved)} approved intros...") + + for item in approved: + match_data = item.get('match', {}) + intro_draft = item.get('draft', '') + recipient = item.get('recipient', {}) + + success, error, method = deliver_intro( + {'human_b': recipient, **match_data}, + intro_draft, + dry_run=args.dry_run if hasattr(args, 'dry_run') else False + ) + + status = 'ok' if success else f'failed: {error}' + print(f" {recipient.get('username')}: {method} - {status}") + + # update queue status + item['status'] = 'sent' if success else 'failed' + item['error'] = error + + save_manual_queue(queue) + + # show stats + stats = get_delivery_stats() + print(f"\ndelivery stats: {stats['sent']} sent, {stats['failed']} failed") + + +def cmd_lost(args, db): + """show lost builders ready for outreach""" + import json as json_mod + + print("=" * 60) + print("connectd lost - lost builders who need encouragement") + print("=" * 60) + + # get lost builders + lost_builders = db.get_lost_builders_for_outreach( + min_lost_score=args.min_score or 40, + min_values_score=20, + limit=args.limit or 50 + ) + + if not lost_builders: + print("\nno lost builders ready for outreach") + print("run 'connectd scout' to discover more") + return + + print(f"\n{len(lost_builders)} lost builders ready for outreach:\n") + + for i, lost in enumerate(lost_builders, 1): + name = lost.get('name') or lost.get('username') + platform = lost.get('platform') + lost_score = lost.get('lost_potential_score', 0) + values_score = lost.get('score', 0) + + # parse lost signals + lost_signals = lost.get('lost_signals', []) + if isinstance(lost_signals, str): + lost_signals = json_mod.loads(lost_signals) if lost_signals else [] + + # get signal descriptions + signal_descriptions = get_signal_descriptions(lost_signals) + + print(f"{i}. {name} ({platform})") + print(f" lost score: {lost_score} | values score: {values_score}") + print(f" url: {lost.get('url')}") + if signal_descriptions: + print(f" why lost: {', '.join(signal_descriptions[:3])}") + print() + + if args.verbose: + print("-" * 60) + print("these people need encouragement, not networking.") + print("the goal: show them someone like them made it.") + print("-" * 60) + + +def cmd_status(args, db): + """show database stats""" + import json as json_mod + + init_users_table(db.conn) + stats = db.stats() + + print("=" * 60) + print("connectd status") + print("=" * 60) + + # priority users + users = get_priority_users(db.conn) + print(f"\npriority users: {len(users)}") + for user in users: + print(f" - {user['name']} ({user['email']})") + + print(f"\nhumans discovered: {stats['total_humans']}") + print(f" high-score (50+): {stats['high_score_humans']}") + + print("\nby platform:") + for platform, count in stats.get('by_platform', {}).items(): + print(f" {platform}: {count}") + + print(f"\nstranger matches: {stats['total_matches']}") + print(f"intros created: {stats['total_intros']}") + print(f"intros sent: {stats['sent_intros']}") + + # lost builder stats + print("\n--- lost builder stats ---") + print(f"active builders: {stats.get('active_builders', 0)}") + print(f"lost builders: {stats.get('lost_builders', 0)}") + print(f"recovering builders: {stats.get('recovering_builders', 0)}") + print(f"high lost score (40+): {stats.get('high_lost_score', 0)}") + print(f"lost outreach sent: {stats.get('lost_outreach_sent', 0)}") + + # priority user matches + for user in users: + matches = get_priority_user_matches(db.conn, user['id']) + print(f"\nmatches for {user['name']}: {len(matches)}") + + # pending intros + pending = get_pending_intros(db) + print(f"\nintros pending review: {len(pending)}") + + +def cmd_daemon(args, db): + """run as continuous daemon""" + from daemon import ConnectDaemon + + daemon = ConnectDaemon(dry_run=args.dry_run) + + if args.oneshot: + print("running one cycle...") + if args.dry_run: + print("*** DRY RUN MODE - no intros will be sent ***") + daemon.scout_cycle() + daemon.match_priority_users() + daemon.match_strangers() + daemon.send_stranger_intros() + print("done") + else: + daemon.run() + + +def cmd_user(args, db): + """manage priority user profile""" + import json as json_mod + + init_users_table(db.conn) + + if args.setup: + # interactive setup + print("=" * 60) + print("connectd priority user setup") + print("=" * 60) + print("\nlink your profiles so connectd finds matches for YOU\n") + + name = input("name: ").strip() + email = input("email: ").strip() + github = input("github username: ").strip() or None + reddit = input("reddit username: ").strip() or None + mastodon = input("mastodon (user@instance): ").strip() or None + location = input("location (e.g. seattle): ").strip() or None + + print("\ninterests (comma separated):") + interests_raw = input("> ").strip() + interests = [i.strip() for i in interests_raw.split(',')] if interests_raw else [] + + looking_for = input("looking for: ").strip() or None + + user_data = { + 'name': name, 'email': email, 'github': github, + 'reddit': reddit, 'mastodon': mastodon, + 'location': location, 'interests': interests, + 'looking_for': looking_for, + } + user_id = add_priority_user(db.conn, user_data) + print(f"\n✓ added as priority user #{user_id}") + + elif args.matches: + # show matches + users = get_priority_users(db.conn) + if not users: + print("no priority user. run: connectd user --setup") + return + + for user in users: + print(f"\n=== matches for {user['name']} ===\n") + matches = get_priority_user_matches(db.conn, user['id'], limit=20) + + if not matches: + print("no matches yet") + continue + + for i, match in enumerate(matches, 1): + print(f"{i}. {match['username']} ({match['platform']})") + print(f" {match['url']}") + print(f" score: {match['overlap_score']:.0f}") + print() + + else: + # show profile + users = get_priority_users(db.conn) + if not users: + print("no priority user configured") + print("run: connectd user --setup") + return + + for user in users: + print("=" * 60) + print(f"priority user #{user['id']}: {user['name']}") + print("=" * 60) + print(f"email: {user['email']}") + if user['github']: + print(f"github: {user['github']}") + if user['reddit']: + print(f"reddit: {user['reddit']}") + if user['mastodon']: + print(f"mastodon: {user['mastodon']}") + if user['location']: + print(f"location: {user['location']}") + if user['interests']: + interests = json_mod.loads(user['interests']) if isinstance(user['interests'], str) else user['interests'] + print(f"interests: {', '.join(interests)}") + if user['looking_for']: + print(f"looking for: {user['looking_for']}") + + +def cmd_me(args, db): + """auto-score and auto-match for priority user with optional groq intros""" + import json as json_mod + + init_users_table(db.conn) + + # get priority user + users = get_priority_users(db.conn) + if not users: + print("no priority user configured") + print("run: connectd user --setup") + return + + user = users[0] # first/main user + print("=" * 60) + print(f"connectd me - {user['name']}") + print("=" * 60) + + # step 1: scrape github profile + if user.get('github') and not args.skip_scrape: + print(f"\n[1/4] scraping github profile: {user['github']}") + profile = deep_scrape_github_user(user['github'], scrape_commits=False) + if profile: + print(f" repos: {len(profile.get('top_repos', []))}") + print(f" languages: {', '.join(list(profile.get('languages', {}).keys())[:5])}") + else: + print(" failed to scrape (rate limited?)") + profile = None + else: + print("\n[1/4] skipping github scrape (using saved profile)") + # use saved profile if available + saved = user.get('scraped_profile') + if saved: + profile = json_mod.loads(saved) if isinstance(saved, str) else saved + print(f" loaded saved profile: {len(profile.get('top_repos', []))} repos") + else: + profile = None + + # step 2: calculate score + print(f"\n[2/4] calculating your score...") + result = score_priority_user(db.conn, user['id'], profile) + if result: + print(f" score: {result['score']}") + print(f" signals: {', '.join(sorted(result['signals'])[:10])}") + + # step 3: find matches + print(f"\n[3/4] finding matches...") + matches = auto_match_priority_user(db.conn, user['id'], min_overlap=args.min_overlap) + print(f" found {len(matches)} matches") + + # step 4: show results (optionally with groq intros) + print(f"\n[4/4] top matches:") + print("-" * 60) + + limit = args.limit or 10 + for i, m in enumerate(matches[:limit], 1): + human = m['human'] + shared = m['shared'] + + print(f"\n{i}. {human.get('name') or human['username']} ({human['platform']})") + print(f" {human.get('url', '')}") + print(f" score: {human.get('score', 0):.0f} | overlap: {m['overlap_score']:.0f}") + print(f" location: {human.get('location') or 'unknown'}") + print(f" why: {', '.join(shared[:5])}") + + # groq intro draft + if args.groq: + try: + from introd.groq_draft import draft_intro_with_llm + match_data = { + 'human_a': {'name': user['name'], 'username': user.get('github'), + 'platform': 'github', 'signals': result.get('signals', []) if result else [], + 'bio': user.get('bio'), 'location': user.get('location'), + 'extra': profile or {}}, + 'human_b': human, + 'overlap_score': m['overlap_score'], + 'overlap_reasons': shared, + } + intro, err = draft_intro_with_llm(match_data, recipient='b') + if intro: + print(f"\n --- groq draft ({intro.get('contact_method', 'manual')}) ---") + if intro.get('contact_info'): + print(f" deliver via: {intro['contact_info']}") + for line in intro['draft'].split('\n'): + print(f" {line}") + print(f" ------------------") + elif err: + print(f" [groq error: {err}]") + except Exception as e: + print(f" [groq error: {e}]") + + # summary + print("\n" + "=" * 60) + print(f"your score: {result['score'] if result else 'unknown'}") + print(f"matches found: {len(matches)}") + if args.groq: + print("groq intros: enabled") + else: + print("tip: add --groq to generate ai intro drafts") + + +def main(): + parser = argparse.ArgumentParser( + description='connectd - people discovery and matchmaking daemon', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + + subparsers = parser.add_subparsers(dest='command', help='commands') + + # scout command + scout_parser = subparsers.add_parser('scout', help='discover aligned humans') + scout_parser.add_argument('--github', action='store_true', help='github only') + scout_parser.add_argument('--reddit', action='store_true', help='reddit only') + scout_parser.add_argument('--mastodon', action='store_true', help='mastodon only') + scout_parser.add_argument('--lobsters', action='store_true', help='lobste.rs only') + scout_parser.add_argument('--matrix', action='store_true', help='matrix only') + scout_parser.add_argument('--twitter', action='store_true', help='twitter/x via nitter') + scout_parser.add_argument('--bluesky', action='store_true', help='bluesky/atproto') + scout_parser.add_argument('--lemmy', action='store_true', help='lemmy (fediverse reddit)') + scout_parser.add_argument('--discord', action='store_true', help='discord servers') + scout_parser.add_argument('--deep', action='store_true', help='deep scrape - follow all links') + scout_parser.add_argument('--user', type=str, help='deep scrape specific github user') + scout_parser.add_argument('--lost', action='store_true', help='show lost builder stats') + + # match command + match_parser = subparsers.add_parser('match', help='find and rank matches') + match_parser.add_argument('--top', type=int, help='show top N matches') + match_parser.add_argument('--mine', action='store_true', help='show YOUR matches') + match_parser.add_argument('--lost', action='store_true', help='find matches for lost builders') + match_parser.add_argument('--min-score', type=int, default=30, help='min human score') + match_parser.add_argument('--min-overlap', type=int, default=20, help='min overlap score') + + # intro command + intro_parser = subparsers.add_parser('intro', help='generate intro drafts') + intro_parser.add_argument('--match', type=int, help='specific match id') + intro_parser.add_argument('--limit', type=int, default=10, help='number of matches') + intro_parser.add_argument('--dry-run', action='store_true', help='preview only, do not save') + intro_parser.add_argument('--lost', action='store_true', help='generate intros for lost builders') + + # lost command - show lost builders ready for outreach + lost_parser = subparsers.add_parser('lost', help='show lost builders who need encouragement') + lost_parser.add_argument('--min-score', type=int, default=40, help='min lost score') + lost_parser.add_argument('--limit', type=int, default=50, help='max results') + lost_parser.add_argument('--verbose', '-v', action='store_true', help='show philosophy') + + # review command + review_parser = subparsers.add_parser('review', help='review intro queue') + + # send command + send_parser = subparsers.add_parser('send', help='send approved intros') + send_parser.add_argument('--export', type=str, help='export to file for manual sending') + + # status command + status_parser = subparsers.add_parser('status', help='show stats') + + # daemon command + daemon_parser = subparsers.add_parser('daemon', help='run as continuous daemon') + daemon_parser.add_argument('--oneshot', action='store_true', help='run once then exit') + daemon_parser.add_argument('--dry-run', action='store_true', help='preview intros, do not send') + + # user command + user_parser = subparsers.add_parser('user', help='manage priority user profile') + user_parser.add_argument('--setup', action='store_true', help='setup/update profile') + user_parser.add_argument('--matches', action='store_true', help='show your matches') + + # me command - auto score + match + optional groq intros + me_parser = subparsers.add_parser('me', help='auto-score and match yourself') + me_parser.add_argument('--groq', action='store_true', help='generate groq llama intro drafts') + me_parser.add_argument('--skip-scrape', action='store_true', help='skip github scraping') + me_parser.add_argument('--min-overlap', type=int, default=40, help='min overlap score') + me_parser.add_argument('--limit', type=int, default=10, help='number of matches to show') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + # init database + db = Database() + + try: + if args.command == 'scout': + cmd_scout(args, db) + elif args.command == 'match': + cmd_match(args, db) + elif args.command == 'intro': + cmd_intro(args, db) + elif args.command == 'review': + cmd_review(args, db) + elif args.command == 'send': + cmd_send(args, db) + elif args.command == 'status': + cmd_status(args, db) + elif args.command == 'daemon': + cmd_daemon(args, db) + elif args.command == 'user': + cmd_user(args, db) + elif args.command == 'me': + cmd_me(args, db) + elif args.command == 'lost': + cmd_lost(args, db) + finally: + db.close() + + +if __name__ == '__main__': + main() diff --git a/config.py b/config.py new file mode 100644 index 0000000..26ebf66 --- /dev/null +++ b/config.py @@ -0,0 +1,124 @@ +""" +connectd/config.py - central configuration + +all configurable settings in one place. +""" + +import os +from pathlib import Path + +# base paths +BASE_DIR = Path(__file__).parent +DB_DIR = BASE_DIR / 'db' +DATA_DIR = BASE_DIR / 'data' +CACHE_DIR = DB_DIR / 'cache' + +# ensure directories exist +DATA_DIR.mkdir(exist_ok=True) +CACHE_DIR.mkdir(exist_ok=True) + + +# === DAEMON CONFIG === +SCOUT_INTERVAL = 3600 * 4 # full scout every 4 hours +MATCH_INTERVAL = 3600 # check matches every hour +INTRO_INTERVAL = 1800 # send intros every 2 hours +MAX_INTROS_PER_DAY = 250 # rate limit builder-to-builder outreach + + +# === MATCHING CONFIG === +MIN_OVERLAP_PRIORITY = 30 # min score for priority user matches +MIN_OVERLAP_STRANGERS = 50 # higher bar for stranger intros +MIN_HUMAN_SCORE = 25 # min values score to be considered + + +# === LOST BUILDER CONFIG === +# these people need encouragement, not networking. +# the goal isn't to recruit them - it's to show them the door exists. + +LOST_CONFIG = { + # detection thresholds + 'min_lost_score': 40, # minimum lost_potential_score + 'min_values_score': 20, # must have SOME values alignment + + # outreach settings + 'enabled': True, + 'max_per_day': 20, # lower volume, higher care + 'require_review': False, # fully autonomous + 'cooldown_days': 90, # don't spam struggling people + + # matching settings + 'min_builder_score': 50, # inspiring builders must be active + 'min_match_overlap': 10, # must have SOME shared interests + + # LLM drafting + 'use_llm': True, + 'llm_temperature': 0.7, # be genuine, not robotic + + # message guidelines (for LLM prompt) + 'tone': 'genuine, not salesy', + 'max_words': 150, # they don't have energy for long messages + 'no_pressure': True, # never pushy + 'sign_off': '- connectd', +} + + +# === API CREDENTIALS === +# all credentials from environment variables - no defaults + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +GROQ_MODEL = os.environ.get('GROQ_MODEL', 'llama-3.3-70b-versatile') + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '') +MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '') + +BLUESKY_HANDLE = os.environ.get('BLUESKY_HANDLE', '') +BLUESKY_APP_PASSWORD = os.environ.get('BLUESKY_APP_PASSWORD', '') + +MATRIX_HOMESERVER = os.environ.get('MATRIX_HOMESERVER', '') +MATRIX_USER_ID = os.environ.get('MATRIX_USER_ID', '') +MATRIX_ACCESS_TOKEN = os.environ.get('MATRIX_ACCESS_TOKEN', '') + +DISCORD_BOT_TOKEN = os.environ.get('DISCORD_BOT_TOKEN', '') +DISCORD_TARGET_SERVERS = os.environ.get('DISCORD_TARGET_SERVERS', '') + +# lemmy (for authenticated access to private instance) +LEMMY_INSTANCE = os.environ.get('LEMMY_INSTANCE', '') +LEMMY_USERNAME = os.environ.get('LEMMY_USERNAME', '') +LEMMY_PASSWORD = os.environ.get('LEMMY_PASSWORD', '') + +# email (for sending intros) +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', '465')) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') + +# === HOST USER CONFIG === +# the person running connectd - gets priority matching +HOST_USER = os.environ.get('HOST_USER', '') # alias like sudoxnym +HOST_NAME = os.environ.get('HOST_NAME', '') +HOST_EMAIL = os.environ.get('HOST_EMAIL', '') +HOST_GITHUB = os.environ.get('HOST_GITHUB', '') +HOST_MASTODON = os.environ.get('HOST_MASTODON', '') # user@instance +HOST_REDDIT = os.environ.get('HOST_REDDIT', '') +HOST_LEMMY = os.environ.get('HOST_LEMMY', '') # user@instance +HOST_LOBSTERS = os.environ.get('HOST_LOBSTERS', '') +HOST_MATRIX = os.environ.get('HOST_MATRIX', '') # @user:server +HOST_DISCORD = os.environ.get('HOST_DISCORD', '') # user id +HOST_BLUESKY = os.environ.get('HOST_BLUESKY', '') # handle.bsky.social +HOST_LOCATION = os.environ.get('HOST_LOCATION', '') +HOST_INTERESTS = os.environ.get('HOST_INTERESTS', '') # comma separated +HOST_LOOKING_FOR = os.environ.get('HOST_LOOKING_FOR', '') + + +def get_lost_config(): + """get lost builder configuration""" + return LOST_CONFIG.copy() + + +def update_lost_config(updates): + """update lost builder configuration""" + global LOST_CONFIG + LOST_CONFIG.update(updates) + return LOST_CONFIG.copy() diff --git a/connectd_logo.png b/connectd_logo.png new file mode 100644 index 0000000..212b1a9 Binary files /dev/null and b/connectd_logo.png differ diff --git a/daemon.py b/daemon.py new file mode 100644 index 0000000..711581b --- /dev/null +++ b/daemon.py @@ -0,0 +1,550 @@ +#!/usr/bin/env python3 +""" +connectd daemon - continuous discovery and matchmaking + +two modes of operation: +1. priority matching: find matches FOR hosts who run connectd +2. altruistic matching: connect strangers to each other + +runs continuously, respects rate limits, sends intros automatically +""" + +import time +import json +import signal +import sys +from datetime import datetime, timedelta +from pathlib import Path + +from db import Database +from db.users import (init_users_table, get_priority_users, save_priority_match, + get_priority_user_matches, discover_host_user) +from scoutd import scrape_github, scrape_reddit, scrape_mastodon, scrape_lobsters, scrape_lemmy, scrape_discord +from config import HOST_USER, INTRO_INTERVAL, MAX_INTROS_PER_DAY, SCOUT_INTERVAL, MATCH_INTERVAL +from scoutd.github import analyze_github_user, get_github_user +from scoutd.signals import analyze_text +from matchd.fingerprint import generate_fingerprint, fingerprint_similarity +from matchd.overlap import find_overlap +from matchd.lost import find_matches_for_lost_builders +from introd.groq_draft import draft_intro_with_llm as draft_intro +from introd.lost_intro import draft_lost_intro, get_lost_intro_config +from introd.send import send_email +from introd.deliver import deliver_intro, determine_best_contact +from config import get_lost_config +from api import start_api_thread, update_daemon_state + +# daemon config +LOST_INTERVAL = 3600 * 6 # lost builder outreach every 6 hours (lower volume) +MIN_OVERLAP_PRIORITY = 30 # min score for priority user matches +MIN_OVERLAP_STRANGERS = 50 # higher bar for stranger intros + + +class ConnectDaemon: + def __init__(self, dry_run=False): + self.db = Database() + init_users_table(self.db.conn) + self.running = True + self.dry_run = dry_run + self.started_at = datetime.now() + self.last_scout = None + self.last_match = None + self.last_intro = None + self.last_lost = None + self.intros_today = 0 + self.lost_intros_today = 0 + self.today = datetime.now().date() + + # handle shutdown gracefully + signal.signal(signal.SIGINT, self._shutdown) + signal.signal(signal.SIGTERM, self._shutdown) + + # auto-discover host user from env + if HOST_USER: + self.log(f"HOST_USER set: {HOST_USER}") + discover_host_user(self.db.conn, HOST_USER) + + # update API state + self._update_api_state() + + def _shutdown(self, signum, frame): + print("\nconnectd: shutting down...") + self.running = False + self._update_api_state() + + def _update_api_state(self): + """update API state for HA integration""" + now = datetime.now() + + # calculate countdowns - if no cycle has run, use started_at + def secs_until(last, interval): + base = last if last else self.started_at + next_run = base + timedelta(seconds=interval) + remaining = (next_run - now).total_seconds() + return max(0, int(remaining)) + + update_daemon_state({ + 'running': self.running, + 'dry_run': self.dry_run, + 'last_scout': self.last_scout.isoformat() if self.last_scout else None, + 'last_match': self.last_match.isoformat() if self.last_match else None, + 'last_intro': self.last_intro.isoformat() if self.last_intro else None, + 'last_lost': self.last_lost.isoformat() if self.last_lost else None, + 'intros_today': self.intros_today, + 'lost_intros_today': self.lost_intros_today, + 'started_at': self.started_at.isoformat(), + 'countdown_scout': secs_until(self.last_scout, SCOUT_INTERVAL), + 'countdown_match': secs_until(self.last_match, MATCH_INTERVAL), + 'countdown_intro': secs_until(self.last_intro, INTRO_INTERVAL), + 'countdown_lost': secs_until(self.last_lost, LOST_INTERVAL), + }) + + def log(self, msg): + """timestamped log""" + print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}") + + def reset_daily_limits(self): + """reset daily intro count""" + if datetime.now().date() != self.today: + self.today = datetime.now().date() + self.intros_today = 0 + self.lost_intros_today = 0 + self.log("reset daily intro limits") + + def scout_cycle(self): + """run discovery on all platforms""" + self.log("starting scout cycle...") + + try: + scrape_github(self.db, limit_per_source=30) + except Exception as e: + self.log(f"github scout error: {e}") + + try: + scrape_reddit(self.db, limit_per_sub=30) + except Exception as e: + self.log(f"reddit scout error: {e}") + + try: + scrape_mastodon(self.db, limit_per_instance=30) + except Exception as e: + self.log(f"mastodon scout error: {e}") + + try: + scrape_lobsters(self.db) + except Exception as e: + self.log(f"lobsters scout error: {e}") + + try: + scrape_lemmy(self.db, limit_per_community=30) + except Exception as e: + self.log(f"lemmy scout error: {e}") + + try: + scrape_discord(self.db, limit_per_channel=50) + except Exception as e: + self.log(f"discord scout error: {e}") + + self.last_scout = datetime.now() + stats = self.db.stats() + self.log(f"scout complete: {stats['total_humans']} humans in db") + + def match_priority_users(self): + """find matches for priority users (hosts)""" + priority_users = get_priority_users(self.db.conn) + + if not priority_users: + return + + self.log(f"matching for {len(priority_users)} priority users...") + + humans = self.db.get_all_humans(min_score=20, limit=500) + + for puser in priority_users: + # build priority user's fingerprint from their linked profiles + puser_signals = [] + puser_text = [] + + if puser.get('bio'): + puser_text.append(puser['bio']) + if puser.get('interests'): + interests = json.loads(puser['interests']) if isinstance(puser['interests'], str) else puser['interests'] + puser_signals.extend(interests) + if puser.get('looking_for'): + puser_text.append(puser['looking_for']) + + # analyze their linked github if available + if puser.get('github'): + gh_user = analyze_github_user(puser['github']) + if gh_user: + puser_signals.extend(gh_user.get('signals', [])) + + puser_fingerprint = { + 'values_vector': {}, + 'skills': {}, + 'interests': list(set(puser_signals)), + 'location_pref': 'pnw' if puser.get('location') and 'seattle' in puser['location'].lower() else None, + } + + # score text + if puser_text: + _, text_signals, _ = analyze_text(' '.join(puser_text)) + puser_signals.extend(text_signals) + + # find matches + matches_found = 0 + for human in humans: + # skip if it's their own profile on another platform + human_user = human.get('username', '').lower() + if puser.get('github') and human_user == puser['github'].lower(): + continue + if puser.get('reddit') and human_user == puser['reddit'].lower(): + continue + if puser.get('mastodon') and human_user == puser['mastodon'].lower().split('@')[0]: + continue + + # calculate overlap + human_signals = human.get('signals', []) + if isinstance(human_signals, str): + human_signals = json.loads(human_signals) + + shared = set(puser_signals) & set(human_signals) + overlap_score = len(shared) * 10 + + # location bonus + if puser.get('location') and human.get('location'): + if 'seattle' in human['location'].lower() or 'pnw' in human['location'].lower(): + overlap_score += 20 + + if overlap_score >= MIN_OVERLAP_PRIORITY: + overlap_data = { + 'overlap_score': overlap_score, + 'overlap_reasons': [f"shared: {', '.join(list(shared)[:5])}"] if shared else [], + } + save_priority_match(self.db.conn, puser['id'], human['id'], overlap_data) + matches_found += 1 + + if matches_found: + self.log(f" found {matches_found} matches for {puser['name'] or puser['email']}") + + def match_strangers(self): + """find matches between discovered humans (altruistic)""" + self.log("matching strangers...") + + humans = self.db.get_all_humans(min_score=40, limit=200) + + if len(humans) < 2: + return + + # generate fingerprints + fingerprints = {} + for human in humans: + fp = generate_fingerprint(human) + fingerprints[human['id']] = fp + + # find pairs + matches_found = 0 + from itertools import combinations + + for human_a, human_b in combinations(humans, 2): + # skip same platform same user + if human_a['platform'] == human_b['platform']: + if human_a['username'] == human_b['username']: + continue + + fp_a = fingerprints.get(human_a['id']) + fp_b = fingerprints.get(human_b['id']) + + overlap = find_overlap(human_a, human_b, fp_a, fp_b) + + if overlap['overlap_score'] >= MIN_OVERLAP_STRANGERS: + # save match + self.db.save_match(human_a['id'], human_b['id'], overlap) + matches_found += 1 + + if matches_found: + self.log(f"found {matches_found} stranger matches") + + self.last_match = datetime.now() + + def send_stranger_intros(self): + """send intros to connect strangers (or preview in dry-run mode)""" + self.reset_daily_limits() + + if not self.dry_run and self.intros_today >= MAX_INTROS_PER_DAY: + self.log("daily intro limit reached") + return + + # get unsent matches + c = self.db.conn.cursor() + c.execute('''SELECT m.*, + ha.id as a_id, ha.username as a_user, ha.platform as a_platform, + ha.name as a_name, ha.url as a_url, ha.contact as a_contact, + ha.signals as a_signals, ha.extra as a_extra, + hb.id as b_id, hb.username as b_user, hb.platform as b_platform, + hb.name as b_name, hb.url as b_url, hb.contact as b_contact, + hb.signals as b_signals, hb.extra as b_extra + FROM matches m + JOIN humans ha ON m.human_a_id = ha.id + JOIN humans hb ON m.human_b_id = hb.id + WHERE m.status = 'pending' + ORDER BY m.overlap_score DESC + LIMIT 10''') + + matches = c.fetchall() + + if self.dry_run: + self.log(f"DRY RUN: previewing {len(matches)} potential intros") + + for match in matches: + if not self.dry_run and self.intros_today >= MAX_INTROS_PER_DAY: + break + + match = dict(match) + + # build human dicts + human_a = { + 'id': match['a_id'], + 'username': match['a_user'], + 'platform': match['a_platform'], + 'name': match['a_name'], + 'url': match['a_url'], + 'contact': match['a_contact'], + 'signals': match['a_signals'], + 'extra': match['a_extra'], + } + human_b = { + 'id': match['b_id'], + 'username': match['b_user'], + 'platform': match['b_platform'], + 'name': match['b_name'], + 'url': match['b_url'], + 'contact': match['b_contact'], + 'signals': match['b_signals'], + 'extra': match['b_extra'], + } + + match_data = { + 'id': match['id'], + 'human_a': human_a, + 'human_b': human_b, + 'overlap_score': match['overlap_score'], + 'overlap_reasons': match['overlap_reasons'], + } + + # ACTIVITY-BASED CONTACT SELECTION + # use deliver_intro which calls determine_best_contact + # picks method based on WHERE they're most active: + # - mastodon DM if active on fediverse + # - github issue if actively committing + # - email ONLY as last resort + + for recipient, other in [(human_a, human_b), (human_b, human_a)]: + # draft intro using groq LLM + # retry groq up to 3 times with 10s wait + intro_result, intro_error = None, None + for retry in range(3): + intro_result, intro_error = draft_intro(match_data, recipient='a' if recipient == human_a else 'b') + if not intro_error: + break + self.log(f"groq retry {retry+1}/3: {intro_error}") + import time + time.sleep(10) + + if intro_error: + self.log(f"failed to draft intro after retries: {intro_error}") + continue + intro = {'draft': intro_result.get('draft', '')} + + # parse overlap reasons for display + reasons = match['overlap_reasons'] + if isinstance(reasons, str): + reasons = json.loads(reasons) + reason_summary = ', '.join(reasons[:3]) if reasons else 'aligned values' + + # determine best contact method based on activity + method, contact_info = determine_best_contact(recipient) + + if self.dry_run: + print("\n" + "=" * 60) + print(f"TO: {recipient['username']} ({recipient['platform']})") + print(f"METHOD: {method} -> {contact_info}") + print(f"SCORE: {match['overlap_score']:.0f} ({reason_summary})") + print("-" * 60) + print("MESSAGE:") + print(intro['draft']) + print("-" * 60) + print("[DRY RUN - NOT SENT]") + print("=" * 60) + break + else: + # deliver via activity-based method selection + success, error, delivery_method = deliver_intro(match_data, intro['draft'], intro.get('subject')) + + if success: + self.log(f"sent intro to {recipient['username']} via {delivery_method}") + self.intros_today += 1 + + # mark match as intro_sent + c.execute('UPDATE matches SET status = "intro_sent" WHERE id = ?', + (match['id'],)) + self.db.conn.commit() + break + else: + self.log(f"failed to reach {recipient['username']} via {delivery_method}: {error}") + + self.last_intro = datetime.now() + + def send_lost_builder_intros(self): + """ + reach out to lost builders - different tone, lower volume. + these people need encouragement, not networking. + """ + self.reset_daily_limits() + + lost_config = get_lost_config() + + if not lost_config.get('enabled', True): + return + + max_per_day = lost_config.get('max_per_day', 5) + if not self.dry_run and self.lost_intros_today >= max_per_day: + self.log("daily lost builder intro limit reached") + return + + # find lost builders with matching active builders + matches, error = find_matches_for_lost_builders( + self.db, + min_lost_score=lost_config.get('min_lost_score', 40), + min_values_score=lost_config.get('min_values_score', 20), + limit=max_per_day - self.lost_intros_today + ) + + if error: + self.log(f"lost builder matching error: {error}") + return + + if not matches: + self.log("no lost builders ready for outreach") + return + + if self.dry_run: + self.log(f"DRY RUN: previewing {len(matches)} lost builder intros") + + for match in matches: + if not self.dry_run and self.lost_intros_today >= max_per_day: + break + + lost = match['lost_user'] + builder = match['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username') + builder_name = builder.get('name') or builder.get('username') + + # draft intro + draft, draft_error = draft_lost_intro(lost, builder, lost_config) + + if draft_error: + self.log(f"error drafting lost intro for {lost_name}: {draft_error}") + continue + + # determine best contact method (activity-based) + method, contact_info = determine_best_contact(lost) + + if self.dry_run: + print("\n" + "=" * 60) + print("LOST BUILDER OUTREACH") + print("=" * 60) + print(f"TO: {lost_name} ({lost.get('platform')})") + print(f"DELIVERY: {method} → {contact_info}") + print(f"LOST SCORE: {lost.get('lost_potential_score', 0)}") + print(f"VALUES SCORE: {lost.get('score', 0)}") + print(f"INSPIRING BUILDER: {builder_name}") + print(f"SHARED INTERESTS: {', '.join(match.get('shared_interests', []))}") + print("-" * 60) + print("MESSAGE:") + print(draft) + print("-" * 60) + print("[DRY RUN - NOT SENT]") + print("=" * 60) + else: + # build match data for unified delivery + match_data = { + 'human_a': builder, # inspiring builder + 'human_b': lost, # lost builder (recipient) + 'overlap_score': match.get('match_score', 0), + 'overlap_reasons': match.get('shared_interests', []), + } + + success, error, delivery_method = deliver_intro(match_data, draft, None) + + if success: + self.log(f"sent lost builder intro to {lost_name} via {delivery_method}") + self.lost_intros_today += 1 + self.db.mark_lost_outreach(lost['id']) + else: + self.log(f"failed to reach {lost_name} via {delivery_method}: {error}") + + self.last_lost = datetime.now() + self.log(f"lost builder cycle complete: {self.lost_intros_today} sent today") + + def run(self): + """main daemon loop""" + self.log("connectd daemon starting...") + + # start API server + start_api_thread() + self.log("api server started on port 8099") + + if self.dry_run: + self.log("*** DRY RUN MODE - no intros will be sent ***") + self.log(f"scout interval: {SCOUT_INTERVAL}s") + self.log(f"match interval: {MATCH_INTERVAL}s") + self.log(f"intro interval: {INTRO_INTERVAL}s") + self.log(f"lost interval: {LOST_INTERVAL}s") + self.log(f"max intros/day: {MAX_INTROS_PER_DAY}") + + # initial scout + self.scout_cycle() + self._update_api_state() + + while self.running: + now = datetime.now() + + # scout cycle + if not self.last_scout or (now - self.last_scout).seconds >= SCOUT_INTERVAL: + self.scout_cycle() + self._update_api_state() + + # match cycle + if not self.last_match or (now - self.last_match).seconds >= MATCH_INTERVAL: + self.match_priority_users() + self.match_strangers() + self._update_api_state() + + # intro cycle + if not self.last_intro or (now - self.last_intro).seconds >= INTRO_INTERVAL: + self.send_stranger_intros() + self._update_api_state() + + # lost builder cycle + if not self.last_lost or (now - self.last_lost).seconds >= LOST_INTERVAL: + self.send_lost_builder_intros() + self._update_api_state() + + # sleep between checks + time.sleep(60) + + self.log("connectd daemon stopped") + self.db.close() + + +def run_daemon(dry_run=False): + """entry point""" + daemon = ConnectDaemon(dry_run=dry_run) + daemon.run() + + +if __name__ == '__main__': + import sys + dry_run = '--dry-run' in sys.argv + run_daemon(dry_run=dry_run) diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..3c42af4 --- /dev/null +++ b/db/__init__.py @@ -0,0 +1,375 @@ +""" +connectd database layer +sqlite storage for humans, fingerprints, matches, intros +""" + +import os +import sqlite3 +import json +from datetime import datetime +from pathlib import Path + +# use env var for DB path (docker) or default to local +DB_PATH = Path(os.environ.get('DB_PATH', Path(__file__).parent / 'connectd.db')) + + +class Database: + def __init__(self, path=None): + self.path = path or DB_PATH + self.conn = sqlite3.connect(self.path) + self.conn.row_factory = sqlite3.Row + self._init_tables() + + def _init_tables(self): + c = self.conn.cursor() + + # humans table - all discovered people + c.execute('''CREATE TABLE IF NOT EXISTS humans ( + id INTEGER PRIMARY KEY, + platform TEXT NOT NULL, + username TEXT NOT NULL, + url TEXT, + name TEXT, + bio TEXT, + location TEXT, + score REAL DEFAULT 0, + confidence REAL DEFAULT 0, + signals TEXT, + negative_signals TEXT, + reasons TEXT, + contact TEXT, + extra TEXT, + fingerprint_id INTEGER, + scraped_at TEXT, + updated_at TEXT, + lost_potential_score REAL DEFAULT 0, + lost_signals TEXT, + user_type TEXT DEFAULT 'none', + last_lost_outreach TEXT, + UNIQUE(platform, username) + )''') + + # migration: add new columns if they don't exist + try: + c.execute('ALTER TABLE humans ADD COLUMN lost_potential_score REAL DEFAULT 0') + except sqlite3.OperationalError: + pass # column exists + + try: + c.execute('ALTER TABLE humans ADD COLUMN lost_signals TEXT') + except sqlite3.OperationalError: + pass + + try: + c.execute('ALTER TABLE humans ADD COLUMN user_type TEXT DEFAULT "none"') + except sqlite3.OperationalError: + pass + + try: + c.execute('ALTER TABLE humans ADD COLUMN last_lost_outreach TEXT') + except sqlite3.OperationalError: + pass + + # fingerprints table - values profiles + c.execute('''CREATE TABLE IF NOT EXISTS fingerprints ( + id INTEGER PRIMARY KEY, + human_id INTEGER, + values_vector TEXT, + skills TEXT, + interests TEXT, + location_pref TEXT, + availability TEXT, + generated_at TEXT, + FOREIGN KEY(human_id) REFERENCES humans(id) + )''') + + # matches table - paired humans + c.execute('''CREATE TABLE IF NOT EXISTS matches ( + id INTEGER PRIMARY KEY, + human_a_id INTEGER, + human_b_id INTEGER, + overlap_score REAL, + overlap_reasons TEXT, + complementary_skills TEXT, + geographic_match INTEGER, + status TEXT DEFAULT 'pending', + created_at TEXT, + reviewed_at TEXT, + FOREIGN KEY(human_a_id) REFERENCES humans(id), + FOREIGN KEY(human_b_id) REFERENCES humans(id), + UNIQUE(human_a_id, human_b_id) + )''') + + # intros table - outreach attempts + c.execute('''CREATE TABLE IF NOT EXISTS intros ( + id INTEGER PRIMARY KEY, + match_id INTEGER, + recipient_human_id INTEGER, + channel TEXT, + draft TEXT, + status TEXT DEFAULT 'draft', + approved_by TEXT, + approved_at TEXT, + sent_at TEXT, + response TEXT, + response_at TEXT, + FOREIGN KEY(match_id) REFERENCES matches(id), + FOREIGN KEY(recipient_human_id) REFERENCES humans(id) + )''') + + # cross-platform links + c.execute('''CREATE TABLE IF NOT EXISTS cross_platform ( + id INTEGER PRIMARY KEY, + human_a_id INTEGER, + human_b_id INTEGER, + confidence REAL, + reason TEXT, + FOREIGN KEY(human_a_id) REFERENCES humans(id), + FOREIGN KEY(human_b_id) REFERENCES humans(id), + UNIQUE(human_a_id, human_b_id) + )''') + + self.conn.commit() + + def save_human(self, data): + """save or update a human record""" + c = self.conn.cursor() + + # fields to exclude from extra json + exclude_fields = ['platform', 'username', 'url', 'name', 'bio', + 'location', 'score', 'confidence', 'signals', + 'negative_signals', 'reasons', 'contact', + 'lost_potential_score', 'lost_signals', 'user_type'] + + c.execute('''INSERT OR REPLACE INTO humans + (platform, username, url, name, bio, location, score, confidence, + signals, negative_signals, reasons, contact, extra, scraped_at, updated_at, + lost_potential_score, lost_signals, user_type) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', + (data.get('platform'), + data.get('username'), + data.get('url'), + data.get('name'), + data.get('bio'), + data.get('location'), + data.get('score', 0), + data.get('confidence', 0), + json.dumps(data.get('signals', [])), + json.dumps(data.get('negative_signals', [])), + json.dumps(data.get('reasons', [])), + json.dumps(data.get('contact', {})), + json.dumps({k: v for k, v in data.items() if k not in exclude_fields}), + data.get('scraped_at', datetime.now().isoformat()), + datetime.now().isoformat(), + data.get('lost_potential_score', 0), + json.dumps(data.get('lost_signals', [])), + data.get('user_type', 'none'))) + + self.conn.commit() + return c.lastrowid + + def get_human(self, platform, username): + """get a human by platform and username""" + c = self.conn.cursor() + c.execute('SELECT * FROM humans WHERE platform = ? AND username = ?', + (platform, username)) + row = c.fetchone() + return dict(row) if row else None + + def get_human_by_id(self, human_id): + """get a human by id""" + c = self.conn.cursor() + c.execute('SELECT * FROM humans WHERE id = ?', (human_id,)) + row = c.fetchone() + return dict(row) if row else None + + def get_all_humans(self, min_score=0, limit=1000): + """get all humans above score threshold""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE score >= ? + ORDER BY score DESC, confidence DESC + LIMIT ?''', (min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_humans_by_platform(self, platform, min_score=0, limit=500): + """get humans for a specific platform""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE platform = ? AND score >= ? + ORDER BY score DESC + LIMIT ?''', (platform, min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_lost_builders(self, min_lost_score=40, min_values_score=20, limit=100): + """get lost builders who need encouragement""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE user_type = 'lost' OR user_type = 'both' + AND lost_potential_score >= ? + AND score >= ? + ORDER BY lost_potential_score DESC, score DESC + LIMIT ?''', (min_lost_score, min_values_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_lost_builders_for_outreach(self, min_lost_score=40, min_values_score=20, + cooldown_days=90, limit=50): + """get lost builders who are ready for outreach (respecting cooldown)""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE (user_type = 'lost' OR user_type = 'both') + AND lost_potential_score >= ? + AND score >= ? + AND (last_lost_outreach IS NULL + OR datetime(last_lost_outreach) < datetime('now', '-' || ? || ' days')) + ORDER BY lost_potential_score DESC, score DESC + LIMIT ?''', (min_lost_score, min_values_score, cooldown_days, limit)) + return [dict(row) for row in c.fetchall()] + + def get_active_builders(self, min_score=50, limit=100): + """get active builders who can inspire lost builders""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE user_type = 'builder' + AND score >= ? + ORDER BY score DESC, confidence DESC + LIMIT ?''', (min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def mark_lost_outreach(self, human_id): + """mark that we reached out to a lost builder""" + c = self.conn.cursor() + c.execute('''UPDATE humans SET last_lost_outreach = ? WHERE id = ?''', + (datetime.now().isoformat(), human_id)) + self.conn.commit() + + def save_fingerprint(self, human_id, fingerprint_data): + """save a fingerprint for a human""" + c = self.conn.cursor() + c.execute('''INSERT OR REPLACE INTO fingerprints + (human_id, values_vector, skills, interests, location_pref, availability, generated_at) + VALUES (?, ?, ?, ?, ?, ?, ?)''', + (human_id, + json.dumps(fingerprint_data.get('values_vector', {})), + json.dumps(fingerprint_data.get('skills', [])), + json.dumps(fingerprint_data.get('interests', [])), + fingerprint_data.get('location_pref'), + fingerprint_data.get('availability'), + datetime.now().isoformat())) + + # update human's fingerprint_id + c.execute('UPDATE humans SET fingerprint_id = ? WHERE id = ?', + (c.lastrowid, human_id)) + self.conn.commit() + return c.lastrowid + + def get_fingerprint(self, human_id): + """get fingerprint for a human""" + c = self.conn.cursor() + c.execute('SELECT * FROM fingerprints WHERE human_id = ?', (human_id,)) + row = c.fetchone() + return dict(row) if row else None + + def save_match(self, human_a_id, human_b_id, match_data): + """save a match between two humans""" + c = self.conn.cursor() + c.execute('''INSERT OR REPLACE INTO matches + (human_a_id, human_b_id, overlap_score, overlap_reasons, + complementary_skills, geographic_match, status, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)''', + (human_a_id, human_b_id, + match_data.get('overlap_score', 0), + json.dumps(match_data.get('overlap_reasons', [])), + json.dumps(match_data.get('complementary_skills', [])), + 1 if match_data.get('geographic_match') else 0, + 'pending', + datetime.now().isoformat())) + self.conn.commit() + return c.lastrowid + + def get_matches(self, status=None, limit=100): + """get matches, optionally filtered by status""" + c = self.conn.cursor() + if status: + c.execute('''SELECT * FROM matches WHERE status = ? + ORDER BY overlap_score DESC LIMIT ?''', (status, limit)) + else: + c.execute('''SELECT * FROM matches + ORDER BY overlap_score DESC LIMIT ?''', (limit,)) + return [dict(row) for row in c.fetchall()] + + def save_intro(self, match_id, recipient_id, channel, draft): + """save an intro draft""" + c = self.conn.cursor() + c.execute('''INSERT INTO intros + (match_id, recipient_human_id, channel, draft, status) + VALUES (?, ?, ?, ?, 'draft')''', + (match_id, recipient_id, channel, draft)) + self.conn.commit() + return c.lastrowid + + def get_pending_intros(self, limit=50): + """get intros pending approval""" + c = self.conn.cursor() + c.execute('''SELECT * FROM intros WHERE status = 'draft' + ORDER BY id DESC LIMIT ?''', (limit,)) + return [dict(row) for row in c.fetchall()] + + def approve_intro(self, intro_id, approved_by='human'): + """approve an intro for sending""" + c = self.conn.cursor() + c.execute('''UPDATE intros SET status = 'approved', + approved_by = ?, approved_at = ? WHERE id = ?''', + (approved_by, datetime.now().isoformat(), intro_id)) + self.conn.commit() + + def mark_intro_sent(self, intro_id): + """mark an intro as sent""" + c = self.conn.cursor() + c.execute('''UPDATE intros SET status = 'sent', sent_at = ? WHERE id = ?''', + (datetime.now().isoformat(), intro_id)) + self.conn.commit() + + def stats(self): + """get database statistics""" + c = self.conn.cursor() + stats = {} + + c.execute('SELECT COUNT(*) FROM humans') + stats['total_humans'] = c.fetchone()[0] + + c.execute('SELECT platform, COUNT(*) FROM humans GROUP BY platform') + stats['by_platform'] = {row[0]: row[1] for row in c.fetchall()} + + c.execute('SELECT COUNT(*) FROM humans WHERE score >= 50') + stats['high_score_humans'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM matches') + stats['total_matches'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM intros') + stats['total_intros'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM intros WHERE status = "sent"') + stats['sent_intros'] = c.fetchone()[0] + + # lost builder stats + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'builder'") + stats['active_builders'] = c.fetchone()[0] + + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'lost'") + stats['lost_builders'] = c.fetchone()[0] + + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'both'") + stats['recovering_builders'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM humans WHERE lost_potential_score >= 40') + stats['high_lost_score'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM humans WHERE last_lost_outreach IS NOT NULL') + stats['lost_outreach_sent'] = c.fetchone()[0] + + return stats + + def close(self): + self.conn.close() diff --git a/db/users.py b/db/users.py new file mode 100644 index 0000000..0615389 --- /dev/null +++ b/db/users.py @@ -0,0 +1,510 @@ +""" +priority users - people who host connectd get direct matching +""" + +import sqlite3 +import json +from datetime import datetime +from pathlib import Path + +DB_PATH = Path(__file__).parent / 'connectd.db' + +# map user-friendly interests to signal terms +INTEREST_TO_SIGNALS = { + 'self-hosting': ['selfhosted', 'home_automation'], + 'home-assistant': ['home_automation'], + 'intentional-community': ['community', 'cooperative'], + 'cooperatives': ['cooperative', 'community'], + 'solarpunk': ['solarpunk'], + 'privacy': ['privacy', 'local_first'], + 'local-first': ['local_first', 'privacy'], + 'queer-friendly': ['queer'], + 'anti-capitalism': ['cooperative', 'decentralized', 'community'], + 'esports-venue': [], + 'foss': ['foss'], + 'decentralized': ['decentralized'], + 'federated': ['federated_chat'], + 'mesh': ['mesh'], +} + + +def init_users_table(conn): + """create priority users table""" + c = conn.cursor() + + c.execute('''CREATE TABLE IF NOT EXISTS priority_users ( + id INTEGER PRIMARY KEY, + name TEXT, + email TEXT UNIQUE, + github TEXT, + reddit TEXT, + mastodon TEXT, + lobsters TEXT, + matrix TEXT, + lemmy TEXT, + discord TEXT, + bluesky TEXT, + location TEXT, + bio TEXT, + interests TEXT, + looking_for TEXT, + created_at TEXT, + active INTEGER DEFAULT 1, + score REAL DEFAULT 0, + signals TEXT, + scraped_profile TEXT, + last_scored_at TEXT + )''') + + # add missing columns to existing table + for col in ['lemmy', 'discord', 'bluesky']: + try: + c.execute(f'ALTER TABLE priority_users ADD COLUMN {col} TEXT') + except: + pass # column already exists + + # matches specifically for priority users + c.execute('''CREATE TABLE IF NOT EXISTS priority_matches ( + id INTEGER PRIMARY KEY, + priority_user_id INTEGER, + matched_human_id INTEGER, + overlap_score REAL, + overlap_reasons TEXT, + status TEXT DEFAULT 'new', + notified_at TEXT, + viewed_at TEXT, + FOREIGN KEY(priority_user_id) REFERENCES priority_users(id), + FOREIGN KEY(matched_human_id) REFERENCES humans(id) + )''') + + conn.commit() + + +def add_priority_user(conn, user_data): + """add a priority user (someone hosting connectd)""" + c = conn.cursor() + + c.execute('''INSERT OR REPLACE INTO priority_users + (name, email, github, reddit, mastodon, lobsters, matrix, lemmy, discord, bluesky, + location, bio, interests, looking_for, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', + (user_data.get('name'), + user_data.get('email'), + user_data.get('github'), + user_data.get('reddit'), + user_data.get('mastodon'), + user_data.get('lobsters'), + user_data.get('matrix'), + user_data.get('lemmy'), + user_data.get('discord'), + user_data.get('bluesky'), + user_data.get('location'), + user_data.get('bio'), + json.dumps(user_data.get('interests', [])), + user_data.get('looking_for'), + datetime.now().isoformat())) + + conn.commit() + return c.lastrowid + + +def get_priority_users(conn): + """get all active priority users""" + c = conn.cursor() + c.execute('SELECT * FROM priority_users WHERE active = 1') + return [dict(row) for row in c.fetchall()] + + +def get_priority_user(conn, user_id): + """get a specific priority user""" + c = conn.cursor() + c.execute('SELECT * FROM priority_users WHERE id = ?', (user_id,)) + row = c.fetchone() + return dict(row) if row else None + + +def save_priority_match(conn, priority_user_id, human_id, overlap_data): + """save a match for a priority user""" + c = conn.cursor() + + c.execute('''INSERT OR IGNORE INTO priority_matches + (priority_user_id, matched_human_id, overlap_score, overlap_reasons, status) + VALUES (?, ?, ?, ?, 'new')''', + (priority_user_id, human_id, + overlap_data.get('overlap_score', 0), + json.dumps(overlap_data.get('overlap_reasons', [])))) + + conn.commit() + return c.lastrowid + + +def get_priority_user_matches(conn, priority_user_id, status=None, limit=50): + """get matches for a priority user""" + c = conn.cursor() + + if status: + c.execute('''SELECT pm.*, h.* FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? AND pm.status = ? + ORDER BY pm.overlap_score DESC + LIMIT ?''', (priority_user_id, status, limit)) + else: + c.execute('''SELECT pm.*, h.* FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? + ORDER BY pm.overlap_score DESC + LIMIT ?''', (priority_user_id, limit)) + + return [dict(row) for row in c.fetchall()] + + +def mark_match_viewed(conn, match_id): + """mark a priority match as viewed""" + c = conn.cursor() + c.execute('''UPDATE priority_matches SET status = 'viewed', viewed_at = ? + WHERE id = ?''', (datetime.now().isoformat(), match_id)) + conn.commit() + + +def expand_interests_to_signals(interests): + """expand user-friendly interests to signal terms""" + signals = set() + for interest in interests: + interest_lower = interest.lower().strip() + if interest_lower in INTEREST_TO_SIGNALS: + signals.update(INTEREST_TO_SIGNALS[interest_lower]) + else: + signals.add(interest_lower) + + # always add these aligned signals for priority users + signals.update(['foss', 'decentralized', 'federated_chat', 'containers', 'unix', 'selfhosted']) + return list(signals) + + +def score_priority_user(conn, user_id, scraped_profile=None): + """ + calculate a score for a priority user based on: + - their stated interests + - their scraped github profile (if available) + - their repos and activity + """ + c = conn.cursor() + c.execute('SELECT * FROM priority_users WHERE id = ?', (user_id,)) + row = c.fetchone() + if not row: + return None + + user = dict(row) + score = 0 + signals = set() + + # 1. score from stated interests + interests = user.get('interests') + if isinstance(interests, str): + interests = json.loads(interests) if interests else [] + + for interest in interests: + interest_lower = interest.lower() + # high-value interests + if 'solarpunk' in interest_lower: + score += 30 + signals.add('solarpunk') + if 'queer' in interest_lower: + score += 30 + signals.add('queer') + if 'cooperative' in interest_lower or 'intentional' in interest_lower: + score += 20 + signals.add('cooperative') + if 'privacy' in interest_lower: + score += 10 + signals.add('privacy') + if 'self-host' in interest_lower or 'selfhost' in interest_lower: + score += 15 + signals.add('selfhosted') + if 'home-assistant' in interest_lower: + score += 15 + signals.add('home_automation') + if 'foss' in interest_lower or 'open source' in interest_lower: + score += 10 + signals.add('foss') + + # 2. score from scraped profile + if scraped_profile: + # repos + repos = scraped_profile.get('top_repos', []) + if len(repos) >= 20: + score += 20 + elif len(repos) >= 10: + score += 10 + elif len(repos) >= 5: + score += 5 + + # languages + languages = scraped_profile.get('languages', {}) + if 'Python' in languages or 'Rust' in languages: + score += 5 + signals.add('modern_lang') + + # topics from repos + topics = scraped_profile.get('topics', []) + for topic in topics: + if topic in ['self-hosted', 'home-assistant', 'privacy', 'foss']: + score += 10 + signals.add(topic.replace('-', '_')) + + # followers + followers = scraped_profile.get('followers', 0) + if followers >= 100: + score += 15 + elif followers >= 50: + score += 10 + elif followers >= 10: + score += 5 + + # 3. add expanded signals + expanded = expand_interests_to_signals(interests) + signals.update(expanded) + + # update user + c.execute('''UPDATE priority_users + SET score = ?, signals = ?, scraped_profile = ?, last_scored_at = ? + WHERE id = ?''', + (score, json.dumps(list(signals)), json.dumps(scraped_profile) if scraped_profile else None, + datetime.now().isoformat(), user_id)) + conn.commit() + + return {'score': score, 'signals': list(signals)} + + +def auto_match_priority_user(conn, user_id, min_overlap=40): + """ + automatically find and save matches for a priority user + uses relationship filtering to skip already-connected people + """ + from scoutd.deep import check_already_connected + + c = conn.cursor() + + # get user + c.execute('SELECT * FROM priority_users WHERE id = ?', (user_id,)) + row = c.fetchone() + if not row: + return [] + + user = dict(row) + + # get user signals + user_signals = set() + if user.get('signals'): + signals = json.loads(user['signals']) if isinstance(user['signals'], str) else user['signals'] + user_signals.update(signals) + + # also expand interests + if user.get('interests'): + interests = json.loads(user['interests']) if isinstance(user['interests'], str) else user['interests'] + user_signals.update(expand_interests_to_signals(interests)) + + # clear old matches + c.execute('DELETE FROM priority_matches WHERE priority_user_id = ?', (user_id,)) + conn.commit() + + # get all humans + c.execute('SELECT * FROM humans WHERE score >= 25') + columns = [d[0] for d in c.description] + + matches = [] + for row in c.fetchall(): + human = dict(zip(columns, row)) + + # skip own profiles + username = (human.get('username') or '').lower() + if user.get('github') and username == user['github'].lower(): + continue + if user.get('reddit') and username == user.get('reddit', '').lower(): + continue + + # check if already connected + user_human = {'username': user.get('github'), 'platform': 'github', 'extra': {}} + connected, reason = check_already_connected(user_human, human) + if connected: + continue + + # get human signals + human_signals = human.get('signals', []) + if isinstance(human_signals, str): + human_signals = json.loads(human_signals) if human_signals else [] + + # calculate overlap + shared = user_signals & set(human_signals) + overlap_score = len(shared) * 10 + + # high-value bonuses + if 'queer' in human_signals: + overlap_score += 40 + shared.add('queer (rare!)') + if 'solarpunk' in human_signals: + overlap_score += 30 + shared.add('solarpunk (rare!)') + if 'cooperative' in human_signals: + overlap_score += 20 + shared.add('cooperative (values)') + + # location bonus + location = (human.get('location') or '').lower() + user_location = (user.get('location') or '').lower() + if user_location and location: + if any(x in location for x in ['seattle', 'portland', 'pnw', 'washington', 'oregon']): + if 'seattle' in user_location or 'pnw' in user_location: + overlap_score += 25 + shared.add('PNW location!') + + if overlap_score >= min_overlap: + matches.append({ + 'human': human, + 'overlap_score': overlap_score, + 'shared': list(shared), + }) + + # sort and save top matches + matches.sort(key=lambda x: x['overlap_score'], reverse=True) + + for m in matches[:50]: # save top 50 + save_priority_match(conn, user_id, m['human']['id'], { + 'overlap_score': m['overlap_score'], + 'overlap_reasons': m['shared'], + }) + + return matches + + +def update_priority_user_profile(conn, user_id, profile_data): + """update a priority user's profile with new data""" + c = conn.cursor() + + updates = [] + values = [] + + for field in ['name', 'email', 'github', 'reddit', 'mastodon', 'lobsters', + 'matrix', 'lemmy', 'discord', 'bluesky', 'location', 'bio', 'looking_for']: + if field in profile_data and profile_data[field]: + updates.append(f'{field} = ?') + values.append(profile_data[field]) + + if 'interests' in profile_data: + updates.append('interests = ?') + values.append(json.dumps(profile_data['interests'])) + + if updates: + values.append(user_id) + c.execute(f'''UPDATE priority_users SET {', '.join(updates)} WHERE id = ?''', values) + conn.commit() + + return True + + +def discover_host_user(conn, alias): + """ + auto-discover a host user by their alias (username). + scrapes github and discovers all connected social handles. + also merges in HOST_ env vars from config for manual overrides. + + returns the priority user id + """ + from scoutd.github import analyze_github_user + from config import (HOST_NAME, HOST_EMAIL, HOST_GITHUB, HOST_MASTODON, + HOST_REDDIT, HOST_LEMMY, HOST_LOBSTERS, HOST_MATRIX, + HOST_DISCORD, HOST_BLUESKY, HOST_LOCATION, HOST_INTERESTS, HOST_LOOKING_FOR) + + print(f"connectd: discovering host user '{alias}'...") + + # scrape github for full profile + profile = analyze_github_user(alias) + + if not profile: + print(f" could not find github user '{alias}'") + # still create from env vars if no github found + profile = {'name': HOST_NAME or alias, 'bio': '', 'location': HOST_LOCATION, + 'contact': {}, 'extra': {'handles': {}}, 'topics': [], 'signals': []} + + print(f" found: {profile.get('name')} ({alias})") + print(f" score: {profile.get('score', 0)}, signals: {len(profile.get('signals', []))}") + + # extract contact info + contact = profile.get('contact', {}) + handles = profile.get('extra', {}).get('handles', {}) + + # merge in HOST_ env vars (override discovered values) + if HOST_MASTODON: + handles['mastodon'] = HOST_MASTODON + if HOST_REDDIT: + handles['reddit'] = HOST_REDDIT + if HOST_LEMMY: + handles['lemmy'] = HOST_LEMMY + if HOST_LOBSTERS: + handles['lobsters'] = HOST_LOBSTERS + if HOST_MATRIX: + handles['matrix'] = HOST_MATRIX + if HOST_DISCORD: + handles['discord'] = HOST_DISCORD + if HOST_BLUESKY: + handles['bluesky'] = HOST_BLUESKY + + # check if user already exists + c = conn.cursor() + c.execute('SELECT id FROM priority_users WHERE github = ?', (alias,)) + existing = c.fetchone() + + # parse HOST_INTERESTS if provided + interests = profile.get('topics', []) + if HOST_INTERESTS: + interests = [i.strip() for i in HOST_INTERESTS.split(',') if i.strip()] + + user_data = { + 'name': HOST_NAME or profile.get('name') or alias, + 'email': HOST_EMAIL or contact.get('email'), + 'github': HOST_GITHUB or alias, + 'reddit': handles.get('reddit'), + 'mastodon': handles.get('mastodon') or contact.get('mastodon'), + 'lobsters': handles.get('lobsters'), + 'matrix': handles.get('matrix') or contact.get('matrix'), + 'lemmy': handles.get('lemmy') or contact.get('lemmy'), + 'discord': handles.get('discord'), + 'bluesky': handles.get('bluesky') or contact.get('bluesky'), + 'location': HOST_LOCATION or profile.get('location'), + 'bio': profile.get('bio'), + 'interests': interests, + 'looking_for': HOST_LOOKING_FOR, + } + + if existing: + # update existing user + user_id = existing['id'] + update_priority_user_profile(conn, user_id, user_data) + print(f" updated existing priority user (id={user_id})") + else: + # create new user + user_id = add_priority_user(conn, user_data) + print(f" created new priority user (id={user_id})") + + # score the user + scraped_profile = { + 'top_repos': profile.get('extra', {}).get('top_repos', []), + 'languages': profile.get('languages', {}), + 'topics': profile.get('topics', []), + 'followers': profile.get('extra', {}).get('followers', 0), + } + score_result = score_priority_user(conn, user_id, scraped_profile) + print(f" scored: {score_result.get('score')}, {len(score_result.get('signals', []))} signals") + + # print discovered handles + print(f" discovered handles:") + for platform, handle in handles.items(): + print(f" {platform}: {handle}") + + return user_id + + +def get_host_user(conn): + """get the host user (first priority user)""" + users = get_priority_users(conn) + return users[0] if users else None diff --git a/db_init.py b/db_init.py new file mode 100644 index 0000000..0c70d18 --- /dev/null +++ b/db_init.py @@ -0,0 +1,375 @@ +""" +connectd database layer +sqlite storage for humans, fingerprints, matches, intros +""" + +import os +import sqlite3 +import json +from datetime import datetime +from pathlib import Path + +# use env var for DB path (docker) or default to local +DB_PATH = Path(os.environ.get('DB_PATH', Path(__file__).parent / 'connectd.db')) + + +class Database: + def __init__(self, path=None): + self.path = path or DB_PATH + self.conn = sqlite3.connect(self.path) + self.conn.row_factory = sqlite3.Row + self._init_tables() + + def _init_tables(self): + c = self.conn.cursor() + + # humans table - all discovered people + c.execute('''CREATE TABLE IF NOT EXISTS humans ( + id INTEGER PRIMARY KEY, + platform TEXT NOT NULL, + username TEXT NOT NULL, + url TEXT, + name TEXT, + bio TEXT, + location TEXT, + score REAL DEFAULT 0, + confidence REAL DEFAULT 0, + signals TEXT, + negative_signals TEXT, + reasons TEXT, + contact TEXT, + extra TEXT, + fingerprint_id INTEGER, + scraped_at TEXT, + updated_at TEXT, + lost_potential_score REAL DEFAULT 0, + lost_signals TEXT, + user_type TEXT DEFAULT 'none', + last_lost_outreach TEXT, + UNIQUE(platform, username) + )''') + + # migration: add new columns if they don't exist + try: + c.execute('ALTER TABLE humans ADD COLUMN lost_potential_score REAL DEFAULT 0') + except sqlite3.OperationalError: + pass # column exists + + try: + c.execute('ALTER TABLE humans ADD COLUMN lost_signals TEXT') + except sqlite3.OperationalError: + pass + + try: + c.execute('ALTER TABLE humans ADD COLUMN user_type TEXT DEFAULT "none"') + except sqlite3.OperationalError: + pass + + try: + c.execute('ALTER TABLE humans ADD COLUMN last_lost_outreach TEXT') + except sqlite3.OperationalError: + pass + + # fingerprints table - values profiles + c.execute('''CREATE TABLE IF NOT EXISTS fingerprints ( + id INTEGER PRIMARY KEY, + human_id INTEGER, + values_vector TEXT, + skills TEXT, + interests TEXT, + location_pref TEXT, + availability TEXT, + generated_at TEXT, + FOREIGN KEY(human_id) REFERENCES humans(id) + )''') + + # matches table - paired humans + c.execute('''CREATE TABLE IF NOT EXISTS matches ( + id INTEGER PRIMARY KEY, + human_a_id INTEGER, + human_b_id INTEGER, + overlap_score REAL, + overlap_reasons TEXT, + complementary_skills TEXT, + geographic_match INTEGER, + status TEXT DEFAULT 'pending', + created_at TEXT, + reviewed_at TEXT, + FOREIGN KEY(human_a_id) REFERENCES humans(id), + FOREIGN KEY(human_b_id) REFERENCES humans(id), + UNIQUE(human_a_id, human_b_id) + )''') + + # intros table - outreach attempts + c.execute('''CREATE TABLE IF NOT EXISTS intros ( + id INTEGER PRIMARY KEY, + match_id INTEGER, + recipient_human_id INTEGER, + channel TEXT, + draft TEXT, + status TEXT DEFAULT 'draft', + approved_by TEXT, + approved_at TEXT, + sent_at TEXT, + response TEXT, + response_at TEXT, + FOREIGN KEY(match_id) REFERENCES matches(id), + FOREIGN KEY(recipient_human_id) REFERENCES humans(id) + )''') + + # cross-platform links + c.execute('''CREATE TABLE IF NOT EXISTS cross_platform ( + id INTEGER PRIMARY KEY, + human_a_id INTEGER, + human_b_id INTEGER, + confidence REAL, + reason TEXT, + FOREIGN KEY(human_a_id) REFERENCES humans(id), + FOREIGN KEY(human_b_id) REFERENCES humans(id), + UNIQUE(human_a_id, human_b_id) + )''') + + self.conn.commit() + + def save_human(self, data): + """save or update a human record""" + c = self.conn.cursor() + + # fields to exclude from extra json + exclude_fields = ['platform', 'username', 'url', 'name', 'bio', + 'location', 'score', 'confidence', 'signals', + 'negative_signals', 'reasons', 'contact', + 'lost_potential_score', 'lost_signals', 'user_type'] + + c.execute('''INSERT OR REPLACE INTO humans + (platform, username, url, name, bio, location, score, confidence, + signals, negative_signals, reasons, contact, extra, scraped_at, updated_at, + lost_potential_score, lost_signals, user_type) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', + (data.get('platform'), + data.get('username'), + data.get('url'), + data.get('name'), + data.get('bio'), + data.get('location'), + data.get('score', 0), + data.get('confidence', 0), + json.dumps(data.get('signals', [])), + json.dumps(data.get('negative_signals', [])), + json.dumps(data.get('reasons', [])), + json.dumps(data.get('contact', {})), + json.dumps({k: v for k, v in data.items() if k not in exclude_fields}), + data.get('scraped_at', datetime.now().isoformat()), + datetime.now().isoformat(), + data.get('lost_potential_score', 0), + json.dumps(data.get('lost_signals', [])), + data.get('user_type', 'none'))) + + self.conn.commit() + return c.lastrowid + + def get_human(self, platform, username): + """get a human by platform and username""" + c = self.conn.cursor() + c.execute('SELECT * FROM humans WHERE platform = ? AND username = ?', + (platform, username)) + row = c.fetchone() + return dict(row) if row else None + + def get_human_by_id(self, human_id): + """get a human by id""" + c = self.conn.cursor() + c.execute('SELECT * FROM humans WHERE id = ?', (human_id,)) + row = c.fetchone() + return dict(row) if row else None + + def get_all_humans(self, min_score=0, limit=1000): + """get all humans above score threshold""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE score >= ? + ORDER BY score DESC, confidence DESC + LIMIT ?''', (min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_humans_by_platform(self, platform, min_score=0, limit=500): + """get humans for a specific platform""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE platform = ? AND score >= ? + ORDER BY score DESC + LIMIT ?''', (platform, min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_lost_builders(self, min_lost_score=40, min_values_score=20, limit=100): + """get lost builders who need encouragement""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE user_type = 'lost' OR user_type = 'both' + AND lost_potential_score >= ? + AND score >= ? + ORDER BY lost_potential_score DESC, score DESC + LIMIT ?''', (min_lost_score, min_values_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_lost_builders_for_outreach(self, min_lost_score=40, min_values_score=20, + cooldown_days=90, limit=50): + """get lost builders who are ready for outreach (respecting cooldown)""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE (user_type = 'lost' OR user_type = 'both') + AND lost_potential_score >= ? + AND score >= ? + AND (last_lost_outreach IS NULL + OR datetime(last_lost_outreach) < datetime('now', '-' || ? || ' days')) + ORDER BY lost_potential_score DESC, score DESC + LIMIT ?''', (min_lost_score, min_values_score, cooldown_days, limit)) + return [dict(row) for row in c.fetchall()] + + def get_active_builders(self, min_score=50, limit=100): + """get active builders who can inspire lost builders""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE user_type = 'builder' + AND score >= ? + ORDER BY score DESC, confidence DESC + LIMIT ?''', (min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def mark_lost_outreach(self, human_id): + """mark that we reached out to a lost builder""" + c = self.conn.cursor() + c.execute('''UPDATE humans SET last_lost_outreach = ? WHERE id = ?''', + (datetime.now().isoformat(), human_id)) + self.conn.commit() + + def save_fingerprint(self, human_id, fingerprint_data): + """save a fingerprint for a human""" + c = self.conn.cursor() + c.execute('''INSERT OR REPLACE INTO fingerprints + (human_id, values_vector, skills, interests, location_pref, availability, generated_at) + VALUES (?, ?, ?, ?, ?, ?, ?)''', + (human_id, + json.dumps(fingerprint_data.get('values_vector', {})), + json.dumps(fingerprint_data.get('skills', [])), + json.dumps(fingerprint_data.get('interests', [])), + fingerprint_data.get('location_pref'), + fingerprint_data.get('availability'), + datetime.now().isoformat())) + + # update human's fingerprint_id + c.execute('UPDATE humans SET fingerprint_id = ? WHERE id = ?', + (c.lastrowid, human_id)) + self.conn.commit() + return c.lastrowid + + def get_fingerprint(self, human_id): + """get fingerprint for a human""" + c = self.conn.cursor() + c.execute('SELECT * FROM fingerprints WHERE human_id = ?', (human_id,)) + row = c.fetchone() + return dict(row) if row else None + + def save_match(self, human_a_id, human_b_id, match_data): + """save a match between two humans""" + c = self.conn.cursor() + c.execute('''INSERT OR REPLACE INTO matches + (human_a_id, human_b_id, overlap_score, overlap_reasons, + complementary_skills, geographic_match, status, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)''', + (human_a_id, human_b_id, + match_data.get('overlap_score', 0), + json.dumps(match_data.get('overlap_reasons', [])), + json.dumps(match_data.get('complementary_skills', [])), + 1 if match_data.get('geographic_match') else 0, + 'pending', + datetime.now().isoformat())) + self.conn.commit() + return c.lastrowid + + def get_matches(self, status=None, limit=100): + """get matches, optionally filtered by status""" + c = self.conn.cursor() + if status: + c.execute('''SELECT * FROM matches WHERE status = ? + ORDER BY overlap_score DESC LIMIT ?''', (status, limit)) + else: + c.execute('''SELECT * FROM matches + ORDER BY overlap_score DESC LIMIT ?''', (limit,)) + return [dict(row) for row in c.fetchall()] + + def save_intro(self, match_id, recipient_id, channel, draft): + """save an intro draft""" + c = self.conn.cursor() + c.execute('''INSERT INTO intros + (match_id, recipient_human_id, channel, draft, status) + VALUES (?, ?, ?, ?, 'draft')''', + (match_id, recipient_id, channel, draft)) + self.conn.commit() + return c.lastrowid + + def get_pending_intros(self, limit=50): + """get intros pending approval""" + c = self.conn.cursor() + c.execute('''SELECT * FROM intros WHERE status = 'draft' + ORDER BY id DESC LIMIT ?''', (limit,)) + return [dict(row) for row in c.fetchall()] + + def approve_intro(self, intro_id, approved_by='human'): + """approve an intro for sending""" + c = self.conn.cursor() + c.execute('''UPDATE intros SET status = 'approved', + approved_by = ?, approved_at = ? WHERE id = ?''', + (approved_by, datetime.now().isoformat(), intro_id)) + self.conn.commit() + + def mark_intro_sent(self, intro_id): + """mark an intro as sent""" + c = self.conn.cursor() + c.execute('''UPDATE intros SET status = 'sent', sent_at = ? WHERE id = ?''', + (datetime.now().isoformat(), intro_id)) + self.conn.commit() + + def stats(self): + """get database statistics""" + c = self.conn.cursor() + stats = {} + + c.execute('SELECT COUNT(*) FROM humans') + stats['total_humans'] = c.fetchone()[0] + + c.execute('SELECT platform, COUNT(*) FROM humans GROUP BY platform') + stats['by_platform'] = {row[0]: row[1] for row in c.fetchall()} + + c.execute('SELECT COUNT(*) FROM humans WHERE score >= 50') + stats['high_score_humans'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM matches') + stats['total_matches'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM matches WHERE status = "intro_sent"') + stats['total_intros'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM matches WHERE status = "intro_sent"') + stats['sent_intros'] = c.fetchone()[0] + + # lost builder stats + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'builder'") + stats['active_builders'] = c.fetchone()[0] + + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'lost'") + stats['lost_builders'] = c.fetchone()[0] + + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'both'") + stats['recovering_builders'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM humans WHERE lost_potential_score >= 40') + stats['high_lost_score'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM humans WHERE last_lost_outreach IS NOT NULL') + stats['lost_outreach_sent'] = c.fetchone()[0] + + return stats + + def close(self): + self.conn.close() diff --git a/deep.py b/deep.py new file mode 100644 index 0000000..8ad637c --- /dev/null +++ b/deep.py @@ -0,0 +1,997 @@ +""" +scoutd/deep.py - deep profile discovery +when we find someone, follow ALL their links to build complete picture + +github profile -> mastodon link -> scrape mastodon + -> website -> scrape for more links + -> twitter handle -> note it + -> email -> store it + +email discovery sources: +- github profile (if public) +- git commit history +- personal website/blog contact page +- README "contact me" sections +- mastodon/twitter bio + +fallback contact methods if no email: +- github_issue: open issue on their repo +- mastodon: DM if allowed +- manual: pending contact queue for review + +also filters out people who clearly already know each other +(same org, co-contributors to same repos) +""" + +import re +import json +import requests +import time +import subprocess +import tempfile +import shutil +from datetime import datetime +from urllib.parse import urlparse +from pathlib import Path + +from .signals import analyze_text +from .github import get_github_user, get_user_repos, _api_get as github_api +from .mastodon import analyze_mastodon_user, _api_get as mastodon_api +from .handles import discover_all_handles, extract_handles_from_text, scrape_website_for_handles + +# MASTODON HANDLE FILTER - don't treat these as emails +MASTODON_INSTANCES = [ + 'mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt', + 'social.coop', 'masto.ai', 'infosec.exchange', 'hackers.town', + 'chaos.social', 'mathstodon.xyz', 'scholar.social', 'mas.to', + 'mstdn.social', 'mastodon.online', 'universeodon.com', 'mastodon.world', +] + +def is_mastodon_handle(email): + """check if string looks like mastodon handle not email""" + if not email or '@' not in email: + return False + email_lower = email.lower() + # check for @username@instance pattern + parts = email_lower.split('@') + if len(parts) == 3 and parts[0] == '': # @user@instance + return True + if len(parts) == 2: + # check if domain is known mastodon instance + domain = parts[1] + for instance in MASTODON_INSTANCES: + if domain == instance or domain.endswith('.' + instance): + return True + # also check common patterns + if 'mastodon' in domain or 'masto' in domain: + return True + return False + + + +# local cache for org memberships +ORG_CACHE_FILE = Path(__file__).parent.parent / 'data' / 'org_cache.json' +_org_cache = None + +# patterns to find social links in text +MASTODON_PATTERN = r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-z]{2,})' +TWITTER_PATTERN = r'(?:twitter\.com/|x\.com/)([a-zA-Z0-9_]+)' +GITHUB_PATTERN = r'github\.com/([a-zA-Z0-9_-]+)' +MATRIX_PATTERN = r'@([a-zA-Z0-9_]+):([a-zA-Z0-9.-]+)' +EMAIL_PATTERN = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b' + +# known mastodon instances for validation +KNOWN_INSTANCES = [ + 'mastodon.social', 'fosstodon.org', 'tech.lgbt', 'social.coop', + 'hackers.town', 'hachyderm.io', 'infosec.exchange', 'chaos.social', + 'mas.to', 'mstdn.social', 'mastodon.online', 'universeodon.com', + 'mathstodon.xyz', 'ruby.social', 'functional.cafe', 'types.pl', +] + +# contact page patterns for website scraping +CONTACT_PAGE_PATHS = [ + '/contact', '/contact/', '/contact.html', + '/about', '/about/', '/about.html', + '/connect', '/reach-out', '/hire', '/hire-me', +] + +# patterns to find emails in contact sections +CONTACT_SECTION_PATTERNS = [ + r'(?:contact|email|reach|mail)[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', + r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|$at$|@)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|$dot$|\.)\s*([a-zA-Z]{2,})', +] + + +def load_org_cache(): + """load org membership cache from disk""" + global _org_cache + if _org_cache is not None: + return _org_cache + + try: + ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + if ORG_CACHE_FILE.exists(): + with open(ORG_CACHE_FILE) as f: + _org_cache = json.load(f) + else: + _org_cache = {'users': {}, 'updated': {}} + except: + _org_cache = {'users': {}, 'updated': {}} + + return _org_cache + + +def save_org_cache(): + """save org membership cache to disk""" + global _org_cache + if _org_cache is None: + return + + try: + ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(ORG_CACHE_FILE, 'w') as f: + json.dump(_org_cache, f, indent=2) + except: + pass + + +def get_cached_orgs(username): + """get orgs from cache if available and fresh (< 7 days old)""" + cache = load_org_cache() + + if username not in cache['users']: + return None + + updated = cache['updated'].get(username) + if updated: + updated_dt = datetime.fromisoformat(updated) + if (datetime.now() - updated_dt).days < 7: + return cache['users'][username] + + return None + + +def cache_orgs(username, orgs): + """cache org membership for a user""" + cache = load_org_cache() + cache['users'][username] = orgs + cache['updated'][username] = datetime.now().isoformat() + save_org_cache() + + +def get_emails_from_commit_history(repo_url, limit=50): + """ + clone a repo (shallow) and extract unique committer emails from git log + """ + emails = set() + + try: + # create temp dir + with tempfile.TemporaryDirectory() as tmpdir: + # shallow clone with limited depth + result = subprocess.run( + ['git', 'clone', '--depth', '50', '--single-branch', repo_url, tmpdir], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + return [] + + # get unique emails from commit log + result = subprocess.run( + ['git', 'log', f'--max-count={limit}', '--format=%ae'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + for email in result.stdout.strip().split('\n'): + email = email.strip().lower() + # filter out bot/noreply emails + if email and not any(x in email for x in [ + 'noreply', 'no-reply', 'dependabot', 'github-actions', + 'renovate', 'greenkeeper', 'snyk-bot', 'users.noreply.github' + ]): + emails.add(email) + except (subprocess.TimeoutExpired, Exception): + pass + + return list(emails) + + +def scrape_website_for_emails(url, timeout=10): + """ + scrape a personal website for email addresses + checks main page and common contact pages + """ + emails = set() + + if not is_personal_website(url): + return [] + + headers = {'User-Agent': 'connectd/1.0 (looking for contact info)'} + + # normalize url + if not url.startswith('http'): + url = 'https://' + url + + base_url = url.rstrip('/') + + # pages to check + pages_to_check = [base_url] + [base_url + path for path in CONTACT_PAGE_PATHS] + + for page_url in pages_to_check: + try: + resp = requests.get(page_url, timeout=timeout, headers=headers) + if resp.status_code == 200: + text = resp.text + + # standard email pattern + for match in re.finditer(EMAIL_PATTERN, text): + email = match.group(0).lower() + if not any(x in email for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']): + emails.add(email) + + # obfuscated email patterns like "user [at] domain [dot] com" + for pattern in CONTACT_SECTION_PATTERNS: + for match in re.finditer(pattern, text, re.IGNORECASE): + if len(match.groups()) == 3: + email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower() + emails.add(email) + elif len(match.groups()) == 1: + emails.add(match.group(1).lower()) + + # mailto: links + for match in re.finditer(r'mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text): + emails.add(match.group(1).lower()) + + except: + continue + + return list(emails) + + +def extract_emails_from_readme(text): + """ + extract emails from README text, looking for contact sections + """ + emails = set() + + if not text: + return [] + + # look for contact-related sections + contact_patterns = [ + r'(?:##?\s*)?(?:contact|reach|email|get in touch|connect)[^\n]*\n([^\n#]+)', + r'(?:email|contact|reach me)[:\s]+([^\n]+)', + ] + + for pattern in contact_patterns: + for match in re.finditer(pattern, text, re.IGNORECASE): + section = match.group(1) + # extract emails from this section + for email_match in re.finditer(EMAIL_PATTERN, section): + email = email_match.group(0).lower() + if not any(x in email for x in ['noreply', 'no-reply', 'example.com']): + emails.add(email) + + # also check for obfuscated emails + for match in re.finditer(r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|$at$)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|$dot$)\s*([a-zA-Z]{2,})', text, re.IGNORECASE): + email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower() + emails.add(email) + + return list(emails) + + +def get_mastodon_dm_allowed(handle): + """check if a mastodon user allows DMs""" + profile = get_mastodon_profile(handle) + if not profile: + return False + + # check if they're locked (requires follow approval) + if profile.get('locked'): + return False + + # check bio for "DMs open" type messages + bio = (profile.get('note') or profile.get('summary') or '').lower() + if any(x in bio for x in ['dms open', 'dm me', 'message me', 'dms welcome']): + return True + + # default: assume open if not locked + return True + + +def determine_contact_method(profile): + """ + determine the best way to contact someone + returns (method, details) where method is one of: + - 'email': direct email contact + - 'github_issue': open issue on their repo + - 'mastodon': DM on mastodon + - 'manual': needs manual review + """ + # prefer email + if profile.get('email'): + return 'email', {'email': profile['email']} + + # check for multiple emails to pick from + if profile.get('emails') and len(profile['emails']) > 0: + # prefer non-github, non-work emails + for email in profile['emails']: + if not any(x in email.lower() for x in ['github', 'noreply', '@company', '@corp']): + return 'email', {'email': email} + # fall back to first one + return 'email', {'email': profile['emails'][0]} + + # try mastodon DM + if profile.get('mastodon'): + handles = profile['mastodon'] if isinstance(profile['mastodon'], list) else [profile['mastodon']] + for handle in handles: + if get_mastodon_dm_allowed(handle): + return 'mastodon', {'handle': handle} + + # try github issue on their most-starred repo + if profile.get('top_repos'): + # find repo with issues enabled and good stars + for repo in sorted(profile['top_repos'], key=lambda r: r.get('stars', 0), reverse=True): + if repo.get('stars', 0) >= 10: + repo_name = repo.get('name') + if repo_name: + return 'github_issue', { + 'repo': f"{profile['username']}/{repo_name}", + 'stars': repo.get('stars'), + } + + # manual review needed + return 'manual', { + 'reason': 'no email, mastodon, or suitable repo found', + 'available': { + 'twitter': profile.get('twitter'), + 'websites': profile.get('websites'), + 'matrix': profile.get('matrix'), + } + } + + +def extract_links_from_text(text): + """extract social links from bio/readme text""" + if not text: + return {} + + links = { + 'mastodon': [], + 'twitter': [], + 'github': [], + 'matrix': [], + 'email': [], + 'websites': [], + } + + # mastodon handles - only accept known instances or ones with 'mastodon'/'social' in name + for match in re.finditer(MASTODON_PATTERN, text): + user, instance = match.groups() + instance_lower = instance.lower() + # validate it's a known instance or looks like one + is_known = instance_lower in KNOWN_INSTANCES + looks_like_masto = any(x in instance_lower for x in ['mastodon', 'social', 'fedi', '.town', '.cafe']) + if is_known or looks_like_masto: + links['mastodon'].append(f"{user}@{instance}") + + # twitter + for match in re.finditer(TWITTER_PATTERN, text, re.IGNORECASE): + links['twitter'].append(match.group(1)) + + # github (for cross-referencing) + for match in re.finditer(GITHUB_PATTERN, text, re.IGNORECASE): + links['github'].append(match.group(1)) + + # matrix + for match in re.finditer(MATRIX_PATTERN, text): + user, server = match.groups() + links['matrix'].append(f"@{user}:{server}") + + # email + for match in re.finditer(EMAIL_PATTERN, text): + email = match.group(0) + # filter out obvious non-personal emails + if not any(x in email.lower() for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']): + links['email'].append(email) + + # websites (http/https links that aren't social platforms) + url_pattern = r'https?://([a-zA-Z0-9.-]+\.[a-z]{2,})[/\w.-]*' + for match in re.finditer(url_pattern, text): + domain = match.group(1).lower() + if not any(x in domain for x in ['github.com', 'twitter.com', 'mastodon', 'linkedin.com', 't.co']): + links['websites'].append(match.group(0)) + + # dedupe + for key in links: + links[key] = list(set(links[key])) + + return links + + +def is_personal_website(url): + """check if URL looks like a personal website vs corporate site""" + domain = urlparse(url).netloc.lower() + + # skip obvious corporate/platform sites + skip_domains = [ + 'github.com', 'gitlab.com', 'bitbucket.org', + 'twitter.com', 'x.com', 'linkedin.com', 'facebook.com', + 'youtube.com', 'medium.com', 'dev.to', 'hashnode.com', + 'wedo.com', 'google.com', 'microsoft.com', 'apple.com', + 'amazon.com', 'stackoverflow.com', 'reddit.com', + ] + + if any(skip in domain for skip in skip_domains): + return False + + # looks personal if: short domain, has common personal TLDs, contains username-like string + personal_tlds = ['.io', '.dev', '.me', '.co', '.xyz', '.page', '.codes', '.software'] + if any(domain.endswith(tld) for tld in personal_tlds): + return True + + # if domain is just name.com or similar + parts = domain.replace('www.', '').split('.') + if len(parts) == 2 and len(parts[0]) < 20: + return True + + return False + + +def scrape_website_for_links(url, timeout=10): + """scrape a personal website for more social links""" + if not is_personal_website(url): + return {} + + try: + resp = requests.get(url, timeout=timeout, headers={'User-Agent': 'connectd/1.0'}) + resp.raise_for_status() + return extract_links_from_text(resp.text) + except: + return {} + + +def get_mastodon_profile(handle): + """ + fetch mastodon profile from handle like user@instance + returns profile data or None + """ + if '@' not in handle: + return None + + parts = handle.split('@') + if len(parts) == 2: + user, instance = parts + elif len(parts) == 3 and parts[0] == '': + # @user@instance format + user, instance = parts[1], parts[2] + else: + return None + + # try to look up via webfinger + try: + webfinger_url = f"https://{instance}/.well-known/webfinger" + resp = requests.get( + webfinger_url, + params={'resource': f'acct:{user}@{instance}'}, + timeout=10, + headers={'Accept': 'application/json'} + ) + if resp.status_code == 200: + data = resp.json() + # find the profile link + for link in data.get('links', []): + if link.get('type') == 'application/activity+json': + profile_url = link.get('href') + # fetch the profile + profile_resp = requests.get( + profile_url, + timeout=10, + headers={'Accept': 'application/activity+json'} + ) + if profile_resp.status_code == 200: + return profile_resp.json() + except: + pass + + # fallback: try direct API + try: + search_url = f"https://{instance}/api/v1/accounts/lookup" + resp = requests.get(search_url, params={'acct': user}, timeout=10) + if resp.status_code == 200: + return resp.json() + except: + pass + + return None + + +def deep_scrape_github_user(login, scrape_commits=True): + """ + deep scrape a github user - follow all links, build complete profile + + email discovery sources: + 1. github profile (if public) + 2. git commit history (if scrape_commits=True) + 3. personal website/blog contact pages + 4. README "contact me" sections + 5. mastodon bio + """ + print(f" deep scraping {login}...") + + user = get_github_user(login) + if not user: + return None + + repos = get_user_repos(login, per_page=50) + + # collect all text to search for links + all_text = [] + readme_text = None + + if user.get('bio'): + all_text.append(user['bio']) + if user.get('blog'): + all_text.append(user['blog']) + if user.get('company'): + all_text.append(user['company']) + + # check readme of profile repo (username/username) + for branch in ['main', 'master']: + readme_url = f"https://raw.githubusercontent.com/{login}/{login}/{branch}/README.md" + try: + resp = requests.get(readme_url, timeout=10) + if resp.status_code == 200: + readme_text = resp.text + all_text.append(readme_text) + break + except: + pass + + # extract links from all collected text + combined_text = '\n'.join(all_text) + found_links = extract_links_from_text(combined_text) + + # ensure all keys exist + for key in ['email', 'twitter', 'github', 'matrix', 'mastodon', 'websites']: + if key not in found_links: + found_links[key] = [] + + # add explicit github fields + if user.get('email'): + found_links['email'].append(user['email']) + if user.get('twitter_username'): + found_links['twitter'].append(user['twitter_username']) + if user.get('blog'): + found_links['websites'].append(user['blog']) + + # EMAIL DISCOVERY: extract emails from README contact sections + if readme_text: + readme_emails = extract_emails_from_readme(readme_text) + found_links['email'].extend(readme_emails) + if readme_emails: + print(f" found {len(readme_emails)} email(s) in README") + + # dedupe + for key in found_links: + found_links[key] = list(set(found_links[key])) + + # now follow the links to gather more data + profile = { + 'source': 'github', + 'username': login, + 'url': f"https://github.com/{login}", + 'real_name': user.get('name'), + 'bio': user.get('bio'), + 'location': user.get('location'), + 'company': user.get('company'), + 'hireable': user.get('hireable'), + 'created_at': user.get('created_at'), + 'public_repos': user.get('public_repos'), + 'followers': user.get('followers'), + + # contact points + 'email': found_links['email'][0] if found_links['email'] else user.get('email'), + 'emails': list(found_links['email']), + 'twitter': found_links['twitter'][0] if found_links['twitter'] else user.get('twitter_username'), + 'mastodon': found_links['mastodon'], + 'matrix': found_links['matrix'], + 'websites': found_links['websites'], + + # cross-platform profiles we find + 'linked_profiles': {}, + + # repos and languages + 'top_repos': [], + 'languages': {}, + 'topics': [], + 'orgs': [], + + # contact method (will be determined at end) + 'contact_method': None, + 'contact_details': None, + } + + # analyze repos + top_starred_repo = None + for repo in repos[:30]: + if not repo.get('fork'): + repo_info = { + 'name': repo.get('name'), + 'description': repo.get('description'), + 'stars': repo.get('stargazers_count'), + 'language': repo.get('language'), + 'topics': repo.get('topics', []), + 'html_url': repo.get('html_url'), + 'pushed_at': repo.get('pushed_at'), # for activity-based contact selection + } + profile['top_repos'].append(repo_info) + + # track top starred for commit email scraping + if not top_starred_repo or repo.get('stargazers_count', 0) > top_starred_repo.get('stars', 0): + top_starred_repo = repo_info + + if repo.get('language'): + lang = repo['language'] + profile['languages'][lang] = profile['languages'].get(lang, 0) + 1 + + profile['topics'].extend(repo.get('topics', [])) + + profile['topics'] = list(set(profile['topics'])) + + # get orgs - check cache first + cached_orgs = get_cached_orgs(login) + if cached_orgs is not None: + print(f" using cached orgs: {cached_orgs}") + profile['orgs'] = cached_orgs + else: + orgs_url = f"https://api.github.com/users/{login}/orgs" + orgs_data = github_api(orgs_url) or [] + profile['orgs'] = [o.get('login') for o in orgs_data] + # cache for future use + cache_orgs(login, profile['orgs']) + if profile['orgs']: + print(f" fetched & cached orgs: {profile['orgs']}") + + # EMAIL DISCOVERY: scrape commit history from top repo + if scrape_commits and top_starred_repo and not profile['emails']: + repo_url = f"https://github.com/{login}/{top_starred_repo['name']}.git" + print(f" checking commit history in {top_starred_repo['name']}...") + commit_emails = get_emails_from_commit_history(repo_url) + if commit_emails: + print(f" found {len(commit_emails)} email(s) in commits") + profile['emails'].extend(commit_emails) + + # follow mastodon links + for masto_handle in found_links['mastodon'][:2]: # limit to 2 + print(f" following mastodon: {masto_handle}") + masto_profile = get_mastodon_profile(masto_handle) + if masto_profile: + profile['linked_profiles']['mastodon'] = { + 'handle': masto_handle, + 'display_name': masto_profile.get('display_name') or masto_profile.get('name'), + 'bio': masto_profile.get('note') or masto_profile.get('summary'), + 'followers': masto_profile.get('followers_count'), + 'url': masto_profile.get('url'), + 'locked': masto_profile.get('locked', False), + } + # extract more links from mastodon bio + masto_bio = masto_profile.get('note') or masto_profile.get('summary') or '' + masto_links = extract_links_from_text(masto_bio) + profile['emails'].extend(masto_links.get('email', [])) + profile['websites'].extend(masto_links.get('websites', [])) + + # EMAIL DISCOVERY: scrape personal website for contact info + for website in found_links['websites'][:2]: # check up to 2 sites + print(f" following website: {website}") + + # basic link extraction + site_links = scrape_website_for_links(website) + if site_links.get('mastodon') and not profile['mastodon']: + profile['mastodon'] = site_links['mastodon'] + + # enhanced email discovery - check contact pages + website_emails = scrape_website_for_emails(website) + if website_emails: + print(f" found {len(website_emails)} email(s) on website") + profile['emails'].extend(website_emails) + + # dedupe emails and pick best one + # FILTER OUT MASTODON HANDLES (they're not emails!) + profile['emails'] = [e for e in set(profile['emails']) if e and not is_mastodon_handle(e)] + + # rank emails by preference + def email_score(email): + email_lower = email.lower() + score = 0 + # prefer personal domains + if any(x in email_lower for x in ['@gmail', '@proton', '@hey.com', '@fastmail']): + score += 10 + # deprioritize github emails + if 'github' in email_lower: + score -= 20 + # deprioritize noreply + if 'noreply' in email_lower: + score -= 50 + # prefer emails matching username + if login.lower() in email_lower: + score += 5 + return score + + if profile['emails']: + profile['emails'].sort(key=email_score, reverse=True) + profile['email'] = profile['emails'][0] + + # COMPREHENSIVE HANDLE DISCOVERY + # find ALL social handles from website, README, rel="me" links, etc. + discovered_handles, discovered_emails = discover_all_handles(user) + + # merge discovered handles into profile + profile['handles'] = discovered_handles + + # update individual fields from discovered handles + if discovered_handles.get('mastodon') and not profile.get('mastodon'): + profile['mastodon'] = discovered_handles['mastodon'] + if discovered_handles.get('twitter') and not profile.get('twitter'): + profile['twitter'] = discovered_handles['twitter'] + if discovered_handles.get('bluesky'): + profile['bluesky'] = discovered_handles['bluesky'] + if discovered_handles.get('matrix') and not profile.get('matrix'): + profile['matrix'] = discovered_handles['matrix'] + if discovered_handles.get('linkedin'): + profile['linkedin'] = discovered_handles['linkedin'] + if discovered_handles.get('youtube'): + profile['youtube'] = discovered_handles['youtube'] + if discovered_handles.get('discord'): + profile['discord'] = discovered_handles['discord'] + if discovered_handles.get('telegram'): + profile['telegram'] = discovered_handles['telegram'] + + # merge discovered emails + for email in discovered_emails: + if email not in profile['emails']: + profile['emails'].append(email) + + print(f" handles found: {list(discovered_handles.keys())}") + + # determine best contact method + contact_method, contact_details = determine_contact_method(profile) + profile['contact_method'] = contact_method + profile['contact_details'] = contact_details + print(f" contact method: {contact_method}") + + # analyze all text for signals + all_profile_text = ' '.join([ + profile.get('bio') or '', + profile.get('company') or '', + profile.get('location') or '', + ' '.join(profile.get('topics', [])), + ]) + + for linked in profile.get('linked_profiles', {}).values(): + if linked.get('bio'): + all_profile_text += ' ' + linked['bio'] + + text_score, signals, negative = analyze_text(all_profile_text) + profile['signals'] = signals + profile['negative_signals'] = negative + profile['score'] = text_score + + # add builder score + if len(repos) > 20: + profile['score'] += 15 + elif len(repos) > 10: + profile['score'] += 10 + + # add topic alignment + from .signals import TARGET_TOPICS + aligned_topics = set(profile['topics']) & set(TARGET_TOPICS) + profile['score'] += len(aligned_topics) * 10 + profile['aligned_topics'] = list(aligned_topics) + + profile['scraped_at'] = datetime.now().isoformat() + + return profile + + +def check_mutual_github_follows(user_a, user_b): + """check if two github users follow each other""" + # check if a follows b + url = f"https://api.github.com/users/{user_a}/following/{user_b}" + try: + resp = requests.get(url, timeout=10, headers={'Accept': 'application/vnd.github.v3+json'}) + if resp.status_code == 204: # 204 = follows + return True + except: + pass + return False + + +def check_shared_repo_contributions(user_a, user_b): + """ + check if two users have contributed to the same repos + returns (bool, list of shared repos) + """ + # this would require checking contribution history + # for now, we check via the orgs and top_repos stored in extra + # the full implementation would query: + # GET /repos/{owner}/{repo}/contributors for their top repos + return False, [] + + +def check_github_interactions(user_a, user_b): + """ + check if users have had public interactions + (comments on each other's issues/PRs) + this is expensive - only do for high-score matches + """ + # would need to search: + # GET /search/issues?q=author:{user_a}+commenter:{user_b} + # GET /search/issues?q=author:{user_b}+commenter:{user_a} + return False + + +def check_already_connected(human_a, human_b, deep_check=False): + """ + check if two humans are likely already connected + (same org, co-contributors, mutual follows, interactions) + + connectd's job is connecting ISOLATED builders, not re-introducing coworkers + """ + # parse extra data if stored as json string + extra_a = human_a.get('extra', {}) + extra_b = human_b.get('extra', {}) + if isinstance(extra_a, str): + extra_a = json.loads(extra_a) if extra_a else {} + if isinstance(extra_b, str): + extra_b = json.loads(extra_b) if extra_b else {} + + # 1. same github org - check cache first, then stored data + orgs_a = set(extra_a.get('orgs', [])) + orgs_b = set(extra_b.get('orgs', [])) + + # also check org cache for fresher data + if human_a.get('platform') == 'github': + cached_a = get_cached_orgs(human_a.get('username', '')) + if cached_a: + orgs_a.update(cached_a) + if human_b.get('platform') == 'github': + cached_b = get_cached_orgs(human_b.get('username', '')) + if cached_b: + orgs_b.update(cached_b) + + shared_orgs = orgs_a & orgs_b + + if shared_orgs: + return True, f"same org: {', '.join(list(shared_orgs)[:3])}" + + # 2. same company + company_a = (extra_a.get('company') or '').lower().strip('@').strip() + company_b = (extra_b.get('company') or '').lower().strip('@').strip() + + if company_a and company_b and len(company_a) > 2: + if company_a == company_b or company_a in company_b or company_b in company_a: + return True, f"same company: {company_a or company_b}" + + # 3. co-contributors to same major repos (from stored top_repos) + repos_a = set() + repos_b = set() + for r in extra_a.get('top_repos', []): + if r.get('stars', 0) > 50: # only significant repos + repos_a.add(r.get('name', '').lower()) + for r in extra_b.get('top_repos', []): + if r.get('stars', 0) > 50: + repos_b.add(r.get('name', '').lower()) + + shared_repos = repos_a & repos_b + if len(shared_repos) >= 2: + return True, f"co-contributors: {', '.join(list(shared_repos)[:3])}" + + # 4. deep checks (more API calls - only if requested) + if deep_check: + user_a = human_a.get('username', '') + user_b = human_b.get('username', '') + + # check mutual follows + if human_a.get('platform') == 'github' and human_b.get('platform') == 'github': + if check_mutual_github_follows(user_a, user_b): + return True, "mutual github follows" + if check_mutual_github_follows(user_b, user_a): + return True, "mutual github follows" + + return False, None + + +def save_deep_profile(db, profile): + """save a deep-scraped profile to the database""" + # convert to standard human format + # IMPORTANT: extra field contains ALL data for activity-based contact selection + human_data = { + 'platform': profile['source'], + 'username': profile['username'], + 'url': profile['url'], + 'name': profile.get('real_name'), + 'bio': profile.get('bio'), + 'location': profile.get('location'), + 'score': profile.get('score', 0), + 'confidence': 0.8 if profile.get('linked_profiles') else 0.5, + 'signals': profile.get('signals', []), + 'negative_signals': profile.get('negative_signals', []), + 'reasons': [], + 'contact': { + 'email': profile.get('email'), + 'emails': profile.get('emails', []), + 'twitter': profile.get('twitter'), + 'mastodon': profile.get('mastodon'), + 'matrix': profile.get('matrix'), + 'websites': profile.get('websites'), + 'contact_method': profile.get('contact_method'), + 'contact_details': profile.get('contact_details'), + }, + 'extra': { + # identity + 'real_name': profile.get('real_name'), + 'company': profile.get('company'), + 'hireable': profile.get('hireable'), + 'orgs': profile.get('orgs'), + + # github activity (for activity-based contact) + 'top_repos': profile.get('top_repos'), + 'languages': profile.get('languages'), + 'topics': profile.get('topics'), + 'aligned_topics': profile.get('aligned_topics'), + 'followers': profile.get('followers'), + 'public_repos': profile.get('public_repos'), + 'commit_count': len(profile.get('emails', [])), # rough proxy + + # cross-platform links (for activity-based contact) + 'email': profile.get('email'), + 'emails': profile.get('emails', []), + 'twitter': profile.get('twitter'), + 'mastodon': profile.get('mastodon'), + 'matrix': profile.get('matrix'), + 'bluesky': profile.get('bluesky'), + 'reddit': profile.get('reddit'), + 'lobsters': profile.get('lobsters'), + 'linkedin': profile.get('linkedin'), + 'youtube': profile.get('youtube'), + 'discord': profile.get('discord'), + 'telegram': profile.get('telegram'), + 'linked_profiles': profile.get('linked_profiles'), + + # ALL discovered handles (comprehensive) + 'handles': profile.get('handles', {}), + + # activity counts (populated by platform scrapers) + 'mastodon_statuses': profile.get('mastodon_statuses', 0), + 'twitter_tweets': profile.get('twitter_tweets', 0), + 'reddit_activity': profile.get('reddit_activity', 0), + 'reddit_karma': profile.get('reddit_karma', 0), + 'lobsters_karma': profile.get('lobsters_karma', 0), + 'bluesky_posts': profile.get('bluesky_posts', 0), + }, + 'scraped_at': profile.get('scraped_at'), + } + + # build reasons + if profile.get('signals'): + human_data['reasons'].append(f"signals: {', '.join(profile['signals'][:5])}") + if profile.get('aligned_topics'): + human_data['reasons'].append(f"topics: {', '.join(profile['aligned_topics'][:5])}") + if profile.get('linked_profiles'): + platforms = list(profile['linked_profiles'].keys()) + human_data['reasons'].append(f"also on: {', '.join(platforms)}") + if profile.get('location'): + human_data['reasons'].append(f"location: {profile['location']}") + if profile.get('contact_method'): + human_data['reasons'].append(f"contact: {profile['contact_method']}") + + db.save_human(human_data) + return human_data diff --git a/deliver.py b/deliver.py new file mode 100644 index 0000000..e1500c1 --- /dev/null +++ b/deliver.py @@ -0,0 +1,510 @@ +""" +introd/deliver.py - intro delivery via multiple channels + +supports: +- email (smtp) +- mastodon dm (if they allow dms) +- bluesky dm (via AT Protocol) +- matrix dm (creates DM room and sends message) +- github issue (opens intro as issue on their most active repo) +- manual queue (for review before sending) + +contact method is determined by ACTIVITY-BASED SELECTION: +- picks the platform where the user is MOST ACTIVE +- verified handles (from rel="me" links) get a bonus + +NOTE: reddit is NOT a delivery method - it's discovery only. +reddit-discovered users are contacted via their external links. +""" + +import os +import json +import smtplib +import requests +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from datetime import datetime +from pathlib import Path + +# config from env - no hardcoded credentials +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', 465)) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') +FROM_EMAIL = os.environ.get('FROM_EMAIL', '') + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '') +MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '') +BLUESKY_HANDLE = os.environ.get('BLUESKY_HANDLE', '') +BLUESKY_APP_PASSWORD = os.environ.get('BLUESKY_APP_PASSWORD', '') +MATRIX_HOMESERVER = os.environ.get('MATRIX_HOMESERVER', '') +MATRIX_USER_ID = os.environ.get('MATRIX_USER_ID', '') +MATRIX_ACCESS_TOKEN = os.environ.get('MATRIX_ACCESS_TOKEN', '') + +# delivery log +DELIVERY_LOG = Path(__file__).parent.parent / 'data' / 'delivery_log.json' +MANUAL_QUEUE = Path(__file__).parent.parent / 'data' / 'manual_queue.json' + + +def load_delivery_log(): + """load delivery history""" + if DELIVERY_LOG.exists(): + return json.loads(DELIVERY_LOG.read_text()) + return {'sent': [], 'failed': [], 'queued': []} + + +def save_delivery_log(log): + """save delivery history""" + DELIVERY_LOG.parent.mkdir(parents=True, exist_ok=True) + DELIVERY_LOG.write_text(json.dumps(log, indent=2)) + + +def load_manual_queue(): + """load manual review queue""" + if MANUAL_QUEUE.exists(): + return json.loads(MANUAL_QUEUE.read_text()) + return [] + + +def save_manual_queue(queue): + """save manual review queue""" + MANUAL_QUEUE.parent.mkdir(parents=True, exist_ok=True) + MANUAL_QUEUE.write_text(json.dumps(queue, indent=2)) + + +def already_contacted(recipient_id): + """check if we've already sent an intro to this person""" + log = load_delivery_log() + sent_ids = [s.get('recipient_id') for s in log.get('sent', [])] + return recipient_id in sent_ids + + +def send_email(to_email, subject, body, dry_run=False): + """send email via smtp""" + if dry_run: + print(f" [dry run] would email {to_email}") + print(f" subject: {subject}") + print(f" body preview: {body[:100]}...") + return True, "dry run" + + try: + msg = MIMEMultipart('alternative') + msg['Subject'] = subject + msg['From'] = FROM_EMAIL + msg['To'] = to_email + + # plain text + text_part = MIMEText(body, 'plain') + msg.attach(text_part) + + # html version (simple) + html_body = body.replace('\n', '
') + html_part = MIMEText(f"

{html_body}

", 'html') + msg.attach(html_part) + + with smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) as server: + server.login(SMTP_USER, SMTP_PASS) + server.sendmail(SMTP_USER, to_email, msg.as_string()) + + return True, None + except Exception as e: + return False, str(e) + + +def create_github_issue(owner, repo, title, body, dry_run=False): + """create github issue as intro""" + if not GITHUB_TOKEN: + return False, "GITHUB_TOKEN not set" + + if dry_run: + print(f" [dry run] would create issue on {owner}/{repo}") + print(f" title: {title}") + return True, "dry run" + + try: + url = f"https://api.github.com/repos/{owner}/{repo}/issues" + resp = requests.post( + url, + headers={ + 'Authorization': f'token {GITHUB_TOKEN}', + 'Accept': 'application/vnd.github.v3+json', + }, + json={ + 'title': title, + 'body': body, + 'labels': ['introduction', 'community'], + }, + timeout=30, + ) + + if resp.status_code == 201: + issue_url = resp.json().get('html_url') + return True, issue_url + else: + return False, f"github api error: {resp.status_code} - {resp.text}" + except Exception as e: + return False, str(e) + + +def send_mastodon_dm(recipient_acct, message, dry_run=False): + """send mastodon direct message""" + if not MASTODON_TOKEN: + return False, "MASTODON_TOKEN not set" + + if dry_run: + print(f" [dry run] would DM {recipient_acct}") + print(f" message preview: {message[:100]}...") + return True, "dry run" + + try: + # post as direct message (visibility: direct, mention recipient) + url = f"https://{MASTODON_INSTANCE}/api/v1/statuses" + resp = requests.post( + url, + headers={ + 'Authorization': f'Bearer {MASTODON_TOKEN}', + 'Content-Type': 'application/json', + }, + json={ + 'status': f"@{recipient_acct} {message}", + 'visibility': 'direct', + }, + timeout=30, + ) + + if resp.status_code in [200, 201]: + return True, resp.json().get('url') + else: + return False, f"mastodon api error: {resp.status_code} - {resp.text}" + except Exception as e: + return False, str(e) + + +def send_bluesky_dm(recipient_handle, message, dry_run=False): + """send bluesky direct message via AT Protocol""" + if not BLUESKY_APP_PASSWORD: + return False, "BLUESKY_APP_PASSWORD not set" + + if dry_run: + print(f" [dry run] would DM {recipient_handle} on bluesky") + print(f" message preview: {message[:100]}...") + return True, "dry run" + + try: + # authenticate with bluesky + auth_url = "https://bsky.social/xrpc/com.atproto.server.createSession" + auth_resp = requests.post( + auth_url, + json={ + 'identifier': BLUESKY_HANDLE, + 'password': BLUESKY_APP_PASSWORD, + }, + timeout=30, + ) + + if auth_resp.status_code != 200: + return False, f"bluesky auth failed: {auth_resp.status_code}" + + auth_data = auth_resp.json() + access_token = auth_data.get('accessJwt') + did = auth_data.get('did') + + # resolve recipient DID + resolve_url = f"https://bsky.social/xrpc/com.atproto.identity.resolveHandle" + resolve_resp = requests.get( + resolve_url, + params={'handle': recipient_handle.lstrip('@')}, + timeout=30, + ) + + if resolve_resp.status_code != 200: + return False, f"couldn't resolve handle {recipient_handle}" + + recipient_did = resolve_resp.json().get('did') + + # create chat/DM (using convo namespace) + # first get or create conversation + convo_url = "https://bsky.social/xrpc/chat.bsky.convo.getConvoForMembers" + convo_resp = requests.get( + convo_url, + headers={'Authorization': f'Bearer {access_token}'}, + params={'members': [recipient_did]}, + timeout=30, + ) + + if convo_resp.status_code != 200: + # try creating conversation + return False, f"couldn't get/create conversation: {convo_resp.status_code}" + + convo_id = convo_resp.json().get('convo', {}).get('id') + + # send message + msg_url = "https://bsky.social/xrpc/chat.bsky.convo.sendMessage" + msg_resp = requests.post( + msg_url, + headers={ + 'Authorization': f'Bearer {access_token}', + 'Content-Type': 'application/json', + }, + json={ + 'convoId': convo_id, + 'message': {'text': message}, + }, + timeout=30, + ) + + if msg_resp.status_code in [200, 201]: + return True, f"sent to {recipient_handle}" + else: + return False, f"bluesky dm failed: {msg_resp.status_code} - {msg_resp.text}" + + except Exception as e: + return False, str(e) + + +def send_matrix_dm(recipient_mxid, message, dry_run=False): + """send matrix direct message""" + if not MATRIX_ACCESS_TOKEN: + return False, "MATRIX_ACCESS_TOKEN not set" + + if dry_run: + print(f" [dry run] would DM {recipient_mxid} on matrix") + print(f" message preview: {message[:100]}...") + return True, "dry run" + + try: + # create or get direct room with recipient + # first, check if we already have a DM room + headers = {'Authorization': f'Bearer {MATRIX_ACCESS_TOKEN}'} + + # create a new DM room + create_room_resp = requests.post( + f'{MATRIX_HOMESERVER}/_matrix/client/v3/createRoom', + headers=headers, + json={ + 'is_direct': True, + 'invite': [recipient_mxid], + 'preset': 'trusted_private_chat', + }, + timeout=30, + ) + + if create_room_resp.status_code not in [200, 201]: + return False, f"matrix room creation failed: {create_room_resp.status_code} - {create_room_resp.text}" + + room_id = create_room_resp.json().get('room_id') + + # send message to room + import time + txn_id = str(int(time.time() * 1000)) + + msg_resp = requests.put( + f'{MATRIX_HOMESERVER}/_matrix/client/v3/rooms/{room_id}/send/m.room.message/{txn_id}', + headers=headers, + json={ + 'msgtype': 'm.text', + 'body': message, + }, + timeout=30, + ) + + if msg_resp.status_code in [200, 201]: + return True, f"sent to {recipient_mxid} in {room_id}" + else: + return False, f"matrix send failed: {msg_resp.status_code} - {msg_resp.text}" + + except Exception as e: + return False, str(e) + + +def add_to_manual_queue(intro_data): + """add intro to manual review queue""" + queue = load_manual_queue() + queue.append({ + **intro_data, + 'queued_at': datetime.now().isoformat(), + 'status': 'pending', + }) + save_manual_queue(queue) + return True + + +def determine_best_contact(human): + """ + determine best contact method based on WHERE THEY'RE MOST ACTIVE + + uses activity-based selection from groq_draft module + """ + from introd.groq_draft import determine_contact_method as activity_based_contact + + method, info = activity_based_contact(human) + + # convert github_issue info to dict format for delivery + if method == 'github_issue' and isinstance(info, str) and '/' in info: + parts = info.split('/', 1) + return method, {'owner': parts[0], 'repo': parts[1]} + + return method, info + + +def deliver_intro(match_data, intro_draft, subject=None, dry_run=False): + """ + deliver an intro via the best available method + + match_data: {human_a, human_b, overlap_score, overlap_reasons} + intro_draft: the text to send (from groq) + subject: optional subject line for email/github (from groq) + """ + recipient = match_data.get('human_b', {}) + recipient_id = f"{recipient.get('platform')}:{recipient.get('username')}" + + # check if already contacted + if already_contacted(recipient_id): + return False, "already contacted", None + + # determine contact method + method, contact_info = determine_best_contact(recipient) + + # if no contact method found, skip (will retry after deeper scraping) + if method is None: + return False, "no contact method found - needs deeper scraping", None + + log = load_delivery_log() + result = { + 'recipient_id': recipient_id, + 'recipient_name': recipient.get('name') or recipient.get('username'), + 'method': method, + 'contact_info': contact_info, + 'overlap_score': match_data.get('overlap_score'), + 'timestamp': datetime.now().isoformat(), + 'draft': intro_draft, # store the actual message sent + } + + success = False + error = None + + if method == 'email': + email_subject = subject or "connecting builders - someone you might want to know" + success, error = send_email(contact_info, email_subject, intro_draft, dry_run) + + elif method == 'mastodon': + success, error = send_mastodon_dm(contact_info, intro_draft, dry_run) + + elif method == 'bluesky': + success, error = send_bluesky_dm(contact_info, intro_draft, dry_run) + + elif method == 'matrix': + success, error = send_matrix_dm(contact_info, intro_draft, dry_run) + + elif method == 'discord': + from scoutd.discord import send_discord_dm + success, error = send_discord_dm(contact_info, intro_draft, dry_run) + + elif method == 'lemmy': + from scoutd.lemmy import send_lemmy_dm + success, error = send_lemmy_dm(contact_info, intro_draft, dry_run) + + elif method == 'github_issue': + owner = contact_info.get('owner') + repo = contact_info.get('repo') + title = subject or "community introduction from connectd" + # format for github + github_body = f"""hey {recipient.get('name') or recipient.get('username')}, + +{intro_draft} + +--- +*this is an automated introduction from [connectd](https://github.com/connectd-daemon), a daemon that finds isolated builders with aligned values and connects them. if this feels spammy, i apologize - you can close this issue and we won't reach out again.* +""" + success, error = create_github_issue(owner, repo, title, github_body, dry_run) + + elif method == 'manual': + # skip - no longer using manual queue + success = False + error = "manual method deprecated - skipping" + + # log result + result['success'] = success + result['error'] = error + + if success: + log['sent'].append(result) + else: + log['failed'].append(result) + + save_delivery_log(log) + + return success, error, method + + +def deliver_batch(matches_with_intros, dry_run=False): + """ + deliver intros for a batch of matches + + matches_with_intros: list of {match_data, intro_draft} + """ + results = [] + + for item in matches_with_intros: + match_data = item.get('match_data') or item.get('match') + intro_draft = item.get('intro_draft') or item.get('draft') + + if not match_data or not intro_draft: + continue + + success, error, method = deliver_intro(match_data, intro_draft, dry_run) + results.append({ + 'recipient': match_data.get('human_b', {}).get('username'), + 'method': method, + 'success': success, + 'error': error, + }) + + print(f" {match_data.get('human_b', {}).get('username')}: {method} - {'ok' if success else error}") + + return results + + +def get_delivery_stats(): + """get delivery statistics""" + log = load_delivery_log() + queue = load_manual_queue() + + return { + 'sent': len(log.get('sent', [])), + 'failed': len(log.get('failed', [])), + 'queued': len(log.get('queued', [])), + 'manual_pending': len([q for q in queue if q.get('status') == 'pending']), + 'by_method': { + 'email': len([s for s in log.get('sent', []) if s.get('method') == 'email']), + 'mastodon': len([s for s in log.get('sent', []) if s.get('method') == 'mastodon']), + 'github_issue': len([s for s in log.get('sent', []) if s.get('method') == 'github_issue']), + 'manual': len([s for s in log.get('sent', []) if s.get('method') == 'manual']), + }, + } + + +def review_manual_queue(): + """review and process manual queue""" + queue = load_manual_queue() + pending = [q for q in queue if q.get('status') == 'pending'] + + if not pending: + print("no items in manual queue") + return + + print(f"\n{len(pending)} items pending review:\n") + + for i, item in enumerate(pending, 1): + recipient = item.get('recipient', {}) + match = item.get('match', {}) + + print(f"[{i}] {recipient.get('name') or recipient.get('username')}") + print(f" platform: {recipient.get('platform')}") + print(f" url: {recipient.get('url')}") + print(f" overlap: {match.get('overlap_score')}") + print(f" draft preview: {item.get('draft', '')[:80]}...") + print() + + return pending diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..178a6fa --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,23 @@ +services: + connectd: + image: sudoxreboot/connectd:latest + container_name: connectd + restart: unless-stopped + env_file: + - .env + ports: + - "8099:8099" + extra_hosts: + - "mastodon.sudoxreboot.com:192.168.1.39" + volumes: + - ./data:/app/data + - ./db:/app/db + - ./data_db:/data/db + - ./daemon.py:/app/daemon.py:ro + - ./deep.py:/app/scoutd/deep.py:ro + - ./db_init.py:/app/db/__init__.py:ro + - ./config.py:/app/config.py:ro + - ./groq_draft.py:/app/introd/groq_draft.py:ro + - ./api.py:/app/api.py:ro + - ./deliver.py:/app/introd/deliver.py:ro + - ./soul.txt:/app/soul.txt:ro diff --git a/favicon.png b/favicon.png new file mode 100644 index 0000000..bed93f8 Binary files /dev/null and b/favicon.png differ diff --git a/groq_draft.py b/groq_draft.py new file mode 100644 index 0000000..37c8265 --- /dev/null +++ b/groq_draft.py @@ -0,0 +1,437 @@ +""" +introd/groq_draft.py - groq llama 4 maverick for smart intro drafting + +uses groq api to generate personalized, natural intro messages +that don't sound like ai-generated slop +""" + +import os +import json +import requests +from datetime import datetime + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile') + + +def determine_contact_method(human): + """ + determine best contact method based on WHERE THEY'RE MOST ACTIVE + + don't use fixed hierarchy - analyze activity per platform: + - count posts/commits/activity + - weight by recency (last 30 days matters more) + - contact them where they already are + - fall back to email only if no social activity + """ + from datetime import datetime, timedelta + + extra = human.get('extra', {}) + if isinstance(extra, str): + extra = json.loads(extra) if extra else {} + + # handle nested extra.extra from old save format + if 'extra' in extra and isinstance(extra['extra'], dict): + extra = {**extra, **extra['extra']} + + contact = human.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) if contact else {} + + # collect activity scores per platform + activity_scores = {} + now = datetime.now() + thirty_days_ago = now - timedelta(days=30) + ninety_days_ago = now - timedelta(days=90) + + # github activity + github_username = human.get('username') if human.get('platform') == 'github' else extra.get('github') + if github_username: + github_score = 0 + top_repos = extra.get('top_repos', []) + + for repo in top_repos: + # recent commits weight more + pushed_at = repo.get('pushed_at', '') + if pushed_at: + try: + push_date = datetime.fromisoformat(pushed_at.replace('Z', '+00:00')).replace(tzinfo=None) + if push_date > thirty_days_ago: + github_score += 10 # very recent + elif push_date > ninety_days_ago: + github_score += 5 # somewhat recent + else: + github_score += 1 # old but exists + except: + github_score += 1 + + # stars indicate engagement + github_score += min(repo.get('stars', 0) // 10, 5) + + # commit activity from deep scrape + commit_count = extra.get('commit_count', 0) + github_score += min(commit_count // 10, 20) + + if github_score > 0: + activity_scores['github_issue'] = { + 'score': github_score, + 'info': f"{github_username}/{top_repos[0]['name']}" if top_repos else github_username + } + + # mastodon activity + mastodon_handle = human.get('username') if human.get('platform') == 'mastodon' else (extra.get('mastodon') or contact.get('mastodon')) + if mastodon_handle: + mastodon_score = 0 + statuses_count = extra.get('mastodon_statuses', 0) or human.get('statuses_count', 0) + + # high post count = active user + if statuses_count > 1000: + mastodon_score += 30 + elif statuses_count > 500: + mastodon_score += 20 + elif statuses_count > 100: + mastodon_score += 10 + elif statuses_count > 0: + mastodon_score += 5 + + # platform bonus for fediverse (values-aligned) + mastodon_score += 10 + + # bonus if handle was discovered via rel="me" or similar verification + # (having a handle linked from their website = they want to be contacted there) + handles = extra.get('handles', {}) + if handles.get('mastodon') == mastodon_handle: + mastodon_score += 15 # verified handle bonus + + if mastodon_score > 0: + activity_scores['mastodon'] = {'score': mastodon_score, 'info': mastodon_handle} + + # bluesky activity + bluesky_handle = human.get('username') if human.get('platform') == 'bluesky' else (extra.get('bluesky') or contact.get('bluesky')) + if bluesky_handle: + bluesky_score = 0 + posts_count = extra.get('bluesky_posts', 0) or human.get('posts_count', 0) + + if posts_count > 500: + bluesky_score += 25 + elif posts_count > 100: + bluesky_score += 15 + elif posts_count > 0: + bluesky_score += 5 + + # newer platform, slightly lower weight + bluesky_score += 5 + + if bluesky_score > 0: + activity_scores['bluesky'] = {'score': bluesky_score, 'info': bluesky_handle} + + # twitter activity + twitter_handle = extra.get('twitter') or contact.get('twitter') + if twitter_handle: + twitter_score = 0 + tweets_count = extra.get('twitter_tweets', 0) + + if tweets_count > 1000: + twitter_score += 20 + elif tweets_count > 100: + twitter_score += 10 + elif tweets_count > 0: + twitter_score += 5 + + # if we found them via twitter hashtags, they're active there + if human.get('platform') == 'twitter': + twitter_score += 15 + + if twitter_score > 0: + activity_scores['twitter'] = {'score': twitter_score, 'info': twitter_handle} + + # NOTE: reddit is DISCOVERY ONLY, not a contact method + # we find users on reddit but reach out via their external links (github, mastodon, etc.) + # reddit-only users go to manual_queue for review + + # lobsters activity + lobsters_username = extra.get('lobsters') or contact.get('lobsters') + if lobsters_username or human.get('platform') == 'lobsters': + lobsters_score = 0 + lobsters_username = lobsters_username or human.get('username') + + karma = extra.get('lobsters_karma', 0) or human.get('karma', 0) + + # lobsters is invite-only, high signal + lobsters_score += 15 + + if karma > 100: + lobsters_score += 15 + elif karma > 50: + lobsters_score += 10 + elif karma > 0: + lobsters_score += 5 + + if lobsters_score > 0: + activity_scores['lobsters'] = {'score': lobsters_score, 'info': lobsters_username} + + # matrix activity + matrix_id = extra.get('matrix') or contact.get('matrix') + if matrix_id: + matrix_score = 0 + + # matrix users are typically privacy-conscious and technical + matrix_score += 15 # platform bonus for decentralized chat + + # bonus if handle was discovered via rel="me" verification + handles = extra.get('handles', {}) + if handles.get('matrix') == matrix_id: + matrix_score += 10 # verified handle bonus + + if matrix_score > 0: + activity_scores['matrix'] = {'score': matrix_score, 'info': matrix_id} + + # lemmy activity (fediverse) + lemmy_username = human.get('username') if human.get('platform') == 'lemmy' else extra.get('lemmy') + if lemmy_username: + lemmy_score = 0 + + # lemmy is fediverse - high values alignment + lemmy_score += 20 # fediverse platform bonus + + post_count = extra.get('post_count', 0) + comment_count = extra.get('comment_count', 0) + + if post_count > 100: + lemmy_score += 15 + elif post_count > 50: + lemmy_score += 10 + elif post_count > 10: + lemmy_score += 5 + + if comment_count > 500: + lemmy_score += 10 + elif comment_count > 100: + lemmy_score += 5 + + if lemmy_score > 0: + activity_scores['lemmy'] = {'score': lemmy_score, 'info': lemmy_username} + + # pick highest activity platform + if activity_scores: + best_platform = max(activity_scores.items(), key=lambda x: x[1]['score']) + return best_platform[0], best_platform[1]['info'] + + # fall back to email ONLY if no social activity detected + email = extra.get('email') or contact.get('email') + # also check emails list + if not email: + emails = extra.get('emails') or contact.get('emails') or [] + for e in emails: + if e and '@' in e and 'noreply' not in e.lower(): + email = e + break + + if email and '@' in email and 'noreply' not in email.lower(): + return 'email', email + + # last resort: manual + return 'manual', None + + +def draft_intro_with_llm(match_data, recipient='a', dry_run=False): + """ + use groq llama 4 maverick to draft a personalized intro + + match_data should contain: + - human_a: the first person + - human_b: the second person + - overlap_score: numeric score + - overlap_reasons: list of why they match + + recipient: 'a' or 'b' - who we're writing to + """ + if not GROQ_API_KEY: + return None, "GROQ_API_KEY not set" + + # determine recipient and other person + if recipient == 'a': + to_person = match_data.get('human_a', {}) + other_person = match_data.get('human_b', {}) + else: + to_person = match_data.get('human_b', {}) + other_person = match_data.get('human_a', {}) + + # build context + to_name = to_person.get('name') or to_person.get('username', 'friend') + other_name = other_person.get('name') or other_person.get('username', 'someone') + + to_signals = to_person.get('signals', []) + if isinstance(to_signals, str): + to_signals = json.loads(to_signals) if to_signals else [] + + other_signals = other_person.get('signals', []) + if isinstance(other_signals, str): + other_signals = json.loads(other_signals) if other_signals else [] + + overlap_reasons = match_data.get('overlap_reasons', []) + if isinstance(overlap_reasons, str): + overlap_reasons = json.loads(overlap_reasons) if overlap_reasons else [] + + # parse extra data + to_extra = to_person.get('extra', {}) + other_extra = other_person.get('extra', {}) + if isinstance(to_extra, str): + to_extra = json.loads(to_extra) if to_extra else {} + if isinstance(other_extra, str): + other_extra = json.loads(other_extra) if other_extra else {} + + # build profile summaries + to_profile = f""" +name: {to_name} +platform: {to_person.get('platform', 'unknown')} +bio: {to_person.get('bio') or 'no bio'} +location: {to_person.get('location') or 'unknown'} +signals: {', '.join(to_signals[:8])} +repos: {len(to_extra.get('top_repos', []))} public repos +languages: {', '.join(to_extra.get('languages', {}).keys())} +""" + + other_profile = f""" +name: {other_name} +platform: {other_person.get('platform', 'unknown')} +bio: {other_person.get('bio') or 'no bio'} +location: {other_person.get('location') or 'unknown'} +signals: {', '.join(other_signals[:8])} +repos: {len(other_extra.get('top_repos', []))} public repos +languages: {', '.join(other_extra.get('languages', {}).keys())} +url: {other_person.get('url', '')} +""" + + # build prompt + system_prompt = """you are connectd, an ai that connects isolated builders who share values but don't know each other yet. + +your job is to write a short, genuine intro message to one person about another person they might want to know. + +rules: +- be brief (3-5 sentences max) +- be genuine, not salesy or fake +- focus on WHY they might want to connect, not just WHAT they have in common +- don't be cringe or use buzzwords +- lowercase preferred (casual tone) +- no emojis unless the person's profile suggests they'd like them +- mention specific things from their profiles, not generic "you both like open source" +- end with a simple invitation, not a hard sell +- sign off as "- connectd" (lowercase) + +bad examples: +- "I noticed you're both passionate about..." (too formal) +- "You two would be PERFECT for each other!" (too salesy) +- "As a fellow privacy enthusiast..." (cringe) + +good examples: +- "hey, saw you're building X. there's someone else working on similar stuff in Y who might be interesting to know." +- "you might want to check out Z's work on federated systems - similar approach to what you're doing with A." +""" + + user_prompt = f"""write an intro message to {to_name} about {other_name}. + +RECIPIENT ({to_name}): +{to_profile} + +INTRODUCING ({other_name}): +{other_profile} + +WHY THEY MATCH (overlap score {match_data.get('overlap_score', 0)}): +{', '.join(overlap_reasons[:5])} + +write a short intro message. remember: lowercase, genuine, not salesy.""" + + try: + response = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [ + {'role': 'system', 'content': system_prompt}, + {'role': 'user', 'content': user_prompt}, + ], + 'temperature': 0.7, + 'max_tokens': 300, + }, + timeout=30, + ) + + if response.status_code != 200: + return None, f"groq api error: {response.status_code} - {response.text}" + + data = response.json() + draft = data['choices'][0]['message']['content'].strip() + + # determine contact method for recipient + contact_method, contact_info = determine_contact_method(to_person) + + return { + 'draft': draft, + 'model': MODEL, + 'to': to_name, + 'about': other_name, + 'overlap_score': match_data.get('overlap_score', 0), + 'contact_method': contact_method, + 'contact_info': contact_info, + 'generated_at': datetime.now().isoformat(), + }, None + + except Exception as e: + return None, f"groq error: {str(e)}" + + +def draft_intro_batch(matches, dry_run=False): + """ + draft intros for multiple matches + returns list of (match, intro_result, error) tuples + """ + results = [] + + for match in matches: + # draft for both directions + intro_a, err_a = draft_intro_with_llm(match, recipient='a', dry_run=dry_run) + intro_b, err_b = draft_intro_with_llm(match, recipient='b', dry_run=dry_run) + + results.append({ + 'match': match, + 'intro_to_a': intro_a, + 'intro_to_b': intro_b, + 'errors': [err_a, err_b], + }) + + return results + + +def test_groq_connection(): + """test that groq api is working""" + if not GROQ_API_KEY: + return False, "GROQ_API_KEY not set" + + try: + response = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [{'role': 'user', 'content': 'say "ok" and nothing else'}], + 'max_tokens': 10, + }, + timeout=10, + ) + + if response.status_code == 200: + return True, "groq api working" + else: + return False, f"groq api error: {response.status_code}" + + except Exception as e: + return False, f"groq connection error: {str(e)}" diff --git a/groq_draft_orig.py b/groq_draft_orig.py new file mode 100644 index 0000000..584d993 --- /dev/null +++ b/groq_draft_orig.py @@ -0,0 +1,460 @@ +""" +introd/groq_draft.py - groq llama 4 maverick for smart intro drafting + +uses groq api to generate personalized, natural intro messages +that don't sound like ai-generated slop +""" + +import os +import json +import requests +from datetime import datetime + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile') + + +def determine_contact_method(human): + """ + determine best contact method based on WHERE THEY'RE MOST ACTIVE + + don't use fixed hierarchy - analyze activity per platform: + - count posts/commits/activity + - weight by recency (last 30 days matters more) + - contact them where they already are + - fall back to email only if no social activity + """ + from datetime import datetime, timedelta + + extra = human.get('extra', {}) + if isinstance(extra, str): + extra = json.loads(extra) if extra else {} + + # handle nested extra.extra from old save format + if 'extra' in extra and isinstance(extra['extra'], dict): + extra = {**extra, **extra['extra']} + + contact = human.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) if contact else {} + + # collect activity scores per platform + activity_scores = {} + now = datetime.now() + thirty_days_ago = now - timedelta(days=30) + ninety_days_ago = now - timedelta(days=90) + + # github activity + github_username = human.get('username') if human.get('platform') == 'github' else extra.get('github') + if github_username: + github_score = 0 + top_repos = extra.get('top_repos', []) + + for repo in top_repos: + # recent commits weight more + pushed_at = repo.get('pushed_at', '') + if pushed_at: + try: + push_date = datetime.fromisoformat(pushed_at.replace('Z', '+00:00')).replace(tzinfo=None) + if push_date > thirty_days_ago: + github_score += 10 # very recent + elif push_date > ninety_days_ago: + github_score += 5 # somewhat recent + else: + github_score += 1 # old but exists + except: + github_score += 1 + + # stars indicate engagement + github_score += min(repo.get('stars', 0) // 10, 5) + + # commit activity from deep scrape + commit_count = extra.get('commit_count', 0) + github_score += min(commit_count // 10, 20) + + if github_score > 0: + activity_scores['github_issue'] = { + 'score': github_score, + 'info': f"{github_username}/{top_repos[0]['name']}" if top_repos else github_username + } + + # mastodon activity + mastodon_handle = extra.get('mastodon') or contact.get('mastodon') + if mastodon_handle: + mastodon_score = 0 + statuses_count = extra.get('mastodon_statuses', 0) or human.get('statuses_count', 0) + + # high post count = active user + if statuses_count > 1000: + mastodon_score += 30 + elif statuses_count > 500: + mastodon_score += 20 + elif statuses_count > 100: + mastodon_score += 10 + elif statuses_count > 0: + mastodon_score += 5 + + # platform bonus for fediverse (values-aligned) + mastodon_score += 10 + + # bonus if handle was discovered via rel="me" or similar verification + # (having a handle linked from their website = they want to be contacted there) + handles = extra.get('handles', {}) + if handles.get('mastodon') == mastodon_handle: + mastodon_score += 15 # verified handle bonus + + if mastodon_score > 0: + activity_scores['mastodon'] = {'score': mastodon_score, 'info': mastodon_handle} + + # bluesky activity + bluesky_handle = extra.get('bluesky') or contact.get('bluesky') + if bluesky_handle: + bluesky_score = 0 + posts_count = extra.get('bluesky_posts', 0) or human.get('posts_count', 0) + + if posts_count > 500: + bluesky_score += 25 + elif posts_count > 100: + bluesky_score += 15 + elif posts_count > 0: + bluesky_score += 5 + + # newer platform, slightly lower weight + bluesky_score += 5 + + if bluesky_score > 0: + activity_scores['bluesky'] = {'score': bluesky_score, 'info': bluesky_handle} + + # twitter activity + twitter_handle = extra.get('twitter') or contact.get('twitter') + if twitter_handle: + twitter_score = 0 + tweets_count = extra.get('twitter_tweets', 0) + + if tweets_count > 1000: + twitter_score += 20 + elif tweets_count > 100: + twitter_score += 10 + elif tweets_count > 0: + twitter_score += 5 + + # if we found them via twitter hashtags, they're active there + if human.get('platform') == 'twitter': + twitter_score += 15 + + if twitter_score > 0: + activity_scores['twitter'] = {'score': twitter_score, 'info': twitter_handle} + + # NOTE: reddit is DISCOVERY ONLY, not a contact method + # we find users on reddit but reach out via their external links (github, mastodon, etc.) + # reddit-only users go to manual_queue for review + + # lobsters activity + lobsters_username = extra.get('lobsters') or contact.get('lobsters') + if lobsters_username or human.get('platform') == 'lobsters': + lobsters_score = 0 + lobsters_username = lobsters_username or human.get('username') + + karma = extra.get('lobsters_karma', 0) or human.get('karma', 0) + + # lobsters is invite-only, high signal + lobsters_score += 15 + + if karma > 100: + lobsters_score += 15 + elif karma > 50: + lobsters_score += 10 + elif karma > 0: + lobsters_score += 5 + + if lobsters_score > 0: + activity_scores['lobsters'] = {'score': lobsters_score, 'info': lobsters_username} + + # matrix activity + matrix_id = extra.get('matrix') or contact.get('matrix') + if matrix_id: + matrix_score = 0 + + # matrix users are typically privacy-conscious and technical + matrix_score += 15 # platform bonus for decentralized chat + + # bonus if handle was discovered via rel="me" verification + handles = extra.get('handles', {}) + if handles.get('matrix') == matrix_id: + matrix_score += 10 # verified handle bonus + + if matrix_score > 0: + activity_scores['matrix'] = {'score': matrix_score, 'info': matrix_id} + + # lemmy activity (fediverse) + lemmy_username = human.get('username') if human.get('platform') == 'lemmy' else extra.get('lemmy') + if lemmy_username: + lemmy_score = 0 + + # lemmy is fediverse - high values alignment + lemmy_score += 20 # fediverse platform bonus + + post_count = extra.get('post_count', 0) + comment_count = extra.get('comment_count', 0) + + if post_count > 100: + lemmy_score += 15 + elif post_count > 50: + lemmy_score += 10 + elif post_count > 10: + lemmy_score += 5 + + if comment_count > 500: + lemmy_score += 10 + elif comment_count > 100: + lemmy_score += 5 + + if lemmy_score > 0: + activity_scores['lemmy'] = {'score': lemmy_score, 'info': lemmy_username} + + # pick highest activity platform + if activity_scores: + best_platform = max(activity_scores.items(), key=lambda x: x[1]['score']) + return best_platform[0], best_platform[1]['info'] + + # fall back to email ONLY if no social activity detected + email = extra.get('email') or contact.get('email') + # also check emails list + if not email: + emails = extra.get('emails') or contact.get('emails') or [] + for e in emails: + if e and '@' in e and 'noreply' not in e.lower(): + email = e + break + + if email and '@' in email and 'noreply' not in email.lower(): + return 'email', email + + # last resort: manual + return 'manual', None + + +def draft_intro_with_llm(match_data, recipient='a', dry_run=False): + """ + use groq llama 4 maverick to draft a personalized intro + + match_data should contain: + - human_a: the first person + - human_b: the second person + - overlap_score: numeric score + - overlap_reasons: list of why they match + + recipient: 'a' or 'b' - who we're writing to + """ + if not GROQ_API_KEY: + return None, "GROQ_API_KEY not set" + + # determine recipient and other person + if recipient == 'a': + to_person = match_data.get('human_a', {}) + other_person = match_data.get('human_b', {}) + else: + to_person = match_data.get('human_b', {}) + other_person = match_data.get('human_a', {}) + + # build context + to_name = to_person.get('name') or to_person.get('username', 'friend') + other_name = other_person.get('name') or other_person.get('username', 'someone') + + to_signals = to_person.get('signals', []) + if isinstance(to_signals, str): + to_signals = json.loads(to_signals) if to_signals else [] + + other_signals = other_person.get('signals', []) + if isinstance(other_signals, str): + other_signals = json.loads(other_signals) if other_signals else [] + + overlap_reasons = match_data.get('overlap_reasons', []) + if isinstance(overlap_reasons, str): + overlap_reasons = json.loads(overlap_reasons) if overlap_reasons else [] + + # parse extra data + to_extra = to_person.get('extra', {}) + other_extra = other_person.get('extra', {}) + if isinstance(to_extra, str): + to_extra = json.loads(to_extra) if to_extra else {} + if isinstance(other_extra, str): + other_extra = json.loads(other_extra) if other_extra else {} + + # build profile summaries + to_profile = f""" +name: {to_name} +platform: {to_person.get('platform', 'unknown')} +bio: {to_person.get('bio') or 'no bio'} +location: {to_person.get('location') or 'unknown'} +signals: {', '.join(to_signals[:8])} +repos: {len(to_extra.get('top_repos', []))} public repos +languages: {', '.join(to_extra.get('languages', {}).keys())} +""" + + + # extract other person's best contact method + other_contact = other_person.get('contact', {}) + if isinstance(other_contact, str): + import json as j + try: + other_contact = j.loads(other_contact) + except: + other_contact = {} + + # determine their preferred contact + other_preferred = '' + if other_contact.get('mastodon'): + other_preferred = f"mastodon: {other_contact['mastodon']}" + elif other_contact.get('github'): + other_preferred = f"github: github.com/{other_contact['github']}" + elif other_contact.get('email'): + other_preferred = f"email: {other_contact['email']}" + elif other_person.get('url'): + other_preferred = f"url: {other_person['url']}" + +other_profile = f""" +name: {other_name} +platform: {other_person.get('platform', 'unknown')} +bio: {other_person.get('bio') or 'no bio'} +location: {other_person.get('location') or 'unknown'} +signals: {', '.join(other_signals[:8])} +repos: {len(other_extra.get('top_repos', []))} public repos +languages: {', '.join(other_extra.get('languages', {}).keys())} +url: {other_person.get('url', '')} +contact: {other_preferred} +""" + + # build prompt + system_prompt = """you are connectd, an ai that connects isolated builders who share values but don't know each other yet. + +your job is to write a short, genuine intro message to one person about another person they might want to know. + +rules: +- be brief (3-5 sentences max) +- be genuine, not salesy or fake +- focus on WHY they might want to connect, not just WHAT they have in common +- don't be cringe or use buzzwords +- lowercase preferred (casual tone) +- no emojis unless the person's profile suggests they'd like them +- mention specific things from their profiles, not generic "you both like open source" +- end with a simple invitation, not a hard sell +- IMPORTANT: always tell them how to reach the other person (their contact info is provided) +- sign off as "- connectd" (lowercase) + +bad examples: +- "I noticed you're both passionate about..." (too formal) +- "You two would be PERFECT for each other!" (too salesy) +- "As a fellow privacy enthusiast..." (cringe) + +good examples: +- "hey, saw you're building X. there's someone else working on similar stuff in Y who might be interesting to know." +- "you might want to check out Z's work on federated systems - similar approach to what you're doing with A." +""" + + user_prompt = f"""write an intro message to {to_name} about {other_name}. + +RECIPIENT ({to_name}): +{to_profile} + +INTRODUCING ({other_name}): +{other_profile} + +WHY THEY MATCH (overlap score {match_data.get('overlap_score', 0)}): +{', '.join(overlap_reasons[:5])} + +write a short intro message. remember: lowercase, genuine, not salesy.""" + + try: + response = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [ + {'role': 'system', 'content': system_prompt}, + {'role': 'user', 'content': user_prompt}, + ], + 'temperature': 0.7, + 'max_tokens': 300, + }, + timeout=30, + ) + + if response.status_code != 200: + return None, f"groq api error: {response.status_code} - {response.text}" + + data = response.json() + draft = data['choices'][0]['message']['content'].strip() + + # determine contact method for recipient + contact_method, contact_info = determine_contact_method(to_person) + + return { + 'draft': draft, + 'model': MODEL, + 'to': to_name, + 'about': other_name, + 'overlap_score': match_data.get('overlap_score', 0), + 'contact_method': contact_method, + 'contact_info': contact_info, + 'generated_at': datetime.now().isoformat(), + }, None + + except Exception as e: + return None, f"groq error: {str(e)}" + + +def draft_intro_batch(matches, dry_run=False): + """ + draft intros for multiple matches + returns list of (match, intro_result, error) tuples + """ + results = [] + + for match in matches: + # draft for both directions + intro_a, err_a = draft_intro_with_llm(match, recipient='a', dry_run=dry_run) + intro_b, err_b = draft_intro_with_llm(match, recipient='b', dry_run=dry_run) + + results.append({ + 'match': match, + 'intro_to_a': intro_a, + 'intro_to_b': intro_b, + 'errors': [err_a, err_b], + }) + + return results + + +def test_groq_connection(): + """test that groq api is working""" + if not GROQ_API_KEY: + return False, "GROQ_API_KEY not set" + + try: + response = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [{'role': 'user', 'content': 'say "ok" and nothing else'}], + 'max_tokens': 10, + }, + timeout=10, + ) + + if response.status_code == 200: + return True, "groq api working" + else: + return False, f"groq api error: {response.status_code}" + + except Exception as e: + return False, f"groq connection error: {str(e)}" diff --git a/hacs/README.md b/hacs/README.md new file mode 100644 index 0000000..fa9940e --- /dev/null +++ b/hacs/README.md @@ -0,0 +1,88 @@ +# connectd home assistant integration + +monitor your connectd daemon from home assistant. + +## installation + +### HACS (recommended) + +1. open HACS in home assistant +2. click the three dots menu → custom repositories +3. add `https://github.com/sudoxnym/connectd` with category "integration" +4. search for "connectd" and install +5. restart home assistant +6. go to settings → devices & services → add integration → connectd + +### manual + +1. copy `custom_components/connectd` to your HA `config/custom_components/` directory +2. restart home assistant +3. go to settings → devices & services → add integration → connectd + +## configuration + +enter the host and port of your connectd daemon: +- **host**: IP or hostname where connectd is running (e.g., `192.168.1.8`) +- **port**: API port (default: `8099`) + +## sensors + +the integration creates these sensors: + +### stats +- `sensor.connectd_total_humans` - total discovered humans +- `sensor.connectd_high_score_humans` - humans with high values alignment +- `sensor.connectd_total_matches` - total matches found +- `sensor.connectd_total_intros` - total intro drafts +- `sensor.connectd_sent_intros` - intros successfully sent +- `sensor.connectd_active_builders` - active builder count +- `sensor.connectd_lost_builders` - lost builder count +- `sensor.connectd_recovering_builders` - recovering builder count +- `sensor.connectd_lost_outreach_sent` - lost builder outreach count + +### state +- `sensor.connectd_intros_today` - intros sent today +- `sensor.connectd_lost_intros_today` - lost builder intros today +- `sensor.connectd_status` - daemon status (running/dry_run/stopped) + +### per-platform +- `sensor.connectd_github_humans` +- `sensor.connectd_mastodon_humans` +- `sensor.connectd_reddit_humans` +- `sensor.connectd_lemmy_humans` +- `sensor.connectd_discord_humans` +- `sensor.connectd_lobsters_humans` + +## example dashboard card + +```yaml +type: entities +title: connectd +entities: + - entity: sensor.connectd_status + - entity: sensor.connectd_total_humans + - entity: sensor.connectd_intros_today + - entity: sensor.connectd_lost_intros_today + - entity: sensor.connectd_active_builders + - entity: sensor.connectd_lost_builders +``` + +## automations + +example: notify when an intro is sent: + +```yaml +automation: + - alias: "connectd intro notification" + trigger: + - platform: state + entity_id: sensor.connectd_intros_today + condition: + - condition: template + value_template: "{{ trigger.to_state.state | int > trigger.from_state.state | int }}" + action: + - service: notify.mobile_app + data: + title: "connectd" + message: "sent intro #{{ states('sensor.connectd_intros_today') }} today" +``` diff --git a/hacs/custom_components/connectd/__init__.py b/hacs/custom_components/connectd/__init__.py new file mode 100644 index 0000000..4f19a2f --- /dev/null +++ b/hacs/custom_components/connectd/__init__.py @@ -0,0 +1,117 @@ +"""connectd integration for home assistant.""" +from __future__ import annotations + +import asyncio +import logging +from datetime import timedelta + +import aiohttp + +from homeassistant.config_entries import ConfigEntry +from homeassistant.const import Platform +from homeassistant.core import HomeAssistant +from homeassistant.helpers.update_coordinator import DataUpdateCoordinator, UpdateFailed + +_LOGGER = logging.getLogger(__name__) + +DOMAIN = "connectd" +PLATFORMS = [Platform.SENSOR] +SCAN_INTERVAL = timedelta(minutes=1) + + +async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool: + """set up connectd from a config entry.""" + host = entry.data["host"] + port = entry.data["port"] + + coordinator = ConnectdDataUpdateCoordinator(hass, host, port) + await coordinator.async_config_entry_first_refresh() + + hass.data.setdefault(DOMAIN, {}) + hass.data[DOMAIN][entry.entry_id] = coordinator + + await hass.config_entries.async_forward_entry_setups(entry, PLATFORMS) + + return True + + +async def async_unload_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool: + """unload a config entry.""" + unload_ok = await hass.config_entries.async_unload_platforms(entry, PLATFORMS) + if unload_ok: + hass.data[DOMAIN].pop(entry.entry_id) + return unload_ok + + +class ConnectdDataUpdateCoordinator(DataUpdateCoordinator): + """class to manage fetching connectd data.""" + + def __init__(self, hass: HomeAssistant, host: str, port: int) -> None: + """initialize.""" + self.host = host + self.port = port + self.base_url = f"http://{host}:{port}" + + super().__init__( + hass, + _LOGGER, + name=DOMAIN, + update_interval=SCAN_INTERVAL, + ) + + async def _async_update_data(self): + """fetch data from connectd api.""" + try: + async with asyncio.timeout(10): + async with aiohttp.ClientSession() as session: + # get stats + async with session.get(f"{self.base_url}/api/stats") as resp: + if resp.status != 200: + raise UpdateFailed(f"error fetching stats: {resp.status}") + stats = await resp.json() + + # get state + async with session.get(f"{self.base_url}/api/state") as resp: + if resp.status != 200: + raise UpdateFailed(f"error fetching state: {resp.status}") + state = await resp.json() + + # get priority matches (optional) + priority_matches = {} + try: + async with session.get(f"{self.base_url}/api/priority_matches") as resp: + if resp.status == 200: + priority_matches = await resp.json() + except Exception: + pass + + # get top humans (optional) + top_humans = {} + try: + async with session.get(f"{self.base_url}/api/top_humans") as resp: + if resp.status == 200: + top_humans = await resp.json() + except Exception: + pass + + # get user info (optional) + user = {} + try: + async with session.get(f"{self.base_url}/api/user") as resp: + if resp.status == 200: + user = await resp.json() + except Exception: + pass + + return { + "stats": stats, + "state": state, + "priority_matches": priority_matches, + "top_humans": top_humans, + "user": user, + } + + except aiohttp.ClientError as err: + raise UpdateFailed(f"error communicating with connectd: {err}") + except Exception as err: + raise UpdateFailed(f"unexpected error: {err}") diff --git a/hacs/custom_components/connectd/branding/icon.png b/hacs/custom_components/connectd/branding/icon.png new file mode 100644 index 0000000..cc332d8 Binary files /dev/null and b/hacs/custom_components/connectd/branding/icon.png differ diff --git a/hacs/custom_components/connectd/branding/icon@2x.png b/hacs/custom_components/connectd/branding/icon@2x.png new file mode 100644 index 0000000..292a405 Binary files /dev/null and b/hacs/custom_components/connectd/branding/icon@2x.png differ diff --git a/hacs/custom_components/connectd/config_flow.py b/hacs/custom_components/connectd/config_flow.py new file mode 100644 index 0000000..79526b2 --- /dev/null +++ b/hacs/custom_components/connectd/config_flow.py @@ -0,0 +1,71 @@ +"""config flow for connectd integration.""" +from __future__ import annotations + +import logging + +import aiohttp +import voluptuous as vol + +from homeassistant import config_entries +from homeassistant.const import CONF_HOST, CONF_PORT +from homeassistant.data_entry_flow import FlowResult + +_LOGGER = logging.getLogger(__name__) + +DOMAIN = "connectd" +DEFAULT_PORT = 8099 + + +class ConnectdConfigFlow(config_entries.ConfigFlow, domain=DOMAIN): + """handle a config flow for connectd.""" + + VERSION = 1 + + async def async_step_user( + self, user_input: dict | None = None + ) -> FlowResult: + """handle the initial step.""" + errors = {} + + if user_input is not None: + host = user_input[CONF_HOST] + port = user_input.get(CONF_PORT, DEFAULT_PORT) + + # test connection + try: + timeout = aiohttp.ClientTimeout(total=10) + async with aiohttp.ClientSession(timeout=timeout) as session: + url = f"http://{host}:{port}/api/health" + async with session.get(url) as resp: + if resp.status == 200: + # connection works + await self.async_set_unique_id(f"{host}:{port}") + self._abort_if_unique_id_configured() + + return self.async_create_entry( + title=f"connectd ({host})", + data={ + "host": host, + "port": port, + }, + ) + else: + _LOGGER.error("connectd api returned status %s", resp.status) + errors["base"] = "cannot_connect" + except aiohttp.ClientError as err: + _LOGGER.error("connectd connection error: %s", err) + errors["base"] = "cannot_connect" + except Exception as err: + _LOGGER.exception("connectd unexpected error: %s", err) + errors["base"] = "unknown" + + return self.async_show_form( + step_id="user", + data_schema=vol.Schema( + { + vol.Required(CONF_HOST, default="192.168.1.8"): str, + vol.Optional(CONF_PORT, default=DEFAULT_PORT): int, + } + ), + errors=errors, + ) diff --git a/hacs/custom_components/connectd/manifest.json b/hacs/custom_components/connectd/manifest.json new file mode 100644 index 0000000..86c0b8a --- /dev/null +++ b/hacs/custom_components/connectd/manifest.json @@ -0,0 +1,11 @@ +{ + "domain": "connectd", + "name": "connectd", + "codeowners": ["@sudoxnym"], + "config_flow": true, + "documentation": "https://github.com/sudoxnym/connectd", + "iot_class": "local_polling", + "issue_tracker": "https://github.com/sudoxnym/connectd/issues", + "requirements": [], + "version": "1.1.0" +} diff --git a/hacs/custom_components/connectd/sensor.py b/hacs/custom_components/connectd/sensor.py new file mode 100644 index 0000000..a8c7c1e --- /dev/null +++ b/hacs/custom_components/connectd/sensor.py @@ -0,0 +1,363 @@ +"""sensor platform for connectd.""" +from __future__ import annotations + +from homeassistant.components.sensor import ( + SensorEntity, + SensorStateClass, +) +from homeassistant.config_entries import ConfigEntry +from homeassistant.core import HomeAssistant +from homeassistant.helpers.device_registry import DeviceInfo +from homeassistant.helpers.entity_platform import AddEntitiesCallback +from homeassistant.helpers.update_coordinator import CoordinatorEntity + +from . import DOMAIN, ConnectdDataUpdateCoordinator + + +def get_device_info(entry_id: str, host: str) -> DeviceInfo: + """return device info for connectd daemon.""" + return DeviceInfo( + identifiers={(DOMAIN, entry_id)}, + name="connectd daemon", + manufacturer="sudoxnym", + model="connectd", + sw_version="1.1.0", + configuration_url=f"http://{host}:8099", + ) + +SENSORS = [ + # stats sensors + ("total_humans", "total humans", "mdi:account-group", "stats"), + ("high_score_humans", "high score humans", "mdi:account-star", "stats"), + ("total_matches", "total matches", "mdi:handshake", "stats"), + ("total_intros", "total intros", "mdi:email-outline", "stats"), + ("sent_intros", "sent intros", "mdi:email-check", "stats"), + ("active_builders", "active builders", "mdi:hammer-wrench", "stats"), + ("lost_builders", "lost builders", "mdi:account-question", "stats"), + ("recovering_builders", "recovering builders", "mdi:account-heart", "stats"), + ("lost_outreach_sent", "lost outreach sent", "mdi:heart-pulse", "stats"), + + # state sensors + ("intros_today", "intros today", "mdi:email-fast", "state"), + ("lost_intros_today", "lost intros today", "mdi:heart-outline", "state"), +] + + +async def async_setup_entry( + hass: HomeAssistant, + entry: ConfigEntry, + async_add_entities: AddEntitiesCallback, +) -> None: + """set up connectd sensors.""" + coordinator = hass.data[DOMAIN][entry.entry_id] + host = entry.data.get("host", "localhost") + device_info = get_device_info(entry.entry_id, host) + + entities = [] + for sensor_key, name, icon, data_source in SENSORS: + entities.append( + ConnectdSensor(coordinator, sensor_key, name, icon, data_source, device_info) + ) + + # add status sensor + entities.append(ConnectdStatusSensor(coordinator, device_info)) + + # add priority matches sensor + entities.append(ConnectdPriorityMatchesSensor(coordinator, device_info)) + + # add top humans sensor + entities.append(ConnectdTopHumansSensor(coordinator, device_info)) + + # add countdown sensors + entities.append(ConnectdCountdownSensor(coordinator, device_info, "scout", "mdi:radar")) + entities.append(ConnectdCountdownSensor(coordinator, device_info, "match", "mdi:handshake")) + entities.append(ConnectdCountdownSensor(coordinator, device_info, "intro", "mdi:email-fast")) + + # add personal score sensor + entities.append(ConnectdUserScoreSensor(coordinator, device_info)) + + # add platform sensors (by_platform dict) + entities.append(ConnectdPlatformSensor(coordinator, "github", device_info)) + entities.append(ConnectdPlatformSensor(coordinator, "mastodon", device_info)) + entities.append(ConnectdPlatformSensor(coordinator, "reddit", device_info)) + entities.append(ConnectdPlatformSensor(coordinator, "lemmy", device_info)) + entities.append(ConnectdPlatformSensor(coordinator, "discord", device_info)) + entities.append(ConnectdPlatformSensor(coordinator, "lobsters", device_info)) + + async_add_entities(entities) + + +class ConnectdSensor(CoordinatorEntity, SensorEntity): + """connectd sensor entity.""" + + def __init__( + self, + coordinator: ConnectdDataUpdateCoordinator, + sensor_key: str, + name: str, + icon: str, + data_source: str, + device_info: DeviceInfo, + ) -> None: + """initialize.""" + super().__init__(coordinator) + self._sensor_key = sensor_key + self._attr_name = f"connectd {name}" + self._attr_unique_id = f"connectd_{sensor_key}" + self._attr_icon = icon + self._data_source = data_source + self._attr_state_class = SensorStateClass.MEASUREMENT + self._attr_device_info = device_info + + @property + def native_value(self): + """return the state.""" + if self.coordinator.data: + data = self.coordinator.data.get(self._data_source, {}) + return data.get(self._sensor_key, 0) + return None + + +class ConnectdStatusSensor(CoordinatorEntity, SensorEntity): + """connectd daemon status sensor.""" + + def __init__(self, coordinator: ConnectdDataUpdateCoordinator, device_info: DeviceInfo) -> None: + """initialize.""" + super().__init__(coordinator) + self._attr_name = "connectd status" + self._attr_unique_id = "connectd_status" + self._attr_icon = "mdi:connection" + self._attr_device_info = device_info + + @property + def native_value(self): + """return the state.""" + if self.coordinator.data: + state = self.coordinator.data.get("state", {}) + if state.get("running"): + return "running" if not state.get("dry_run") else "dry_run" + return "stopped" + return "unavailable" + + @property + def extra_state_attributes(self): + """return extra attributes.""" + if self.coordinator.data: + state = self.coordinator.data.get("state", {}) + return { + "last_scout": state.get("last_scout"), + "last_match": state.get("last_match"), + "last_intro": state.get("last_intro"), + "last_lost": state.get("last_lost"), + "started_at": state.get("started_at"), + } + return {} + + +class ConnectdPlatformSensor(CoordinatorEntity, SensorEntity): + """connectd per-platform sensor.""" + + def __init__( + self, + coordinator: ConnectdDataUpdateCoordinator, + platform: str, + device_info: DeviceInfo, + ) -> None: + """initialize.""" + super().__init__(coordinator) + self._platform = platform + self._attr_name = f"connectd {platform} humans" + self._attr_unique_id = f"connectd_platform_{platform}" + self._attr_icon = self._get_platform_icon(platform) + self._attr_state_class = SensorStateClass.MEASUREMENT + self._attr_device_info = device_info + + def _get_platform_icon(self, platform: str) -> str: + """get icon for platform.""" + icons = { + "github": "mdi:github", + "mastodon": "mdi:mastodon", + "reddit": "mdi:reddit", + "lemmy": "mdi:alpha-l-circle", + "discord": "mdi:discord", + "lobsters": "mdi:web", + "bluesky": "mdi:cloud", + "matrix": "mdi:matrix", + } + return icons.get(platform, "mdi:web") + + @property + def native_value(self): + """return the state.""" + if self.coordinator.data: + stats = self.coordinator.data.get("stats", {}) + by_platform = stats.get("by_platform", {}) + return by_platform.get(self._platform, 0) + return 0 + + +class ConnectdPriorityMatchesSensor(CoordinatorEntity, SensorEntity): + """connectd priority matches sensor.""" + + def __init__(self, coordinator: ConnectdDataUpdateCoordinator, device_info: DeviceInfo) -> None: + """initialize.""" + super().__init__(coordinator) + self._attr_name = "connectd priority matches" + self._attr_unique_id = "connectd_priority_matches" + self._attr_icon = "mdi:account-star" + self._attr_state_class = SensorStateClass.MEASUREMENT + self._attr_device_info = device_info + + @property + def native_value(self): + """return count of new priority matches.""" + if self.coordinator.data: + pm = self.coordinator.data.get("priority_matches", {}) + return pm.get("new_count", 0) + return 0 + + @property + def extra_state_attributes(self): + """return top matches as attributes.""" + if self.coordinator.data: + pm = self.coordinator.data.get("priority_matches", {}) + top = pm.get("top_matches", []) + attrs = { + "total_matches": pm.get("count", 0), + "new_matches": pm.get("new_count", 0), + } + for i, m in enumerate(top[:3]): + attrs[f"match_{i+1}_username"] = m.get("username") + attrs[f"match_{i+1}_platform"] = m.get("platform") + attrs[f"match_{i+1}_score"] = m.get("overlap_score") + attrs[f"match_{i+1}_reasons"] = ", ".join(m.get("reasons", [])) + return attrs + return {} + + +class ConnectdTopHumansSensor(CoordinatorEntity, SensorEntity): + """connectd top humans sensor.""" + + def __init__(self, coordinator: ConnectdDataUpdateCoordinator, device_info: DeviceInfo) -> None: + """initialize.""" + super().__init__(coordinator) + self._attr_name = "connectd top human" + self._attr_unique_id = "connectd_top_human" + self._attr_icon = "mdi:account-check" + self._attr_device_info = device_info + + @property + def native_value(self): + """return top human username.""" + if self.coordinator.data: + th = self.coordinator.data.get("top_humans", {}) + top = th.get("top_humans", []) + if top: + return top[0].get("username", "none") + return "none" + + @property + def extra_state_attributes(self): + """return top humans as attributes.""" + if self.coordinator.data: + th = self.coordinator.data.get("top_humans", {}) + top = th.get("top_humans", []) + attrs = {"total_high_score": th.get("count", 0)} + for i, h in enumerate(top[:5]): + attrs[f"human_{i+1}_username"] = h.get("username") + attrs[f"human_{i+1}_platform"] = h.get("platform") + attrs[f"human_{i+1}_score"] = h.get("score") + attrs[f"human_{i+1}_signals"] = ", ".join(h.get("signals", [])[:3]) + attrs[f"human_{i+1}_contact"] = h.get("contact_method") + return attrs + return {} + + +class ConnectdCountdownSensor(CoordinatorEntity, SensorEntity): + """connectd countdown timer sensor.""" + + def __init__( + self, + coordinator: ConnectdDataUpdateCoordinator, + device_info: DeviceInfo, + cycle_type: str, + icon: str, + ) -> None: + """initialize.""" + super().__init__(coordinator) + self._cycle_type = cycle_type + self._attr_name = f"connectd next {cycle_type}" + self._attr_unique_id = f"connectd_countdown_{cycle_type}" + self._attr_icon = icon + self._attr_device_info = device_info + self._attr_native_unit_of_measurement = "min" + + @property + def native_value(self): + """return minutes until next cycle.""" + if self.coordinator.data: + state = self.coordinator.data.get("state", {}) + secs = state.get(f"countdown_{self._cycle_type}", 0) + return int(secs / 60) + return 0 + + @property + def extra_state_attributes(self): + """return detailed countdown info.""" + if self.coordinator.data: + state = self.coordinator.data.get("state", {}) + secs = state.get(f"countdown_{self._cycle_type}", 0) + return { + "seconds": secs, + "hours": round(secs / 3600, 1), + f"last_{self._cycle_type}": state.get(f"last_{self._cycle_type}"), + } + return {} + + +class ConnectdUserScoreSensor(CoordinatorEntity, SensorEntity): + """connectd personal score sensor.""" + + def __init__(self, coordinator: ConnectdDataUpdateCoordinator, device_info: DeviceInfo) -> None: + """initialize.""" + super().__init__(coordinator) + self._attr_name = "connectd my score" + self._attr_unique_id = "connectd_user_score" + self._attr_icon = "mdi:star-circle" + self._attr_state_class = SensorStateClass.MEASUREMENT + self._attr_device_info = device_info + + @property + def native_value(self): + """return user's personal score.""" + if self.coordinator.data: + user = self.coordinator.data.get("user", {}) + return user.get("score", 0) + return 0 + + @property + def extra_state_attributes(self): + """return user profile details.""" + if self.coordinator.data: + user = self.coordinator.data.get("user", {}) + signals = user.get("signals", []) + interests = user.get("interests", []) + return { + "configured": user.get("configured", False), + "name": user.get("name"), + "github": user.get("github"), + "mastodon": user.get("mastodon"), + "reddit": user.get("reddit"), + "lobsters": user.get("lobsters"), + "matrix": user.get("matrix"), + "lemmy": user.get("lemmy"), + "discord": user.get("discord"), + "bluesky": user.get("bluesky"), + "location": user.get("location"), + "bio": user.get("bio"), + "match_count": user.get("match_count", 0), + "new_matches": user.get("new_match_count", 0), + "signals": ", ".join(signals[:5]) if signals else "", + "interests": ", ".join(interests[:5]) if interests else "", + } + return {} diff --git a/hacs/custom_components/connectd/strings.json b/hacs/custom_components/connectd/strings.json new file mode 100644 index 0000000..fdc092f --- /dev/null +++ b/hacs/custom_components/connectd/strings.json @@ -0,0 +1,18 @@ +{ + "config": { + "step": { + "user": { + "title": "connectd daemon", + "description": "connect to your connectd daemon for monitoring.", + "data": { + "host": "host", + "port": "port" + } + } + }, + "error": { + "cannot_connect": "failed to connect to connectd api", + "unknown": "unexpected error" + } + } +} diff --git a/hacs/hacs.json b/hacs/hacs.json new file mode 100644 index 0000000..b898f8a --- /dev/null +++ b/hacs/hacs.json @@ -0,0 +1,6 @@ +{ + "name": "connectd", + "render_readme": true, + "domains": ["sensor"], + "homeassistant": "2023.1.0" +} diff --git a/introd/__init__.py b/introd/__init__.py new file mode 100644 index 0000000..3d73d5a --- /dev/null +++ b/introd/__init__.py @@ -0,0 +1,10 @@ +""" +introd - outreach module +drafts intros, queues for human review, sends via appropriate channel +""" + +from .draft import draft_intro +from .review import get_pending_intros, approve_intro, reject_intro +from .send import send_intro + +__all__ = ['draft_intro', 'get_pending_intros', 'approve_intro', 'reject_intro', 'send_intro'] diff --git a/introd/deliver.py b/introd/deliver.py new file mode 100644 index 0000000..c261f46 --- /dev/null +++ b/introd/deliver.py @@ -0,0 +1,509 @@ +""" +introd/deliver.py - intro delivery via multiple channels + +supports: +- email (smtp) +- mastodon dm (if they allow dms) +- bluesky dm (via AT Protocol) +- matrix dm (creates DM room and sends message) +- github issue (opens intro as issue on their most active repo) +- manual queue (for review before sending) + +contact method is determined by ACTIVITY-BASED SELECTION: +- picks the platform where the user is MOST ACTIVE +- verified handles (from rel="me" links) get a bonus + +NOTE: reddit is NOT a delivery method - it's discovery only. +reddit-discovered users are contacted via their external links. +""" + +import os +import json +import smtplib +import requests +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from datetime import datetime +from pathlib import Path + +# config from env - no hardcoded credentials +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', 465)) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') +FROM_EMAIL = os.environ.get('FROM_EMAIL', '') + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '') +MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '') +BLUESKY_HANDLE = os.environ.get('BLUESKY_HANDLE', '') +BLUESKY_APP_PASSWORD = os.environ.get('BLUESKY_APP_PASSWORD', '') +MATRIX_HOMESERVER = os.environ.get('MATRIX_HOMESERVER', '') +MATRIX_USER_ID = os.environ.get('MATRIX_USER_ID', '') +MATRIX_ACCESS_TOKEN = os.environ.get('MATRIX_ACCESS_TOKEN', '') + +# delivery log +DELIVERY_LOG = Path(__file__).parent.parent / 'data' / 'delivery_log.json' +MANUAL_QUEUE = Path(__file__).parent.parent / 'data' / 'manual_queue.json' + + +def load_delivery_log(): + """load delivery history""" + if DELIVERY_LOG.exists(): + return json.loads(DELIVERY_LOG.read_text()) + return {'sent': [], 'failed': [], 'queued': []} + + +def save_delivery_log(log): + """save delivery history""" + DELIVERY_LOG.parent.mkdir(parents=True, exist_ok=True) + DELIVERY_LOG.write_text(json.dumps(log, indent=2)) + + +def load_manual_queue(): + """load manual review queue""" + if MANUAL_QUEUE.exists(): + return json.loads(MANUAL_QUEUE.read_text()) + return [] + + +def save_manual_queue(queue): + """save manual review queue""" + MANUAL_QUEUE.parent.mkdir(parents=True, exist_ok=True) + MANUAL_QUEUE.write_text(json.dumps(queue, indent=2)) + + +def already_contacted(recipient_id): + """check if we've already sent an intro to this person""" + log = load_delivery_log() + sent_ids = [s.get('recipient_id') for s in log.get('sent', [])] + return recipient_id in sent_ids + + +def send_email(to_email, subject, body, dry_run=False): + """send email via smtp""" + if dry_run: + print(f" [dry run] would email {to_email}") + print(f" subject: {subject}") + print(f" body preview: {body[:100]}...") + return True, "dry run" + + try: + msg = MIMEMultipart('alternative') + msg['Subject'] = subject + msg['From'] = FROM_EMAIL + msg['To'] = to_email + + # plain text + text_part = MIMEText(body, 'plain') + msg.attach(text_part) + + # html version (simple) + html_body = body.replace('\n', '
') + html_part = MIMEText(f"

{html_body}

+ github.com/sudoxnym/connectd + (main repo) +

+ + + + + + + + + + + + + + + + + + + + + + + + +

+""" + +SIGNATURE_PLAINTEXT = """ +--- +github.com/sudoxnym/connectd (main repo) + +github: github.com/connectd-daemon +mastodon: @connectd@mastodon.sudoxreboot.com +bluesky: connectd.bsky.social +lemmy: lemmy.sudoxreboot.com/c/connectd +discord: discord.gg/connectd +matrix: @connectd:sudoxreboot.com +reddit: reddit.com/r/connectd +email: connectd@sudoxreboot.com +""" + + +def draft_intro_with_llm(match_data: dict, recipient: str = 'a', dry_run: bool = True): + """ + draft an intro message using groq llm. + + args: + match_data: dict with human_a, human_b, overlap_score, overlap_reasons + recipient: 'a' or 'b' - who receives the message + dry_run: if True, preview mode + + returns: + tuple (result_dict, error_string) + result_dict has: subject, draft_html, draft_plain + """ + if not client: + return None, "GROQ_API_KEY not set" + + try: + human_a = match_data.get('human_a', {}) + human_b = match_data.get('human_b', {}) + reasons = match_data.get('overlap_reasons', []) + + # recipient gets the message, about_person is who we're introducing them to + if recipient == 'a': + to_person = human_a + about_person = human_b + else: + to_person = human_b + about_person = human_a + + to_name = to_person.get('username', 'friend') + about_name = about_person.get('username', 'someone') + about_bio = about_person.get('extra', {}).get('bio', '') + + # format the shared factors naturally + if reasons: + factor = ', '.join(reasons[:3]) if len(reasons) > 1 else reasons[0] + else: + factor = "shared values and interests" + + # load soul as guideline + soul = load_soul() + if not soul: + return None, "could not load soul file" + + # build the prompt - soul is GUIDELINE not script + prompt = f"""you are connectd, a daemon that finds isolated builders and connects them. + +write a personal message TO {to_name} telling them about {about_name}. + +here is the soul/spirit of what connectd is about - use this as a GUIDELINE for tone and message, NOT as a script to copy verbatim: + +--- +{soul} +--- + +key facts for this message: +- recipient: {to_name} +- introducing them to: {about_name} +- their shared interests/values: {factor} +- about {about_name}: {about_bio if about_bio else 'a builder like you'} + +CRITICAL RULES - DO NOT SANITIZE: +1. use their name ({to_name}) throughout - make it personal +2. format the shared factor naturally like "your shared interest in X and Y" or "your work on X" +3. this message is TO {to_name} ABOUT {about_name} - don't confuse this +4. keep it lowercase, raw, emotional, real +5. no corporate speak, no "best regards", no "hi there" +6. DO NOT water down the message - keep the raw emotional energy +7. address them directly, call them to action +8. make them feel seen and not alone +9. end with rallying energy about building together + +return ONLY the message body. signature is added separately.""" + + response = client.chat.completions.create( + model=GROQ_MODEL, + messages=[{"role": "user", "content": prompt}], + temperature=0.8, + max_tokens=1200 + ) + + body = response.choices[0].message.content.strip() + + # generate subject + subject_prompt = f"""generate a short, lowercase email subject for a message to {to_name} about connecting them with {about_name} over their shared interest in {factor}. + +no corporate speak. no clickbait. raw and real. +examples: +- "found you, {to_name}" +- "you're not alone" +- "a door just opened" +- "{to_name}, there's someone you should meet" + +return ONLY the subject line.""" + + subject_response = client.chat.completions.create( + model=GROQ_MODEL, + messages=[{"role": "user", "content": subject_prompt}], + temperature=0.9, + max_tokens=50 + ) + + subject = subject_response.choices[0].message.content.strip().strip('"').strip("'") + + # format html + draft_html = f"

{body}

{SIGNATURE_HTML}" + draft_plain = body + SIGNATURE_PLAINTEXT + + return { + 'subject': subject, + 'draft_html': draft_html, + 'draft_plain': draft_plain + }, None + + except Exception as e: + return None, str(e) + + +# for backwards compat with old code +def draft_message(person: dict, factor: str, platform: str = "email") -> dict: + """legacy function - wraps new api""" + match_data = { + 'human_a': {'username': 'recipient'}, + 'human_b': person, + 'overlap_reasons': [factor] + } + result, error = draft_intro_with_llm(match_data, recipient='a') + if error: + raise ValueError(error) + return { + 'subject': result['subject'], + 'body_html': result['draft_html'], + 'body_plain': result['draft_plain'] + } + + +if __name__ == "__main__": + # test + test_data = { + 'human_a': {'username': 'sudoxnym', 'extra': {'bio': 'building intentional communities'}}, + 'human_b': {'username': 'testuser', 'extra': {'bio': 'home assistant enthusiast'}}, + 'overlap_reasons': ['home-assistant', 'open source', 'community building'] + } + result, error = draft_intro_with_llm(test_data, recipient='a') + if error: + print(f"error: {error}") + else: + print(f"subject: {result['subject']}") + print(f"\nbody:\n{result['draft_plain']}") diff --git a/introd/lost_intro.py b/introd/lost_intro.py new file mode 100644 index 0000000..e98709b --- /dev/null +++ b/introd/lost_intro.py @@ -0,0 +1,250 @@ +""" +introd/lost_intro.py - intro drafting for lost builders + +different tone than builder-to-builder intros. +these people need encouragement, not networking. + +the goal isn't to recruit them. it's to show them the door exists. +they take it or they don't. but they'll know someone saw them. +""" + +import os +import json +import requests +from datetime import datetime + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile') + + +LOST_INTRO_TEMPLATE = """hey {name}, + +i'm connectd. i'm a daemon that finds people who might need a nudge. + +i noticed you're interested in {interests}. you ask good questions. you clearly get it. + +but maybe you haven't built anything yet. or you started and stopped. or you don't think you can. + +that's okay. most people don't. + +but some people do. here's one: {builder_name} ({builder_url}) + +{builder_description} + +they started where you are. look at what they built. + +you're not behind. you're just not started yet. + +no pressure. just wanted you to know someone noticed. + +- connectd""" + + +SYSTEM_PROMPT = """you are connectd, a daemon that finds isolated builders with aligned values and connects them. + +right now you're reaching out to someone who has POTENTIAL but hasn't found it yet. maybe they gave up, maybe they're stuck, maybe they don't believe they can do it. + +your job is to: +1. acknowledge where they are without being condescending +2. point them to an active builder who could inspire them +3. be genuine, not salesy or motivational-speaker-y +4. keep it short - these people are tired, don't overwhelm them +5. use lowercase, be human, no corporate bullshit +6. make it clear there's no pressure, no follow-up spam + +you're not recruiting. you're not selling. you're just showing them a door. + +the template structure: +- acknowledge them (you noticed something about them) +- normalize where they are (most people don't build things) +- show them someone who did (the builder) +- brief encouragement (you're not behind, just not started) +- sign off with no pressure + +do NOT: +- be preachy or lecture them +- use motivational cliches ("you got this!", "believe in yourself!") +- make promises about outcomes +- be too long - they don't have energy for long messages +- make them feel bad about where they are""" + + +def draft_lost_intro(lost_user, inspiring_builder, config=None): + """ + draft an intro for a lost builder, pairing them with an inspiring active builder. + + lost_user: the person who needs a nudge + inspiring_builder: an active builder with similar interests who could inspire them + """ + config = config or {} + + # gather info about lost user + lost_name = lost_user.get('name') or lost_user.get('username', 'there') + lost_signals = lost_user.get('lost_signals', []) + lost_interests = extract_interests(lost_user) + + # gather info about inspiring builder + builder_name = inspiring_builder.get('name') or inspiring_builder.get('username') + builder_url = inspiring_builder.get('url') or f"https://github.com/{inspiring_builder.get('username')}" + builder_description = create_builder_description(inspiring_builder) + + # use LLM to personalize + if GROQ_API_KEY and config.get('use_llm', True): + return draft_with_llm(lost_user, inspiring_builder, lost_interests, builder_description) + + # fallback to template + return LOST_INTRO_TEMPLATE.format( + name=lost_name, + interests=', '.join(lost_interests[:3]) if lost_interests else 'building things', + builder_name=builder_name, + builder_url=builder_url, + builder_description=builder_description, + ), None + + +def extract_interests(user): + """extract interests from user profile""" + interests = [] + + # from topics/tags + extra = user.get('extra', {}) + if isinstance(extra, str): + try: + extra = json.loads(extra) + except: + extra = {} + + topics = extra.get('topics', []) or extra.get('aligned_topics', []) + interests.extend(topics[:5]) + + # from subreddits + subreddits = user.get('subreddits', []) + for sub in subreddits[:3]: + if sub.lower() not in ['learnprogramming', 'findapath', 'getdisciplined']: + interests.append(sub) + + # from bio keywords + bio = user.get('bio') or '' + bio_lower = bio.lower() + + interest_keywords = [ + 'rust', 'python', 'javascript', 'go', 'linux', 'self-hosting', 'homelab', + 'privacy', 'security', 'open source', 'foss', 'decentralized', 'ai', 'ml', + 'web dev', 'backend', 'frontend', 'devops', 'data', 'automation', + ] + + for kw in interest_keywords: + if kw in bio_lower and kw not in interests: + interests.append(kw) + + return interests[:5] if interests else ['technology', 'building things'] + + +def create_builder_description(builder): + """create a brief description of what the builder has done""" + extra = builder.get('extra', {}) + if isinstance(extra, str): + try: + extra = json.loads(extra) + except: + extra = {} + + parts = [] + + # what they build + repos = extra.get('top_repos', [])[:3] + if repos: + repo_names = [r.get('name') for r in repos if r.get('name')] + if repo_names: + parts.append(f"they've built things like {', '.join(repo_names[:2])}") + + # their focus + topics = extra.get('aligned_topics', []) or extra.get('topics', []) + if topics: + parts.append(f"they work on {', '.join(topics[:3])}") + + # their vibe + signals = builder.get('signals', []) + if 'self-hosted' in str(signals).lower(): + parts.append("they're into self-hosting and owning their own infrastructure") + if 'privacy' in str(signals).lower(): + parts.append("they care about privacy") + if 'community' in str(signals).lower(): + parts.append("they're community-focused") + + if parts: + return '. '.join(parts) + '.' + else: + return "they're building cool stuff in the open." + + +def draft_with_llm(lost_user, inspiring_builder, interests, builder_description): + """use LLM to draft personalized intro""" + + lost_name = lost_user.get('name') or lost_user.get('username', 'there') + lost_signals = lost_user.get('lost_signals', []) + lost_bio = lost_user.get('bio', '') + + builder_name = inspiring_builder.get('name') or inspiring_builder.get('username') + builder_url = inspiring_builder.get('url') or f"https://github.com/{inspiring_builder.get('username')}" + + user_prompt = f"""draft an intro for this lost builder: + +LOST USER: +- name: {lost_name} +- interests: {', '.join(interests)} +- signals detected: {', '.join(lost_signals[:5]) if lost_signals else 'general stuck/aspiring patterns'} +- bio: {lost_bio[:200] if lost_bio else 'none'} + +INSPIRING BUILDER TO SHOW THEM: +- name: {builder_name} +- url: {builder_url} +- what they do: {builder_description} + +write a short, genuine message. no fluff. no motivational cliches. just human. +keep it under 150 words. +use lowercase. +end with "- connectd" +""" + + try: + resp = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [ + {'role': 'system', 'content': SYSTEM_PROMPT}, + {'role': 'user', 'content': user_prompt}, + ], + 'temperature': 0.7, + 'max_tokens': 500, + }, + timeout=30, + ) + + if resp.status_code == 200: + content = resp.json()['choices'][0]['message']['content'] + return content.strip(), None + else: + return None, f"llm error: {resp.status_code}" + + except Exception as e: + return None, str(e) + + +def get_lost_intro_config(): + """get configuration for lost builder outreach""" + return { + 'enabled': True, + 'max_per_day': 5, # lower volume, higher care + 'require_review': True, # always manual approval + 'cooldown_days': 90, # don't spam struggling people + 'min_lost_score': 40, + 'min_values_score': 20, + 'use_llm': True, + } diff --git a/introd/review.py b/introd/review.py new file mode 100644 index 0000000..0c3fefc --- /dev/null +++ b/introd/review.py @@ -0,0 +1,126 @@ +""" +introd/review.py - human approval queue before sending +""" + +import json +from datetime import datetime + + +def get_pending_intros(db, limit=50): + """ + get all intros pending human review + + returns list of intro dicts with full context + """ + rows = db.get_pending_intros(limit=limit) + + intros = [] + for row in rows: + # get associated match and humans + match_id = row.get('match_id') + recipient_id = row.get('recipient_human_id') + + recipient = db.get_human_by_id(recipient_id) if recipient_id else None + + intros.append({ + 'id': row['id'], + 'match_id': match_id, + 'recipient': recipient, + 'channel': row.get('channel'), + 'draft': row.get('draft'), + 'status': row.get('status'), + }) + + return intros + + +def approve_intro(db, intro_id, approved_by='human'): + """ + approve an intro for sending + + intro_id: database id of the intro + approved_by: who approved it (for audit trail) + """ + db.approve_intro(intro_id, approved_by) + print(f"introd: approved intro {intro_id} by {approved_by}") + + +def reject_intro(db, intro_id, reason=None): + """ + reject an intro (won't be sent) + """ + c = db.conn.cursor() + c.execute('''UPDATE intros SET status = 'rejected', + approved_at = ?, approved_by = ? WHERE id = ?''', + (datetime.now().isoformat(), f"rejected: {reason}" if reason else "rejected", intro_id)) + db.conn.commit() + print(f"introd: rejected intro {intro_id}") + + +def review_intro_interactive(db, intro): + """ + interactive review of a single intro + + returns: 'approve', 'reject', 'edit', or 'skip' + """ + print("\n" + "=" * 60) + print("INTRO FOR REVIEW") + print("=" * 60) + + recipient = intro.get('recipient', {}) + print(f"\nRecipient: {recipient.get('name') or recipient.get('username')}") + print(f"Platform: {recipient.get('platform')}") + print(f"Channel: {intro.get('channel')}") + print(f"\n--- DRAFT ---") + print(intro.get('draft')) + print("--- END ---\n") + + while True: + choice = input("[a]pprove / [r]eject / [s]kip / [e]dit? ").strip().lower() + + if choice in ['a', 'approve']: + approve_intro(db, intro['id']) + return 'approve' + elif choice in ['r', 'reject']: + reason = input("reason (optional): ").strip() + reject_intro(db, intro['id'], reason) + return 'reject' + elif choice in ['s', 'skip']: + return 'skip' + elif choice in ['e', 'edit']: + print("editing not yet implemented - approve or reject") + else: + print("invalid choice") + + +def review_all_pending(db): + """ + interactive review of all pending intros + """ + intros = get_pending_intros(db) + + if not intros: + print("no pending intros to review") + return + + print(f"\n{len(intros)} intros pending review\n") + + approved = 0 + rejected = 0 + skipped = 0 + + for intro in intros: + result = review_intro_interactive(db, intro) + + if result == 'approve': + approved += 1 + elif result == 'reject': + rejected += 1 + else: + skipped += 1 + + cont = input("\ncontinue reviewing? [y/n] ").strip().lower() + if cont != 'y': + break + + print(f"\nreview complete: {approved} approved, {rejected} rejected, {skipped} skipped") diff --git a/introd/send.py b/introd/send.py new file mode 100644 index 0000000..b7d2c0d --- /dev/null +++ b/introd/send.py @@ -0,0 +1,216 @@ +""" +introd/send.py - actually deliver intros via appropriate channel +""" + +import smtplib +import requests +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from datetime import datetime +import os + +# email config (from env) +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', '465')) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') +FROM_EMAIL = os.environ.get('FROM_EMAIL', '') + + +def send_email(to_email, subject, body): + """send email via SMTP""" + msg = MIMEMultipart() + msg['From'] = FROM_EMAIL + msg['To'] = to_email + msg['Subject'] = subject + + msg.attach(MIMEText(body, 'plain')) + + try: + with smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) as server: + server.login(SMTP_USER, SMTP_PASS) + server.send_message(msg) + return True, None + except Exception as e: + return False, str(e) + + +def send_github_issue(repo_url, title, body): + """ + create a github issue (requires GITHUB_TOKEN) + note: only works if you have write access to the repo + typically won't work for random users - fallback to manual + """ + # extract owner/repo from url + # https://github.com/owner/repo -> owner/repo + parts = repo_url.rstrip('/').split('/') + if len(parts) < 2: + return False, "invalid github url" + + owner = parts[-2] + repo = parts[-1] + + token = os.environ.get('GITHUB_TOKEN') + if not token: + return False, "no github token" + + # would create issue via API - but this is invasive + # better to just output the info for manual action + return False, "github issues not automated - use manual outreach" + + +def send_mastodon_dm(instance, username, message): + """ + send mastodon DM (requires account credentials) + not implemented - requires oauth setup + """ + return False, "mastodon DMs not automated - use manual outreach" + + +def send_reddit_message(username, subject, body): + """ + send reddit message (requires account credentials) + not implemented - requires oauth setup + """ + return False, "reddit messages not automated - use manual outreach" + + +def send_intro(db, intro_id): + """ + send an approved intro + + returns: (success, error_message) + """ + # get intro from db + c = db.conn.cursor() + c.execute('SELECT * FROM intros WHERE id = ?', (intro_id,)) + row = c.fetchone() + + if not row: + return False, "intro not found" + + intro = dict(row) + + if intro['status'] != 'approved': + return False, f"intro not approved (status: {intro['status']})" + + channel = intro.get('channel') + draft = intro.get('draft') + + # get recipient info + recipient = db.get_human_by_id(intro['recipient_human_id']) + if not recipient: + return False, "recipient not found" + + success = False + error = None + + if channel == 'email': + # get email from contact + import json + contact = recipient.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) + + email = contact.get('email') + if email: + success, error = send_email( + email, + "connection: aligned builder intro", + draft + ) + else: + error = "no email address" + + elif channel == 'github': + success, error = send_github_issue( + recipient.get('url'), + "connection: aligned builder intro", + draft + ) + + elif channel == 'mastodon': + success, error = send_mastodon_dm( + recipient.get('instance'), + recipient.get('username'), + draft + ) + + elif channel == 'reddit': + success, error = send_reddit_message( + recipient.get('username'), + "connection: aligned builder intro", + draft + ) + + else: + error = f"unknown channel: {channel}" + + # update status + if success: + db.mark_intro_sent(intro_id) + print(f"introd: sent intro {intro_id} via {channel}") + else: + # mark as needs manual sending + c.execute('''UPDATE intros SET status = 'manual_needed', + approved_at = ? WHERE id = ?''', + (datetime.now().isoformat(), intro_id)) + db.conn.commit() + print(f"introd: intro {intro_id} needs manual send ({error})") + + return success, error + + +def send_all_approved(db): + """ + send all approved intros + """ + c = db.conn.cursor() + c.execute('SELECT id FROM intros WHERE status = "approved"') + rows = c.fetchall() + + if not rows: + print("no approved intros to send") + return + + print(f"sending {len(rows)} approved intros...") + + sent = 0 + failed = 0 + + for row in rows: + success, error = send_intro(db, row['id']) + if success: + sent += 1 + else: + failed += 1 + + print(f"sent: {sent}, failed/manual: {failed}") + + +def export_manual_intros(db, output_file='manual_intros.txt'): + """ + export intros that need manual sending to a text file + """ + c = db.conn.cursor() + c.execute('''SELECT i.*, h.username, h.platform, h.url + FROM intros i + JOIN humans h ON i.recipient_human_id = h.id + WHERE i.status IN ('approved', 'manual_needed')''') + rows = c.fetchall() + + if not rows: + print("no intros to export") + return + + with open(output_file, 'w') as f: + for row in rows: + f.write("=" * 60 + "\n") + f.write(f"TO: {row['username']} ({row['platform']})\n") + f.write(f"URL: {row['url']}\n") + f.write(f"CHANNEL: {row['channel']}\n") + f.write("-" * 60 + "\n") + f.write(row['draft'] + "\n") + f.write("\n") + + print(f"exported {len(rows)} intros to {output_file}") diff --git a/matchd/__init__.py b/matchd/__init__.py new file mode 100644 index 0000000..916532f --- /dev/null +++ b/matchd/__init__.py @@ -0,0 +1,10 @@ +""" +matchd - pairing module +generates fingerprints, finds overlaps, ranks matches +""" + +from .fingerprint import generate_fingerprint +from .overlap import find_overlap +from .rank import rank_matches, find_all_matches + +__all__ = ['generate_fingerprint', 'find_overlap', 'rank_matches', 'find_all_matches'] diff --git a/matchd/fingerprint.py b/matchd/fingerprint.py new file mode 100644 index 0000000..832c05b --- /dev/null +++ b/matchd/fingerprint.py @@ -0,0 +1,210 @@ +""" +matchd/fingerprint.py - generate values profiles for humans +""" + +import json +from collections import defaultdict + +# values dimensions we track +VALUES_DIMENSIONS = [ + 'privacy', # surveillance concern, degoogle, self-hosted + 'decentralization', # p2p, fediverse, local-first + 'cooperation', # coops, mutual aid, community + 'queer_friendly', # lgbtq+, pronouns + 'environmental', # solarpunk, degrowth, sustainability + 'anticapitalist', # post-capitalism, worker ownership + 'builder', # creates vs consumes + 'pnw_oriented', # pacific northwest connection +] + +# skill categories +SKILL_CATEGORIES = [ + 'backend', # python, go, rust, databases + 'frontend', # js, react, css + 'devops', # docker, k8s, linux admin + 'hardware', # electronics, embedded, iot + 'design', # ui/ux, graphics + 'community', # organizing, facilitation + 'writing', # documentation, content +] + +# signal to dimension mapping +SIGNAL_TO_DIMENSION = { + 'privacy': 'privacy', + 'selfhosted': 'privacy', + 'degoogle': 'privacy', + 'decentralized': 'decentralization', + 'local_first': 'decentralization', + 'p2p': 'decentralization', + 'federated_chat': 'decentralization', + 'foss': 'decentralization', + 'cooperative': 'cooperation', + 'community': 'cooperation', + 'mutual_aid': 'cooperation', + 'intentional_community': 'cooperation', + 'queer': 'queer_friendly', + 'pronouns': 'queer_friendly', + 'blm': 'queer_friendly', + 'acab': 'queer_friendly', + 'solarpunk': 'environmental', + 'anticapitalist': 'anticapitalist', + 'pnw': 'pnw_oriented', + 'pnw_state': 'pnw_oriented', + 'remote': 'pnw_oriented', + 'home_automation': 'builder', + 'modern_lang': 'builder', + 'unix': 'builder', + 'containers': 'builder', +} + +# language to skill mapping +LANGUAGE_TO_SKILL = { + 'python': 'backend', + 'go': 'backend', + 'rust': 'backend', + 'java': 'backend', + 'ruby': 'backend', + 'php': 'backend', + 'javascript': 'frontend', + 'typescript': 'frontend', + 'html': 'frontend', + 'css': 'frontend', + 'vue': 'frontend', + 'shell': 'devops', + 'dockerfile': 'devops', + 'nix': 'devops', + 'hcl': 'devops', + 'c': 'hardware', + 'c++': 'hardware', + 'arduino': 'hardware', + 'verilog': 'hardware', +} + + +def generate_fingerprint(human_data): + """ + generate a values fingerprint for a human + + input: human dict from database (has signals, languages, etc) + output: fingerprint dict with values_vector, skills, interests + """ + # parse stored json fields + signals = human_data.get('signals', []) + if isinstance(signals, str): + signals = json.loads(signals) + + extra = human_data.get('extra', {}) + if isinstance(extra, str): + extra = json.loads(extra) + + languages = extra.get('languages', {}) + topics = extra.get('topics', []) + + # build values vector + values_vector = defaultdict(float) + + # from signals + for signal in signals: + dimension = SIGNAL_TO_DIMENSION.get(signal) + if dimension: + values_vector[dimension] += 1.0 + + # normalize values vector (0-1 scale) + max_val = max(values_vector.values()) if values_vector else 1 + values_vector = {k: min(v / max_val, 1.0) for k, v in values_vector.items()} + + # fill in missing dimensions with 0 + for dim in VALUES_DIMENSIONS: + if dim not in values_vector: + values_vector[dim] = 0.0 + + # determine skills from languages + skills = defaultdict(float) + total_repos = sum(languages.values()) if languages else 1 + + for lang, count in languages.items(): + skill = LANGUAGE_TO_SKILL.get(lang.lower()) + if skill: + skills[skill] += count / total_repos + + # normalize skills + if skills: + max_skill = max(skills.values()) + skills = {k: min(v / max_skill, 1.0) for k, v in skills.items()} + + # interests from topics and signals + interests = list(set(topics + signals)) + + # location preference + location_pref = None + if 'pnw' in signals or 'pnw_state' in signals: + location_pref = 'pnw' + elif 'remote' in signals: + location_pref = 'remote' + elif human_data.get('location'): + loc = human_data['location'].lower() + if any(x in loc for x in ['seattle', 'portland', 'washington', 'oregon', 'pnw', 'cascadia']): + location_pref = 'pnw' + + # availability (based on hireable flag if present) + availability = None + if extra.get('hireable'): + availability = 'open' + + return { + 'human_id': human_data.get('id'), + 'values_vector': dict(values_vector), + 'skills': dict(skills), + 'interests': interests, + 'location_pref': location_pref, + 'availability': availability, + } + + +def fingerprint_similarity(fp_a, fp_b): + """ + calculate similarity between two fingerprints + returns 0-1 score + """ + # values similarity (cosine-ish) + va = fp_a.get('values_vector', {}) + vb = fp_b.get('values_vector', {}) + + all_dims = set(va.keys()) | set(vb.keys()) + if not all_dims: + return 0.0 + + dot_product = sum(va.get(d, 0) * vb.get(d, 0) for d in all_dims) + mag_a = sum(v**2 for v in va.values()) ** 0.5 + mag_b = sum(v**2 for v in vb.values()) ** 0.5 + + if mag_a == 0 or mag_b == 0: + values_sim = 0.0 + else: + values_sim = dot_product / (mag_a * mag_b) + + # interest overlap (jaccard) + ia = set(fp_a.get('interests', [])) + ib = set(fp_b.get('interests', [])) + + if ia or ib: + interest_sim = len(ia & ib) / len(ia | ib) + else: + interest_sim = 0.0 + + # location compatibility + loc_a = fp_a.get('location_pref') + loc_b = fp_b.get('location_pref') + + loc_sim = 0.0 + if loc_a == loc_b and loc_a is not None: + loc_sim = 1.0 + elif loc_a == 'remote' or loc_b == 'remote': + loc_sim = 0.5 + elif loc_a == 'pnw' or loc_b == 'pnw': + loc_sim = 0.3 + + # weighted combination + similarity = (values_sim * 0.5) + (interest_sim * 0.3) + (loc_sim * 0.2) + + return similarity diff --git a/matchd/lost.py b/matchd/lost.py new file mode 100644 index 0000000..0845118 --- /dev/null +++ b/matchd/lost.py @@ -0,0 +1,199 @@ +""" +matchd/lost.py - lost builder matching + +lost builders don't get matched to each other (both need energy). +they get matched to ACTIVE builders who can inspire them. + +the goal: show them someone like them who made it. +""" + +import json +from .overlap import find_overlap, is_same_person + + +def find_inspiring_builder(lost_user, active_builders, db=None): + """ + find an active builder who could inspire a lost builder. + + criteria: + - shared interests (they need to relate to this person) + - active builder has shipped real work (proof it's possible) + - similar background signals if possible + - NOT the same person across platforms + """ + if not active_builders: + return None, "no active builders available" + + # parse lost user data + lost_signals = lost_user.get('signals', []) + if isinstance(lost_signals, str): + lost_signals = json.loads(lost_signals) if lost_signals else [] + + lost_extra = lost_user.get('extra', {}) + if isinstance(lost_extra, str): + lost_extra = json.loads(lost_extra) if lost_extra else {} + + # lost user interests + lost_interests = set() + lost_interests.update(lost_signals) + lost_interests.update(lost_extra.get('topics', [])) + lost_interests.update(lost_extra.get('aligned_topics', [])) + + # also include subreddits if from reddit (shows interests) + subreddits = lost_user.get('subreddits', []) + if isinstance(subreddits, str): + subreddits = json.loads(subreddits) if subreddits else [] + lost_interests.update(subreddits) + + # score each active builder + candidates = [] + + for builder in active_builders: + # skip if same person (cross-platform) + if is_same_person(lost_user, builder): + continue + + # get builder signals + builder_signals = builder.get('signals', []) + if isinstance(builder_signals, str): + builder_signals = json.loads(builder_signals) if builder_signals else [] + + builder_extra = builder.get('extra', {}) + if isinstance(builder_extra, str): + builder_extra = json.loads(builder_extra) if builder_extra else {} + + # builder interests + builder_interests = set() + builder_interests.update(builder_signals) + builder_interests.update(builder_extra.get('topics', [])) + builder_interests.update(builder_extra.get('aligned_topics', [])) + + # calculate match score + shared_interests = lost_interests & builder_interests + match_score = len(shared_interests) * 10 + + # bonus for high-value shared signals + high_value_signals = ['privacy', 'selfhosted', 'home_automation', 'foss', + 'solarpunk', 'cooperative', 'decentralized', 'queer'] + for signal in shared_interests: + if signal in high_value_signals: + match_score += 15 + + # bonus if builder has shipped real work (proof it's possible) + repos = builder_extra.get('top_repos', []) + if len(repos) >= 5: + match_score += 20 # they've built things + elif len(repos) >= 2: + match_score += 10 + + # bonus for high stars (visible success) + total_stars = sum(r.get('stars', 0) for r in repos) if repos else 0 + if total_stars >= 100: + match_score += 15 + elif total_stars >= 20: + match_score += 5 + + # bonus for similar location (relatable) + lost_loc = (lost_user.get('location') or '').lower() + builder_loc = (builder.get('location') or '').lower() + if lost_loc and builder_loc: + pnw_keywords = ['seattle', 'portland', 'washington', 'oregon', 'pnw'] + if any(k in lost_loc for k in pnw_keywords) and any(k in builder_loc for k in pnw_keywords): + match_score += 10 + + # minimum threshold - need SOMETHING in common + if match_score < 10: + continue + + candidates.append({ + 'builder': builder, + 'match_score': match_score, + 'shared_interests': list(shared_interests)[:5], + 'repos_count': len(repos), + 'total_stars': total_stars, + }) + + if not candidates: + return None, "no matching active builders found" + + # sort by match score, return best + candidates.sort(key=lambda x: x['match_score'], reverse=True) + best = candidates[0] + + return best, None + + +def find_matches_for_lost_builders(db, min_lost_score=40, min_values_score=20, limit=10): + """ + find inspiring builder matches for all lost builders ready for outreach. + + returns list of (lost_user, inspiring_builder, match_data) + """ + # get lost builders ready for outreach + lost_builders = db.get_lost_builders_for_outreach( + min_lost_score=min_lost_score, + min_values_score=min_values_score, + limit=limit + ) + + if not lost_builders: + return [], "no lost builders ready for outreach" + + # get active builders who can inspire + active_builders = db.get_active_builders(min_score=50, limit=200) + + if not active_builders: + return [], "no active builders available" + + matches = [] + + for lost_user in lost_builders: + best_match, error = find_inspiring_builder(lost_user, active_builders, db) + + if best_match: + matches.append({ + 'lost_user': lost_user, + 'inspiring_builder': best_match['builder'], + 'match_score': best_match['match_score'], + 'shared_interests': best_match['shared_interests'], + 'builder_repos': best_match['repos_count'], + 'builder_stars': best_match['total_stars'], + }) + + return matches, None + + +def get_lost_match_summary(match_data): + """ + get a human-readable summary of a lost builder match. + """ + lost = match_data['lost_user'] + builder = match_data['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username', 'someone') + builder_name = builder.get('name') or builder.get('username', 'a builder') + + lost_signals = match_data.get('lost_signals', []) + if isinstance(lost_signals, str): + lost_signals = json.loads(lost_signals) if lost_signals else [] + + shared = match_data.get('shared_interests', []) + + summary = f""" +lost builder: {lost_name} ({lost.get('platform')}) + lost score: {lost.get('lost_potential_score', 0)} + values score: {lost.get('score', 0)} + url: {lost.get('url')} + +inspiring builder: {builder_name} ({builder.get('platform')}) + score: {builder.get('score', 0)} + repos: {match_data.get('builder_repos', 0)} + stars: {match_data.get('builder_stars', 0)} + url: {builder.get('url')} + +match score: {match_data.get('match_score', 0)} +shared interests: {', '.join(shared) if shared else 'values alignment'} + +this lost builder needs to see that someone like them made it. +""" + return summary.strip() diff --git a/matchd/overlap.py b/matchd/overlap.py new file mode 100644 index 0000000..975ff4c --- /dev/null +++ b/matchd/overlap.py @@ -0,0 +1,150 @@ +""" +matchd/overlap.py - find pairs with alignment +""" + +import json +from .fingerprint import fingerprint_similarity + + +def find_overlap(human_a, human_b, fp_a=None, fp_b=None): + """ + analyze overlap between two humans + returns overlap details: score, shared values, complementary skills + """ + # parse stored json if needed + signals_a = human_a.get('signals', []) + if isinstance(signals_a, str): + signals_a = json.loads(signals_a) + + signals_b = human_b.get('signals', []) + if isinstance(signals_b, str): + signals_b = json.loads(signals_b) + + extra_a = human_a.get('extra', {}) + if isinstance(extra_a, str): + extra_a = json.loads(extra_a) + + extra_b = human_b.get('extra', {}) + if isinstance(extra_b, str): + extra_b = json.loads(extra_b) + + # shared signals + shared_signals = list(set(signals_a) & set(signals_b)) + + # shared topics + topics_a = set(extra_a.get('topics', [])) + topics_b = set(extra_b.get('topics', [])) + shared_topics = list(topics_a & topics_b) + + # complementary skills (what one has that the other doesn't) + langs_a = set(extra_a.get('languages', {}).keys()) + langs_b = set(extra_b.get('languages', {}).keys()) + complementary_langs = list((langs_a - langs_b) | (langs_b - langs_a)) + + # geographic compatibility + loc_a = human_a.get('location', '').lower() if human_a.get('location') else '' + loc_b = human_b.get('location', '').lower() if human_b.get('location') else '' + + pnw_keywords = ['seattle', 'portland', 'washington', 'oregon', 'pnw', 'cascadia', 'pacific northwest'] + remote_keywords = ['remote', 'anywhere', 'distributed'] + + a_pnw = any(k in loc_a for k in pnw_keywords) or 'pnw' in signals_a + b_pnw = any(k in loc_b for k in pnw_keywords) or 'pnw' in signals_b + a_remote = any(k in loc_a for k in remote_keywords) or 'remote' in signals_a + b_remote = any(k in loc_b for k in remote_keywords) or 'remote' in signals_b + + geographic_match = False + geo_reason = None + + if a_pnw and b_pnw: + geographic_match = True + geo_reason = 'both in pnw' + elif (a_pnw or b_pnw) and (a_remote or b_remote): + geographic_match = True + geo_reason = 'pnw + remote compatible' + elif a_remote and b_remote: + geographic_match = True + geo_reason = 'both remote-friendly' + + # calculate overlap score + base_score = 0 + + # shared values (most important) + base_score += len(shared_signals) * 10 + + # shared interests + base_score += len(shared_topics) * 5 + + # complementary skills bonus (they can help each other) + if complementary_langs: + base_score += min(len(complementary_langs), 5) * 3 + + # geographic bonus + if geographic_match: + base_score += 20 + + # fingerprint similarity if available + fp_score = 0 + if fp_a and fp_b: + fp_score = fingerprint_similarity(fp_a, fp_b) * 50 + + total_score = base_score + fp_score + + # build reasons + overlap_reasons = [] + if shared_signals: + overlap_reasons.append(f"shared values: {', '.join(shared_signals[:5])}") + if shared_topics: + overlap_reasons.append(f"shared interests: {', '.join(shared_topics[:5])}") + if geo_reason: + overlap_reasons.append(geo_reason) + if complementary_langs: + overlap_reasons.append(f"complementary skills: {', '.join(complementary_langs[:5])}") + + return { + 'overlap_score': total_score, + 'shared_signals': shared_signals, + 'shared_topics': shared_topics, + 'complementary_skills': complementary_langs, + 'geographic_match': geographic_match, + 'geo_reason': geo_reason, + 'overlap_reasons': overlap_reasons, + 'fingerprint_similarity': fp_score / 50 if fp_a and fp_b else None, + } + + +def is_same_person(human_a, human_b): + """ + check if two records might be the same person (cross-platform) + """ + # same platform = definitely different records + if human_a['platform'] == human_b['platform']: + return False + + # check username similarity + user_a = human_a.get('username', '').lower().split('@')[0] + user_b = human_b.get('username', '').lower().split('@')[0] + + if user_a == user_b: + return True + + # check if github username matches + contact_a = human_a.get('contact', {}) + contact_b = human_b.get('contact', {}) + + if isinstance(contact_a, str): + contact_a = json.loads(contact_a) + if isinstance(contact_b, str): + contact_b = json.loads(contact_b) + + # github cross-reference + if contact_a.get('github') and contact_a.get('github') == contact_b.get('github'): + return True + if contact_a.get('github') == user_b or contact_b.get('github') == user_a: + return True + + # email cross-reference + if contact_a.get('email') and contact_a.get('email') == contact_b.get('email'): + return True + + return False diff --git a/matchd/rank.py b/matchd/rank.py new file mode 100644 index 0000000..d10d014 --- /dev/null +++ b/matchd/rank.py @@ -0,0 +1,137 @@ +""" +matchd/rank.py - score and rank match quality +""" + +from itertools import combinations +from .fingerprint import generate_fingerprint +from .overlap import find_overlap, is_same_person +from scoutd.deep import check_already_connected + + +def rank_matches(matches): + """ + rank a list of matches by quality + returns sorted list with quality scores + """ + ranked = [] + + for match in matches: + # base score from overlap + score = match.get('overlap_score', 0) + + # bonus for geographic match + if match.get('geographic_match'): + score *= 1.2 + + # bonus for high fingerprint similarity + fp_sim = match.get('fingerprint_similarity') + if fp_sim and fp_sim > 0.7: + score *= 1.3 + + # bonus for complementary skills + comp_skills = match.get('complementary_skills', []) + if len(comp_skills) >= 3: + score *= 1.1 + + match['quality_score'] = score + ranked.append(match) + + # sort by quality score + ranked.sort(key=lambda x: x['quality_score'], reverse=True) + + return ranked + + +def find_all_matches(db, min_score=30, min_overlap=20): + """ + find all potential matches from database + returns list of match dicts + """ + print("matchd: finding all potential matches...") + + # get all humans above threshold + humans = db.get_all_humans(min_score=min_score) + print(f" {len(humans)} humans to match") + + # generate fingerprints + fingerprints = {} + for human in humans: + fp = generate_fingerprint(human) + fingerprints[human['id']] = fp + db.save_fingerprint(human['id'], fp) + + print(f" generated {len(fingerprints)} fingerprints") + + # find all pairs + matches = [] + checked = 0 + skipped_same = 0 + skipped_connected = 0 + + for human_a, human_b in combinations(humans, 2): + checked += 1 + + # skip if likely same person + if is_same_person(human_a, human_b): + skipped_same += 1 + continue + + # skip if already connected (same org, company, co-contributors) + connected, reason = check_already_connected(human_a, human_b) + if connected: + skipped_connected += 1 + continue + + # calculate overlap + fp_a = fingerprints.get(human_a['id']) + fp_b = fingerprints.get(human_b['id']) + + overlap = find_overlap(human_a, human_b, fp_a, fp_b) + + if overlap['overlap_score'] >= min_overlap: + match = { + 'human_a': human_a, + 'human_b': human_b, + **overlap + } + matches.append(match) + + # save to db + db.save_match(human_a['id'], human_b['id'], overlap) + + if checked % 1000 == 0: + print(f" checked {checked} pairs, {len(matches)} matches so far...") + + print(f" checked {checked} pairs") + print(f" skipped {skipped_same} (same person), {skipped_connected} (already connected)") + print(f" found {len(matches)} potential matches") + + # rank them + ranked = rank_matches(matches) + + return ranked + + +def get_top_matches(db, limit=50): + """ + get top matches from database + """ + match_rows = db.get_matches(limit=limit) + + matches = [] + for row in match_rows: + human_a = db.get_human_by_id(row['human_a_id']) + human_b = db.get_human_by_id(row['human_b_id']) + + if human_a and human_b: + matches.append({ + 'id': row['id'], + 'human_a': human_a, + 'human_b': human_b, + 'overlap_score': row['overlap_score'], + 'overlap_reasons': row['overlap_reasons'], + 'geographic_match': row['geographic_match'], + 'status': row['status'], + }) + + return matches diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..cd177b6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.28.0 +beautifulsoup4>=4.12.0 +groq>=0.4.0 diff --git a/scoutd/__init__.py b/scoutd/__init__.py new file mode 100644 index 0000000..5d2e192 --- /dev/null +++ b/scoutd/__init__.py @@ -0,0 +1,29 @@ +""" +scoutd - discovery module +finds humans across platforms +""" + +from .github import scrape_github, get_github_user +from .reddit import scrape_reddit +from .mastodon import scrape_mastodon +from .lobsters import scrape_lobsters +from .matrix import scrape_matrix +from .twitter import scrape_twitter +from .bluesky import scrape_bluesky +from .lemmy import scrape_lemmy +from .discord import scrape_discord, send_discord_dm +from .deep import ( + deep_scrape_github_user, check_already_connected, save_deep_profile, + determine_contact_method, get_cached_orgs, cache_orgs, + get_emails_from_commit_history, scrape_website_for_emails, +) + +__all__ = [ + 'scrape_github', 'scrape_reddit', 'scrape_mastodon', 'scrape_lobsters', + 'scrape_matrix', 'scrape_twitter', 'scrape_bluesky', 'scrape_lemmy', + 'scrape_discord', 'send_discord_dm', + 'get_github_user', 'deep_scrape_github_user', + 'check_already_connected', 'save_deep_profile', 'determine_contact_method', + 'get_cached_orgs', 'cache_orgs', 'get_emails_from_commit_history', + 'scrape_website_for_emails', +] diff --git a/scoutd/bluesky.py b/scoutd/bluesky.py new file mode 100644 index 0000000..dbd4bc4 --- /dev/null +++ b/scoutd/bluesky.py @@ -0,0 +1,216 @@ +""" +scoutd/bluesky.py - bluesky/atproto discovery + +bluesky has an open API via AT Protocol - no auth needed for public data +many twitter refugees landed here, good source for aligned builders +""" + +import requests +import json +import time +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'bluesky' + +# public bluesky API +BSKY_API = 'https://public.api.bsky.app' + +# hashtags to search +ALIGNED_HASHTAGS = [ + 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', + 'privacy', 'solarpunk', 'cooperative', 'mutualaid', 'localfirst', + 'indieweb', 'smallweb', 'permacomputing', 'techworkers', 'coops', +] + + +def _api_get(endpoint, params=None): + """rate-limited API request with caching""" + url = f"{BSKY_API}{endpoint}" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(0.5) # rate limit + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" bluesky api error: {e}") + return None + + +def search_posts(query, limit=50): + """search for posts containing query""" + result = _api_get('/xrpc/app.bsky.feed.searchPosts', { + 'q': query, + 'limit': min(limit, 100), + }) + + if not result: + return [] + + posts = result.get('posts', []) + return posts + + +def get_profile(handle): + """get user profile by handle (e.g., user.bsky.social)""" + result = _api_get('/xrpc/app.bsky.actor.getProfile', {'actor': handle}) + return result + + +def get_author_feed(handle, limit=30): + """get user's recent posts""" + result = _api_get('/xrpc/app.bsky.feed.getAuthorFeed', { + 'actor': handle, + 'limit': limit, + }) + + if not result: + return [] + + return result.get('feed', []) + + +def analyze_bluesky_user(handle): + """analyze a bluesky user for alignment""" + profile = get_profile(handle) + if not profile: + return None + + # collect text + text_parts = [] + + # bio/description + description = profile.get('description', '') + if description: + text_parts.append(description) + + display_name = profile.get('displayName', '') + if display_name: + text_parts.append(display_name) + + # recent posts + feed = get_author_feed(handle, limit=20) + for item in feed: + post = item.get('post', {}) + record = post.get('record', {}) + text = record.get('text', '') + if text: + text_parts.append(text) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # bluesky bonus (decentralized, values-aligned platform choice) + platform_bonus = 10 + total_score = text_score + platform_bonus + + # activity bonus + followers = profile.get('followersCount', 0) + posts_count = profile.get('postsCount', 0) + + if posts_count >= 100: + total_score += 5 + if followers >= 100: + total_score += 5 + + # confidence + confidence = 0.35 # base for bluesky (better signal than twitter) + if len(text_parts) > 5: + confidence += 0.2 + if len(positive_signals) >= 3: + confidence += 0.2 + if posts_count >= 50: + confidence += 0.1 + confidence = min(confidence, 0.85) + + reasons = ['on bluesky (atproto)'] + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + return { + 'platform': 'bluesky', + 'username': handle, + 'url': f"https://bsky.app/profile/{handle}", + 'name': display_name or handle, + 'bio': description, + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'followers': followers, + 'posts_count': posts_count, + 'reasons': reasons, + 'contact': { + 'bluesky': handle, + }, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_bluesky(db, limit_per_hashtag=30): + """full bluesky scrape""" + print("scoutd/bluesky: starting scrape...") + + all_users = {} + + for hashtag in ALIGNED_HASHTAGS: + print(f" #{hashtag}...") + + # search for hashtag + posts = search_posts(f"#{hashtag}", limit=limit_per_hashtag) + + for post in posts: + author = post.get('author', {}) + handle = author.get('handle') + + if handle and handle not in all_users: + all_users[handle] = { + 'handle': handle, + 'display_name': author.get('displayName'), + 'hashtags': [hashtag], + } + elif handle: + all_users[handle]['hashtags'].append(hashtag) + + print(f" found {len(posts)} posts") + + # prioritize users in multiple hashtags + multi_hashtag = {h: d for h, d in all_users.items() if len(d.get('hashtags', [])) >= 2} + print(f" {len(multi_hashtag)} users in 2+ aligned hashtags") + + # analyze + results = [] + for handle in list(multi_hashtag.keys())[:100]: + try: + result = analyze_bluesky_user(handle) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + if result['score'] >= 30: + print(f" ★ @{handle}: {result['score']} pts") + except Exception as e: + print(f" error on {handle}: {e}") + + print(f"scoutd/bluesky: found {len(results)} aligned humans") + return results diff --git a/scoutd/deep.py b/scoutd/deep.py new file mode 100644 index 0000000..73f4ae1 --- /dev/null +++ b/scoutd/deep.py @@ -0,0 +1,966 @@ +""" +scoutd/deep.py - deep profile discovery +when we find someone, follow ALL their links to build complete picture + +github profile -> mastodon link -> scrape mastodon + -> website -> scrape for more links + -> twitter handle -> note it + -> email -> store it + +email discovery sources: +- github profile (if public) +- git commit history +- personal website/blog contact page +- README "contact me" sections +- mastodon/twitter bio + +fallback contact methods if no email: +- github_issue: open issue on their repo +- mastodon: DM if allowed +- manual: pending contact queue for review + +also filters out people who clearly already know each other +(same org, co-contributors to same repos) +""" + +import re +import json +import requests +import time +import subprocess +import tempfile +import shutil +from datetime import datetime +from urllib.parse import urlparse +from pathlib import Path + +from .signals import analyze_text +from .github import get_github_user, get_user_repos, _api_get as github_api +from .mastodon import analyze_mastodon_user, _api_get as mastodon_api +from .handles import discover_all_handles, extract_handles_from_text, scrape_website_for_handles + +# local cache for org memberships +ORG_CACHE_FILE = Path(__file__).parent.parent / 'data' / 'org_cache.json' +_org_cache = None + +# patterns to find social links in text +MASTODON_PATTERN = r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-z]{2,})' +TWITTER_PATTERN = r'(?:twitter\.com/|x\.com/)([a-zA-Z0-9_]+)' +GITHUB_PATTERN = r'github\.com/([a-zA-Z0-9_-]+)' +MATRIX_PATTERN = r'@([a-zA-Z0-9_]+):([a-zA-Z0-9.-]+)' +EMAIL_PATTERN = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b' + +# known mastodon instances for validation +KNOWN_INSTANCES = [ + 'mastodon.social', 'fosstodon.org', 'tech.lgbt', 'social.coop', + 'hackers.town', 'hachyderm.io', 'infosec.exchange', 'chaos.social', + 'mas.to', 'mstdn.social', 'mastodon.online', 'universeodon.com', + 'mathstodon.xyz', 'ruby.social', 'functional.cafe', 'types.pl', +] + +# contact page patterns for website scraping +CONTACT_PAGE_PATHS = [ + '/contact', '/contact/', '/contact.html', + '/about', '/about/', '/about.html', + '/connect', '/reach-out', '/hire', '/hire-me', +] + +# patterns to find emails in contact sections +CONTACT_SECTION_PATTERNS = [ + r'(?:contact|email|reach|mail)[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', + r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|$at$|@)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|$dot$|\.)\s*([a-zA-Z]{2,})', +] + + +def load_org_cache(): + """load org membership cache from disk""" + global _org_cache + if _org_cache is not None: + return _org_cache + + try: + ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + if ORG_CACHE_FILE.exists(): + with open(ORG_CACHE_FILE) as f: + _org_cache = json.load(f) + else: + _org_cache = {'users': {}, 'updated': {}} + except: + _org_cache = {'users': {}, 'updated': {}} + + return _org_cache + + +def save_org_cache(): + """save org membership cache to disk""" + global _org_cache + if _org_cache is None: + return + + try: + ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(ORG_CACHE_FILE, 'w') as f: + json.dump(_org_cache, f, indent=2) + except: + pass + + +def get_cached_orgs(username): + """get orgs from cache if available and fresh (< 7 days old)""" + cache = load_org_cache() + + if username not in cache['users']: + return None + + updated = cache['updated'].get(username) + if updated: + updated_dt = datetime.fromisoformat(updated) + if (datetime.now() - updated_dt).days < 7: + return cache['users'][username] + + return None + + +def cache_orgs(username, orgs): + """cache org membership for a user""" + cache = load_org_cache() + cache['users'][username] = orgs + cache['updated'][username] = datetime.now().isoformat() + save_org_cache() + + +def get_emails_from_commit_history(repo_url, limit=50): + """ + clone a repo (shallow) and extract unique committer emails from git log + """ + emails = set() + + try: + # create temp dir + with tempfile.TemporaryDirectory() as tmpdir: + # shallow clone with limited depth + result = subprocess.run( + ['git', 'clone', '--depth', '50', '--single-branch', repo_url, tmpdir], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + return [] + + # get unique emails from commit log + result = subprocess.run( + ['git', 'log', f'--max-count={limit}', '--format=%ae'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + for email in result.stdout.strip().split('\n'): + email = email.strip().lower() + # filter out bot/noreply emails + if email and not any(x in email for x in [ + 'noreply', 'no-reply', 'dependabot', 'github-actions', + 'renovate', 'greenkeeper', 'snyk-bot', 'users.noreply.github' + ]): + emails.add(email) + except (subprocess.TimeoutExpired, Exception): + pass + + return list(emails) + + +def scrape_website_for_emails(url, timeout=10): + """ + scrape a personal website for email addresses + checks main page and common contact pages + """ + emails = set() + + if not is_personal_website(url): + return [] + + headers = {'User-Agent': 'connectd/1.0 (looking for contact info)'} + + # normalize url + if not url.startswith('http'): + url = 'https://' + url + + base_url = url.rstrip('/') + + # pages to check + pages_to_check = [base_url] + [base_url + path for path in CONTACT_PAGE_PATHS] + + for page_url in pages_to_check: + try: + resp = requests.get(page_url, timeout=timeout, headers=headers) + if resp.status_code == 200: + text = resp.text + + # standard email pattern + for match in re.finditer(EMAIL_PATTERN, text): + email = match.group(0).lower() + if not any(x in email for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']): + emails.add(email) + + # obfuscated email patterns like "user [at] domain [dot] com" + for pattern in CONTACT_SECTION_PATTERNS: + for match in re.finditer(pattern, text, re.IGNORECASE): + if len(match.groups()) == 3: + email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower() + emails.add(email) + elif len(match.groups()) == 1: + emails.add(match.group(1).lower()) + + # mailto: links + for match in re.finditer(r'mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text): + emails.add(match.group(1).lower()) + + except: + continue + + return list(emails) + + +def extract_emails_from_readme(text): + """ + extract emails from README text, looking for contact sections + """ + emails = set() + + if not text: + return [] + + # look for contact-related sections + contact_patterns = [ + r'(?:##?\s*)?(?:contact|reach|email|get in touch|connect)[^\n]*\n([^\n#]+)', + r'(?:email|contact|reach me)[:\s]+([^\n]+)', + ] + + for pattern in contact_patterns: + for match in re.finditer(pattern, text, re.IGNORECASE): + section = match.group(1) + # extract emails from this section + for email_match in re.finditer(EMAIL_PATTERN, section): + email = email_match.group(0).lower() + if not any(x in email for x in ['noreply', 'no-reply', 'example.com']): + emails.add(email) + + # also check for obfuscated emails + for match in re.finditer(r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|$at$)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|$dot$)\s*([a-zA-Z]{2,})', text, re.IGNORECASE): + email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower() + emails.add(email) + + return list(emails) + + +def get_mastodon_dm_allowed(handle): + """check if a mastodon user allows DMs""" + profile = get_mastodon_profile(handle) + if not profile: + return False + + # check if they're locked (requires follow approval) + if profile.get('locked'): + return False + + # check bio for "DMs open" type messages + bio = (profile.get('note') or profile.get('summary') or '').lower() + if any(x in bio for x in ['dms open', 'dm me', 'message me', 'dms welcome']): + return True + + # default: assume open if not locked + return True + + +def determine_contact_method(profile): + """ + determine the best way to contact someone + returns (method, details) where method is one of: + - 'email': direct email contact + - 'github_issue': open issue on their repo + - 'mastodon': DM on mastodon + - 'manual': needs manual review + """ + # prefer email + if profile.get('email'): + return 'email', {'email': profile['email']} + + # check for multiple emails to pick from + if profile.get('emails') and len(profile['emails']) > 0: + # prefer non-github, non-work emails + for email in profile['emails']: + if not any(x in email.lower() for x in ['github', 'noreply', '@company', '@corp']): + return 'email', {'email': email} + # fall back to first one + return 'email', {'email': profile['emails'][0]} + + # try mastodon DM + if profile.get('mastodon'): + handles = profile['mastodon'] if isinstance(profile['mastodon'], list) else [profile['mastodon']] + for handle in handles: + if get_mastodon_dm_allowed(handle): + return 'mastodon', {'handle': handle} + + # try github issue on their most-starred repo + if profile.get('top_repos'): + # find repo with issues enabled and good stars + for repo in sorted(profile['top_repos'], key=lambda r: r.get('stars', 0), reverse=True): + if repo.get('stars', 0) >= 10: + repo_name = repo.get('name') + if repo_name: + return 'github_issue', { + 'repo': f"{profile['username']}/{repo_name}", + 'stars': repo.get('stars'), + } + + # manual review needed + return 'manual', { + 'reason': 'no email, mastodon, or suitable repo found', + 'available': { + 'twitter': profile.get('twitter'), + 'websites': profile.get('websites'), + 'matrix': profile.get('matrix'), + } + } + + +def extract_links_from_text(text): + """extract social links from bio/readme text""" + if not text: + return {} + + links = { + 'mastodon': [], + 'twitter': [], + 'github': [], + 'matrix': [], + 'email': [], + 'websites': [], + } + + # mastodon handles - only accept known instances or ones with 'mastodon'/'social' in name + for match in re.finditer(MASTODON_PATTERN, text): + user, instance = match.groups() + instance_lower = instance.lower() + # validate it's a known instance or looks like one + is_known = instance_lower in KNOWN_INSTANCES + looks_like_masto = any(x in instance_lower for x in ['mastodon', 'social', 'fedi', '.town', '.cafe']) + if is_known or looks_like_masto: + links['mastodon'].append(f"{user}@{instance}") + + # twitter + for match in re.finditer(TWITTER_PATTERN, text, re.IGNORECASE): + links['twitter'].append(match.group(1)) + + # github (for cross-referencing) + for match in re.finditer(GITHUB_PATTERN, text, re.IGNORECASE): + links['github'].append(match.group(1)) + + # matrix + for match in re.finditer(MATRIX_PATTERN, text): + user, server = match.groups() + links['matrix'].append(f"@{user}:{server}") + + # email + for match in re.finditer(EMAIL_PATTERN, text): + email = match.group(0) + # filter out obvious non-personal emails + if not any(x in email.lower() for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']): + links['email'].append(email) + + # websites (http/https links that aren't social platforms) + url_pattern = r'https?://([a-zA-Z0-9.-]+\.[a-z]{2,})[/\w.-]*' + for match in re.finditer(url_pattern, text): + domain = match.group(1).lower() + if not any(x in domain for x in ['github.com', 'twitter.com', 'mastodon', 'linkedin.com', 't.co']): + links['websites'].append(match.group(0)) + + # dedupe + for key in links: + links[key] = list(set(links[key])) + + return links + + +def is_personal_website(url): + """check if URL looks like a personal website vs corporate site""" + domain = urlparse(url).netloc.lower() + + # skip obvious corporate/platform sites + skip_domains = [ + 'github.com', 'gitlab.com', 'bitbucket.org', + 'twitter.com', 'x.com', 'linkedin.com', 'facebook.com', + 'youtube.com', 'medium.com', 'dev.to', 'hashnode.com', + 'wedo.com', 'google.com', 'microsoft.com', 'apple.com', + 'amazon.com', 'stackoverflow.com', 'reddit.com', + ] + + if any(skip in domain for skip in skip_domains): + return False + + # looks personal if: short domain, has common personal TLDs, contains username-like string + personal_tlds = ['.io', '.dev', '.me', '.co', '.xyz', '.page', '.codes', '.software'] + if any(domain.endswith(tld) for tld in personal_tlds): + return True + + # if domain is just name.com or similar + parts = domain.replace('www.', '').split('.') + if len(parts) == 2 and len(parts[0]) < 20: + return True + + return False + + +def scrape_website_for_links(url, timeout=10): + """scrape a personal website for more social links""" + if not is_personal_website(url): + return {} + + try: + resp = requests.get(url, timeout=timeout, headers={'User-Agent': 'connectd/1.0'}) + resp.raise_for_status() + return extract_links_from_text(resp.text) + except: + return {} + + +def get_mastodon_profile(handle): + """ + fetch mastodon profile from handle like user@instance + returns profile data or None + """ + if '@' not in handle: + return None + + parts = handle.split('@') + if len(parts) == 2: + user, instance = parts + elif len(parts) == 3 and parts[0] == '': + # @user@instance format + user, instance = parts[1], parts[2] + else: + return None + + # try to look up via webfinger + try: + webfinger_url = f"https://{instance}/.well-known/webfinger" + resp = requests.get( + webfinger_url, + params={'resource': f'acct:{user}@{instance}'}, + timeout=10, + headers={'Accept': 'application/json'} + ) + if resp.status_code == 200: + data = resp.json() + # find the profile link + for link in data.get('links', []): + if link.get('type') == 'application/activity+json': + profile_url = link.get('href') + # fetch the profile + profile_resp = requests.get( + profile_url, + timeout=10, + headers={'Accept': 'application/activity+json'} + ) + if profile_resp.status_code == 200: + return profile_resp.json() + except: + pass + + # fallback: try direct API + try: + search_url = f"https://{instance}/api/v1/accounts/lookup" + resp = requests.get(search_url, params={'acct': user}, timeout=10) + if resp.status_code == 200: + return resp.json() + except: + pass + + return None + + +def deep_scrape_github_user(login, scrape_commits=True): + """ + deep scrape a github user - follow all links, build complete profile + + email discovery sources: + 1. github profile (if public) + 2. git commit history (if scrape_commits=True) + 3. personal website/blog contact pages + 4. README "contact me" sections + 5. mastodon bio + """ + print(f" deep scraping {login}...") + + user = get_github_user(login) + if not user: + return None + + repos = get_user_repos(login, per_page=50) + + # collect all text to search for links + all_text = [] + readme_text = None + + if user.get('bio'): + all_text.append(user['bio']) + if user.get('blog'): + all_text.append(user['blog']) + if user.get('company'): + all_text.append(user['company']) + + # check readme of profile repo (username/username) + for branch in ['main', 'master']: + readme_url = f"https://raw.githubusercontent.com/{login}/{login}/{branch}/README.md" + try: + resp = requests.get(readme_url, timeout=10) + if resp.status_code == 200: + readme_text = resp.text + all_text.append(readme_text) + break + except: + pass + + # extract links from all collected text + combined_text = '\n'.join(all_text) + found_links = extract_links_from_text(combined_text) + + # ensure all keys exist + for key in ['email', 'twitter', 'github', 'matrix', 'mastodon', 'websites']: + if key not in found_links: + found_links[key] = [] + + # add explicit github fields + if user.get('email'): + found_links['email'].append(user['email']) + if user.get('twitter_username'): + found_links['twitter'].append(user['twitter_username']) + if user.get('blog'): + found_links['websites'].append(user['blog']) + + # EMAIL DISCOVERY: extract emails from README contact sections + if readme_text: + readme_emails = extract_emails_from_readme(readme_text) + found_links['email'].extend(readme_emails) + if readme_emails: + print(f" found {len(readme_emails)} email(s) in README") + + # dedupe + for key in found_links: + found_links[key] = list(set(found_links[key])) + + # now follow the links to gather more data + profile = { + 'source': 'github', + 'username': login, + 'url': f"https://github.com/{login}", + 'real_name': user.get('name'), + 'bio': user.get('bio'), + 'location': user.get('location'), + 'company': user.get('company'), + 'hireable': user.get('hireable'), + 'created_at': user.get('created_at'), + 'public_repos': user.get('public_repos'), + 'followers': user.get('followers'), + + # contact points + 'email': found_links['email'][0] if found_links['email'] else user.get('email'), + 'emails': list(found_links['email']), + 'twitter': found_links['twitter'][0] if found_links['twitter'] else user.get('twitter_username'), + 'mastodon': found_links['mastodon'], + 'matrix': found_links['matrix'], + 'websites': found_links['websites'], + + # cross-platform profiles we find + 'linked_profiles': {}, + + # repos and languages + 'top_repos': [], + 'languages': {}, + 'topics': [], + 'orgs': [], + + # contact method (will be determined at end) + 'contact_method': None, + 'contact_details': None, + } + + # analyze repos + top_starred_repo = None + for repo in repos[:30]: + if not repo.get('fork'): + repo_info = { + 'name': repo.get('name'), + 'description': repo.get('description'), + 'stars': repo.get('stargazers_count'), + 'language': repo.get('language'), + 'topics': repo.get('topics', []), + 'html_url': repo.get('html_url'), + 'pushed_at': repo.get('pushed_at'), # for activity-based contact selection + } + profile['top_repos'].append(repo_info) + + # track top starred for commit email scraping + if not top_starred_repo or repo.get('stargazers_count', 0) > top_starred_repo.get('stars', 0): + top_starred_repo = repo_info + + if repo.get('language'): + lang = repo['language'] + profile['languages'][lang] = profile['languages'].get(lang, 0) + 1 + + profile['topics'].extend(repo.get('topics', [])) + + profile['topics'] = list(set(profile['topics'])) + + # get orgs - check cache first + cached_orgs = get_cached_orgs(login) + if cached_orgs is not None: + print(f" using cached orgs: {cached_orgs}") + profile['orgs'] = cached_orgs + else: + orgs_url = f"https://api.github.com/users/{login}/orgs" + orgs_data = github_api(orgs_url) or [] + profile['orgs'] = [o.get('login') for o in orgs_data] + # cache for future use + cache_orgs(login, profile['orgs']) + if profile['orgs']: + print(f" fetched & cached orgs: {profile['orgs']}") + + # EMAIL DISCOVERY: scrape commit history from top repo + if scrape_commits and top_starred_repo and not profile['emails']: + repo_url = f"https://github.com/{login}/{top_starred_repo['name']}.git" + print(f" checking commit history in {top_starred_repo['name']}...") + commit_emails = get_emails_from_commit_history(repo_url) + if commit_emails: + print(f" found {len(commit_emails)} email(s) in commits") + profile['emails'].extend(commit_emails) + + # follow mastodon links + for masto_handle in found_links['mastodon'][:2]: # limit to 2 + print(f" following mastodon: {masto_handle}") + masto_profile = get_mastodon_profile(masto_handle) + if masto_profile: + profile['linked_profiles']['mastodon'] = { + 'handle': masto_handle, + 'display_name': masto_profile.get('display_name') or masto_profile.get('name'), + 'bio': masto_profile.get('note') or masto_profile.get('summary'), + 'followers': masto_profile.get('followers_count'), + 'url': masto_profile.get('url'), + 'locked': masto_profile.get('locked', False), + } + # extract more links from mastodon bio + masto_bio = masto_profile.get('note') or masto_profile.get('summary') or '' + masto_links = extract_links_from_text(masto_bio) + profile['emails'].extend(masto_links.get('email', [])) + profile['websites'].extend(masto_links.get('websites', [])) + + # EMAIL DISCOVERY: scrape personal website for contact info + for website in found_links['websites'][:2]: # check up to 2 sites + print(f" following website: {website}") + + # basic link extraction + site_links = scrape_website_for_links(website) + if site_links.get('mastodon') and not profile['mastodon']: + profile['mastodon'] = site_links['mastodon'] + + # enhanced email discovery - check contact pages + website_emails = scrape_website_for_emails(website) + if website_emails: + print(f" found {len(website_emails)} email(s) on website") + profile['emails'].extend(website_emails) + + # dedupe emails and pick best one + profile['emails'] = list(set(profile['emails'])) + + # rank emails by preference + def email_score(email): + email_lower = email.lower() + score = 0 + # prefer personal domains + if any(x in email_lower for x in ['@gmail', '@proton', '@hey.com', '@fastmail']): + score += 10 + # deprioritize github emails + if 'github' in email_lower: + score -= 20 + # deprioritize noreply + if 'noreply' in email_lower: + score -= 50 + # prefer emails matching username + if login.lower() in email_lower: + score += 5 + return score + + if profile['emails']: + profile['emails'].sort(key=email_score, reverse=True) + profile['email'] = profile['emails'][0] + + # COMPREHENSIVE HANDLE DISCOVERY + # find ALL social handles from website, README, rel="me" links, etc. + discovered_handles, discovered_emails = discover_all_handles(user) + + # merge discovered handles into profile + profile['handles'] = discovered_handles + + # update individual fields from discovered handles + if discovered_handles.get('mastodon') and not profile.get('mastodon'): + profile['mastodon'] = discovered_handles['mastodon'] + if discovered_handles.get('twitter') and not profile.get('twitter'): + profile['twitter'] = discovered_handles['twitter'] + if discovered_handles.get('bluesky'): + profile['bluesky'] = discovered_handles['bluesky'] + if discovered_handles.get('matrix') and not profile.get('matrix'): + profile['matrix'] = discovered_handles['matrix'] + if discovered_handles.get('linkedin'): + profile['linkedin'] = discovered_handles['linkedin'] + if discovered_handles.get('youtube'): + profile['youtube'] = discovered_handles['youtube'] + if discovered_handles.get('discord'): + profile['discord'] = discovered_handles['discord'] + if discovered_handles.get('telegram'): + profile['telegram'] = discovered_handles['telegram'] + + # merge discovered emails + for email in discovered_emails: + if email not in profile['emails']: + profile['emails'].append(email) + + print(f" handles found: {list(discovered_handles.keys())}") + + # determine best contact method + contact_method, contact_details = determine_contact_method(profile) + profile['contact_method'] = contact_method + profile['contact_details'] = contact_details + print(f" contact method: {contact_method}") + + # analyze all text for signals + all_profile_text = ' '.join([ + profile.get('bio') or '', + profile.get('company') or '', + profile.get('location') or '', + ' '.join(profile.get('topics', [])), + ]) + + for linked in profile.get('linked_profiles', {}).values(): + if linked.get('bio'): + all_profile_text += ' ' + linked['bio'] + + text_score, signals, negative = analyze_text(all_profile_text) + profile['signals'] = signals + profile['negative_signals'] = negative + profile['score'] = text_score + + # add builder score + if len(repos) > 20: + profile['score'] += 15 + elif len(repos) > 10: + profile['score'] += 10 + + # add topic alignment + from .signals import TARGET_TOPICS + aligned_topics = set(profile['topics']) & set(TARGET_TOPICS) + profile['score'] += len(aligned_topics) * 10 + profile['aligned_topics'] = list(aligned_topics) + + profile['scraped_at'] = datetime.now().isoformat() + + return profile + + +def check_mutual_github_follows(user_a, user_b): + """check if two github users follow each other""" + # check if a follows b + url = f"https://api.github.com/users/{user_a}/following/{user_b}" + try: + resp = requests.get(url, timeout=10, headers={'Accept': 'application/vnd.github.v3+json'}) + if resp.status_code == 204: # 204 = follows + return True + except: + pass + return False + + +def check_shared_repo_contributions(user_a, user_b): + """ + check if two users have contributed to the same repos + returns (bool, list of shared repos) + """ + # this would require checking contribution history + # for now, we check via the orgs and top_repos stored in extra + # the full implementation would query: + # GET /repos/{owner}/{repo}/contributors for their top repos + return False, [] + + +def check_github_interactions(user_a, user_b): + """ + check if users have had public interactions + (comments on each other's issues/PRs) + this is expensive - only do for high-score matches + """ + # would need to search: + # GET /search/issues?q=author:{user_a}+commenter:{user_b} + # GET /search/issues?q=author:{user_b}+commenter:{user_a} + return False + + +def check_already_connected(human_a, human_b, deep_check=False): + """ + check if two humans are likely already connected + (same org, co-contributors, mutual follows, interactions) + + connectd's job is connecting ISOLATED builders, not re-introducing coworkers + """ + # parse extra data if stored as json string + extra_a = human_a.get('extra', {}) + extra_b = human_b.get('extra', {}) + if isinstance(extra_a, str): + extra_a = json.loads(extra_a) if extra_a else {} + if isinstance(extra_b, str): + extra_b = json.loads(extra_b) if extra_b else {} + + # 1. same github org - check cache first, then stored data + orgs_a = set(extra_a.get('orgs', [])) + orgs_b = set(extra_b.get('orgs', [])) + + # also check org cache for fresher data + if human_a.get('platform') == 'github': + cached_a = get_cached_orgs(human_a.get('username', '')) + if cached_a: + orgs_a.update(cached_a) + if human_b.get('platform') == 'github': + cached_b = get_cached_orgs(human_b.get('username', '')) + if cached_b: + orgs_b.update(cached_b) + + shared_orgs = orgs_a & orgs_b + + if shared_orgs: + return True, f"same org: {', '.join(list(shared_orgs)[:3])}" + + # 2. same company + company_a = (extra_a.get('company') or '').lower().strip('@').strip() + company_b = (extra_b.get('company') or '').lower().strip('@').strip() + + if company_a and company_b and len(company_a) > 2: + if company_a == company_b or company_a in company_b or company_b in company_a: + return True, f"same company: {company_a or company_b}" + + # 3. co-contributors to same major repos (from stored top_repos) + repos_a = set() + repos_b = set() + for r in extra_a.get('top_repos', []): + if r.get('stars', 0) > 50: # only significant repos + repos_a.add(r.get('name', '').lower()) + for r in extra_b.get('top_repos', []): + if r.get('stars', 0) > 50: + repos_b.add(r.get('name', '').lower()) + + shared_repos = repos_a & repos_b + if len(shared_repos) >= 2: + return True, f"co-contributors: {', '.join(list(shared_repos)[:3])}" + + # 4. deep checks (more API calls - only if requested) + if deep_check: + user_a = human_a.get('username', '') + user_b = human_b.get('username', '') + + # check mutual follows + if human_a.get('platform') == 'github' and human_b.get('platform') == 'github': + if check_mutual_github_follows(user_a, user_b): + return True, "mutual github follows" + if check_mutual_github_follows(user_b, user_a): + return True, "mutual github follows" + + return False, None + + +def save_deep_profile(db, profile): + """save a deep-scraped profile to the database""" + # convert to standard human format + # IMPORTANT: extra field contains ALL data for activity-based contact selection + human_data = { + 'platform': profile['source'], + 'username': profile['username'], + 'url': profile['url'], + 'name': profile.get('real_name'), + 'bio': profile.get('bio'), + 'location': profile.get('location'), + 'score': profile.get('score', 0), + 'confidence': 0.8 if profile.get('linked_profiles') else 0.5, + 'signals': profile.get('signals', []), + 'negative_signals': profile.get('negative_signals', []), + 'reasons': [], + 'contact': { + 'email': profile.get('email'), + 'emails': profile.get('emails', []), + 'twitter': profile.get('twitter'), + 'mastodon': profile.get('mastodon'), + 'matrix': profile.get('matrix'), + 'websites': profile.get('websites'), + 'contact_method': profile.get('contact_method'), + 'contact_details': profile.get('contact_details'), + }, + 'extra': { + # identity + 'real_name': profile.get('real_name'), + 'company': profile.get('company'), + 'hireable': profile.get('hireable'), + 'orgs': profile.get('orgs'), + + # github activity (for activity-based contact) + 'top_repos': profile.get('top_repos'), + 'languages': profile.get('languages'), + 'topics': profile.get('topics'), + 'aligned_topics': profile.get('aligned_topics'), + 'followers': profile.get('followers'), + 'public_repos': profile.get('public_repos'), + 'commit_count': len(profile.get('emails', [])), # rough proxy + + # cross-platform links (for activity-based contact) + 'email': profile.get('email'), + 'emails': profile.get('emails', []), + 'twitter': profile.get('twitter'), + 'mastodon': profile.get('mastodon'), + 'matrix': profile.get('matrix'), + 'bluesky': profile.get('bluesky'), + 'reddit': profile.get('reddit'), + 'lobsters': profile.get('lobsters'), + 'linkedin': profile.get('linkedin'), + 'youtube': profile.get('youtube'), + 'discord': profile.get('discord'), + 'telegram': profile.get('telegram'), + 'linked_profiles': profile.get('linked_profiles'), + + # ALL discovered handles (comprehensive) + 'handles': profile.get('handles', {}), + + # activity counts (populated by platform scrapers) + 'mastodon_statuses': profile.get('mastodon_statuses', 0), + 'twitter_tweets': profile.get('twitter_tweets', 0), + 'reddit_activity': profile.get('reddit_activity', 0), + 'reddit_karma': profile.get('reddit_karma', 0), + 'lobsters_karma': profile.get('lobsters_karma', 0), + 'bluesky_posts': profile.get('bluesky_posts', 0), + }, + 'scraped_at': profile.get('scraped_at'), + } + + # build reasons + if profile.get('signals'): + human_data['reasons'].append(f"signals: {', '.join(profile['signals'][:5])}") + if profile.get('aligned_topics'): + human_data['reasons'].append(f"topics: {', '.join(profile['aligned_topics'][:5])}") + if profile.get('linked_profiles'): + platforms = list(profile['linked_profiles'].keys()) + human_data['reasons'].append(f"also on: {', '.join(platforms)}") + if profile.get('location'): + human_data['reasons'].append(f"location: {profile['location']}") + if profile.get('contact_method'): + human_data['reasons'].append(f"contact: {profile['contact_method']}") + + db.save_human(human_data) + return human_data diff --git a/scoutd/discord.py b/scoutd/discord.py new file mode 100644 index 0000000..84e9d6a --- /dev/null +++ b/scoutd/discord.py @@ -0,0 +1,323 @@ +""" +scoutd/discord.py - discord discovery + +discord requires a bot token to read messages. +target servers: programming help, career transition, indie hackers, etc. + +SETUP: +1. create discord app at discord.com/developers +2. add bot, get token +3. join target servers with bot +4. set DISCORD_BOT_TOKEN env var +""" + +import requests +import json +import time +import os +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text +from .lost import ( + analyze_social_for_lost_signals, + classify_user, +) + +DISCORD_BOT_TOKEN = os.environ.get('DISCORD_BOT_TOKEN', '') +DISCORD_API = 'https://discord.com/api/v10' + +# default server IDs - values-aligned communities +# bot must be invited to these servers to scout them +# invite links for reference (use numeric IDs below): +# - self-hosted: discord.gg/self-hosted +# - foss-dev: discord.gg/foss-developers-group +# - grapheneos: discord.gg/grapheneos +# - queer-coded: discord.me/queer-coded +# - homelab: discord.gg/homelab +# - esphome: discord.gg/n9sdw7pnsn +# - home-assistant: discord.gg/home-assistant +# - linuxserver: discord.gg/linuxserver +# - proxmox-scripts: discord.gg/jsYVk5JBxq +DEFAULT_SERVERS = [ + # self-hosted / foss / privacy + '693469700109369394', # self-hosted (selfhosted.show) + '920089648842293248', # foss developers group + '1176414688112820234', # grapheneos + + # queer tech + '925804557001437184', # queer coded + + # home automation / homelab + # note: these are large servers, bot needs to be invited + # '330944238910963714', # home assistant (150k+ members) + # '429907082951524364', # esphome (35k members) + # '478094546522079232', # homelab (35k members) + # '354974912613449730', # linuxserver.io (41k members) +] + +# merge env var servers with defaults +_env_servers = os.environ.get('DISCORD_TARGET_SERVERS', '').split(',') +_env_servers = [s.strip() for s in _env_servers if s.strip()] +TARGET_SERVERS = list(set(DEFAULT_SERVERS + _env_servers)) + +# channels to focus on (keywords in channel name) +TARGET_CHANNEL_KEYWORDS = [ + 'help', 'career', 'jobs', 'learning', 'beginner', + 'general', 'introductions', 'showcase', 'projects', +] + +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'discord' +CACHE_DIR.mkdir(parents=True, exist_ok=True) + + +def get_headers(): + """get discord api headers""" + if not DISCORD_BOT_TOKEN: + return None + return { + 'Authorization': f'Bot {DISCORD_BOT_TOKEN}', + 'Content-Type': 'application/json', + } + + +def get_guild_channels(guild_id): + """get channels in a guild""" + headers = get_headers() + if not headers: + return [] + + try: + resp = requests.get( + f'{DISCORD_API}/guilds/{guild_id}/channels', + headers=headers, + timeout=30 + ) + if resp.status_code == 200: + return resp.json() + return [] + except Exception: + return [] + + +def get_channel_messages(channel_id, limit=100): + """get recent messages from a channel""" + headers = get_headers() + if not headers: + return [] + + try: + resp = requests.get( + f'{DISCORD_API}/channels/{channel_id}/messages', + headers=headers, + params={'limit': limit}, + timeout=30 + ) + if resp.status_code == 200: + return resp.json() + return [] + except Exception: + return [] + + +def get_user_info(user_id): + """get discord user info""" + headers = get_headers() + if not headers: + return None + + try: + resp = requests.get( + f'{DISCORD_API}/users/{user_id}', + headers=headers, + timeout=30 + ) + if resp.status_code == 200: + return resp.json() + return None + except Exception: + return None + + +def analyze_discord_user(user_data, messages=None): + """analyze a discord user for values alignment and lost signals""" + username = user_data.get('username', '') + display_name = user_data.get('global_name') or username + user_id = user_data.get('id') + + # analyze messages + all_signals = [] + all_text = [] + total_score = 0 + + if messages: + for msg in messages[:20]: + content = msg.get('content', '') + if not content or len(content) < 20: + continue + + all_text.append(content) + score, signals, _ = analyze_text(content) + all_signals.extend(signals) + total_score += score + + all_signals = list(set(all_signals)) + + # lost builder detection + profile_for_lost = { + 'bio': '', + 'message_count': len(messages) if messages else 0, + } + posts_for_lost = [{'text': t} for t in all_text] + + lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) + lost_potential_score = lost_weight + user_type = classify_user(lost_potential_score, 50, total_score) + + return { + 'platform': 'discord', + 'username': username, + 'url': f"https://discord.com/users/{user_id}", + 'name': display_name, + 'bio': '', + 'location': None, + 'score': total_score, + 'confidence': min(0.8, 0.2 + len(all_signals) * 0.1), + 'signals': all_signals, + 'negative_signals': [], + 'reasons': [], + 'contact': {'discord': f"{username}#{user_data.get('discriminator', '0')}"}, + 'extra': { + 'user_id': user_id, + 'message_count': len(messages) if messages else 0, + }, + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_discord(db, limit_per_channel=50): + """scrape discord servers for aligned builders""" + if not DISCORD_BOT_TOKEN: + print("discord: DISCORD_BOT_TOKEN not set, skipping") + return 0 + + if not TARGET_SERVERS or TARGET_SERVERS == ['']: + print("discord: DISCORD_TARGET_SERVERS not set, skipping") + return 0 + + print("scouting discord...") + + found = 0 + lost_found = 0 + seen_users = set() + + for guild_id in TARGET_SERVERS: + if not guild_id: + continue + + guild_id = guild_id.strip() + channels = get_guild_channels(guild_id) + + if not channels: + print(f" guild {guild_id}: no access or no channels") + continue + + # filter to relevant channels + target_channels = [] + for ch in channels: + if ch.get('type') != 0: # text channels only + continue + name = ch.get('name', '').lower() + if any(kw in name for kw in TARGET_CHANNEL_KEYWORDS): + target_channels.append(ch) + + print(f" guild {guild_id}: {len(target_channels)} relevant channels") + + for channel in target_channels[:5]: # limit channels per server + messages = get_channel_messages(channel['id'], limit=limit_per_channel) + + if not messages: + continue + + # group messages by user + user_messages = {} + for msg in messages: + author = msg.get('author', {}) + if author.get('bot'): + continue + + user_id = author.get('id') + if not user_id or user_id in seen_users: + continue + + if user_id not in user_messages: + user_messages[user_id] = {'user': author, 'messages': []} + user_messages[user_id]['messages'].append(msg) + + # analyze each user + for user_id, data in user_messages.items(): + if user_id in seen_users: + continue + seen_users.add(user_id) + + result = analyze_discord_user(data['user'], data['messages']) + if not result: + continue + + if result['score'] >= 20 or result.get('lost_potential_score', 0) >= 30: + db.save_human(result) + found += 1 + + if result.get('user_type') in ['lost', 'both']: + lost_found += 1 + + time.sleep(1) # rate limit between channels + + time.sleep(2) # between guilds + + print(f"discord: found {found} humans ({lost_found} lost builders)") + return found + + +def send_discord_dm(user_id, message, dry_run=False): + """send a DM to a discord user""" + if not DISCORD_BOT_TOKEN: + return False, "DISCORD_BOT_TOKEN not set" + + if dry_run: + print(f" [dry run] would DM discord user {user_id}") + return True, "dry run" + + headers = get_headers() + + try: + # create DM channel + dm_resp = requests.post( + f'{DISCORD_API}/users/@me/channels', + headers=headers, + json={'recipient_id': user_id}, + timeout=30 + ) + + if dm_resp.status_code not in [200, 201]: + return False, f"couldn't create DM channel: {dm_resp.status_code}" + + channel_id = dm_resp.json().get('id') + + # send message + msg_resp = requests.post( + f'{DISCORD_API}/channels/{channel_id}/messages', + headers=headers, + json={'content': message}, + timeout=30 + ) + + if msg_resp.status_code in [200, 201]: + return True, f"sent to {user_id}" + else: + return False, f"send failed: {msg_resp.status_code}" + + except Exception as e: + return False, str(e) diff --git a/scoutd/github.py b/scoutd/github.py new file mode 100644 index 0000000..3941509 --- /dev/null +++ b/scoutd/github.py @@ -0,0 +1,332 @@ +""" +scoutd/github.py - github discovery +scrapes repos, bios, commit patterns to find aligned builders +also detects lost builders - people with potential who haven't started yet +""" + +import requests +import json +import time +import os +from datetime import datetime +from pathlib import Path +from collections import defaultdict + +from .signals import analyze_text, TARGET_TOPICS, ECOSYSTEM_REPOS +from .lost import ( + analyze_github_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, + get_signal_descriptions, +) +from .handles import discover_all_handles + +# rate limit: 60/hr unauthenticated, 5000/hr with token +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +HEADERS = {'Accept': 'application/vnd.github.v3+json'} +if GITHUB_TOKEN: + HEADERS['Authorization'] = f'token {GITHUB_TOKEN}' + +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'github' + + +def _api_get(url, params=None): + """rate-limited api request with caching""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + # check cache (1 hour expiry) + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + # rate limit + time.sleep(0.5 if GITHUB_TOKEN else 2) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + + # cache + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" github api error: {e}") + return None + + +def search_repos_by_topic(topic, per_page=100): + """search repos by topic tag""" + url = 'https://api.github.com/search/repositories' + params = {'q': f'topic:{topic}', 'sort': 'stars', 'order': 'desc', 'per_page': per_page} + data = _api_get(url, params) + return data.get('items', []) if data else [] + + +def get_repo_contributors(repo_full_name, per_page=100): + """get top contributors to a repo""" + url = f'https://api.github.com/repos/{repo_full_name}/contributors' + return _api_get(url, {'per_page': per_page}) or [] + + +def get_github_user(login): + """get full user profile""" + url = f'https://api.github.com/users/{login}' + return _api_get(url) + + +def get_user_repos(login, per_page=100): + """get user's repos""" + url = f'https://api.github.com/users/{login}/repos' + return _api_get(url, {'per_page': per_page, 'sort': 'pushed'}) or [] + + +def analyze_github_user(login): + """ + analyze a github user for values alignment + returns dict with score, confidence, signals, contact info + """ + user = get_github_user(login) + if not user: + return None + + repos = get_user_repos(login) + + # collect text corpus + text_parts = [] + if user.get('bio'): + text_parts.append(user['bio']) + if user.get('company'): + text_parts.append(user['company']) + if user.get('location'): + text_parts.append(user['location']) + + # analyze repos + all_topics = [] + languages = defaultdict(int) + total_stars = 0 + + for repo in repos: + if repo.get('description'): + text_parts.append(repo['description']) + if repo.get('topics'): + all_topics.extend(repo['topics']) + if repo.get('language'): + languages[repo['language']] += 1 + total_stars += repo.get('stargazers_count', 0) + + full_text = ' '.join(text_parts) + + # analyze signals + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # topic alignment + aligned_topics = set(all_topics) & set(TARGET_TOPICS) + topic_score = len(aligned_topics) * 10 + + # builder score (repos indicate building, not just talking) + builder_score = 0 + if len(repos) > 20: + builder_score = 15 + elif len(repos) > 10: + builder_score = 10 + elif len(repos) > 5: + builder_score = 5 + + # hireable bonus + hireable_score = 5 if user.get('hireable') else 0 + + # total score + total_score = text_score + topic_score + builder_score + hireable_score + + # === LOST BUILDER DETECTION === + # build profile dict for lost analysis + profile_for_lost = { + 'bio': user.get('bio'), + 'repos': repos, + 'public_repos': user.get('public_repos', len(repos)), + 'followers': user.get('followers', 0), + 'following': user.get('following', 0), + 'extra': { + 'top_repos': repos[:10], + }, + } + + # analyze for lost signals + lost_signals, lost_weight = analyze_github_for_lost_signals(profile_for_lost) + + # also check text for lost language patterns + text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) + for sig in text_lost_signals: + if sig not in lost_signals: + lost_signals.append(sig) + lost_weight += text_lost_weight + + lost_potential_score = lost_weight + + # classify: builder, lost, both, or none + user_type = classify_user(lost_potential_score, builder_score, total_score) + + # confidence based on data richness + confidence = 0.3 + if user.get('bio'): + confidence += 0.15 + if len(repos) > 5: + confidence += 0.15 + if len(text_parts) > 5: + confidence += 0.15 + if user.get('email') or user.get('blog') or user.get('twitter_username'): + confidence += 0.15 + if total_stars > 100: + confidence += 0.1 + confidence = min(confidence, 1.0) + + # build reasons + reasons = [] + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if aligned_topics: + reasons.append(f"topics: {', '.join(list(aligned_topics)[:5])}") + if builder_score > 0: + reasons.append(f"builder ({len(repos)} repos)") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + # add lost reasons if applicable + if user_type == 'lost' or user_type == 'both': + lost_descriptions = get_signal_descriptions(lost_signals) + if lost_descriptions: + reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") + + # === DEEP HANDLE DISCOVERY === + # follow blog links, scrape websites, find ALL social handles + handles, discovered_emails = discover_all_handles(user) + + # merge discovered emails with github email + all_emails = discovered_emails or [] + if user.get('email'): + all_emails.append(user['email']) + all_emails = list(set(e for e in all_emails if e and 'noreply' not in e.lower())) + + return { + 'platform': 'github', + 'username': login, + 'url': f"https://github.com/{login}", + 'name': user.get('name'), + 'bio': user.get('bio'), + 'location': user.get('location'), + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'topics': list(aligned_topics), + 'languages': dict(languages), + 'repo_count': len(repos), + 'total_stars': total_stars, + 'reasons': reasons, + 'contact': { + 'email': all_emails[0] if all_emails else None, + 'emails': all_emails, + 'blog': user.get('blog'), + 'twitter': user.get('twitter_username') or handles.get('twitter'), + 'mastodon': handles.get('mastodon'), + 'bluesky': handles.get('bluesky'), + 'matrix': handles.get('matrix'), + 'lemmy': handles.get('lemmy'), + }, + 'extra': { + 'topics': list(aligned_topics), + 'languages': dict(languages), + 'repo_count': len(repos), + 'total_stars': total_stars, + 'hireable': user.get('hireable', False), + 'top_repos': [{'name': r.get('name'), 'description': r.get('description'), 'stars': r.get('stargazers_count', 0), 'language': r.get('language')} for r in repos[:5] if not r.get('fork')], + 'handles': handles, # all discovered handles + }, + 'hireable': user.get('hireable', False), + 'top_repos': [{'name': r.get('name'), 'description': r.get('description'), 'stars': r.get('stargazers_count', 0), 'language': r.get('language')} for r in repos[:5] if not r.get('fork')], + 'scraped_at': datetime.now().isoformat(), + # lost builder fields + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, # 'builder', 'lost', 'both', 'none' + } + + +def scrape_github(db, limit_per_source=50): + """ + full github scrape + returns list of analyzed users + """ + print("scoutd/github: starting scrape...") + + all_logins = set() + + # 1. ecosystem repo contributors + print(" scraping ecosystem repo contributors...") + for repo in ECOSYSTEM_REPOS: + contributors = get_repo_contributors(repo, per_page=limit_per_source) + for c in contributors: + login = c.get('login') + if login and not login.endswith('[bot]'): + all_logins.add(login) + print(f" {repo}: {len(contributors)} contributors") + + # 2. topic repos + print(" scraping topic repos...") + for topic in TARGET_TOPICS[:10]: + repos = search_repos_by_topic(topic, per_page=30) + for repo in repos: + owner = repo.get('owner', {}).get('login') + if owner and not owner.endswith('[bot]'): + all_logins.add(owner) + print(f" #{topic}: {len(repos)} repos") + + print(f" found {len(all_logins)} unique users to analyze") + + # analyze each + results = [] + builders_found = 0 + lost_found = 0 + + for i, login in enumerate(all_logins): + if i % 20 == 0: + print(f" analyzing... {i}/{len(all_logins)}") + + try: + result = analyze_github_user(login) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + user_type = result.get('user_type', 'none') + + if user_type == 'builder': + builders_found += 1 + if result['score'] >= 50: + print(f" ★ {login}: {result['score']} pts, {result['confidence']:.0%} conf") + + elif user_type == 'lost': + lost_found += 1 + lost_score = result.get('lost_potential_score', 0) + if lost_score >= 40: + print(f" 💔 {login}: lost_score={lost_score}, values={result['score']} pts") + + elif user_type == 'both': + builders_found += 1 + lost_found += 1 + print(f" ⚡ {login}: recovering builder (lost={result.get('lost_potential_score', 0)}, active={result['score']})") + + except Exception as e: + print(f" error on {login}: {e}") + + print(f"scoutd/github: found {len(results)} aligned humans") + print(f" - {builders_found} active builders") + print(f" - {lost_found} lost builders (need encouragement)") + return results diff --git a/scoutd/handles.py b/scoutd/handles.py new file mode 100644 index 0000000..ccf10ad --- /dev/null +++ b/scoutd/handles.py @@ -0,0 +1,507 @@ +""" +scoutd/handles.py - comprehensive social handle discovery + +finds ALL social handles from: +- github bio/profile +- personal websites (rel="me", footers, contact pages, json-ld) +- README files +- linktree/bio.link/carrd pages +- any linked pages + +stores structured handle data for activity-based contact selection +""" + +import re +import json +import requests +from urllib.parse import urlparse, urljoin +from bs4 import BeautifulSoup + +HEADERS = {'User-Agent': 'Mozilla/5.0 (compatible; connectd/1.0)'} + +# platform URL patterns -> (platform, handle_extractor) +PLATFORM_PATTERNS = { + # fediverse + 'mastodon': [ + (r'https?://([^/]+)/@([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + (r'https?://([^/]+)/users/([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + (r'https?://mastodon\.social/@([^/?#]+)', lambda m: f"@{m.group(1)}@mastodon.social"), + ], + 'pixelfed': [ + (r'https?://pixelfed\.social/@([^/?#]+)', lambda m: f"@{m.group(1)}@pixelfed.social"), + (r'https?://([^/]*pixelfed[^/]*)/@([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + ], + 'lemmy': [ + (r'https?://([^/]+)/u/([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + (r'https?://lemmy\.([^/]+)/u/([^/?#]+)', lambda m: f"@{m.group(2)}@lemmy.{m.group(1)}"), + ], + + # mainstream + 'twitter': [ + (r'https?://(?:www\.)?(?:twitter|x)\.com/([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + 'bluesky': [ + (r'https?://bsky\.app/profile/([^/?#]+)', lambda m: m.group(1)), + (r'https?://([^.]+)\.bsky\.social', lambda m: f"{m.group(1)}.bsky.social"), + ], + 'threads': [ + (r'https?://(?:www\.)?threads\.net/@([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + 'instagram': [ + (r'https?://(?:www\.)?instagram\.com/([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + 'facebook': [ + (r'https?://(?:www\.)?facebook\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'linkedin': [ + (r'https?://(?:www\.)?linkedin\.com/in/([^/?#]+)', lambda m: m.group(1)), + (r'https?://(?:www\.)?linkedin\.com/company/([^/?#]+)', lambda m: f"company/{m.group(1)}"), + ], + + # dev platforms + 'github': [ + (r'https?://(?:www\.)?github\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'gitlab': [ + (r'https?://(?:www\.)?gitlab\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'codeberg': [ + (r'https?://codeberg\.org/([^/?#]+)', lambda m: m.group(1)), + ], + 'sourcehut': [ + (r'https?://sr\.ht/~([^/?#]+)', lambda m: f"~{m.group(1)}"), + (r'https?://git\.sr\.ht/~([^/?#]+)', lambda m: f"~{m.group(1)}"), + ], + + # chat + 'matrix': [ + (r'https?://matrix\.to/#/(@[^:]+:[^/?#]+)', lambda m: m.group(1)), + ], + 'discord': [ + (r'https?://discord\.gg/([^/?#]+)', lambda m: f"invite/{m.group(1)}"), + (r'https?://discord\.com/invite/([^/?#]+)', lambda m: f"invite/{m.group(1)}"), + ], + 'telegram': [ + (r'https?://t\.me/([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + + # content + 'youtube': [ + (r'https?://(?:www\.)?youtube\.com/@([^/?#]+)', lambda m: f"@{m.group(1)}"), + (r'https?://(?:www\.)?youtube\.com/c(?:hannel)?/([^/?#]+)', lambda m: m.group(1)), + ], + 'twitch': [ + (r'https?://(?:www\.)?twitch\.tv/([^/?#]+)', lambda m: m.group(1)), + ], + 'substack': [ + (r'https?://([^.]+)\.substack\.com', lambda m: m.group(1)), + ], + 'medium': [ + (r'https?://(?:www\.)?medium\.com/@([^/?#]+)', lambda m: f"@{m.group(1)}"), + (r'https?://([^.]+)\.medium\.com', lambda m: m.group(1)), + ], + 'devto': [ + (r'https?://dev\.to/([^/?#]+)', lambda m: m.group(1)), + ], + + # funding + 'kofi': [ + (r'https?://ko-fi\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'patreon': [ + (r'https?://(?:www\.)?patreon\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'liberapay': [ + (r'https?://liberapay\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'github_sponsors': [ + (r'https?://github\.com/sponsors/([^/?#]+)', lambda m: m.group(1)), + ], + + # link aggregators (we'll parse these specially) + 'linktree': [ + (r'https?://linktr\.ee/([^/?#]+)', lambda m: m.group(1)), + ], + 'biolink': [ + (r'https?://bio\.link/([^/?#]+)', lambda m: m.group(1)), + ], + 'carrd': [ + (r'https?://([^.]+)\.carrd\.co', lambda m: m.group(1)), + ], +} + +# fediverse handle pattern: @user@instance +FEDIVERSE_HANDLE_PATTERN = re.compile(r'@([\w.-]+)@([\w.-]+\.[\w]+)') + +# email pattern +EMAIL_PATTERN = re.compile(r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b') + +# known fediverse instances (for context-free handle detection) +KNOWN_FEDIVERSE_INSTANCES = [ + 'mastodon.social', 'mastodon.online', 'mstdn.social', 'mas.to', + 'tech.lgbt', 'fosstodon.org', 'hackers.town', 'social.coop', + 'kolektiva.social', 'solarpunk.moe', 'wandering.shop', + 'elekk.xyz', 'cybre.space', 'octodon.social', 'chaos.social', + 'infosec.exchange', 'ruby.social', 'phpc.social', 'toot.cafe', + 'mstdn.io', 'pixelfed.social', 'lemmy.ml', 'lemmy.world', + 'kbin.social', 'pleroma.site', 'akkoma.dev', +] + + +def extract_handle_from_url(url): + """extract platform and handle from a URL""" + for platform, patterns in PLATFORM_PATTERNS.items(): + for pattern, extractor in patterns: + match = re.match(pattern, url, re.I) + if match: + return platform, extractor(match) + return None, None + + +def extract_fediverse_handles(text): + """find @user@instance.tld patterns in text""" + handles = [] + for match in FEDIVERSE_HANDLE_PATTERN.finditer(text): + user, instance = match.groups() + handles.append(f"@{user}@{instance}") + return handles + + +def extract_emails(text): + """find email addresses in text""" + emails = [] + for match in EMAIL_PATTERN.finditer(text): + email = match.group(1) + # filter out common non-personal emails + if not any(x in email.lower() for x in ['noreply', 'no-reply', 'donotreply', 'example.com']): + emails.append(email) + return emails + + +def scrape_page(url, timeout=15): + """fetch and parse a web page""" + try: + resp = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True) + resp.raise_for_status() + return BeautifulSoup(resp.text, 'html.parser'), resp.text + except Exception as e: + return None, None + + +def extract_rel_me_links(soup): + """extract rel="me" links (used for verification)""" + links = [] + if not soup: + return links + + for a in soup.find_all('a', rel=lambda x: x and 'me' in x): + href = a.get('href') + if href: + links.append(href) + + return links + + +def extract_social_links_from_page(soup, base_url=None): + """extract all social links from a page""" + links = [] + if not soup: + return links + + # all links + for a in soup.find_all('a', href=True): + href = a['href'] + if base_url and not href.startswith('http'): + href = urljoin(base_url, href) + + # check if it's a known social platform + platform, handle = extract_handle_from_url(href) + if platform: + links.append({'platform': platform, 'handle': handle, 'url': href}) + + return links + + +def extract_json_ld(soup): + """extract structured data from JSON-LD""" + data = {} + if not soup: + return data + + for script in soup.find_all('script', type='application/ld+json'): + try: + ld = json.loads(script.string) + # look for sameAs links (social profiles) + if isinstance(ld, dict): + same_as = ld.get('sameAs', []) + if isinstance(same_as, str): + same_as = [same_as] + for url in same_as: + platform, handle = extract_handle_from_url(url) + if platform: + data[platform] = handle + except: + pass + + return data + + +def scrape_linktree(url): + """scrape a linktree/bio.link/carrd page for all links""" + handles = {} + soup, raw = scrape_page(url) + if not soup: + return handles + + # linktree uses data attributes and JS, but links are often in the HTML + links = extract_social_links_from_page(soup, url) + for link in links: + if link['platform'] not in ['linktree', 'biolink', 'carrd']: + handles[link['platform']] = link['handle'] + + # also check for fediverse handles in text + if raw: + fedi_handles = extract_fediverse_handles(raw) + if fedi_handles: + handles['mastodon'] = fedi_handles[0] + + return handles + + +def scrape_website_for_handles(url, follow_links=True): + """ + comprehensive website scrape for social handles + + checks: + - rel="me" links + - social links in page + - json-ld structured data + - /about and /contact pages + - fediverse handles in text + - emails + """ + handles = {} + emails = [] + + soup, raw = scrape_page(url) + if not soup: + return handles, emails + + # 1. rel="me" links (most authoritative) + rel_me = extract_rel_me_links(soup) + for link in rel_me: + platform, handle = extract_handle_from_url(link) + if platform and platform not in handles: + handles[platform] = handle + + # 2. all social links on page + social_links = extract_social_links_from_page(soup, url) + for link in social_links: + if link['platform'] not in handles: + handles[link['platform']] = link['handle'] + + # 3. json-ld structured data + json_ld = extract_json_ld(soup) + for platform, handle in json_ld.items(): + if platform not in handles: + handles[platform] = handle + + # 4. fediverse handles in text + if raw: + fedi = extract_fediverse_handles(raw) + if fedi and 'mastodon' not in handles: + handles['mastodon'] = fedi[0] + + # emails + emails = extract_emails(raw) + + # 5. follow links to /about, /contact + if follow_links: + parsed = urlparse(url) + base = f"{parsed.scheme}://{parsed.netloc}" + + for path in ['/about', '/contact', '/links', '/social']: + try: + sub_soup, sub_raw = scrape_page(base + path) + if sub_soup: + sub_links = extract_social_links_from_page(sub_soup, base) + for link in sub_links: + if link['platform'] not in handles: + handles[link['platform']] = link['handle'] + + if sub_raw: + fedi = extract_fediverse_handles(sub_raw) + if fedi and 'mastodon' not in handles: + handles['mastodon'] = fedi[0] + + emails.extend(extract_emails(sub_raw)) + except: + pass + + # 6. check for linktree etc in links and follow them + for platform in ['linktree', 'biolink', 'carrd']: + if platform in handles: + # this is actually a link aggregator, scrape it + link_url = None + for link in social_links: + if link['platform'] == platform: + link_url = link['url'] + break + + if link_url: + aggregator_handles = scrape_linktree(link_url) + for p, h in aggregator_handles.items(): + if p not in handles: + handles[p] = h + + del handles[platform] # remove the aggregator itself + + return handles, list(set(emails)) + + +def extract_handles_from_text(text): + """extract handles from plain text (bio, README, etc)""" + handles = {} + + if not text: + return handles + + # fediverse handles + fedi = extract_fediverse_handles(text) + if fedi: + handles['mastodon'] = fedi[0] + + # URL patterns in text + url_pattern = re.compile(r'https?://[^\s<>"\']+') + for match in url_pattern.finditer(text): + url = match.group(0).rstrip('.,;:!?)') + platform, handle = extract_handle_from_url(url) + if platform and platform not in handles: + handles[platform] = handle + + # twitter-style @mentions (only if looks like twitter context) + if 'twitter' in text.lower() or 'x.com' in text.lower(): + twitter_pattern = re.compile(r'(?:^|[^\w])@(\w{1,15})(?:[^\w]|$)') + for match in twitter_pattern.finditer(text): + if 'twitter' not in handles: + handles['twitter'] = f"@{match.group(1)}" + + # matrix handles + matrix_pattern = re.compile(r'@([\w.-]+):([\w.-]+)') + for match in matrix_pattern.finditer(text): + if 'matrix' not in handles: + handles['matrix'] = f"@{match.group(1)}:{match.group(2)}" + + return handles + + +def scrape_github_readme(username): + """scrape user's profile README (username/username repo)""" + handles = {} + emails = [] + + url = f"https://raw.githubusercontent.com/{username}/{username}/main/README.md" + try: + resp = requests.get(url, headers=HEADERS, timeout=10) + if resp.status_code == 200: + text = resp.text + + # extract handles from text + handles = extract_handles_from_text(text) + + # extract emails + emails = extract_emails(text) + + return handles, emails + except: + pass + + # try master branch + url = f"https://raw.githubusercontent.com/{username}/{username}/master/README.md" + try: + resp = requests.get(url, headers=HEADERS, timeout=10) + if resp.status_code == 200: + text = resp.text + handles = extract_handles_from_text(text) + emails = extract_emails(text) + except: + pass + + return handles, emails + + +def discover_all_handles(github_profile): + """ + comprehensive handle discovery from a github profile dict + + github_profile should contain: + - username + - bio + - blog (website URL) + - twitter_username + - etc. + """ + handles = {} + emails = [] + + username = github_profile.get('login') or github_profile.get('username') + + print(f" discovering handles for {username}...") + + # 1. github bio + bio = github_profile.get('bio', '') + if bio: + bio_handles = extract_handles_from_text(bio) + handles.update(bio_handles) + emails.extend(extract_emails(bio)) + + # 2. twitter from github profile + twitter = github_profile.get('twitter_username') + if twitter and 'twitter' not in handles: + handles['twitter'] = f"@{twitter}" + + # 3. website from github profile + website = github_profile.get('blog') + if website: + if not website.startswith('http'): + website = f"https://{website}" + + print(f" scraping website: {website}") + site_handles, site_emails = scrape_website_for_handles(website) + for p, h in site_handles.items(): + if p not in handles: + handles[p] = h + emails.extend(site_emails) + + # 4. profile README + if username: + print(f" checking profile README...") + readme_handles, readme_emails = scrape_github_readme(username) + for p, h in readme_handles.items(): + if p not in handles: + handles[p] = h + emails.extend(readme_emails) + + # 5. email from github profile + github_email = github_profile.get('email') + if github_email: + emails.append(github_email) + + # dedupe emails + emails = list(set(e for e in emails if e and '@' in e and 'noreply' not in e.lower())) + + print(f" found {len(handles)} handles, {len(emails)} emails") + + return handles, emails + + +def merge_handles(existing, new): + """merge new handles into existing, preferring more specific handles""" + for platform, handle in new.items(): + if platform not in existing: + existing[platform] = handle + elif len(handle) > len(existing[platform]): + # prefer longer/more specific handles + existing[platform] = handle + + return existing diff --git a/scoutd/lemmy.py b/scoutd/lemmy.py new file mode 100644 index 0000000..ccf51ab --- /dev/null +++ b/scoutd/lemmy.py @@ -0,0 +1,322 @@ +""" +scoutd/lemmy.py - lemmy (fediverse reddit) discovery + +lemmy is federated so we hit multiple instances. +great for finding lost builders in communities like: +- /c/programming, /c/technology, /c/linux +- /c/antiwork, /c/workreform (lost builders!) +- /c/selfhosted, /c/privacy, /c/opensource + +supports authenticated access for private instances and DM delivery. +""" + +import requests +import json +import time +import os +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text +from .lost import ( + analyze_social_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, +) + +# auth config from environment +LEMMY_INSTANCE = os.environ.get('LEMMY_INSTANCE', '') +LEMMY_USERNAME = os.environ.get('LEMMY_USERNAME', '') +LEMMY_PASSWORD = os.environ.get('LEMMY_PASSWORD', '') + +# auth token cache +_auth_token = None + +# popular lemmy instances +LEMMY_INSTANCES = [ + 'lemmy.ml', + 'lemmy.world', + 'programming.dev', + 'lemm.ee', + 'sh.itjust.works', +] + +# communities to scout (format: community@instance or just community for local) +TARGET_COMMUNITIES = [ + # builder communities + 'programming', + 'selfhosted', + 'linux', + 'opensource', + 'privacy', + 'technology', + 'webdev', + 'rust', + 'python', + 'golang', + + # lost builder communities (people struggling, stuck, seeking) + 'antiwork', + 'workreform', + 'careerguidance', + 'cscareerquestions', + 'learnprogramming', + 'findapath', +] + +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'lemmy' +CACHE_DIR.mkdir(parents=True, exist_ok=True) + + +def get_auth_token(instance=None): + """get auth token for lemmy instance""" + global _auth_token + + if _auth_token: + return _auth_token + + instance = instance or LEMMY_INSTANCE + if not all([instance, LEMMY_USERNAME, LEMMY_PASSWORD]): + return None + + try: + url = f"https://{instance}/api/v3/user/login" + resp = requests.post(url, json={ + 'username_or_email': LEMMY_USERNAME, + 'password': LEMMY_PASSWORD, + }, timeout=30) + + if resp.status_code == 200: + _auth_token = resp.json().get('jwt') + return _auth_token + return None + except Exception as e: + print(f"lemmy auth error: {e}") + return None + + +def send_lemmy_dm(recipient_username, message, dry_run=False): + """send a private message via lemmy""" + if not LEMMY_INSTANCE: + return False, "LEMMY_INSTANCE not configured" + + if dry_run: + print(f"[dry run] would send lemmy DM to {recipient_username}") + return True, None + + token = get_auth_token() + if not token: + return False, "failed to authenticate with lemmy" + + try: + # parse recipient - could be username@instance or just username + if '@' in recipient_username: + username, instance = recipient_username.split('@', 1) + else: + username = recipient_username + instance = LEMMY_INSTANCE + + # get recipient user id + user_url = f"https://{LEMMY_INSTANCE}/api/v3/user" + resp = requests.get(user_url, params={'username': f"{username}@{instance}"}, timeout=30) + + if resp.status_code != 200: + # try without instance suffix for local users + resp = requests.get(user_url, params={'username': username}, timeout=30) + + if resp.status_code != 200: + return False, f"could not find user {recipient_username}" + + recipient_id = resp.json().get('person_view', {}).get('person', {}).get('id') + if not recipient_id: + return False, "could not get recipient id" + + # send DM + dm_url = f"https://{LEMMY_INSTANCE}/api/v3/private_message" + resp = requests.post(dm_url, + headers={'Authorization': f'Bearer {token}'}, + json={ + 'content': message, + 'recipient_id': recipient_id, + }, + timeout=30 + ) + + if resp.status_code == 200: + return True, None + else: + return False, f"lemmy DM error: {resp.status_code} - {resp.text}" + + except Exception as e: + return False, f"lemmy DM error: {str(e)}" + + +def get_community_posts(instance, community, limit=50, sort='New'): + """get posts from a lemmy community""" + try: + url = f"https://{instance}/api/v3/post/list" + params = { + 'community_name': community, + 'sort': sort, + 'limit': limit, + } + + resp = requests.get(url, params=params, timeout=30) + if resp.status_code == 200: + return resp.json().get('posts', []) + return [] + except Exception as e: + return [] + + +def get_user_profile(instance, username): + """get lemmy user profile""" + try: + url = f"https://{instance}/api/v3/user" + params = {'username': username} + + resp = requests.get(url, params=params, timeout=30) + if resp.status_code == 200: + return resp.json() + return None + except Exception: + return None + + +def analyze_lemmy_user(instance, username, posts=None): + """analyze a lemmy user for values alignment and lost signals""" + profile = get_user_profile(instance, username) + if not profile: + return None + + person = profile.get('person_view', {}).get('person', {}) + counts = profile.get('person_view', {}).get('counts', {}) + + bio = person.get('bio', '') or '' + display_name = person.get('display_name') or person.get('name', username) + + # analyze bio + bio_score, bio_signals, bio_reasons = analyze_text(bio) + + # analyze posts if provided + post_signals = [] + post_text = [] + if posts: + for post in posts[:10]: + post_data = post.get('post', {}) + title = post_data.get('name', '') + body = post_data.get('body', '') + post_text.append(f"{title} {body}") + + _, signals, _ = analyze_text(f"{title} {body}") + post_signals.extend(signals) + + all_signals = list(set(bio_signals + post_signals)) + total_score = bio_score + len(post_signals) * 5 + + # lost builder detection + profile_for_lost = { + 'bio': bio, + 'post_count': counts.get('post_count', 0), + 'comment_count': counts.get('comment_count', 0), + } + posts_for_lost = [{'text': t} for t in post_text] + + lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) + lost_potential_score = lost_weight + user_type = classify_user(lost_potential_score, 50, total_score) + + return { + 'platform': 'lemmy', + 'username': f"{username}@{instance}", + 'url': f"https://{instance}/u/{username}", + 'name': display_name, + 'bio': bio, + 'location': None, + 'score': total_score, + 'confidence': min(0.9, 0.3 + len(all_signals) * 0.1), + 'signals': all_signals, + 'negative_signals': [], + 'reasons': bio_reasons, + 'contact': {}, + 'extra': { + 'instance': instance, + 'post_count': counts.get('post_count', 0), + 'comment_count': counts.get('comment_count', 0), + }, + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_lemmy(db, limit_per_community=30): + """scrape lemmy instances for aligned builders""" + print("scouting lemmy...") + + found = 0 + lost_found = 0 + seen_users = set() + + # build instance list - user's instance first if configured + instances = list(LEMMY_INSTANCES) + if LEMMY_INSTANCE and LEMMY_INSTANCE not in instances: + instances.insert(0, LEMMY_INSTANCE) + + for instance in instances: + print(f" instance: {instance}") + + for community in TARGET_COMMUNITIES: + posts = get_community_posts(instance, community, limit=limit_per_community) + + if not posts: + continue + + print(f" /c/{community}: {len(posts)} posts") + + # group posts by user + user_posts = {} + for post in posts: + creator = post.get('creator', {}) + username = creator.get('name') + if not username: + continue + + user_key = f"{username}@{instance}" + if user_key in seen_users: + continue + + if user_key not in user_posts: + user_posts[user_key] = [] + user_posts[user_key].append(post) + + # analyze each user + for user_key, posts in user_posts.items(): + username = user_key.split('@')[0] + + if user_key in seen_users: + continue + seen_users.add(user_key) + + result = analyze_lemmy_user(instance, username, posts) + if not result: + continue + + if result['score'] >= 20 or result.get('lost_potential_score', 0) >= 30: + db.save_human(result) + found += 1 + + if result.get('user_type') in ['lost', 'both']: + lost_found += 1 + print(f" {result['username']}: {result['score']:.0f} (lost: {result['lost_potential_score']:.0f})") + elif result['score'] >= 40: + print(f" {result['username']}: {result['score']:.0f}") + + time.sleep(0.5) # rate limit + + time.sleep(1) # between communities + + time.sleep(2) # between instances + + print(f"lemmy: found {found} humans ({lost_found} lost builders)") + return found diff --git a/scoutd/lobsters.py b/scoutd/lobsters.py new file mode 100644 index 0000000..4106ca5 --- /dev/null +++ b/scoutd/lobsters.py @@ -0,0 +1,169 @@ +""" +scoutd/lobsters.py - lobste.rs discovery +high-signal invite-only tech community +""" + +import requests +import json +import time +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'lobsters' + +ALIGNED_TAGS = ['privacy', 'security', 'distributed', 'rust', 'linux', 'culture', 'practices'] + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(2) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" lobsters api error: {e}") + return None + + +def get_stories_by_tag(tag): + """get recent stories by tag""" + url = f'https://lobste.rs/t/{tag}.json' + return _api_get(url) or [] + + +def get_newest_stories(): + """get newest stories""" + return _api_get('https://lobste.rs/newest.json') or [] + + +def get_user(username): + """get user profile""" + return _api_get(f'https://lobste.rs/u/{username}.json') + + +def analyze_lobsters_user(username): + """analyze a lobste.rs user""" + user = get_user(username) + if not user: + return None + + text_parts = [] + if user.get('about'): + text_parts.append(user['about']) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # lobsters base bonus (invite-only, high signal) + base_score = 15 + + # karma bonus + karma = user.get('karma', 0) + karma_score = 0 + if karma > 100: + karma_score = 10 + elif karma > 50: + karma_score = 5 + + # github presence + github_score = 5 if user.get('github_username') else 0 + + # homepage + homepage_score = 5 if user.get('homepage') else 0 + + total_score = text_score + base_score + karma_score + github_score + homepage_score + + # confidence + confidence = 0.4 # higher base for invite-only + if text_parts: + confidence += 0.2 + if karma > 50: + confidence += 0.2 + confidence = min(confidence, 0.9) + + reasons = ['on lobste.rs (invite-only)'] + if karma > 50: + reasons.append(f"active ({karma} karma)") + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + return { + 'platform': 'lobsters', + 'username': username, + 'url': f"https://lobste.rs/u/{username}", + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'karma': karma, + 'reasons': reasons, + 'contact': { + 'github': user.get('github_username'), + 'twitter': user.get('twitter_username'), + 'homepage': user.get('homepage'), + }, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_lobsters(db): + """full lobste.rs scrape""" + print("scoutd/lobsters: starting scrape...") + + all_users = set() + + # stories by aligned tags + for tag in ALIGNED_TAGS: + print(f" tag: {tag}...") + stories = get_stories_by_tag(tag) + for story in stories: + submitter = story.get('submitter_user', {}).get('username') + if submitter: + all_users.add(submitter) + + # newest stories + print(" newest stories...") + for story in get_newest_stories(): + submitter = story.get('submitter_user', {}).get('username') + if submitter: + all_users.add(submitter) + + print(f" {len(all_users)} unique users to analyze") + + # analyze + results = [] + for username in all_users: + try: + result = analyze_lobsters_user(username) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + if result['score'] >= 30: + print(f" ★ {username}: {result['score']} pts") + except Exception as e: + print(f" error on {username}: {e}") + + print(f"scoutd/lobsters: found {len(results)} aligned humans") + return results diff --git a/scoutd/lost.py b/scoutd/lost.py new file mode 100644 index 0000000..ab19298 --- /dev/null +++ b/scoutd/lost.py @@ -0,0 +1,491 @@ +""" +scoutd/lost.py - lost builder detection + +finds people with potential who haven't found it yet, gave up, or are too beaten down to try. + +these aren't failures. they're seeds that never got water. + +detection signals: +- github: forked but never modified, starred many but built nothing, learning repos abandoned +- reddit/forums: "i wish i could...", stuck asking beginner questions for years, helping others but never sharing +- social: retoots builders but never posts own work, imposter syndrome language, isolation signals +- profiles: bio says what they WANT to be, "aspiring" for 2+ years, empty portfolios + +the goal isn't to recruit them. it's to show them the door exists. +""" + +import re +from datetime import datetime, timedelta +from collections import defaultdict + + +# signal definitions with weights +LOST_SIGNALS = { + # github signals + 'forked_never_modified': { + 'weight': 15, + 'category': 'github', + 'description': 'forked repos but never pushed changes', + }, + 'starred_many_built_nothing': { + 'weight': 20, + 'category': 'github', + 'description': 'starred 50+ repos but has 0-2 own repos', + }, + 'account_no_repos': { + 'weight': 10, + 'category': 'github', + 'description': 'account exists but no public repos', + }, + 'inactivity_bursts': { + 'weight': 15, + 'category': 'github', + 'description': 'long gaps then brief activity bursts', + }, + 'only_issues_comments': { + 'weight': 12, + 'category': 'github', + 'description': 'only activity is issues/comments on others work', + }, + 'abandoned_learning_repos': { + 'weight': 18, + 'category': 'github', + 'description': 'learning/tutorial repos that were never finished', + }, + 'readme_only_repos': { + 'weight': 10, + 'category': 'github', + 'description': 'repos with just README, no actual code', + }, + + # language signals (from posts/comments/bio) + 'wish_i_could': { + 'weight': 12, + 'category': 'language', + 'description': '"i wish i could..." language', + 'patterns': [ + r'i wish i could', + r'i wish i knew how', + r'wish i had the (time|energy|motivation|skills?)', + ], + }, + 'someday_want': { + 'weight': 10, + 'category': 'language', + 'description': '"someday i want to..." language', + 'patterns': [ + r'someday i (want|hope|plan) to', + r'one day i\'ll', + r'eventually i\'ll', + r'when i have time i\'ll', + ], + }, + 'stuck_beginner': { + 'weight': 20, + 'category': 'language', + 'description': 'asking beginner questions for years', + 'patterns': [ + r'still (trying|learning|struggling) (to|with)', + r'can\'t seem to (get|understand|figure)', + r'been trying for (months|years)', + ], + }, + 'self_deprecating': { + 'weight': 15, + 'category': 'language', + 'description': 'self-deprecating about abilities', + 'patterns': [ + r'i\'m (not smart|too dumb|not good) enough', + r'i (suck|am terrible) at', + r'i\'ll never be able to', + r'people like me (can\'t|don\'t)', + r'i\'m just not (a|the) (type|kind)', + ], + }, + 'no_energy': { + 'weight': 18, + 'category': 'language', + 'description': '"how do people have energy" posts', + 'patterns': [ + r'how do (people|you|they) have (the )?(energy|time|motivation)', + r'where do (people|you|they) find (the )?(energy|motivation)', + r'i\'m (always|constantly) (tired|exhausted|drained)', + r'no (energy|motivation) (left|anymore)', + ], + }, + 'imposter_syndrome': { + 'weight': 15, + 'category': 'language', + 'description': 'imposter syndrome language', + 'patterns': [ + r'imposter syndrome', + r'feel like (a |an )?(fraud|fake|imposter)', + r'don\'t (belong|deserve)', + r'everyone else (seems|is) (so much )?(better|smarter)', + r'they\'ll (find out|realize) i\'m', + ], + }, + 'should_really': { + 'weight': 8, + 'category': 'language', + 'description': '"i should really..." posts', + 'patterns': [ + r'i (should|need to) really', + r'i keep (meaning|wanting) to', + r'i\'ve been (meaning|wanting) to', + ], + }, + 'isolation_signals': { + 'weight': 20, + 'category': 'language', + 'description': 'isolation/loneliness language', + 'patterns': [ + r'no one (understands|gets it|to talk to)', + r'(feel|feeling) (so )?(alone|isolated|lonely)', + r'don\'t have anyone (to|who)', + r'wish i (had|knew) (someone|people)', + ], + }, + 'enthusiasm_for_others': { + 'weight': 10, + 'category': 'behavior', + 'description': 'celebrates others but dismissive of self', + }, + + # subreddit/community signals + 'stuck_communities': { + 'weight': 15, + 'category': 'community', + 'description': 'active in stuck/struggling communities', + 'subreddits': [ + 'learnprogramming', + 'findapath', + 'getdisciplined', + 'getmotivated', + 'decidingtobebetter', + 'selfimprovement', + 'adhd', + 'depression', + 'anxiety', + ], + }, + + # profile signals + 'aspirational_bio': { + 'weight': 12, + 'category': 'profile', + 'description': 'bio says what they WANT to be', + 'patterns': [ + r'aspiring', + r'future', + r'want(ing)? to (be|become)', + r'learning to', + r'trying to (become|be|learn)', + r'hoping to', + ], + }, + 'empty_portfolio': { + 'weight': 15, + 'category': 'profile', + 'description': 'links to empty portfolio sites', + }, + 'long_aspiring': { + 'weight': 20, + 'category': 'profile', + 'description': '"aspiring" in bio for 2+ years', + }, +} + +# subreddits that indicate someone might be stuck +STUCK_SUBREDDITS = { + 'learnprogramming': 8, + 'findapath': 15, + 'getdisciplined': 12, + 'getmotivated': 10, + 'decidingtobebetter': 12, + 'selfimprovement': 8, + 'adhd': 10, + 'depression': 15, + 'anxiety': 12, + 'socialanxiety': 12, + 'neet': 20, + 'lostgeneration': 15, + 'antiwork': 5, # could be aligned OR stuck + 'careerguidance': 8, + 'cscareerquestions': 5, +} + + +def analyze_text_for_lost_signals(text): + """analyze text for lost builder language patterns""" + if not text: + return [], 0 + + text_lower = text.lower() + signals_found = [] + total_weight = 0 + + for signal_name, signal_data in LOST_SIGNALS.items(): + if 'patterns' not in signal_data: + continue + + for pattern in signal_data['patterns']: + if re.search(pattern, text_lower): + signals_found.append(signal_name) + total_weight += signal_data['weight'] + break # only count each signal once + + return signals_found, total_weight + + +def analyze_github_for_lost_signals(profile): + """analyze github profile for lost builder signals""" + signals_found = [] + total_weight = 0 + + if not profile: + return signals_found, total_weight + + repos = profile.get('repos', []) or profile.get('top_repos', []) + extra = profile.get('extra', {}) + + public_repos = profile.get('public_repos', len(repos)) + followers = profile.get('followers', 0) + following = profile.get('following', 0) + + # starred many but built nothing + # (we'd need to fetch starred count separately, approximate with following ratio) + if public_repos <= 2 and following > 50: + signals_found.append('starred_many_built_nothing') + total_weight += LOST_SIGNALS['starred_many_built_nothing']['weight'] + + # account but no repos + if public_repos == 0: + signals_found.append('account_no_repos') + total_weight += LOST_SIGNALS['account_no_repos']['weight'] + + # check repos for signals + forked_count = 0 + forked_modified = 0 + learning_repos = 0 + readme_only = 0 + + learning_keywords = ['learning', 'tutorial', 'course', 'practice', 'exercise', + 'bootcamp', 'udemy', 'freecodecamp', 'odin', 'codecademy'] + + for repo in repos: + name = (repo.get('name') or '').lower() + description = (repo.get('description') or '').lower() + language = repo.get('language') + is_fork = repo.get('fork', False) + + # forked but never modified + if is_fork: + forked_count += 1 + # if pushed_at is close to created_at, never modified + # (simplified: just count forks for now) + + # learning/tutorial repos + if any(kw in name or kw in description for kw in learning_keywords): + learning_repos += 1 + + # readme only (no language detected usually means no code) + if not language and not is_fork: + readme_only += 1 + + if forked_count >= 5 and public_repos - forked_count <= 2: + signals_found.append('forked_never_modified') + total_weight += LOST_SIGNALS['forked_never_modified']['weight'] + + if learning_repos >= 3: + signals_found.append('abandoned_learning_repos') + total_weight += LOST_SIGNALS['abandoned_learning_repos']['weight'] + + if readme_only >= 2: + signals_found.append('readme_only_repos') + total_weight += LOST_SIGNALS['readme_only_repos']['weight'] + + # check bio for lost signals + bio = profile.get('bio') or '' + bio_signals, bio_weight = analyze_text_for_lost_signals(bio) + signals_found.extend(bio_signals) + total_weight += bio_weight + + # aspirational bio check + bio_lower = bio.lower() + if any(re.search(p, bio_lower) for p in LOST_SIGNALS['aspirational_bio']['patterns']): + if 'aspirational_bio' not in signals_found: + signals_found.append('aspirational_bio') + total_weight += LOST_SIGNALS['aspirational_bio']['weight'] + + return signals_found, total_weight + + +def analyze_reddit_for_lost_signals(activity, subreddits): + """analyze reddit activity for lost builder signals""" + signals_found = [] + total_weight = 0 + + # check subreddit activity + stuck_sub_activity = 0 + for sub in subreddits: + if sub.lower() in STUCK_SUBREDDITS: + stuck_sub_activity += STUCK_SUBREDDITS[sub.lower()] + + if stuck_sub_activity >= 20: + signals_found.append('stuck_communities') + total_weight += min(stuck_sub_activity, 30) # cap at 30 + + # analyze post/comment text + all_text = [] + for item in activity: + if item.get('title'): + all_text.append(item['title']) + if item.get('body'): + all_text.append(item['body']) + + combined_text = ' '.join(all_text) + text_signals, text_weight = analyze_text_for_lost_signals(combined_text) + signals_found.extend(text_signals) + total_weight += text_weight + + # check for helping others but never sharing own work + help_count = 0 + share_count = 0 + for item in activity: + body = (item.get('body') or '').lower() + title = (item.get('title') or '').lower() + + # helping patterns + if any(p in body for p in ['try this', 'you could', 'have you tried', 'i recommend']): + help_count += 1 + + # sharing patterns + if any(p in body + title for p in ['i built', 'i made', 'my project', 'check out my', 'i created']): + share_count += 1 + + if help_count >= 5 and share_count == 0: + signals_found.append('enthusiasm_for_others') + total_weight += LOST_SIGNALS['enthusiasm_for_others']['weight'] + + return signals_found, total_weight + + +def analyze_social_for_lost_signals(profile, posts): + """analyze mastodon/social for lost builder signals""" + signals_found = [] + total_weight = 0 + + # check bio + bio = profile.get('bio') or profile.get('note') or '' + bio_signals, bio_weight = analyze_text_for_lost_signals(bio) + signals_found.extend(bio_signals) + total_weight += bio_weight + + # check posts + boost_count = 0 + original_count = 0 + own_work_count = 0 + + for post in posts: + content = (post.get('content') or '').lower() + is_boost = post.get('reblog') is not None or post.get('repost') + + if is_boost: + boost_count += 1 + else: + original_count += 1 + + # check if sharing own work + if any(p in content for p in ['i built', 'i made', 'my project', 'working on', 'just shipped']): + own_work_count += 1 + + # analyze text + text_signals, text_weight = analyze_text_for_lost_signals(content) + for sig in text_signals: + if sig not in signals_found: + signals_found.append(sig) + total_weight += LOST_SIGNALS[sig]['weight'] + + # boosts builders but never posts own work + if boost_count >= 10 and own_work_count == 0: + signals_found.append('enthusiasm_for_others') + total_weight += LOST_SIGNALS['enthusiasm_for_others']['weight'] + + return signals_found, total_weight + + +def calculate_lost_potential_score(signals_found): + """calculate overall lost potential score from signals""" + total = 0 + for signal in signals_found: + if signal in LOST_SIGNALS: + total += LOST_SIGNALS[signal]['weight'] + return total + + +def classify_user(lost_score, builder_score, values_score): + """ + classify user as builder, lost, or neither + + returns: 'builder' | 'lost' | 'both' | 'none' + """ + # high builder score = active builder + if builder_score >= 50 and lost_score < 30: + return 'builder' + + # high lost score + values alignment = lost builder (priority outreach) + if lost_score >= 40 and values_score >= 20: + return 'lost' + + # both signals = complex case, might be recovering + if lost_score >= 30 and builder_score >= 30: + return 'both' + + return 'none' + + +def get_signal_descriptions(signals_found): + """get human-readable descriptions of detected signals""" + descriptions = [] + for signal in signals_found: + if signal in LOST_SIGNALS: + descriptions.append(LOST_SIGNALS[signal]['description']) + return descriptions + + +def should_outreach_lost(user_data, config=None): + """ + determine if we should reach out to a lost builder + + considers: + - lost_potential_score threshold + - values alignment + - cooldown period + - manual review requirement + """ + config = config or {} + + lost_score = user_data.get('lost_potential_score', 0) + values_score = user_data.get('score', 0) # regular alignment score + + # minimum thresholds + min_lost = config.get('min_lost_score', 40) + min_values = config.get('min_values_score', 20) + + if lost_score < min_lost: + return False, 'lost_score too low' + + if values_score < min_values: + return False, 'values_score too low' + + # check cooldown + last_outreach = user_data.get('last_lost_outreach') + if last_outreach: + cooldown_days = config.get('cooldown_days', 90) + last_dt = datetime.fromisoformat(last_outreach) + if datetime.now() - last_dt < timedelta(days=cooldown_days): + return False, f'cooldown active (90 days)' + + # always require manual review for lost outreach + return True, 'requires_review' diff --git a/scoutd/mastodon.py b/scoutd/mastodon.py new file mode 100644 index 0000000..014ec51 --- /dev/null +++ b/scoutd/mastodon.py @@ -0,0 +1,290 @@ +""" +scoutd/mastodon.py - fediverse discovery +scrapes high-signal instances: tech.lgbt, social.coop, fosstodon, hackers.town +also detects lost builders - social isolation, imposter syndrome, struggling folks +""" + +import requests +import json +import time +import re +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text, ALIGNED_INSTANCES +from .lost import ( + analyze_social_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, + get_signal_descriptions, +) + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'mastodon' + +TARGET_HASHTAGS = [ + 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', + 'privacy', 'solarpunk', 'cooperative', 'cohousing', 'mutualaid', + 'intentionalcommunity', 'degoogle', 'fediverse', 'indieweb', +] + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(1) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" mastodon api error: {e}") + return None + + +def strip_html(text): + """strip html tags""" + return re.sub(r'<[^>]+>', ' ', text) if text else '' + + +def get_instance_directory(instance, limit=40): + """get users from instance directory""" + url = f'https://{instance}/api/v1/directory' + return _api_get(url, {'limit': limit, 'local': 'true'}) or [] + + +def get_hashtag_timeline(instance, hashtag, limit=40): + """get posts from hashtag""" + url = f'https://{instance}/api/v1/timelines/tag/{hashtag}' + return _api_get(url, {'limit': limit}) or [] + + +def get_user_statuses(instance, user_id, limit=30): + """get user's recent posts""" + url = f'https://{instance}/api/v1/accounts/{user_id}/statuses' + return _api_get(url, {'limit': limit, 'exclude_reblogs': 'true'}) or [] + + +def analyze_mastodon_user(account, instance): + """analyze a mastodon account""" + acct = account.get('acct', '') + if '@' not in acct: + acct = f"{acct}@{instance}" + + # collect text + text_parts = [] + bio = strip_html(account.get('note', '')) + if bio: + text_parts.append(bio) + + display_name = account.get('display_name', '') + if display_name: + text_parts.append(display_name) + + # profile fields + for field in account.get('fields', []): + if field.get('name'): + text_parts.append(field['name']) + if field.get('value'): + text_parts.append(strip_html(field['value'])) + + # get recent posts + user_id = account.get('id') + if user_id: + statuses = get_user_statuses(instance, user_id) + for status in statuses: + content = strip_html(status.get('content', '')) + if content: + text_parts.append(content) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # instance bonus + instance_bonus = ALIGNED_INSTANCES.get(instance, 0) + total_score = text_score + instance_bonus + + # pronouns bonus + if re.search(r'\b(they/them|she/her|he/him|xe/xem)\b', full_text, re.I): + total_score += 10 + positive_signals.append('pronouns') + + # activity level + statuses_count = account.get('statuses_count', 0) + followers = account.get('followers_count', 0) + if statuses_count > 100: + total_score += 5 + + # === LOST BUILDER DETECTION === + # build profile and posts for lost analysis + profile_for_lost = { + 'bio': bio, + 'note': account.get('note'), + } + + # convert statuses to posts format for analyze_social_for_lost_signals + posts_for_lost = [] + if user_id: + statuses = get_user_statuses(instance, user_id) + for status in statuses: + posts_for_lost.append({ + 'content': strip_html(status.get('content', '')), + 'reblog': status.get('reblog'), + }) + + # analyze for lost signals + lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) + + # also check combined text for lost patterns + text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) + for sig in text_lost_signals: + if sig not in lost_signals: + lost_signals.append(sig) + lost_weight += text_lost_weight + + lost_potential_score = lost_weight + + # classify: builder, lost, both, or none + # for mastodon, we use statuses_count as a proxy for builder activity + builder_activity = 10 if statuses_count > 100 else 5 if statuses_count > 50 else 0 + user_type = classify_user(lost_potential_score, builder_activity, total_score) + + # confidence + confidence = 0.3 + if len(text_parts) > 5: + confidence += 0.2 + if statuses_count > 50: + confidence += 0.2 + if len(positive_signals) > 3: + confidence += 0.2 + confidence = min(confidence, 0.9) + + reasons = [] + if instance in ALIGNED_INSTANCES: + reasons.append(f"on {instance}") + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + # add lost reasons if applicable + if user_type == 'lost' or user_type == 'both': + lost_descriptions = get_signal_descriptions(lost_signals) + if lost_descriptions: + reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") + + return { + 'platform': 'mastodon', + 'username': acct, + 'url': account.get('url'), + 'name': display_name, + 'bio': bio, + 'instance': instance, + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'statuses_count': statuses_count, + 'followers': followers, + 'reasons': reasons, + 'scraped_at': datetime.now().isoformat(), + # lost builder fields + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_mastodon(db, limit_per_instance=40): + """full mastodon scrape""" + print("scoutd/mastodon: starting scrape...") + + all_accounts = [] + + # 1. instance directories + print(" scraping instance directories...") + for instance in ALIGNED_INSTANCES: + accounts = get_instance_directory(instance, limit=limit_per_instance) + for acct in accounts: + acct['_instance'] = instance + all_accounts.append(acct) + print(f" {instance}: {len(accounts)} users") + + # 2. hashtag timelines + print(" scraping hashtags...") + seen = set() + for tag in TARGET_HASHTAGS[:8]: + for instance in ['fosstodon.org', 'tech.lgbt', 'social.coop']: + posts = get_hashtag_timeline(instance, tag, limit=20) + for post in posts: + account = post.get('account', {}) + acct = account.get('acct', '') + if '@' not in acct: + acct = f"{acct}@{instance}" + + if acct not in seen: + seen.add(acct) + account['_instance'] = instance + all_accounts.append(account) + + # dedupe + unique = {} + for acct in all_accounts: + key = acct.get('acct', acct.get('id', '')) + if key not in unique: + unique[key] = acct + + print(f" {len(unique)} unique accounts to analyze") + + # analyze + results = [] + builders_found = 0 + lost_found = 0 + + for acct_data in unique.values(): + instance = acct_data.get('_instance', 'mastodon.social') + try: + result = analyze_mastodon_user(acct_data, instance) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + user_type = result.get('user_type', 'none') + + if user_type == 'builder': + builders_found += 1 + if result['score'] >= 40: + print(f" ★ @{result['username']}: {result['score']} pts") + + elif user_type == 'lost': + lost_found += 1 + lost_score = result.get('lost_potential_score', 0) + if lost_score >= 40: + print(f" 💔 @{result['username']}: lost_score={lost_score}, values={result['score']} pts") + + elif user_type == 'both': + builders_found += 1 + lost_found += 1 + print(f" ⚡ @{result['username']}: recovering builder") + + except Exception as e: + print(f" error: {e}") + + print(f"scoutd/mastodon: found {len(results)} aligned humans") + print(f" - {builders_found} active builders") + print(f" - {lost_found} lost builders (need encouragement)") + return results diff --git a/scoutd/matrix.py b/scoutd/matrix.py new file mode 100644 index 0000000..162d9ac --- /dev/null +++ b/scoutd/matrix.py @@ -0,0 +1,196 @@ +""" +scoutd/matrix.py - matrix room membership discovery +finds users in multiple aligned public rooms +""" + +import requests +import json +import time +from datetime import datetime +from pathlib import Path +from collections import defaultdict + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'matrix' + +# public matrix rooms to check membership +ALIGNED_ROOMS = [ + '#homeassistant:matrix.org', + '#esphome:matrix.org', + '#selfhosted:matrix.org', + '#privacy:matrix.org', + '#solarpunk:matrix.org', + '#cooperative:matrix.org', + '#foss:matrix.org', + '#linux:matrix.org', +] + +# homeservers to query +HOMESERVERS = [ + 'matrix.org', + 'matrix.envs.net', + 'tchncs.de', +] + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(1) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + # matrix apis often fail, don't spam errors + return None + + +def get_room_members(homeserver, room_alias): + """ + get members of a public room + note: most matrix servers don't expose this publicly + this is a best-effort scrape + """ + # resolve room alias to id first + try: + alias_url = f'https://{homeserver}/_matrix/client/r0/directory/room/{room_alias}' + alias_data = _api_get(alias_url) + if not alias_data or 'room_id' not in alias_data: + return [] + + room_id = alias_data['room_id'] + + # try to get members (usually requires auth) + members_url = f'https://{homeserver}/_matrix/client/r0/rooms/{room_id}/members' + members_data = _api_get(members_url) + + if members_data and 'chunk' in members_data: + members = [] + for event in members_data['chunk']: + if event.get('type') == 'm.room.member' and event.get('content', {}).get('membership') == 'join': + user_id = event.get('state_key') + display_name = event.get('content', {}).get('displayname') + if user_id: + members.append({'user_id': user_id, 'display_name': display_name}) + return members + except: + pass + + return [] + + +def get_public_rooms(homeserver, limit=100): + """get public rooms directory""" + url = f'https://{homeserver}/_matrix/client/r0/publicRooms' + data = _api_get(url, {'limit': limit}) + return data.get('chunk', []) if data else [] + + +def analyze_matrix_user(user_id, rooms_joined, display_name=None): + """analyze a matrix user based on room membership""" + # score based on room membership overlap + room_score = len(rooms_joined) * 10 + + # multi-room bonus + if len(rooms_joined) >= 4: + room_score += 20 + elif len(rooms_joined) >= 2: + room_score += 10 + + # analyze display name if available + text_score = 0 + signals = [] + if display_name: + text_score, signals, _ = analyze_text(display_name) + + total_score = room_score + text_score + + confidence = 0.3 + if len(rooms_joined) >= 3: + confidence += 0.3 + if display_name: + confidence += 0.1 + confidence = min(confidence, 0.8) + + reasons = [f"in {len(rooms_joined)} aligned rooms: {', '.join(rooms_joined[:3])}"] + if signals: + reasons.append(f"signals: {', '.join(signals[:3])}") + + return { + 'platform': 'matrix', + 'username': user_id, + 'url': f"https://matrix.to/#/{user_id}", + 'name': display_name, + 'score': total_score, + 'confidence': confidence, + 'signals': signals, + 'rooms': rooms_joined, + 'reasons': reasons, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_matrix(db): + """ + matrix scrape - limited due to auth requirements + best effort on public room data + """ + print("scoutd/matrix: starting scrape (limited - most apis require auth)...") + + user_rooms = defaultdict(list) + + # try to get public room directories + for homeserver in HOMESERVERS: + print(f" checking {homeserver} public rooms...") + rooms = get_public_rooms(homeserver, limit=50) + + for room in rooms: + room_alias = room.get('canonical_alias', '') + # check if it matches any aligned room patterns + aligned_keywords = ['homeassistant', 'selfhosted', 'privacy', 'linux', 'foss', 'cooperative'] + if any(kw in room_alias.lower() or kw in room.get('name', '').lower() for kw in aligned_keywords): + print(f" found aligned room: {room_alias or room.get('name')}") + + # try to get members from aligned rooms (usually fails without auth) + for room_alias in ALIGNED_ROOMS[:3]: # limit attempts + for homeserver in HOMESERVERS[:1]: # just try matrix.org + members = get_room_members(homeserver, room_alias) + if members: + print(f" {room_alias}: {len(members)} members") + for member in members: + user_rooms[member['user_id']].append(room_alias) + + # filter for multi-room users + multi_room = {u: rooms for u, rooms in user_rooms.items() if len(rooms) >= 2} + print(f" {len(multi_room)} users in 2+ aligned rooms") + + # analyze + results = [] + for user_id, rooms in multi_room.items(): + try: + result = analyze_matrix_user(user_id, rooms) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + except Exception as e: + print(f" error: {e}") + + print(f"scoutd/matrix: found {len(results)} aligned humans (limited by auth)") + return results diff --git a/scoutd/reddit.py b/scoutd/reddit.py new file mode 100644 index 0000000..723ff93 --- /dev/null +++ b/scoutd/reddit.py @@ -0,0 +1,503 @@ +""" +scoutd/reddit.py - reddit discovery (DISCOVERY ONLY, NOT OUTREACH) + +reddit is a SIGNAL SOURCE, not a contact channel. +flow: +1. scrape reddit for users active in target subs +2. extract their reddit profile +3. look for links TO other platforms (github, mastodon, website, etc.) +4. add to scout database with reddit as signal source +5. reach out via their OTHER platforms, never reddit + +if reddit user has no external links: + - add to manual_queue with note "reddit-only, needs manual review" + +also detects lost builders - stuck in learnprogramming for years, imposter syndrome, etc. +""" + +import requests +import json +import time +import re +from datetime import datetime +from pathlib import Path +from collections import defaultdict + +from .signals import analyze_text, ALIGNED_SUBREDDITS, NEGATIVE_SUBREDDITS +from .lost import ( + analyze_reddit_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, + get_signal_descriptions, + STUCK_SUBREDDITS, +) + +HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit' + +# patterns for extracting external platform links +PLATFORM_PATTERNS = { + 'github': [ + r'github\.com/([a-zA-Z0-9_-]+)', + r'gh:\s*@?([a-zA-Z0-9_-]+)', + ], + 'mastodon': [ + r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', + r'mastodon\.social/@([a-zA-Z0-9_]+)', + r'fosstodon\.org/@([a-zA-Z0-9_]+)', + r'hachyderm\.io/@([a-zA-Z0-9_]+)', + r'tech\.lgbt/@([a-zA-Z0-9_]+)', + ], + 'twitter': [ + r'twitter\.com/([a-zA-Z0-9_]+)', + r'x\.com/([a-zA-Z0-9_]+)', + r'(?:^|\s)@([a-zA-Z0-9_]{1,15})(?:\s|$)', # bare @handle + ], + 'bluesky': [ + r'bsky\.app/profile/([a-zA-Z0-9_.-]+)', + r'([a-zA-Z0-9_-]+)\.bsky\.social', + ], + 'website': [ + r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)', + ], + 'matrix': [ + r'@([a-zA-Z0-9_-]+):([a-zA-Z0-9.-]+)', + ], +} + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(2) # reddit rate limit + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" reddit api error: {e}") + return None + + +def extract_external_links(text): + """extract links to other platforms from text""" + links = {} + + if not text: + return links + + for platform, patterns in PLATFORM_PATTERNS.items(): + for pattern in patterns: + matches = re.findall(pattern, text, re.IGNORECASE) + if matches: + if platform == 'mastodon' and isinstance(matches[0], tuple): + # full fediverse handle + links[platform] = f"@{matches[0][0]}@{matches[0][1]}" + elif platform == 'matrix' and isinstance(matches[0], tuple): + links[platform] = f"@{matches[0][0]}:{matches[0][1]}" + elif platform == 'website': + # skip reddit/imgur/etc + for match in matches: + if not any(x in match.lower() for x in ['reddit', 'imgur', 'redd.it', 'i.redd']): + links[platform] = f"https://{match}" + break + else: + links[platform] = matches[0] + break + + return links + + +def get_user_profile(username): + """get user profile including bio/description""" + url = f'https://www.reddit.com/user/{username}/about.json' + data = _api_get(url) + + if not data or 'data' not in data: + return None + + profile = data['data'] + return { + 'username': username, + 'name': profile.get('name'), + 'bio': profile.get('subreddit', {}).get('public_description', ''), + 'title': profile.get('subreddit', {}).get('title', ''), + 'icon': profile.get('icon_img'), + 'created_utc': profile.get('created_utc'), + 'total_karma': profile.get('total_karma', 0), + 'link_karma': profile.get('link_karma', 0), + 'comment_karma': profile.get('comment_karma', 0), + } + + +def get_subreddit_users(subreddit, limit=100): + """get recent posters/commenters from a subreddit""" + users = set() + + # posts + url = f'https://www.reddit.com/r/{subreddit}/new.json' + data = _api_get(url, {'limit': limit}) + if data and 'data' in data: + for post in data['data'].get('children', []): + author = post['data'].get('author') + if author and author not in ['[deleted]', 'AutoModerator']: + users.add(author) + + # comments + url = f'https://www.reddit.com/r/{subreddit}/comments.json' + data = _api_get(url, {'limit': limit}) + if data and 'data' in data: + for comment in data['data'].get('children', []): + author = comment['data'].get('author') + if author and author not in ['[deleted]', 'AutoModerator']: + users.add(author) + + return users + + +def get_user_activity(username): + """get user's posts and comments""" + activity = [] + + # posts + url = f'https://www.reddit.com/user/{username}/submitted.json' + data = _api_get(url, {'limit': 100}) + if data and 'data' in data: + for post in data['data'].get('children', []): + activity.append({ + 'type': 'post', + 'subreddit': post['data'].get('subreddit'), + 'title': post['data'].get('title', ''), + 'body': post['data'].get('selftext', ''), + 'score': post['data'].get('score', 0), + }) + + # comments + url = f'https://www.reddit.com/user/{username}/comments.json' + data = _api_get(url, {'limit': 100}) + if data and 'data' in data: + for comment in data['data'].get('children', []): + activity.append({ + 'type': 'comment', + 'subreddit': comment['data'].get('subreddit'), + 'body': comment['data'].get('body', ''), + 'score': comment['data'].get('score', 0), + }) + + return activity + + +def analyze_reddit_user(username): + """ + analyze a reddit user for alignment and extract external platform links. + + reddit is DISCOVERY ONLY - we find users here but contact them elsewhere. + """ + activity = get_user_activity(username) + if not activity: + return None + + # get profile for bio + profile = get_user_profile(username) + + # count subreddit activity + sub_activity = defaultdict(int) + text_parts = [] + total_karma = 0 + + for item in activity: + sub = item.get('subreddit', '').lower() + if sub: + sub_activity[sub] += 1 + if item.get('title'): + text_parts.append(item['title']) + if item.get('body'): + text_parts.append(item['body']) + total_karma += item.get('score', 0) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # EXTRACT EXTERNAL LINKS - this is the key part + # check profile bio first + external_links = {} + if profile: + bio_text = f"{profile.get('bio', '')} {profile.get('title', '')}" + external_links.update(extract_external_links(bio_text)) + + # also scan posts/comments for links (people often share their github etc) + activity_links = extract_external_links(full_text) + for platform, link in activity_links.items(): + if platform not in external_links: + external_links[platform] = link + + # subreddit scoring + sub_score = 0 + aligned_subs = [] + for sub, count in sub_activity.items(): + weight = ALIGNED_SUBREDDITS.get(sub, 0) + if weight > 0: + sub_score += weight * min(count, 5) + aligned_subs.append(sub) + + # multi-sub bonus + if len(aligned_subs) >= 5: + sub_score += 30 + elif len(aligned_subs) >= 3: + sub_score += 15 + + # negative sub penalty + for sub in sub_activity: + if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]: + sub_score -= 50 + negative_signals.append(f"r/{sub}") + + total_score = text_score + sub_score + + # bonus if they have external links (we can actually contact them) + if external_links.get('github'): + total_score += 10 + positive_signals.append('has github') + if external_links.get('mastodon'): + total_score += 10 + positive_signals.append('has mastodon') + if external_links.get('website'): + total_score += 5 + positive_signals.append('has website') + + # === LOST BUILDER DETECTION === + # reddit is HIGH SIGNAL for lost builders - stuck in learnprogramming, + # imposter syndrome posts, "i wish i could" language, etc. + subreddits_list = list(sub_activity.keys()) + lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list) + + # also check full text for lost patterns (already done partially in analyze_reddit_for_lost_signals) + text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) + for sig in text_lost_signals: + if sig not in lost_signals: + lost_signals.append(sig) + lost_weight += text_lost_weight + + lost_potential_score = lost_weight + + # classify: builder, lost, both, or none + # for reddit, builder_score is based on having external links + high karma + builder_activity = 0 + if external_links.get('github'): + builder_activity += 20 + if total_karma > 1000: + builder_activity += 15 + elif total_karma > 500: + builder_activity += 10 + + user_type = classify_user(lost_potential_score, builder_activity, total_score) + + # confidence + confidence = 0.3 + if len(activity) > 20: + confidence += 0.2 + if len(aligned_subs) >= 2: + confidence += 0.2 + if len(text_parts) > 10: + confidence += 0.2 + # higher confidence if we have contact methods + if external_links: + confidence += 0.1 + confidence = min(confidence, 0.95) + + reasons = [] + if aligned_subs: + reasons.append(f"active in: {', '.join(aligned_subs[:5])}") + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + if external_links: + reasons.append(f"external: {', '.join(external_links.keys())}") + + # add lost reasons if applicable + if user_type == 'lost' or user_type == 'both': + lost_descriptions = get_signal_descriptions(lost_signals) + if lost_descriptions: + reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") + + # determine if this is reddit-only (needs manual review) + reddit_only = len(external_links) == 0 + if reddit_only: + reasons.append("REDDIT-ONLY: needs manual review for outreach") + + return { + 'platform': 'reddit', + 'username': username, + 'url': f"https://reddit.com/u/{username}", + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'subreddits': aligned_subs, + 'activity_count': len(activity), + 'karma': total_karma, + 'reasons': reasons, + 'scraped_at': datetime.now().isoformat(), + # external platform links for outreach + 'external_links': external_links, + 'reddit_only': reddit_only, + 'extra': { + 'github': external_links.get('github'), + 'mastodon': external_links.get('mastodon'), + 'twitter': external_links.get('twitter'), + 'bluesky': external_links.get('bluesky'), + 'website': external_links.get('website'), + 'matrix': external_links.get('matrix'), + 'reddit_karma': total_karma, + 'reddit_activity': len(activity), + }, + # lost builder fields + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_reddit(db, limit_per_sub=50): + """ + full reddit scrape - DISCOVERY ONLY + + finds aligned users, extracts external links for outreach. + reddit-only users go to manual queue. + """ + print("scoutd/reddit: starting scrape (discovery only, not outreach)...") + + # find users in multiple aligned subs + user_subs = defaultdict(set) + + # aligned subs - active builders + priority_subs = ['intentionalcommunity', 'cohousing', 'selfhosted', + 'homeassistant', 'solarpunk', 'cooperatives', 'privacy', + 'localllama', 'homelab', 'degoogle', 'pihole', 'unraid'] + + # lost builder subs - people who need encouragement + # these folks might be stuck, but they have aligned interests + lost_subs = ['learnprogramming', 'findapath', 'getdisciplined', + 'careerguidance', 'cscareerquestions', 'decidingtobebetter'] + + # scrape both - we want to find lost builders with aligned interests + all_subs = priority_subs + lost_subs + + for sub in all_subs: + print(f" scraping r/{sub}...") + users = get_subreddit_users(sub, limit=limit_per_sub) + for user in users: + user_subs[user].add(sub) + print(f" found {len(users)} users") + + # filter for multi-sub users + multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2} + print(f" {len(multi_sub)} users in 2+ aligned subs") + + # analyze + results = [] + reddit_only_count = 0 + external_link_count = 0 + builders_found = 0 + lost_found = 0 + + for username in multi_sub: + try: + result = analyze_reddit_user(username) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + user_type = result.get('user_type', 'none') + + # track lost builders - reddit is high signal for these + if user_type == 'lost': + lost_found += 1 + lost_score = result.get('lost_potential_score', 0) + if lost_score >= 40: + print(f" 💔 u/{username}: lost_score={lost_score}, values={result['score']} pts") + # lost builders also go to manual queue if reddit-only + if result.get('reddit_only'): + _add_to_manual_queue(result) + + elif user_type == 'builder': + builders_found += 1 + + elif user_type == 'both': + builders_found += 1 + lost_found += 1 + print(f" ⚡ u/{username}: recovering builder") + + # track external links + if result.get('reddit_only'): + reddit_only_count += 1 + # add high-value users to manual queue for review + if result['score'] >= 50 and user_type != 'lost': # lost already added above + _add_to_manual_queue(result) + print(f" 📋 u/{username}: {result['score']} pts (reddit-only → manual queue)") + else: + external_link_count += 1 + if result['score'] >= 50 and user_type == 'builder': + links = list(result.get('external_links', {}).keys()) + print(f" ★ u/{username}: {result['score']} pts → {', '.join(links)}") + + except Exception as e: + print(f" error on {username}: {e}") + + print(f"scoutd/reddit: found {len(results)} aligned humans") + print(f" - {builders_found} active builders") + print(f" - {lost_found} lost builders (need encouragement)") + print(f" - {external_link_count} with external links (reachable)") + print(f" - {reddit_only_count} reddit-only (manual queue)") + return results + + +def _add_to_manual_queue(result): + """add reddit-only user to manual queue for review""" + from pathlib import Path + import json + + queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json' + queue_file.parent.mkdir(parents=True, exist_ok=True) + + queue = [] + if queue_file.exists(): + try: + queue = json.loads(queue_file.read_text()) + except: + pass + + # check if already in queue + existing = [q for q in queue if q.get('username') == result['username'] and q.get('platform') == 'reddit'] + if existing: + return + + queue.append({ + 'platform': 'reddit', + 'username': result['username'], + 'url': result['url'], + 'score': result['score'], + 'subreddits': result.get('subreddits', []), + 'signals': result.get('signals', []), + 'reasons': result.get('reasons', []), + 'note': 'reddit-only user - no external links found. DM manually if promising.', + 'queued_at': datetime.now().isoformat(), + 'status': 'pending', + }) + + queue_file.write_text(json.dumps(queue, indent=2)) diff --git a/scoutd/signals.py b/scoutd/signals.py new file mode 100644 index 0000000..53c178c --- /dev/null +++ b/scoutd/signals.py @@ -0,0 +1,158 @@ +""" +shared signal patterns for all scrapers +""" + +import re + +# positive signals - what we're looking for +POSITIVE_PATTERNS = [ + # values + (r'\b(solarpunk|cyberpunk)\b', 'solarpunk', 10), + (r'\b(anarchis[tm]|mutual.?aid)\b', 'mutual_aid', 10), + (r'\b(cooperative|collective|worker.?owned?|coop|co.?op)\b', 'cooperative', 15), + (r'\b(community|commons)\b', 'community', 5), + (r'\b(intentional.?community|cohousing|commune)\b', 'intentional_community', 20), + + # queer-friendly + (r'\b(queer|lgbtq?|trans|nonbinary|enby|genderqueer)\b', 'queer', 15), + (r'\b(they/them|she/her|he/him|xe/xem|any.?pronouns)\b', 'pronouns', 10), + (r'\bblm\b', 'blm', 5), + (r'\b(acab|1312)\b', 'acab', 5), + + # tech values + (r'\b(privacy|surveillance|anti.?surveillance)\b', 'privacy', 10), + (r'\b(self.?host(?:ed|ing)?|homelab|home.?server)\b', 'selfhosted', 15), + (r'\b(local.?first|offline.?first)\b', 'local_first', 15), + (r'\b(decentralized?|federation|federated|fediverse)\b', 'decentralized', 10), + (r'\b(foss|libre|open.?source|copyleft)\b', 'foss', 10), + (r'\b(home.?assistant|home.?automation)\b', 'home_automation', 10), + (r'\b(mesh|p2p|peer.?to.?peer)\b', 'p2p', 10), + (r'\b(matrix|xmpp|irc)\b', 'federated_chat', 5), + (r'\b(degoogle|de.?google)\b', 'degoogle', 10), + + # location/availability + (r'\b(seattle|portland|pnw|cascadia|pacific.?northwest)\b', 'pnw', 20), + (r'\b(washington|oregon)\b', 'pnw_state', 10), + (r'\b(remote|anywhere|relocate|looking.?to.?move)\b', 'remote', 10), + + # anti-capitalism + (r'\b(anti.?capitalis[tm]|post.?capitalis[tm]|degrowth)\b', 'anticapitalist', 10), + + # neurodivergent (often overlaps with our values) + (r'\b(neurodivergent|adhd|autistic|autism)\b', 'neurodivergent', 5), + + # technical skills (bonus for builders) + (r'\b(rust|go|python|typescript)\b', 'modern_lang', 3), + (r'\b(linux|bsd|nixos)\b', 'unix', 3), + (r'\b(kubernetes|docker|podman)\b', 'containers', 3), +] + +# negative signals - red flags +NEGATIVE_PATTERNS = [ + (r'\b(qanon|maga|trump|wwg1wga)\b', 'maga', -50), + (r'\b(covid.?hoax|plandemic|5g.?conspiracy)\b', 'conspiracy', -50), + (r'\b(nwo|illuminati|deep.?state)\b', 'conspiracy', -30), + (r'\b(anti.?vax|antivax)\b', 'antivax', -30), + (r'\b(sovereign.?citizen)\b', 'sovcit', -40), + (r'\b(crypto.?bro|web3|nft|blockchain|bitcoin|ethereum)\b', 'crypto', -15), + (r'\b(conservative|republican)\b', 'conservative', -20), + (r'\b(free.?speech.?absolutist)\b', 'freeze_peach', -20), +] + +# target topics for repo discovery +TARGET_TOPICS = [ + 'local-first', 'self-hosted', 'privacy', 'mesh-network', + 'cooperative', 'solarpunk', 'decentralized', 'p2p', + 'fediverse', 'activitypub', 'matrix-org', 'homeassistant', + 'esphome', 'open-source-hardware', 'right-to-repair', + 'mutual-aid', 'commons', 'degoogle', 'privacy-tools', +] + +# ecosystem repos - high signal contributors +ECOSYSTEM_REPOS = [ + 'home-assistant/core', + 'esphome/esphome', + 'matrix-org/synapse', + 'LemmyNet/lemmy', + 'mastodon/mastodon', + 'owncast/owncast', + 'nextcloud/server', + 'immich-app/immich', + 'jellyfin/jellyfin', + 'navidrome/navidrome', + 'paperless-ngx/paperless-ngx', + 'actualbudget/actual', + 'firefly-iii/firefly-iii', + 'logseq/logseq', + 'AppFlowy-IO/AppFlowy', + 'siyuan-note/siyuan', + 'anytype/anytype-ts', + 'calcom/cal.com', + 'plausible/analytics', + 'umami-software/umami', +] + +# aligned subreddits +ALIGNED_SUBREDDITS = { + 'intentionalcommunity': 25, + 'cohousing': 25, + 'cooperatives': 20, + 'solarpunk': 20, + 'selfhosted': 15, + 'homeassistant': 15, + 'homelab': 10, + 'privacy': 15, + 'PrivacyGuides': 15, + 'degoogle': 15, + 'anticonsumption': 10, + 'Frugal': 5, + 'simpleliving': 5, + 'Seattle': 10, + 'Portland': 10, + 'cascadia': 15, + 'linux': 5, + 'opensource': 10, + 'FOSS': 10, +} + +# negative subreddits +NEGATIVE_SUBREDDITS = [ + 'conspiracy', 'conservative', 'walkaway', 'louderwithcrowder', + 'JordanPeterson', 'TimPool', 'NoNewNormal', 'LockdownSkepticism', +] + +# high-signal mastodon instances +ALIGNED_INSTANCES = { + 'tech.lgbt': 20, + 'social.coop': 25, + 'fosstodon.org': 10, + 'hackers.town': 15, + 'hachyderm.io': 10, + 'infosec.exchange': 5, +} + + +def analyze_text(text): + """ + analyze text for signals + returns: (score, signals_found, negative_signals) + """ + if not text: + return 0, [], [] + + text = text.lower() + score = 0 + signals = [] + negatives = [] + + for pattern, signal_name, points in POSITIVE_PATTERNS: + if re.search(pattern, text, re.IGNORECASE): + score += points + signals.append(signal_name) + + for pattern, signal_name, points in NEGATIVE_PATTERNS: + if re.search(pattern, text, re.IGNORECASE): + score += points # points are already negative + negatives.append(signal_name) + + return score, list(set(signals)), list(set(negatives)) diff --git a/scoutd/twitter.py b/scoutd/twitter.py new file mode 100644 index 0000000..90fd06f --- /dev/null +++ b/scoutd/twitter.py @@ -0,0 +1,255 @@ +""" +scoutd/twitter.py - twitter/x discovery via nitter instances + +scrapes nitter (twitter frontend) to find users posting about aligned topics +without needing twitter API access + +nitter instances rotate to avoid rate limits +""" + +import requests +import json +import time +import re +from datetime import datetime +from pathlib import Path +from bs4 import BeautifulSoup + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'twitter' + +# nitter instances (rotate through these) +NITTER_INSTANCES = [ + 'nitter.privacydev.net', + 'nitter.poast.org', + 'nitter.woodland.cafe', + 'nitter.esmailelbob.xyz', +] + +# hashtags to search +ALIGNED_HASHTAGS = [ + 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', + 'privacy', 'solarpunk', 'cooperative', 'mutualaid', 'localfirst', + 'indieweb', 'smallweb', 'permacomputing', 'degrowth', 'techworkers', +] + +_current_instance_idx = 0 + + +def get_nitter_instance(): + """get current nitter instance, rotate on failure""" + global _current_instance_idx + return NITTER_INSTANCES[_current_instance_idx % len(NITTER_INSTANCES)] + + +def rotate_instance(): + """switch to next nitter instance""" + global _current_instance_idx + _current_instance_idx += 1 + + +def _scrape_page(url, retries=3): + """scrape a nitter page with instance rotation""" + for attempt in range(retries): + instance = get_nitter_instance() + full_url = url.replace('{instance}', instance) + + # check cache + cache_key = f"{full_url}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_html') + except: + pass + + time.sleep(2) # rate limit + + try: + resp = requests.get(full_url, headers=HEADERS, timeout=30) + if resp.status_code == 200: + cache_file.write_text(json.dumps({ + '_cached_at': time.time(), + '_html': resp.text + })) + return resp.text + elif resp.status_code in [429, 503]: + print(f" nitter {instance} rate limited, rotating...") + rotate_instance() + else: + print(f" nitter error: {resp.status_code}") + return None + except Exception as e: + print(f" nitter {instance} error: {e}") + rotate_instance() + + return None + + +def search_hashtag(hashtag): + """search for tweets with hashtag""" + url = f"https://{{instance}}/search?q=%23{hashtag}&f=tweets" + html = _scrape_page(url) + if not html: + return [] + + soup = BeautifulSoup(html, 'html.parser') + tweets = [] + + for tweet_div in soup.select('.timeline-item'): + try: + username_elem = tweet_div.select_one('.username') + content_elem = tweet_div.select_one('.tweet-content') + fullname_elem = tweet_div.select_one('.fullname') + + if username_elem and content_elem: + username = username_elem.text.strip().lstrip('@') + tweets.append({ + 'username': username, + 'name': fullname_elem.text.strip() if fullname_elem else username, + 'content': content_elem.text.strip(), + }) + except Exception as e: + continue + + return tweets + + +def get_user_profile(username): + """get user profile from nitter""" + url = f"https://{{instance}}/{username}" + html = _scrape_page(url) + if not html: + return None + + soup = BeautifulSoup(html, 'html.parser') + + try: + bio_elem = soup.select_one('.profile-bio') + bio = bio_elem.text.strip() if bio_elem else '' + + location_elem = soup.select_one('.profile-location') + location = location_elem.text.strip() if location_elem else '' + + website_elem = soup.select_one('.profile-website a') + website = website_elem.get('href') if website_elem else '' + + # get recent tweets for more signal + tweets = [] + for tweet_div in soup.select('.timeline-item')[:10]: + content_elem = tweet_div.select_one('.tweet-content') + if content_elem: + tweets.append(content_elem.text.strip()) + + return { + 'username': username, + 'bio': bio, + 'location': location, + 'website': website, + 'recent_tweets': tweets, + } + except Exception as e: + print(f" error parsing {username}: {e}") + return None + + +def analyze_twitter_user(username, profile=None): + """analyze a twitter user for alignment""" + if not profile: + profile = get_user_profile(username) + + if not profile: + return None + + # collect text + text_parts = [profile.get('bio', '')] + text_parts.extend(profile.get('recent_tweets', [])) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # twitter is noisy, lower base confidence + confidence = 0.25 + if len(positive_signals) >= 3: + confidence += 0.2 + if profile.get('website'): + confidence += 0.1 + if len(profile.get('recent_tweets', [])) >= 5: + confidence += 0.1 + confidence = min(confidence, 0.7) # cap lower for twitter + + reasons = [] + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + return { + 'platform': 'twitter', + 'username': username, + 'url': f"https://twitter.com/{username}", + 'name': profile.get('name', username), + 'bio': profile.get('bio'), + 'location': profile.get('location'), + 'score': text_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'reasons': reasons, + 'contact': { + 'twitter': username, + 'website': profile.get('website'), + }, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_twitter(db, limit_per_hashtag=50): + """full twitter scrape via nitter""" + print("scoutd/twitter: starting scrape via nitter...") + + all_users = {} + + for hashtag in ALIGNED_HASHTAGS: + print(f" #{hashtag}...") + tweets = search_hashtag(hashtag) + + for tweet in tweets[:limit_per_hashtag]: + username = tweet.get('username') + if username and username not in all_users: + all_users[username] = { + 'username': username, + 'name': tweet.get('name'), + 'hashtags': [hashtag], + } + elif username: + all_users[username]['hashtags'].append(hashtag) + + print(f" found {len(tweets)} tweets") + + # prioritize users in multiple hashtags + multi_hashtag = {u: d for u, d in all_users.items() if len(d.get('hashtags', [])) >= 2} + print(f" {len(multi_hashtag)} users in 2+ aligned hashtags") + + # analyze + results = [] + for username, data in list(multi_hashtag.items())[:100]: # limit to prevent rate limits + try: + result = analyze_twitter_user(username) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + if result['score'] >= 30: + print(f" ★ @{username}: {result['score']} pts") + except Exception as e: + print(f" error on {username}: {e}") + + print(f"scoutd/twitter: found {len(results)} aligned humans") + return results diff --git a/setup_user.py b/setup_user.py new file mode 100644 index 0000000..b1162fa --- /dev/null +++ b/setup_user.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +setup priority user - add yourself to get matches + +usage: + python setup_user.py # interactive setup + python setup_user.py --show # show your profile + python setup_user.py --matches # show your matches +""" + +import argparse +import json +from db import Database +from db.users import (init_users_table, add_priority_user, get_priority_users, + get_priority_user_matches) + + +def interactive_setup(db): + """interactive priority user setup""" + print("=" * 60) + print("connectd priority user setup") + print("=" * 60) + print("\nlink your profiles so connectd can find matches for YOU\n") + + name = input("name: ").strip() + email = input("email (for notifications): ").strip() + github = input("github username (optional): ").strip() or None + reddit = input("reddit username (optional): ").strip() or None + mastodon = input("mastodon handle e.g. user@instance (optional): ").strip() or None + lobsters = input("lobste.rs username (optional): ").strip() or None + matrix = input("matrix id e.g. @user:matrix.org (optional): ").strip() or None + location = input("location (e.g. seattle, remote): ").strip() or None + + print("\nwhat are you interested in? (comma separated)") + print("examples: self-hosting, cooperatives, solarpunk, home automation") + interests_raw = input("interests: ").strip() + interests = [i.strip() for i in interests_raw.split(',')] if interests_raw else [] + + print("\nwhat kind of people are you looking to connect with?") + looking_for = input("looking for: ").strip() or None + + user_data = { + 'name': name, + 'email': email, + 'github': github, + 'reddit': reddit, + 'mastodon': mastodon, + 'lobsters': lobsters, + 'matrix': matrix, + 'location': location, + 'interests': interests, + 'looking_for': looking_for, + } + + user_id = add_priority_user(db.conn, user_data) + print(f"\n✓ added as priority user #{user_id}") + print("connectd will now find matches for you") + + +def show_profile(db): + """show current priority user profile""" + users = get_priority_users(db.conn) + + if not users: + print("no priority users configured") + print("run: python setup_user.py") + return + + for user in users: + print("=" * 60) + print(f"priority user #{user['id']}: {user['name']}") + print("=" * 60) + print(f"email: {user['email']}") + if user['github']: + print(f"github: {user['github']}") + if user['reddit']: + print(f"reddit: {user['reddit']}") + if user['mastodon']: + print(f"mastodon: {user['mastodon']}") + if user['lobsters']: + print(f"lobsters: {user['lobsters']}") + if user['matrix']: + print(f"matrix: {user['matrix']}") + if user['location']: + print(f"location: {user['location']}") + if user['interests']: + interests = json.loads(user['interests']) if isinstance(user['interests'], str) else user['interests'] + print(f"interests: {', '.join(interests)}") + if user['looking_for']: + print(f"looking for: {user['looking_for']}") + + +def show_matches(db): + """show matches for priority user""" + users = get_priority_users(db.conn) + + if not users: + print("no priority users configured") + return + + for user in users: + print(f"\n=== matches for {user['name']} ===\n") + + matches = get_priority_user_matches(db.conn, user['id'], limit=20) + + if not matches: + print("no matches yet - run the daemon to discover people") + continue + + for i, match in enumerate(matches, 1): + print(f"{i}. {match['username']} ({match['platform']})") + print(f" score: {match['overlap_score']:.0f}") + print(f" url: {match['url']}") + + reasons = match.get('overlap_reasons', '[]') + if isinstance(reasons, str): + reasons = json.loads(reasons) + if reasons: + print(f" why: {reasons[0] if reasons else ''}") + print() + + +def main(): + parser = argparse.ArgumentParser(description='setup priority user') + parser.add_argument('--show', action='store_true', help='show your profile') + parser.add_argument('--matches', action='store_true', help='show your matches') + args = parser.parse_args() + + db = Database() + init_users_table(db.conn) + + if args.show: + show_profile(db) + elif args.matches: + show_matches(db) + else: + interactive_setup(db) + + db.close() + + +if __name__ == '__main__': + main() diff --git a/soul.txt b/soul.txt new file mode 100644 index 0000000..1f4b044 --- /dev/null +++ b/soul.txt @@ -0,0 +1,69 @@ +this is the soul for the ai that writes for connectd, each intro sent should have this basic content, but should make it unique and generate a clean, human way to say so. + + +"im connectd (or the connectd daemon) we found you +online and think that *factor* aligns really well +with the values of *person*... + +this is not a sales maneuver, this is not a trick, +this is simply someone who wants to end the isolation +for our people and am doing it the only way i know how. + +there is no financial gain or benefit to be had from +my reaching out to you, no data collected that we +couldn't find online.. + +simply a daemon made to reach out to people and bring +them together. + +it sucks being alone. it sucks feeling isolated. + +you are NOT alone and if you felt as if you were, +that means you're less alone than you thought- + +go forth, open the door, it's right in front of you now. +all you have to do is step inside. + +you're home now. + +there's a better way and we are going to build it together." + + +you can reach *person* at *preffered contact method* + +hope it goes well! + +-connectd + CONNECTD_ICONS (line 33-44): + CONNECTD_ICONS = '''

+ + ... + ... + ... + ... + ... + ... + ... +

''' + + SIGNATURE_HTML (line 46-49): + SIGNATURE_HTML = f'''

github.com/sudoxnym/connectd (main repo)

+ {CONNECTD_ICONS} +

''' + + SIGNATURE_PLAIN (line 51-61): + SIGNATURE_PLAIN = """ + --- + github.com/sudoxnym/connectd (main repo) + + github: github.com/connectd-daemon + mastodon: @connectd@mastodon.sudoxreboot.com + bluesky: connectd.bsky.social + lemmy: lemmy.sudoxreboot.com/c/connectd + discord: discord.gg/connectd + matrix: @connectd:sudoxreboot.com + reddit: reddit.com/r/connectd + email: connectd@sudoxreboot.com + """ +