commit 3c02ee85c27f3a301f30ec2ece1afa57e743629d Author: Your Name Date: Mon Dec 15 11:06:51 2025 -0600 initial release: connectd add-on v1.1.0 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c10348a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.pyc +__pycache__/ +*.db +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..300f889 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +# connectd add-ons for home assistant + +## installation + +1. go to **settings** → **add-ons** → **add-on store** +2. click the three dots in the top right → **repositories** +3. add: `https://github.com/sudoxnym/ha-addons` +4. find **connectd** in the store and install + +## add-ons + +### connectd + +find isolated builders with aligned values. auto-discovers humans on github, mastodon, lemmy, discord, and more. + +[![Open your Home Assistant instance and show the add add-on repository dialog with a specific repository URL pre-filled.](https://my.home-assistant.io/badges/supervisor_add_addon_repository.svg)](https://my.home-assistant.io/redirect/supervisor_add_addon_repository/?repository_url=https%3A%2F%2Fgithub.com%2Fsudoxnym%2Fha-addons) diff --git a/connectd/Dockerfile b/connectd/Dockerfile new file mode 100644 index 0000000..a6ca8e1 --- /dev/null +++ b/connectd/Dockerfile @@ -0,0 +1,28 @@ +ARG BUILD_FROM +FROM ${BUILD_FROM} + +# install python deps +RUN apk add --no-cache python3 py3-pip py3-requests py3-beautifulsoup4 + +# create app directory +WORKDIR /app + +# copy requirements and install +COPY requirements.txt . +RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt + +# copy app code +COPY api.py config.py daemon.py cli.py setup_user.py ./ +COPY db/ db/ +COPY scoutd/ scoutd/ +COPY matchd/ matchd/ +COPY introd/ introd/ + +# create data directory +RUN mkdir -p /data/db /data/cache + +# copy run script +COPY run.sh / +RUN chmod a+x /run.sh + +CMD ["/run.sh"] diff --git a/connectd/README.md b/connectd/README.md new file mode 100644 index 0000000..2971138 --- /dev/null +++ b/connectd/README.md @@ -0,0 +1,52 @@ +# connectd add-on for home assistant + +find isolated builders with aligned values. auto-discovers humans on github, mastodon, lemmy, discord, and more. + +## installation + +1. add this repository to your home assistant add-on store +2. install the connectd add-on +3. configure your HOST_USER (github username) in the add-on settings +4. start the add-on + +## configuration + +### required +- **host_user**: your github username (connectd will auto-discover your profile) + +### optional host info +- **host_name**: your display name +- **host_email**: your email +- **host_mastodon**: mastodon handle (@user@instance) +- **host_reddit**: reddit username +- **host_lemmy**: lemmy handle (@user@instance) +- **host_lobsters**: lobsters username +- **host_matrix**: matrix handle (@user:server) +- **host_discord**: discord user id +- **host_bluesky**: bluesky handle (handle.bsky.social) +- **host_location**: your location +- **host_interests**: comma-separated interests +- **host_looking_for**: what you're looking for + +### api credentials +- **github_token**: for higher rate limits +- **groq_api_key**: for LLM-drafted intros +- **mastodon_token**: for DM delivery +- **discord_bot_token**: for discord discovery/delivery + +## hacs integration + +after starting the add-on, install the connectd integration via HACS: + +1. add custom repository: `https://github.com/sudoxnym/connectd` +2. install connectd integration +3. add integration in HA settings +4. configure with host: `localhost`, port: `8099` + +## sensors + +- total humans, high score humans, active builders +- platform counts (github, mastodon, reddit, lemmy, discord, lobsters) +- priority matches, top humans +- countdown timers (next scout, match, intro) +- your personal score and profile diff --git a/connectd/api.py b/connectd/api.py new file mode 100644 index 0000000..68203dc --- /dev/null +++ b/connectd/api.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +connectd/api.py - REST API for stats and control + +exposes daemon stats for home assistant integration. +runs on port 8099 by default. +""" + +import os +import json +import threading +from http.server import HTTPServer, BaseHTTPRequestHandler +from datetime import datetime + +from db import Database +from db.users import get_priority_users, get_priority_user_matches, get_priority_user + +API_PORT = int(os.environ.get('CONNECTD_API_PORT', 8099)) + +# shared state (updated by daemon) +_daemon_state = { + 'running': False, + 'dry_run': False, + 'last_scout': None, + 'last_match': None, + 'last_intro': None, + 'last_lost': None, + 'intros_today': 0, + 'lost_intros_today': 0, + 'started_at': None, +} + + +def update_daemon_state(state_dict): + """update shared daemon state (called by daemon)""" + global _daemon_state + _daemon_state.update(state_dict) + + +def get_daemon_state(): + """get current daemon state""" + return _daemon_state.copy() + + +class APIHandler(BaseHTTPRequestHandler): + """simple REST API handler""" + + def log_message(self, format, *args): + """suppress default logging""" + pass + + def _send_json(self, data, status=200): + """send JSON response""" + self.send_response(status) + self.send_header('Content-Type', 'application/json') + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(json.dumps(data).encode()) + + def do_GET(self): + """handle GET requests""" + if self.path == '/api/stats': + self._handle_stats() + elif self.path == '/api/health': + self._handle_health() + elif self.path == '/api/state': + self._handle_state() + elif self.path == '/api/priority_matches': + self._handle_priority_matches() + elif self.path == '/api/top_humans': + self._handle_top_humans() + elif self.path == '/api/user': + self._handle_user() + else: + self._send_json({'error': 'not found'}, 404) + + def _handle_stats(self): + """return database statistics""" + try: + db = Database() + stats = db.stats() + db.close() + self._send_json(stats) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_health(self): + """return daemon health status""" + state = get_daemon_state() + + health = { + 'status': 'running' if state['running'] else 'stopped', + 'dry_run': state['dry_run'], + 'uptime_seconds': None, + } + + if state['started_at']: + uptime = datetime.now() - datetime.fromisoformat(state['started_at']) + health['uptime_seconds'] = int(uptime.total_seconds()) + + self._send_json(health) + + def _handle_state(self): + """return full daemon state""" + state = get_daemon_state() + + # convert datetimes to strings + for key in ['last_scout', 'last_match', 'last_intro', 'last_lost', 'started_at']: + if state[key] and isinstance(state[key], datetime): + state[key] = state[key].isoformat() + + self._send_json(state) + + def _handle_priority_matches(self): + """return priority matches for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'count': 0, + 'new_count': 0, + 'top_matches': [], + }) + db.close() + return + + # get matches for first priority user (host) + user = users[0] + matches = get_priority_user_matches(db.conn, user['id'], limit=10) + + new_count = sum(1 for m in matches if m.get('status') == 'new') + + top_matches = [] + for m in matches[:5]: + overlap_reasons = m.get('overlap_reasons', '[]') + if isinstance(overlap_reasons, str): + import json as json_mod + overlap_reasons = json_mod.loads(overlap_reasons) if overlap_reasons else [] + + top_matches.append({ + 'username': m.get('username'), + 'platform': m.get('platform'), + 'score': m.get('score', 0), + 'overlap_score': m.get('overlap_score', 0), + 'reasons': overlap_reasons[:3], + 'url': m.get('url'), + 'status': m.get('status', 'new'), + }) + + db.close() + self._send_json({ + 'count': len(matches), + 'new_count': new_count, + 'top_matches': top_matches, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_top_humans(self): + """return top scoring humans for HA sensor""" + try: + db = Database() + humans = db.get_all_humans(min_score=50, limit=5) + + top_humans = [] + for h in humans: + contact = h.get('contact', '{}') + if isinstance(contact, str): + import json as json_mod + contact = json_mod.loads(contact) if contact else {} + + signals = h.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + top_humans.append({ + 'username': h.get('username'), + 'platform': h.get('platform'), + 'score': h.get('score', 0), + 'name': h.get('name'), + 'signals': signals[:5], + 'contact_method': 'email' if contact.get('email') else + 'mastodon' if contact.get('mastodon') else + 'matrix' if contact.get('matrix') else 'manual', + }) + + db.close() + self._send_json({ + 'count': len(humans), + 'top_humans': top_humans, + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + def _handle_user(self): + """return priority user info for HA sensor""" + try: + db = Database() + users = get_priority_users(db.conn) + + if not users: + self._send_json({ + 'configured': False, + 'score': 0, + 'signals': [], + 'match_count': 0, + }) + db.close() + return + + user = users[0] + signals = user.get('signals', '[]') + if isinstance(signals, str): + import json as json_mod + signals = json_mod.loads(signals) if signals else [] + + interests = user.get('interests', '[]') + if isinstance(interests, str): + import json as json_mod + interests = json_mod.loads(interests) if interests else [] + + matches = get_priority_user_matches(db.conn, user['id'], limit=100) + + db.close() + self._send_json({ + 'configured': True, + 'name': user.get('name'), + 'github': user.get('github'), + 'mastodon': user.get('mastodon'), + 'reddit': user.get('reddit'), + 'lobsters': user.get('lobsters'), + 'matrix': user.get('matrix'), + 'lemmy': user.get('lemmy'), + 'discord': user.get('discord'), + 'bluesky': user.get('bluesky'), + 'score': user.get('score', 0), + 'signals': signals[:10], + 'interests': interests, + 'location': user.get('location'), + 'bio': user.get('bio'), + 'match_count': len(matches), + 'new_match_count': sum(1 for m in matches if m.get('status') == 'new'), + }) + except Exception as e: + self._send_json({'error': str(e)}, 500) + + +def run_api_server(): + """run the API server in a thread""" + server = HTTPServer(('0.0.0.0', API_PORT), APIHandler) + print(f"connectd api running on port {API_PORT}") + server.serve_forever() + + +def start_api_thread(): + """start API server in background thread""" + thread = threading.Thread(target=run_api_server, daemon=True) + thread.start() + return thread + + +if __name__ == '__main__': + # standalone mode for testing + print(f"starting connectd api on port {API_PORT}...") + run_api_server() diff --git a/connectd/build.yaml b/connectd/build.yaml new file mode 100644 index 0000000..7e7811f --- /dev/null +++ b/connectd/build.yaml @@ -0,0 +1,11 @@ +build_from: + amd64: ghcr.io/hassio-addons/base:15.0.8 + aarch64: ghcr.io/hassio-addons/base:15.0.8 + armv7: ghcr.io/hassio-addons/base:15.0.8 +labels: + org.opencontainers.image.title: "connectd" + org.opencontainers.image.description: "find isolated builders with aligned values" + org.opencontainers.image.source: "https://github.com/sudoxnym/connectd" + org.opencontainers.image.licenses: "MIT" +args: + BUILD_ARCH: amd64 diff --git a/connectd/cli.py b/connectd/cli.py new file mode 100755 index 0000000..a82dd89 --- /dev/null +++ b/connectd/cli.py @@ -0,0 +1,878 @@ +#!/usr/bin/env python3 +""" +connectd - people discovery and matchmaking daemon +finds isolated builders and connects them +also finds LOST builders who need encouragement + +usage: + connectd scout # run all scrapers + connectd scout --github # github only + connectd scout --reddit # reddit only + connectd scout --mastodon # mastodon only + connectd scout --lobsters # lobste.rs only + connectd scout --matrix # matrix only + connectd scout --lost # show lost builder stats after scout + + connectd match # find all matches + connectd match --top 20 # show top 20 matches + connectd match --mine # show YOUR matches (priority user) + connectd match --lost # find matches for lost builders + + connectd intro # generate intros for top matches + connectd intro --match 123 # generate intro for specific match + connectd intro --dry-run # preview intros without saving + connectd intro --lost # generate intros for lost builders + + connectd review # interactive review queue + connectd send # send all approved intros + connectd send --export # export for manual sending + + connectd daemon # run as continuous daemon + connectd daemon --oneshot # run once then exit + connectd daemon --dry-run # run but never send intros + connectd daemon --oneshot --dry-run # one cycle, preview only + + connectd user # show your priority user profile + connectd user --setup # setup/update your profile + connectd user --matches # show matches found for you + + connectd status # show database stats (including lost builders) + connectd lost # show lost builders ready for outreach +""" + +import argparse +import sys +from pathlib import Path + +# add parent to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from db import Database +from db.users import (init_users_table, add_priority_user, get_priority_users, + get_priority_user_matches, score_priority_user, auto_match_priority_user, + update_priority_user_profile) +from scoutd import scrape_github, scrape_reddit, scrape_mastodon, scrape_lobsters, scrape_matrix +from scoutd.deep import deep_scrape_github_user +from scoutd.lost import get_signal_descriptions +from introd.deliver import (deliver_intro, deliver_batch, get_delivery_stats, + review_manual_queue, determine_best_contact, load_manual_queue, + save_manual_queue) +from matchd import find_all_matches, generate_fingerprint +from matchd.rank import get_top_matches +from matchd.lost import find_matches_for_lost_builders, get_lost_match_summary +from introd import draft_intro +from introd.draft import draft_intros_for_match +from introd.lost_intro import draft_lost_intro, get_lost_intro_config +from introd.review import review_all_pending, get_pending_intros +from introd.send import send_all_approved, export_manual_intros + + +def cmd_scout(args, db): + """run discovery scrapers""" + from scoutd.deep import deep_scrape_github_user, save_deep_profile + + print("=" * 60) + print("connectd scout - discovering aligned humans") + print("=" * 60) + + # deep scrape specific user + if args.user: + print(f"\ndeep scraping github user: {args.user}") + profile = deep_scrape_github_user(args.user) + if profile: + save_deep_profile(db, profile) + print(f"\n=== {profile['username']} ===") + print(f"real name: {profile.get('real_name')}") + print(f"location: {profile.get('location')}") + print(f"company: {profile.get('company')}") + print(f"email: {profile.get('email')}") + print(f"twitter: {profile.get('twitter')}") + print(f"mastodon: {profile.get('mastodon')}") + print(f"orgs: {', '.join(profile.get('orgs', []))}") + print(f"languages: {', '.join(list(profile.get('languages', {}).keys())[:5])}") + print(f"topics: {', '.join(profile.get('topics', [])[:10])}") + print(f"signals: {', '.join(profile.get('signals', []))}") + print(f"score: {profile.get('score')}") + if profile.get('linked_profiles'): + print(f"linked profiles: {list(profile['linked_profiles'].keys())}") + else: + print("failed to scrape user") + return + + run_all = not any([args.github, args.reddit, args.mastodon, args.lobsters, args.matrix, args.twitter, args.bluesky, args.lemmy, args.discord]) + + if args.github or run_all: + if args.deep: + # deep scrape mode - slower but more thorough + print("\nrunning DEEP github scrape (follows all links)...") + from scoutd.github import get_repo_contributors + from scoutd.signals import ECOSYSTEM_REPOS + + all_logins = set() + for repo in ECOSYSTEM_REPOS[:5]: # limit for deep mode + contributors = get_repo_contributors(repo, per_page=20) + for c in contributors: + login = c.get('login') + if login and not login.endswith('[bot]'): + all_logins.add(login) + print(f" {repo}: {len(contributors)} contributors") + + print(f"\ndeep scraping {len(all_logins)} users...") + for login in all_logins: + try: + profile = deep_scrape_github_user(login) + if profile and profile.get('score', 0) > 0: + save_deep_profile(db, profile) + if profile['score'] >= 30: + print(f" ★ {login}: {profile['score']} pts") + if profile.get('email'): + print(f" email: {profile['email']}") + if profile.get('mastodon'): + print(f" mastodon: {profile['mastodon']}") + except Exception as e: + print(f" error on {login}: {e}") + else: + scrape_github(db) + + if args.reddit or run_all: + scrape_reddit(db) + + if args.mastodon or run_all: + scrape_mastodon(db) + + if args.lobsters or run_all: + scrape_lobsters(db) + + if args.matrix or run_all: + scrape_matrix(db) + + if args.twitter or run_all: + from scoutd.twitter import scrape_twitter + scrape_twitter(db) + + if args.bluesky or run_all: + from scoutd.bluesky import scrape_bluesky + scrape_bluesky(db) + + if args.lemmy or run_all: + from scoutd.lemmy import scrape_lemmy + scrape_lemmy(db) + + if args.discord or run_all: + from scoutd.discord import scrape_discord + scrape_discord(db) + + # show stats + stats = db.stats() + print("\n" + "=" * 60) + print("SCOUT COMPLETE") + print("=" * 60) + print(f"total humans: {stats['total_humans']}") + for platform, count in stats.get('by_platform', {}).items(): + print(f" {platform}: {count}") + + # show lost builder stats if requested + if args.lost or True: # always show lost stats now + print("\n--- lost builder stats ---") + print(f"active builders: {stats.get('active_builders', 0)}") + print(f"lost builders: {stats.get('lost_builders', 0)}") + print(f"recovering builders: {stats.get('recovering_builders', 0)}") + print(f"high lost score (40+): {stats.get('high_lost_score', 0)}") + print(f"lost outreach sent: {stats.get('lost_outreach_sent', 0)}") + + +def cmd_match(args, db): + """find and rank matches""" + import json as json_mod + + print("=" * 60) + print("connectd match - finding aligned pairs") + print("=" * 60) + + # lost builder matching + if args.lost: + print("\n--- LOST BUILDER MATCHING ---") + print("finding inspiring builders for lost souls...\n") + + matches, error = find_matches_for_lost_builders(db, limit=args.top or 20) + + if error: + print(f"error: {error}") + return + + if not matches: + print("no lost builders ready for outreach") + return + + print(f"found {len(matches)} lost builders with matching active builders\n") + + for i, match in enumerate(matches, 1): + lost = match['lost_user'] + builder = match['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username') + builder_name = builder.get('name') or builder.get('username') + + print(f"{i}. {lost_name} ({lost.get('platform')}) → needs inspiration from") + print(f" {builder_name} ({builder.get('platform')})") + print(f" lost score: {lost.get('lost_potential_score', 0)} | values: {lost.get('score', 0)}") + print(f" shared interests: {', '.join(match.get('shared_interests', []))}") + print(f" builder has: {match.get('builder_repos', 0)} repos, {match.get('builder_stars', 0)} stars") + print() + + return + + if args.mine: + # show matches for priority user + init_users_table(db.conn) + users = get_priority_users(db.conn) + if not users: + print("no priority user configured. run: connectd user --setup") + return + + for user in users: + print(f"\n=== matches for {user['name']} ===\n") + matches = get_priority_user_matches(db.conn, user['id'], limit=args.top or 20) + + if not matches: + print("no matches yet - run: connectd scout && connectd match") + continue + + for i, match in enumerate(matches, 1): + print(f"{i}. {match['username']} ({match['platform']})") + print(f" score: {match['overlap_score']:.0f}") + print(f" url: {match['url']}") + reasons = match.get('overlap_reasons', '[]') + if isinstance(reasons, str): + reasons = json_mod.loads(reasons) + if reasons: + print(f" why: {reasons[0]}") + print() + return + + if args.top and not args.mine: + # just show existing top matches + matches = get_top_matches(db, limit=args.top) + else: + # run full matching + matches = find_all_matches(db, min_score=args.min_score, min_overlap=args.min_overlap) + + print("\n" + "-" * 60) + print(f"TOP {min(len(matches), args.top or 20)} MATCHES") + print("-" * 60) + + for i, match in enumerate(matches[:args.top or 20], 1): + human_a = match.get('human_a', {}) + human_b = match.get('human_b', {}) + + print(f"\n{i}. {human_a.get('username')} <-> {human_b.get('username')}") + print(f" platforms: {human_a.get('platform')} / {human_b.get('platform')}") + print(f" overlap: {match.get('overlap_score', 0):.0f} pts") + + reasons = match.get('overlap_reasons', []) + if isinstance(reasons, str): + reasons = json_mod.loads(reasons) + if reasons: + print(f" why: {' | '.join(reasons[:3])}") + + if match.get('geographic_match'): + print(f" location: compatible ✓") + + +def cmd_intro(args, db): + """generate intro drafts""" + import json as json_mod + + print("=" * 60) + print("connectd intro - drafting introductions") + print("=" * 60) + + if args.dry_run: + print("*** DRY RUN MODE - previewing only ***\n") + + # lost builder intros - different tone entirely + if args.lost: + print("\n--- LOST BUILDER INTROS ---") + print("drafting encouragement for lost souls...\n") + + matches, error = find_matches_for_lost_builders(db, limit=args.limit or 10) + + if error: + print(f"error: {error}") + return + + if not matches: + print("no lost builders ready for outreach") + return + + config = get_lost_intro_config() + count = 0 + + for match in matches: + lost = match['lost_user'] + builder = match['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username') + builder_name = builder.get('name') or builder.get('username') + + # draft intro + draft, error = draft_lost_intro(lost, builder, config) + + if error: + print(f" error drafting intro for {lost_name}: {error}") + continue + + if args.dry_run: + print("=" * 60) + print(f"TO: {lost_name} ({lost.get('platform')})") + print(f"LOST SCORE: {lost.get('lost_potential_score', 0)}") + print(f"INSPIRING: {builder_name} ({builder.get('url')})") + print("-" * 60) + print("MESSAGE:") + print(draft) + print("-" * 60) + print("[DRY RUN - NOT SAVED]") + print("=" * 60) + else: + print(f" drafted intro for {lost_name} → {builder_name}") + + count += 1 + + if args.dry_run: + print(f"\npreviewed {count} lost builder intros (dry run)") + else: + print(f"\ndrafted {count} lost builder intros") + print("these require manual review before sending") + + return + + if args.match: + # specific match + matches = [m for m in get_top_matches(db, limit=1000) if m.get('id') == args.match] + else: + # top matches + matches = get_top_matches(db, limit=args.limit or 10) + + if not matches: + print("no matches found") + return + + print(f"generating intros for {len(matches)} matches...") + + count = 0 + for match in matches: + intros = draft_intros_for_match(match) + + for intro in intros: + recipient = intro['recipient_human'] + other = intro['other_human'] + + if args.dry_run: + # get contact info + contact = recipient.get('contact', {}) + if isinstance(contact, str): + contact = json_mod.loads(contact) + email = contact.get('email', 'no email') + + # get overlap reasons + reasons = match.get('overlap_reasons', []) + if isinstance(reasons, str): + reasons = json_mod.loads(reasons) + reason_summary = ', '.join(reasons[:3]) if reasons else 'aligned values' + + # print preview + print("\n" + "=" * 60) + print(f"TO: {recipient.get('username')} ({recipient.get('platform')})") + print(f"EMAIL: {email}") + print(f"SUBJECT: you might want to meet {other.get('username')}") + print(f"SCORE: {match.get('overlap_score', 0):.0f} ({reason_summary})") + print("-" * 60) + print("MESSAGE:") + print(intro['draft']) + print("-" * 60) + print("[DRY RUN - NOT SENT]") + print("=" * 60) + else: + print(f"\n {recipient.get('username')} ({intro['channel']})") + + # save to db + db.save_intro( + match.get('id'), + recipient.get('id'), + intro['channel'], + intro['draft'] + ) + + count += 1 + + if args.dry_run: + print(f"\npreviewed {count} intros (dry run - nothing saved)") + else: + print(f"\ngenerated {count} intro drafts") + print("run 'connectd review' to approve before sending") + + +def cmd_review(args, db): + """interactive review queue""" + review_all_pending(db) + + +def cmd_send(args, db): + """send approved intros""" + import json as json_mod + + if args.export: + # export manual queue to file for review + queue = load_manual_queue() + pending = [q for q in queue if q.get('status') == 'pending'] + + with open(args.export, 'w') as f: + json.dump(pending, f, indent=2) + + print(f"exported {len(pending)} pending intros to {args.export}") + return + + # send all approved from manual queue + queue = load_manual_queue() + approved = [q for q in queue if q.get('status') == 'approved'] + + if not approved: + print("no approved intros to send") + print("use 'connectd review' to approve intros first") + return + + print(f"sending {len(approved)} approved intros...") + + for item in approved: + match_data = item.get('match', {}) + intro_draft = item.get('draft', '') + recipient = item.get('recipient', {}) + + success, error, method = deliver_intro( + {'human_b': recipient, **match_data}, + intro_draft, + dry_run=args.dry_run if hasattr(args, 'dry_run') else False + ) + + status = 'ok' if success else f'failed: {error}' + print(f" {recipient.get('username')}: {method} - {status}") + + # update queue status + item['status'] = 'sent' if success else 'failed' + item['error'] = error + + save_manual_queue(queue) + + # show stats + stats = get_delivery_stats() + print(f"\ndelivery stats: {stats['sent']} sent, {stats['failed']} failed") + + +def cmd_lost(args, db): + """show lost builders ready for outreach""" + import json as json_mod + + print("=" * 60) + print("connectd lost - lost builders who need encouragement") + print("=" * 60) + + # get lost builders + lost_builders = db.get_lost_builders_for_outreach( + min_lost_score=args.min_score or 40, + min_values_score=20, + limit=args.limit or 50 + ) + + if not lost_builders: + print("\nno lost builders ready for outreach") + print("run 'connectd scout' to discover more") + return + + print(f"\n{len(lost_builders)} lost builders ready for outreach:\n") + + for i, lost in enumerate(lost_builders, 1): + name = lost.get('name') or lost.get('username') + platform = lost.get('platform') + lost_score = lost.get('lost_potential_score', 0) + values_score = lost.get('score', 0) + + # parse lost signals + lost_signals = lost.get('lost_signals', []) + if isinstance(lost_signals, str): + lost_signals = json_mod.loads(lost_signals) if lost_signals else [] + + # get signal descriptions + signal_descriptions = get_signal_descriptions(lost_signals) + + print(f"{i}. {name} ({platform})") + print(f" lost score: {lost_score} | values score: {values_score}") + print(f" url: {lost.get('url')}") + if signal_descriptions: + print(f" why lost: {', '.join(signal_descriptions[:3])}") + print() + + if args.verbose: + print("-" * 60) + print("these people need encouragement, not networking.") + print("the goal: show them someone like them made it.") + print("-" * 60) + + +def cmd_status(args, db): + """show database stats""" + import json as json_mod + + init_users_table(db.conn) + stats = db.stats() + + print("=" * 60) + print("connectd status") + print("=" * 60) + + # priority users + users = get_priority_users(db.conn) + print(f"\npriority users: {len(users)}") + for user in users: + print(f" - {user['name']} ({user['email']})") + + print(f"\nhumans discovered: {stats['total_humans']}") + print(f" high-score (50+): {stats['high_score_humans']}") + + print("\nby platform:") + for platform, count in stats.get('by_platform', {}).items(): + print(f" {platform}: {count}") + + print(f"\nstranger matches: {stats['total_matches']}") + print(f"intros created: {stats['total_intros']}") + print(f"intros sent: {stats['sent_intros']}") + + # lost builder stats + print("\n--- lost builder stats ---") + print(f"active builders: {stats.get('active_builders', 0)}") + print(f"lost builders: {stats.get('lost_builders', 0)}") + print(f"recovering builders: {stats.get('recovering_builders', 0)}") + print(f"high lost score (40+): {stats.get('high_lost_score', 0)}") + print(f"lost outreach sent: {stats.get('lost_outreach_sent', 0)}") + + # priority user matches + for user in users: + matches = get_priority_user_matches(db.conn, user['id']) + print(f"\nmatches for {user['name']}: {len(matches)}") + + # pending intros + pending = get_pending_intros(db) + print(f"\nintros pending review: {len(pending)}") + + +def cmd_daemon(args, db): + """run as continuous daemon""" + from daemon import ConnectDaemon + + daemon = ConnectDaemon(dry_run=args.dry_run) + + if args.oneshot: + print("running one cycle...") + if args.dry_run: + print("*** DRY RUN MODE - no intros will be sent ***") + daemon.scout_cycle() + daemon.match_priority_users() + daemon.match_strangers() + daemon.send_stranger_intros() + print("done") + else: + daemon.run() + + +def cmd_user(args, db): + """manage priority user profile""" + import json as json_mod + + init_users_table(db.conn) + + if args.setup: + # interactive setup + print("=" * 60) + print("connectd priority user setup") + print("=" * 60) + print("\nlink your profiles so connectd finds matches for YOU\n") + + name = input("name: ").strip() + email = input("email: ").strip() + github = input("github username: ").strip() or None + reddit = input("reddit username: ").strip() or None + mastodon = input("mastodon (user@instance): ").strip() or None + location = input("location (e.g. seattle): ").strip() or None + + print("\ninterests (comma separated):") + interests_raw = input("> ").strip() + interests = [i.strip() for i in interests_raw.split(',')] if interests_raw else [] + + looking_for = input("looking for: ").strip() or None + + user_data = { + 'name': name, 'email': email, 'github': github, + 'reddit': reddit, 'mastodon': mastodon, + 'location': location, 'interests': interests, + 'looking_for': looking_for, + } + user_id = add_priority_user(db.conn, user_data) + print(f"\n✓ added as priority user #{user_id}") + + elif args.matches: + # show matches + users = get_priority_users(db.conn) + if not users: + print("no priority user. run: connectd user --setup") + return + + for user in users: + print(f"\n=== matches for {user['name']} ===\n") + matches = get_priority_user_matches(db.conn, user['id'], limit=20) + + if not matches: + print("no matches yet") + continue + + for i, match in enumerate(matches, 1): + print(f"{i}. {match['username']} ({match['platform']})") + print(f" {match['url']}") + print(f" score: {match['overlap_score']:.0f}") + print() + + else: + # show profile + users = get_priority_users(db.conn) + if not users: + print("no priority user configured") + print("run: connectd user --setup") + return + + for user in users: + print("=" * 60) + print(f"priority user #{user['id']}: {user['name']}") + print("=" * 60) + print(f"email: {user['email']}") + if user['github']: + print(f"github: {user['github']}") + if user['reddit']: + print(f"reddit: {user['reddit']}") + if user['mastodon']: + print(f"mastodon: {user['mastodon']}") + if user['location']: + print(f"location: {user['location']}") + if user['interests']: + interests = json_mod.loads(user['interests']) if isinstance(user['interests'], str) else user['interests'] + print(f"interests: {', '.join(interests)}") + if user['looking_for']: + print(f"looking for: {user['looking_for']}") + + +def cmd_me(args, db): + """auto-score and auto-match for priority user with optional groq intros""" + import json as json_mod + + init_users_table(db.conn) + + # get priority user + users = get_priority_users(db.conn) + if not users: + print("no priority user configured") + print("run: connectd user --setup") + return + + user = users[0] # first/main user + print("=" * 60) + print(f"connectd me - {user['name']}") + print("=" * 60) + + # step 1: scrape github profile + if user.get('github') and not args.skip_scrape: + print(f"\n[1/4] scraping github profile: {user['github']}") + profile = deep_scrape_github_user(user['github'], scrape_commits=False) + if profile: + print(f" repos: {len(profile.get('top_repos', []))}") + print(f" languages: {', '.join(list(profile.get('languages', {}).keys())[:5])}") + else: + print(" failed to scrape (rate limited?)") + profile = None + else: + print("\n[1/4] skipping github scrape (using saved profile)") + # use saved profile if available + saved = user.get('scraped_profile') + if saved: + profile = json_mod.loads(saved) if isinstance(saved, str) else saved + print(f" loaded saved profile: {len(profile.get('top_repos', []))} repos") + else: + profile = None + + # step 2: calculate score + print(f"\n[2/4] calculating your score...") + result = score_priority_user(db.conn, user['id'], profile) + if result: + print(f" score: {result['score']}") + print(f" signals: {', '.join(sorted(result['signals'])[:10])}") + + # step 3: find matches + print(f"\n[3/4] finding matches...") + matches = auto_match_priority_user(db.conn, user['id'], min_overlap=args.min_overlap) + print(f" found {len(matches)} matches") + + # step 4: show results (optionally with groq intros) + print(f"\n[4/4] top matches:") + print("-" * 60) + + limit = args.limit or 10 + for i, m in enumerate(matches[:limit], 1): + human = m['human'] + shared = m['shared'] + + print(f"\n{i}. {human.get('name') or human['username']} ({human['platform']})") + print(f" {human.get('url', '')}") + print(f" score: {human.get('score', 0):.0f} | overlap: {m['overlap_score']:.0f}") + print(f" location: {human.get('location') or 'unknown'}") + print(f" why: {', '.join(shared[:5])}") + + # groq intro draft + if args.groq: + try: + from introd.groq_draft import draft_intro_with_llm + match_data = { + 'human_a': {'name': user['name'], 'username': user.get('github'), + 'platform': 'github', 'signals': result.get('signals', []) if result else [], + 'bio': user.get('bio'), 'location': user.get('location'), + 'extra': profile or {}}, + 'human_b': human, + 'overlap_score': m['overlap_score'], + 'overlap_reasons': shared, + } + intro, err = draft_intro_with_llm(match_data, recipient='b') + if intro: + print(f"\n --- groq draft ({intro.get('contact_method', 'manual')}) ---") + if intro.get('contact_info'): + print(f" deliver via: {intro['contact_info']}") + for line in intro['draft'].split('\n'): + print(f" {line}") + print(f" ------------------") + elif err: + print(f" [groq error: {err}]") + except Exception as e: + print(f" [groq error: {e}]") + + # summary + print("\n" + "=" * 60) + print(f"your score: {result['score'] if result else 'unknown'}") + print(f"matches found: {len(matches)}") + if args.groq: + print("groq intros: enabled") + else: + print("tip: add --groq to generate ai intro drafts") + + +def main(): + parser = argparse.ArgumentParser( + description='connectd - people discovery and matchmaking daemon', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + + subparsers = parser.add_subparsers(dest='command', help='commands') + + # scout command + scout_parser = subparsers.add_parser('scout', help='discover aligned humans') + scout_parser.add_argument('--github', action='store_true', help='github only') + scout_parser.add_argument('--reddit', action='store_true', help='reddit only') + scout_parser.add_argument('--mastodon', action='store_true', help='mastodon only') + scout_parser.add_argument('--lobsters', action='store_true', help='lobste.rs only') + scout_parser.add_argument('--matrix', action='store_true', help='matrix only') + scout_parser.add_argument('--twitter', action='store_true', help='twitter/x via nitter') + scout_parser.add_argument('--bluesky', action='store_true', help='bluesky/atproto') + scout_parser.add_argument('--lemmy', action='store_true', help='lemmy (fediverse reddit)') + scout_parser.add_argument('--discord', action='store_true', help='discord servers') + scout_parser.add_argument('--deep', action='store_true', help='deep scrape - follow all links') + scout_parser.add_argument('--user', type=str, help='deep scrape specific github user') + scout_parser.add_argument('--lost', action='store_true', help='show lost builder stats') + + # match command + match_parser = subparsers.add_parser('match', help='find and rank matches') + match_parser.add_argument('--top', type=int, help='show top N matches') + match_parser.add_argument('--mine', action='store_true', help='show YOUR matches') + match_parser.add_argument('--lost', action='store_true', help='find matches for lost builders') + match_parser.add_argument('--min-score', type=int, default=30, help='min human score') + match_parser.add_argument('--min-overlap', type=int, default=20, help='min overlap score') + + # intro command + intro_parser = subparsers.add_parser('intro', help='generate intro drafts') + intro_parser.add_argument('--match', type=int, help='specific match id') + intro_parser.add_argument('--limit', type=int, default=10, help='number of matches') + intro_parser.add_argument('--dry-run', action='store_true', help='preview only, do not save') + intro_parser.add_argument('--lost', action='store_true', help='generate intros for lost builders') + + # lost command - show lost builders ready for outreach + lost_parser = subparsers.add_parser('lost', help='show lost builders who need encouragement') + lost_parser.add_argument('--min-score', type=int, default=40, help='min lost score') + lost_parser.add_argument('--limit', type=int, default=50, help='max results') + lost_parser.add_argument('--verbose', '-v', action='store_true', help='show philosophy') + + # review command + review_parser = subparsers.add_parser('review', help='review intro queue') + + # send command + send_parser = subparsers.add_parser('send', help='send approved intros') + send_parser.add_argument('--export', type=str, help='export to file for manual sending') + + # status command + status_parser = subparsers.add_parser('status', help='show stats') + + # daemon command + daemon_parser = subparsers.add_parser('daemon', help='run as continuous daemon') + daemon_parser.add_argument('--oneshot', action='store_true', help='run once then exit') + daemon_parser.add_argument('--dry-run', action='store_true', help='preview intros, do not send') + + # user command + user_parser = subparsers.add_parser('user', help='manage priority user profile') + user_parser.add_argument('--setup', action='store_true', help='setup/update profile') + user_parser.add_argument('--matches', action='store_true', help='show your matches') + + # me command - auto score + match + optional groq intros + me_parser = subparsers.add_parser('me', help='auto-score and match yourself') + me_parser.add_argument('--groq', action='store_true', help='generate groq llama intro drafts') + me_parser.add_argument('--skip-scrape', action='store_true', help='skip github scraping') + me_parser.add_argument('--min-overlap', type=int, default=40, help='min overlap score') + me_parser.add_argument('--limit', type=int, default=10, help='number of matches to show') + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + # init database + db = Database() + + try: + if args.command == 'scout': + cmd_scout(args, db) + elif args.command == 'match': + cmd_match(args, db) + elif args.command == 'intro': + cmd_intro(args, db) + elif args.command == 'review': + cmd_review(args, db) + elif args.command == 'send': + cmd_send(args, db) + elif args.command == 'status': + cmd_status(args, db) + elif args.command == 'daemon': + cmd_daemon(args, db) + elif args.command == 'user': + cmd_user(args, db) + elif args.command == 'me': + cmd_me(args, db) + elif args.command == 'lost': + cmd_lost(args, db) + finally: + db.close() + + +if __name__ == '__main__': + main() diff --git a/connectd/config.py b/connectd/config.py new file mode 100644 index 0000000..ef2ce89 --- /dev/null +++ b/connectd/config.py @@ -0,0 +1,124 @@ +""" +connectd/config.py - central configuration + +all configurable settings in one place. +""" + +import os +from pathlib import Path + +# base paths +BASE_DIR = Path(__file__).parent +DB_DIR = BASE_DIR / 'db' +DATA_DIR = BASE_DIR / 'data' +CACHE_DIR = DB_DIR / 'cache' + +# ensure directories exist +DATA_DIR.mkdir(exist_ok=True) +CACHE_DIR.mkdir(exist_ok=True) + + +# === DAEMON CONFIG === +SCOUT_INTERVAL = 3600 * 4 # full scout every 4 hours +MATCH_INTERVAL = 3600 # check matches every hour +INTRO_INTERVAL = 3600 * 2 # send intros every 2 hours +MAX_INTROS_PER_DAY = 20 # rate limit builder-to-builder outreach + + +# === MATCHING CONFIG === +MIN_OVERLAP_PRIORITY = 30 # min score for priority user matches +MIN_OVERLAP_STRANGERS = 50 # higher bar for stranger intros +MIN_HUMAN_SCORE = 25 # min values score to be considered + + +# === LOST BUILDER CONFIG === +# these people need encouragement, not networking. +# the goal isn't to recruit them - it's to show them the door exists. + +LOST_CONFIG = { + # detection thresholds + 'min_lost_score': 40, # minimum lost_potential_score + 'min_values_score': 20, # must have SOME values alignment + + # outreach settings + 'enabled': True, + 'max_per_day': 5, # lower volume, higher care + 'require_review': False, # fully autonomous + 'cooldown_days': 90, # don't spam struggling people + + # matching settings + 'min_builder_score': 50, # inspiring builders must be active + 'min_match_overlap': 10, # must have SOME shared interests + + # LLM drafting + 'use_llm': True, + 'llm_temperature': 0.7, # be genuine, not robotic + + # message guidelines (for LLM prompt) + 'tone': 'genuine, not salesy', + 'max_words': 150, # they don't have energy for long messages + 'no_pressure': True, # never pushy + 'sign_off': '- connectd', +} + + +# === API CREDENTIALS === +# all credentials from environment variables - no defaults + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +GROQ_MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile') + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '') +MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '') + +BLUESKY_HANDLE = os.environ.get('BLUESKY_HANDLE', '') +BLUESKY_APP_PASSWORD = os.environ.get('BLUESKY_APP_PASSWORD', '') + +MATRIX_HOMESERVER = os.environ.get('MATRIX_HOMESERVER', '') +MATRIX_USER_ID = os.environ.get('MATRIX_USER_ID', '') +MATRIX_ACCESS_TOKEN = os.environ.get('MATRIX_ACCESS_TOKEN', '') + +DISCORD_BOT_TOKEN = os.environ.get('DISCORD_BOT_TOKEN', '') +DISCORD_TARGET_SERVERS = os.environ.get('DISCORD_TARGET_SERVERS', '') + +# lemmy (for authenticated access to private instance) +LEMMY_INSTANCE = os.environ.get('LEMMY_INSTANCE', '') +LEMMY_USERNAME = os.environ.get('LEMMY_USERNAME', '') +LEMMY_PASSWORD = os.environ.get('LEMMY_PASSWORD', '') + +# email (for sending intros) +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', '465')) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') + +# === HOST USER CONFIG === +# the person running connectd - gets priority matching +HOST_USER = os.environ.get('HOST_USER', '') # alias like sudoxnym +HOST_NAME = os.environ.get('HOST_NAME', '') +HOST_EMAIL = os.environ.get('HOST_EMAIL', '') +HOST_GITHUB = os.environ.get('HOST_GITHUB', '') +HOST_MASTODON = os.environ.get('HOST_MASTODON', '') # user@instance +HOST_REDDIT = os.environ.get('HOST_REDDIT', '') +HOST_LEMMY = os.environ.get('HOST_LEMMY', '') # user@instance +HOST_LOBSTERS = os.environ.get('HOST_LOBSTERS', '') +HOST_MATRIX = os.environ.get('HOST_MATRIX', '') # @user:server +HOST_DISCORD = os.environ.get('HOST_DISCORD', '') # user id +HOST_BLUESKY = os.environ.get('HOST_BLUESKY', '') # handle.bsky.social +HOST_LOCATION = os.environ.get('HOST_LOCATION', '') +HOST_INTERESTS = os.environ.get('HOST_INTERESTS', '') # comma separated +HOST_LOOKING_FOR = os.environ.get('HOST_LOOKING_FOR', '') + + +def get_lost_config(): + """get lost builder configuration""" + return LOST_CONFIG.copy() + + +def update_lost_config(updates): + """update lost builder configuration""" + global LOST_CONFIG + LOST_CONFIG.update(updates) + return LOST_CONFIG.copy() diff --git a/connectd/config.yaml b/connectd/config.yaml new file mode 100644 index 0000000..e0861dd --- /dev/null +++ b/connectd/config.yaml @@ -0,0 +1,72 @@ +name: connectd +version: "1.1.0" +slug: connectd +description: "find isolated builders with aligned values. auto-discover humans on github, mastodon, lemmy, discord, and more." +url: "https://github.com/sudoxnym/connectd" +arch: + - amd64 + - aarch64 + - armv7 +startup: application +boot: auto +ports: + 8099/tcp: 8099 +ports_description: + 8099/tcp: "connectd API (for HACS integration)" +map: + - config:rw +options: + host_user: "" + host_name: "" + host_email: "" + host_mastodon: "" + host_reddit: "" + host_lemmy: "" + host_lobsters: "" + host_matrix: "" + host_discord: "" + host_bluesky: "" + host_location: "" + host_interests: "" + host_looking_for: "" + github_token: "" + groq_api_key: "" + mastodon_token: "" + mastodon_instance: "" + discord_bot_token: "" + discord_target_servers: "" + lemmy_instance: "" + lemmy_username: "" + lemmy_password: "" + smtp_host: "" + smtp_port: 465 + smtp_user: "" + smtp_pass: "" +schema: + host_user: str? + host_name: str? + host_email: email? + host_mastodon: str? + host_reddit: str? + host_lemmy: str? + host_lobsters: str? + host_matrix: str? + host_discord: str? + host_bluesky: str? + host_location: str? + host_interests: str? + host_looking_for: str? + github_token: str? + groq_api_key: str? + mastodon_token: str? + mastodon_instance: str? + discord_bot_token: str? + discord_target_servers: str? + lemmy_instance: str? + lemmy_username: str? + lemmy_password: str? + smtp_host: str? + smtp_port: int? + smtp_user: str? + smtp_pass: str? +image: sudoxreboot/connectd-addon-{arch} diff --git a/connectd/daemon.py b/connectd/daemon.py new file mode 100644 index 0000000..932c063 --- /dev/null +++ b/connectd/daemon.py @@ -0,0 +1,546 @@ +#!/usr/bin/env python3 +""" +connectd daemon - continuous discovery and matchmaking + +two modes of operation: +1. priority matching: find matches FOR hosts who run connectd +2. altruistic matching: connect strangers to each other + +runs continuously, respects rate limits, sends intros automatically +""" + +import time +import json +import signal +import sys +from datetime import datetime, timedelta +from pathlib import Path + +from db import Database +from db.users import (init_users_table, get_priority_users, save_priority_match, + get_priority_user_matches, discover_host_user) +from scoutd import scrape_github, scrape_reddit, scrape_mastodon, scrape_lobsters, scrape_lemmy, scrape_discord +from config import HOST_USER +from scoutd.github import analyze_github_user, get_github_user +from scoutd.signals import analyze_text +from matchd.fingerprint import generate_fingerprint, fingerprint_similarity +from matchd.overlap import find_overlap +from matchd.lost import find_matches_for_lost_builders +from introd.draft import draft_intro, summarize_human, summarize_overlap +from introd.lost_intro import draft_lost_intro, get_lost_intro_config +from introd.send import send_email +from introd.deliver import deliver_intro, determine_best_contact +from config import get_lost_config +from api import start_api_thread, update_daemon_state + +# daemon config +SCOUT_INTERVAL = 3600 * 4 # full scout every 4 hours +MATCH_INTERVAL = 3600 # check matches every hour +INTRO_INTERVAL = 3600 * 2 # send intros every 2 hours +LOST_INTERVAL = 3600 * 6 # lost builder outreach every 6 hours (lower volume) +MAX_INTROS_PER_DAY = 20 # rate limit outreach +MIN_OVERLAP_PRIORITY = 30 # min score for priority user matches +MIN_OVERLAP_STRANGERS = 50 # higher bar for stranger intros + + +class ConnectDaemon: + def __init__(self, dry_run=False): + self.db = Database() + init_users_table(self.db.conn) + self.running = True + self.dry_run = dry_run + self.started_at = datetime.now() + self.last_scout = None + self.last_match = None + self.last_intro = None + self.last_lost = None + self.intros_today = 0 + self.lost_intros_today = 0 + self.today = datetime.now().date() + + # handle shutdown gracefully + signal.signal(signal.SIGINT, self._shutdown) + signal.signal(signal.SIGTERM, self._shutdown) + + # auto-discover host user from env + if HOST_USER: + self.log(f"HOST_USER set: {HOST_USER}") + discover_host_user(self.db.conn, HOST_USER) + + # update API state + self._update_api_state() + + def _shutdown(self, signum, frame): + print("\nconnectd: shutting down...") + self.running = False + self._update_api_state() + + def _update_api_state(self): + """update API state for HA integration""" + now = datetime.now() + + # calculate countdowns - if no cycle has run, use started_at + def secs_until(last, interval): + base = last if last else self.started_at + next_run = base + timedelta(seconds=interval) + remaining = (next_run - now).total_seconds() + return max(0, int(remaining)) + + update_daemon_state({ + 'running': self.running, + 'dry_run': self.dry_run, + 'last_scout': self.last_scout.isoformat() if self.last_scout else None, + 'last_match': self.last_match.isoformat() if self.last_match else None, + 'last_intro': self.last_intro.isoformat() if self.last_intro else None, + 'last_lost': self.last_lost.isoformat() if self.last_lost else None, + 'intros_today': self.intros_today, + 'lost_intros_today': self.lost_intros_today, + 'started_at': self.started_at.isoformat(), + 'countdown_scout': secs_until(self.last_scout, SCOUT_INTERVAL), + 'countdown_match': secs_until(self.last_match, MATCH_INTERVAL), + 'countdown_intro': secs_until(self.last_intro, INTRO_INTERVAL), + 'countdown_lost': secs_until(self.last_lost, LOST_INTERVAL), + }) + + def log(self, msg): + """timestamped log""" + print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {msg}") + + def reset_daily_limits(self): + """reset daily intro count""" + if datetime.now().date() != self.today: + self.today = datetime.now().date() + self.intros_today = 0 + self.lost_intros_today = 0 + self.log("reset daily intro limits") + + def scout_cycle(self): + """run discovery on all platforms""" + self.log("starting scout cycle...") + + try: + scrape_github(self.db, limit_per_source=30) + except Exception as e: + self.log(f"github scout error: {e}") + + try: + scrape_reddit(self.db, limit_per_sub=30) + except Exception as e: + self.log(f"reddit scout error: {e}") + + try: + scrape_mastodon(self.db, limit_per_instance=30) + except Exception as e: + self.log(f"mastodon scout error: {e}") + + try: + scrape_lobsters(self.db) + except Exception as e: + self.log(f"lobsters scout error: {e}") + + try: + scrape_lemmy(self.db, limit_per_community=30) + except Exception as e: + self.log(f"lemmy scout error: {e}") + + try: + scrape_discord(self.db, limit_per_channel=50) + except Exception as e: + self.log(f"discord scout error: {e}") + + self.last_scout = datetime.now() + stats = self.db.stats() + self.log(f"scout complete: {stats['total_humans']} humans in db") + + def match_priority_users(self): + """find matches for priority users (hosts)""" + priority_users = get_priority_users(self.db.conn) + + if not priority_users: + return + + self.log(f"matching for {len(priority_users)} priority users...") + + humans = self.db.get_all_humans(min_score=20, limit=500) + + for puser in priority_users: + # build priority user's fingerprint from their linked profiles + puser_signals = [] + puser_text = [] + + if puser.get('bio'): + puser_text.append(puser['bio']) + if puser.get('interests'): + interests = json.loads(puser['interests']) if isinstance(puser['interests'], str) else puser['interests'] + puser_signals.extend(interests) + if puser.get('looking_for'): + puser_text.append(puser['looking_for']) + + # analyze their linked github if available + if puser.get('github'): + gh_user = analyze_github_user(puser['github']) + if gh_user: + puser_signals.extend(gh_user.get('signals', [])) + + puser_fingerprint = { + 'values_vector': {}, + 'skills': {}, + 'interests': list(set(puser_signals)), + 'location_pref': 'pnw' if puser.get('location') and 'seattle' in puser['location'].lower() else None, + } + + # score text + if puser_text: + _, text_signals, _ = analyze_text(' '.join(puser_text)) + puser_signals.extend(text_signals) + + # find matches + matches_found = 0 + for human in humans: + # skip if it's their own profile on another platform + human_user = human.get('username', '').lower() + if puser.get('github') and human_user == puser['github'].lower(): + continue + if puser.get('reddit') and human_user == puser['reddit'].lower(): + continue + if puser.get('mastodon') and human_user == puser['mastodon'].lower().split('@')[0]: + continue + + # calculate overlap + human_signals = human.get('signals', []) + if isinstance(human_signals, str): + human_signals = json.loads(human_signals) + + shared = set(puser_signals) & set(human_signals) + overlap_score = len(shared) * 10 + + # location bonus + if puser.get('location') and human.get('location'): + if 'seattle' in human['location'].lower() or 'pnw' in human['location'].lower(): + overlap_score += 20 + + if overlap_score >= MIN_OVERLAP_PRIORITY: + overlap_data = { + 'overlap_score': overlap_score, + 'overlap_reasons': [f"shared: {', '.join(list(shared)[:5])}"] if shared else [], + } + save_priority_match(self.db.conn, puser['id'], human['id'], overlap_data) + matches_found += 1 + + if matches_found: + self.log(f" found {matches_found} matches for {puser['name'] or puser['email']}") + + def match_strangers(self): + """find matches between discovered humans (altruistic)""" + self.log("matching strangers...") + + humans = self.db.get_all_humans(min_score=40, limit=200) + + if len(humans) < 2: + return + + # generate fingerprints + fingerprints = {} + for human in humans: + fp = generate_fingerprint(human) + fingerprints[human['id']] = fp + + # find pairs + matches_found = 0 + from itertools import combinations + + for human_a, human_b in combinations(humans, 2): + # skip same platform same user + if human_a['platform'] == human_b['platform']: + if human_a['username'] == human_b['username']: + continue + + fp_a = fingerprints.get(human_a['id']) + fp_b = fingerprints.get(human_b['id']) + + overlap = find_overlap(human_a, human_b, fp_a, fp_b) + + if overlap['overlap_score'] >= MIN_OVERLAP_STRANGERS: + # save match + self.db.save_match(human_a['id'], human_b['id'], overlap) + matches_found += 1 + + if matches_found: + self.log(f"found {matches_found} stranger matches") + + self.last_match = datetime.now() + + def send_stranger_intros(self): + """send intros to connect strangers (or preview in dry-run mode)""" + self.reset_daily_limits() + + if not self.dry_run and self.intros_today >= MAX_INTROS_PER_DAY: + self.log("daily intro limit reached") + return + + # get unsent matches + c = self.db.conn.cursor() + c.execute('''SELECT m.*, + ha.id as a_id, ha.username as a_user, ha.platform as a_platform, + ha.name as a_name, ha.url as a_url, ha.contact as a_contact, + ha.signals as a_signals, ha.extra as a_extra, + hb.id as b_id, hb.username as b_user, hb.platform as b_platform, + hb.name as b_name, hb.url as b_url, hb.contact as b_contact, + hb.signals as b_signals, hb.extra as b_extra + FROM matches m + JOIN humans ha ON m.human_a_id = ha.id + JOIN humans hb ON m.human_b_id = hb.id + WHERE m.status = 'pending' + ORDER BY m.overlap_score DESC + LIMIT 10''') + + matches = c.fetchall() + + if self.dry_run: + self.log(f"DRY RUN: previewing {len(matches)} potential intros") + + for match in matches: + if not self.dry_run and self.intros_today >= MAX_INTROS_PER_DAY: + break + + match = dict(match) + + # build human dicts + human_a = { + 'id': match['a_id'], + 'username': match['a_user'], + 'platform': match['a_platform'], + 'name': match['a_name'], + 'url': match['a_url'], + 'contact': match['a_contact'], + 'signals': match['a_signals'], + 'extra': match['a_extra'], + } + human_b = { + 'id': match['b_id'], + 'username': match['b_user'], + 'platform': match['b_platform'], + 'name': match['b_name'], + 'url': match['b_url'], + 'contact': match['b_contact'], + 'signals': match['b_signals'], + 'extra': match['b_extra'], + } + + match_data = { + 'id': match['id'], + 'human_a': human_a, + 'human_b': human_b, + 'overlap_score': match['overlap_score'], + 'overlap_reasons': match['overlap_reasons'], + } + + # try to send intro to person with email + for recipient, other in [(human_a, human_b), (human_b, human_a)]: + contact = recipient.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) + + email = contact.get('email') + if not email: + continue + + # draft intro + intro = draft_intro(match_data, recipient='a' if recipient == human_a else 'b') + + # parse overlap reasons for display + reasons = match['overlap_reasons'] + if isinstance(reasons, str): + reasons = json.loads(reasons) + reason_summary = ', '.join(reasons[:3]) if reasons else 'aligned values' + + if self.dry_run: + # print preview + print("\n" + "=" * 60) + print(f"TO: {recipient['username']} ({recipient['platform']})") + print(f"EMAIL: {email}") + print(f"SUBJECT: you might want to meet {other['username']}") + print(f"SCORE: {match['overlap_score']:.0f} ({reason_summary})") + print("-" * 60) + print("MESSAGE:") + print(intro['draft']) + print("-" * 60) + print("[DRY RUN - NOT SENT]") + print("=" * 60) + break + else: + # actually send + success, error = send_email( + email, + f"connectd: you might want to meet {other['username']}", + intro['draft'] + ) + + if success: + self.log(f"sent intro to {recipient['username']} ({email})") + self.intros_today += 1 + + # mark match as intro_sent + c.execute('UPDATE matches SET status = "intro_sent" WHERE id = ?', + (match['id'],)) + self.db.conn.commit() + break + else: + self.log(f"failed to send to {email}: {error}") + + self.last_intro = datetime.now() + + def send_lost_builder_intros(self): + """ + reach out to lost builders - different tone, lower volume. + these people need encouragement, not networking. + """ + self.reset_daily_limits() + + lost_config = get_lost_config() + + if not lost_config.get('enabled', True): + return + + max_per_day = lost_config.get('max_per_day', 5) + if not self.dry_run and self.lost_intros_today >= max_per_day: + self.log("daily lost builder intro limit reached") + return + + # find lost builders with matching active builders + matches, error = find_matches_for_lost_builders( + self.db, + min_lost_score=lost_config.get('min_lost_score', 40), + min_values_score=lost_config.get('min_values_score', 20), + limit=max_per_day - self.lost_intros_today + ) + + if error: + self.log(f"lost builder matching error: {error}") + return + + if not matches: + self.log("no lost builders ready for outreach") + return + + if self.dry_run: + self.log(f"DRY RUN: previewing {len(matches)} lost builder intros") + + for match in matches: + if not self.dry_run and self.lost_intros_today >= max_per_day: + break + + lost = match['lost_user'] + builder = match['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username') + builder_name = builder.get('name') or builder.get('username') + + # draft intro + draft, draft_error = draft_lost_intro(lost, builder, lost_config) + + if draft_error: + self.log(f"error drafting lost intro for {lost_name}: {draft_error}") + continue + + # determine best contact method (activity-based) + method, contact_info = determine_best_contact(lost) + + if self.dry_run: + print("\n" + "=" * 60) + print("LOST BUILDER OUTREACH") + print("=" * 60) + print(f"TO: {lost_name} ({lost.get('platform')})") + print(f"DELIVERY: {method} → {contact_info}") + print(f"LOST SCORE: {lost.get('lost_potential_score', 0)}") + print(f"VALUES SCORE: {lost.get('score', 0)}") + print(f"INSPIRING BUILDER: {builder_name}") + print(f"SHARED INTERESTS: {', '.join(match.get('shared_interests', []))}") + print("-" * 60) + print("MESSAGE:") + print(draft) + print("-" * 60) + print("[DRY RUN - NOT SENT]") + print("=" * 60) + else: + # build match data for unified delivery + match_data = { + 'human_a': builder, # inspiring builder + 'human_b': lost, # lost builder (recipient) + 'overlap_score': match.get('match_score', 0), + 'overlap_reasons': match.get('shared_interests', []), + } + + success, error, delivery_method = deliver_intro(match_data, draft) + + if success: + self.log(f"sent lost builder intro to {lost_name} via {delivery_method}") + self.lost_intros_today += 1 + self.db.mark_lost_outreach(lost['id']) + else: + self.log(f"failed to reach {lost_name} via {delivery_method}: {error}") + + self.last_lost = datetime.now() + self.log(f"lost builder cycle complete: {self.lost_intros_today} sent today") + + def run(self): + """main daemon loop""" + self.log("connectd daemon starting...") + + # start API server + start_api_thread() + self.log("api server started on port 8099") + + if self.dry_run: + self.log("*** DRY RUN MODE - no intros will be sent ***") + self.log(f"scout interval: {SCOUT_INTERVAL}s") + self.log(f"match interval: {MATCH_INTERVAL}s") + self.log(f"intro interval: {INTRO_INTERVAL}s") + self.log(f"lost interval: {LOST_INTERVAL}s") + self.log(f"max intros/day: {MAX_INTROS_PER_DAY}") + + # initial scout + self.scout_cycle() + self._update_api_state() + + while self.running: + now = datetime.now() + + # scout cycle + if not self.last_scout or (now - self.last_scout).seconds >= SCOUT_INTERVAL: + self.scout_cycle() + self._update_api_state() + + # match cycle + if not self.last_match or (now - self.last_match).seconds >= MATCH_INTERVAL: + self.match_priority_users() + self.match_strangers() + self._update_api_state() + + # intro cycle + if not self.last_intro or (now - self.last_intro).seconds >= INTRO_INTERVAL: + self.send_stranger_intros() + self._update_api_state() + + # lost builder cycle + if not self.last_lost or (now - self.last_lost).seconds >= LOST_INTERVAL: + self.send_lost_builder_intros() + self._update_api_state() + + # sleep between checks + time.sleep(60) + + self.log("connectd daemon stopped") + self.db.close() + + +def run_daemon(dry_run=False): + """entry point""" + daemon = ConnectDaemon(dry_run=dry_run) + daemon.run() + + +if __name__ == '__main__': + import sys + dry_run = '--dry-run' in sys.argv + run_daemon(dry_run=dry_run) diff --git a/connectd/db/__init__.py b/connectd/db/__init__.py new file mode 100644 index 0000000..3c42af4 --- /dev/null +++ b/connectd/db/__init__.py @@ -0,0 +1,375 @@ +""" +connectd database layer +sqlite storage for humans, fingerprints, matches, intros +""" + +import os +import sqlite3 +import json +from datetime import datetime +from pathlib import Path + +# use env var for DB path (docker) or default to local +DB_PATH = Path(os.environ.get('DB_PATH', Path(__file__).parent / 'connectd.db')) + + +class Database: + def __init__(self, path=None): + self.path = path or DB_PATH + self.conn = sqlite3.connect(self.path) + self.conn.row_factory = sqlite3.Row + self._init_tables() + + def _init_tables(self): + c = self.conn.cursor() + + # humans table - all discovered people + c.execute('''CREATE TABLE IF NOT EXISTS humans ( + id INTEGER PRIMARY KEY, + platform TEXT NOT NULL, + username TEXT NOT NULL, + url TEXT, + name TEXT, + bio TEXT, + location TEXT, + score REAL DEFAULT 0, + confidence REAL DEFAULT 0, + signals TEXT, + negative_signals TEXT, + reasons TEXT, + contact TEXT, + extra TEXT, + fingerprint_id INTEGER, + scraped_at TEXT, + updated_at TEXT, + lost_potential_score REAL DEFAULT 0, + lost_signals TEXT, + user_type TEXT DEFAULT 'none', + last_lost_outreach TEXT, + UNIQUE(platform, username) + )''') + + # migration: add new columns if they don't exist + try: + c.execute('ALTER TABLE humans ADD COLUMN lost_potential_score REAL DEFAULT 0') + except sqlite3.OperationalError: + pass # column exists + + try: + c.execute('ALTER TABLE humans ADD COLUMN lost_signals TEXT') + except sqlite3.OperationalError: + pass + + try: + c.execute('ALTER TABLE humans ADD COLUMN user_type TEXT DEFAULT "none"') + except sqlite3.OperationalError: + pass + + try: + c.execute('ALTER TABLE humans ADD COLUMN last_lost_outreach TEXT') + except sqlite3.OperationalError: + pass + + # fingerprints table - values profiles + c.execute('''CREATE TABLE IF NOT EXISTS fingerprints ( + id INTEGER PRIMARY KEY, + human_id INTEGER, + values_vector TEXT, + skills TEXT, + interests TEXT, + location_pref TEXT, + availability TEXT, + generated_at TEXT, + FOREIGN KEY(human_id) REFERENCES humans(id) + )''') + + # matches table - paired humans + c.execute('''CREATE TABLE IF NOT EXISTS matches ( + id INTEGER PRIMARY KEY, + human_a_id INTEGER, + human_b_id INTEGER, + overlap_score REAL, + overlap_reasons TEXT, + complementary_skills TEXT, + geographic_match INTEGER, + status TEXT DEFAULT 'pending', + created_at TEXT, + reviewed_at TEXT, + FOREIGN KEY(human_a_id) REFERENCES humans(id), + FOREIGN KEY(human_b_id) REFERENCES humans(id), + UNIQUE(human_a_id, human_b_id) + )''') + + # intros table - outreach attempts + c.execute('''CREATE TABLE IF NOT EXISTS intros ( + id INTEGER PRIMARY KEY, + match_id INTEGER, + recipient_human_id INTEGER, + channel TEXT, + draft TEXT, + status TEXT DEFAULT 'draft', + approved_by TEXT, + approved_at TEXT, + sent_at TEXT, + response TEXT, + response_at TEXT, + FOREIGN KEY(match_id) REFERENCES matches(id), + FOREIGN KEY(recipient_human_id) REFERENCES humans(id) + )''') + + # cross-platform links + c.execute('''CREATE TABLE IF NOT EXISTS cross_platform ( + id INTEGER PRIMARY KEY, + human_a_id INTEGER, + human_b_id INTEGER, + confidence REAL, + reason TEXT, + FOREIGN KEY(human_a_id) REFERENCES humans(id), + FOREIGN KEY(human_b_id) REFERENCES humans(id), + UNIQUE(human_a_id, human_b_id) + )''') + + self.conn.commit() + + def save_human(self, data): + """save or update a human record""" + c = self.conn.cursor() + + # fields to exclude from extra json + exclude_fields = ['platform', 'username', 'url', 'name', 'bio', + 'location', 'score', 'confidence', 'signals', + 'negative_signals', 'reasons', 'contact', + 'lost_potential_score', 'lost_signals', 'user_type'] + + c.execute('''INSERT OR REPLACE INTO humans + (platform, username, url, name, bio, location, score, confidence, + signals, negative_signals, reasons, contact, extra, scraped_at, updated_at, + lost_potential_score, lost_signals, user_type) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', + (data.get('platform'), + data.get('username'), + data.get('url'), + data.get('name'), + data.get('bio'), + data.get('location'), + data.get('score', 0), + data.get('confidence', 0), + json.dumps(data.get('signals', [])), + json.dumps(data.get('negative_signals', [])), + json.dumps(data.get('reasons', [])), + json.dumps(data.get('contact', {})), + json.dumps({k: v for k, v in data.items() if k not in exclude_fields}), + data.get('scraped_at', datetime.now().isoformat()), + datetime.now().isoformat(), + data.get('lost_potential_score', 0), + json.dumps(data.get('lost_signals', [])), + data.get('user_type', 'none'))) + + self.conn.commit() + return c.lastrowid + + def get_human(self, platform, username): + """get a human by platform and username""" + c = self.conn.cursor() + c.execute('SELECT * FROM humans WHERE platform = ? AND username = ?', + (platform, username)) + row = c.fetchone() + return dict(row) if row else None + + def get_human_by_id(self, human_id): + """get a human by id""" + c = self.conn.cursor() + c.execute('SELECT * FROM humans WHERE id = ?', (human_id,)) + row = c.fetchone() + return dict(row) if row else None + + def get_all_humans(self, min_score=0, limit=1000): + """get all humans above score threshold""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE score >= ? + ORDER BY score DESC, confidence DESC + LIMIT ?''', (min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_humans_by_platform(self, platform, min_score=0, limit=500): + """get humans for a specific platform""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE platform = ? AND score >= ? + ORDER BY score DESC + LIMIT ?''', (platform, min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_lost_builders(self, min_lost_score=40, min_values_score=20, limit=100): + """get lost builders who need encouragement""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE user_type = 'lost' OR user_type = 'both' + AND lost_potential_score >= ? + AND score >= ? + ORDER BY lost_potential_score DESC, score DESC + LIMIT ?''', (min_lost_score, min_values_score, limit)) + return [dict(row) for row in c.fetchall()] + + def get_lost_builders_for_outreach(self, min_lost_score=40, min_values_score=20, + cooldown_days=90, limit=50): + """get lost builders who are ready for outreach (respecting cooldown)""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE (user_type = 'lost' OR user_type = 'both') + AND lost_potential_score >= ? + AND score >= ? + AND (last_lost_outreach IS NULL + OR datetime(last_lost_outreach) < datetime('now', '-' || ? || ' days')) + ORDER BY lost_potential_score DESC, score DESC + LIMIT ?''', (min_lost_score, min_values_score, cooldown_days, limit)) + return [dict(row) for row in c.fetchall()] + + def get_active_builders(self, min_score=50, limit=100): + """get active builders who can inspire lost builders""" + c = self.conn.cursor() + c.execute('''SELECT * FROM humans + WHERE user_type = 'builder' + AND score >= ? + ORDER BY score DESC, confidence DESC + LIMIT ?''', (min_score, limit)) + return [dict(row) for row in c.fetchall()] + + def mark_lost_outreach(self, human_id): + """mark that we reached out to a lost builder""" + c = self.conn.cursor() + c.execute('''UPDATE humans SET last_lost_outreach = ? WHERE id = ?''', + (datetime.now().isoformat(), human_id)) + self.conn.commit() + + def save_fingerprint(self, human_id, fingerprint_data): + """save a fingerprint for a human""" + c = self.conn.cursor() + c.execute('''INSERT OR REPLACE INTO fingerprints + (human_id, values_vector, skills, interests, location_pref, availability, generated_at) + VALUES (?, ?, ?, ?, ?, ?, ?)''', + (human_id, + json.dumps(fingerprint_data.get('values_vector', {})), + json.dumps(fingerprint_data.get('skills', [])), + json.dumps(fingerprint_data.get('interests', [])), + fingerprint_data.get('location_pref'), + fingerprint_data.get('availability'), + datetime.now().isoformat())) + + # update human's fingerprint_id + c.execute('UPDATE humans SET fingerprint_id = ? WHERE id = ?', + (c.lastrowid, human_id)) + self.conn.commit() + return c.lastrowid + + def get_fingerprint(self, human_id): + """get fingerprint for a human""" + c = self.conn.cursor() + c.execute('SELECT * FROM fingerprints WHERE human_id = ?', (human_id,)) + row = c.fetchone() + return dict(row) if row else None + + def save_match(self, human_a_id, human_b_id, match_data): + """save a match between two humans""" + c = self.conn.cursor() + c.execute('''INSERT OR REPLACE INTO matches + (human_a_id, human_b_id, overlap_score, overlap_reasons, + complementary_skills, geographic_match, status, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?)''', + (human_a_id, human_b_id, + match_data.get('overlap_score', 0), + json.dumps(match_data.get('overlap_reasons', [])), + json.dumps(match_data.get('complementary_skills', [])), + 1 if match_data.get('geographic_match') else 0, + 'pending', + datetime.now().isoformat())) + self.conn.commit() + return c.lastrowid + + def get_matches(self, status=None, limit=100): + """get matches, optionally filtered by status""" + c = self.conn.cursor() + if status: + c.execute('''SELECT * FROM matches WHERE status = ? + ORDER BY overlap_score DESC LIMIT ?''', (status, limit)) + else: + c.execute('''SELECT * FROM matches + ORDER BY overlap_score DESC LIMIT ?''', (limit,)) + return [dict(row) for row in c.fetchall()] + + def save_intro(self, match_id, recipient_id, channel, draft): + """save an intro draft""" + c = self.conn.cursor() + c.execute('''INSERT INTO intros + (match_id, recipient_human_id, channel, draft, status) + VALUES (?, ?, ?, ?, 'draft')''', + (match_id, recipient_id, channel, draft)) + self.conn.commit() + return c.lastrowid + + def get_pending_intros(self, limit=50): + """get intros pending approval""" + c = self.conn.cursor() + c.execute('''SELECT * FROM intros WHERE status = 'draft' + ORDER BY id DESC LIMIT ?''', (limit,)) + return [dict(row) for row in c.fetchall()] + + def approve_intro(self, intro_id, approved_by='human'): + """approve an intro for sending""" + c = self.conn.cursor() + c.execute('''UPDATE intros SET status = 'approved', + approved_by = ?, approved_at = ? WHERE id = ?''', + (approved_by, datetime.now().isoformat(), intro_id)) + self.conn.commit() + + def mark_intro_sent(self, intro_id): + """mark an intro as sent""" + c = self.conn.cursor() + c.execute('''UPDATE intros SET status = 'sent', sent_at = ? WHERE id = ?''', + (datetime.now().isoformat(), intro_id)) + self.conn.commit() + + def stats(self): + """get database statistics""" + c = self.conn.cursor() + stats = {} + + c.execute('SELECT COUNT(*) FROM humans') + stats['total_humans'] = c.fetchone()[0] + + c.execute('SELECT platform, COUNT(*) FROM humans GROUP BY platform') + stats['by_platform'] = {row[0]: row[1] for row in c.fetchall()} + + c.execute('SELECT COUNT(*) FROM humans WHERE score >= 50') + stats['high_score_humans'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM matches') + stats['total_matches'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM intros') + stats['total_intros'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM intros WHERE status = "sent"') + stats['sent_intros'] = c.fetchone()[0] + + # lost builder stats + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'builder'") + stats['active_builders'] = c.fetchone()[0] + + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'lost'") + stats['lost_builders'] = c.fetchone()[0] + + c.execute("SELECT COUNT(*) FROM humans WHERE user_type = 'both'") + stats['recovering_builders'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM humans WHERE lost_potential_score >= 40') + stats['high_lost_score'] = c.fetchone()[0] + + c.execute('SELECT COUNT(*) FROM humans WHERE last_lost_outreach IS NOT NULL') + stats['lost_outreach_sent'] = c.fetchone()[0] + + return stats + + def close(self): + self.conn.close() diff --git a/connectd/db/users.py b/connectd/db/users.py new file mode 100644 index 0000000..0615389 --- /dev/null +++ b/connectd/db/users.py @@ -0,0 +1,510 @@ +""" +priority users - people who host connectd get direct matching +""" + +import sqlite3 +import json +from datetime import datetime +from pathlib import Path + +DB_PATH = Path(__file__).parent / 'connectd.db' + +# map user-friendly interests to signal terms +INTEREST_TO_SIGNALS = { + 'self-hosting': ['selfhosted', 'home_automation'], + 'home-assistant': ['home_automation'], + 'intentional-community': ['community', 'cooperative'], + 'cooperatives': ['cooperative', 'community'], + 'solarpunk': ['solarpunk'], + 'privacy': ['privacy', 'local_first'], + 'local-first': ['local_first', 'privacy'], + 'queer-friendly': ['queer'], + 'anti-capitalism': ['cooperative', 'decentralized', 'community'], + 'esports-venue': [], + 'foss': ['foss'], + 'decentralized': ['decentralized'], + 'federated': ['federated_chat'], + 'mesh': ['mesh'], +} + + +def init_users_table(conn): + """create priority users table""" + c = conn.cursor() + + c.execute('''CREATE TABLE IF NOT EXISTS priority_users ( + id INTEGER PRIMARY KEY, + name TEXT, + email TEXT UNIQUE, + github TEXT, + reddit TEXT, + mastodon TEXT, + lobsters TEXT, + matrix TEXT, + lemmy TEXT, + discord TEXT, + bluesky TEXT, + location TEXT, + bio TEXT, + interests TEXT, + looking_for TEXT, + created_at TEXT, + active INTEGER DEFAULT 1, + score REAL DEFAULT 0, + signals TEXT, + scraped_profile TEXT, + last_scored_at TEXT + )''') + + # add missing columns to existing table + for col in ['lemmy', 'discord', 'bluesky']: + try: + c.execute(f'ALTER TABLE priority_users ADD COLUMN {col} TEXT') + except: + pass # column already exists + + # matches specifically for priority users + c.execute('''CREATE TABLE IF NOT EXISTS priority_matches ( + id INTEGER PRIMARY KEY, + priority_user_id INTEGER, + matched_human_id INTEGER, + overlap_score REAL, + overlap_reasons TEXT, + status TEXT DEFAULT 'new', + notified_at TEXT, + viewed_at TEXT, + FOREIGN KEY(priority_user_id) REFERENCES priority_users(id), + FOREIGN KEY(matched_human_id) REFERENCES humans(id) + )''') + + conn.commit() + + +def add_priority_user(conn, user_data): + """add a priority user (someone hosting connectd)""" + c = conn.cursor() + + c.execute('''INSERT OR REPLACE INTO priority_users + (name, email, github, reddit, mastodon, lobsters, matrix, lemmy, discord, bluesky, + location, bio, interests, looking_for, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', + (user_data.get('name'), + user_data.get('email'), + user_data.get('github'), + user_data.get('reddit'), + user_data.get('mastodon'), + user_data.get('lobsters'), + user_data.get('matrix'), + user_data.get('lemmy'), + user_data.get('discord'), + user_data.get('bluesky'), + user_data.get('location'), + user_data.get('bio'), + json.dumps(user_data.get('interests', [])), + user_data.get('looking_for'), + datetime.now().isoformat())) + + conn.commit() + return c.lastrowid + + +def get_priority_users(conn): + """get all active priority users""" + c = conn.cursor() + c.execute('SELECT * FROM priority_users WHERE active = 1') + return [dict(row) for row in c.fetchall()] + + +def get_priority_user(conn, user_id): + """get a specific priority user""" + c = conn.cursor() + c.execute('SELECT * FROM priority_users WHERE id = ?', (user_id,)) + row = c.fetchone() + return dict(row) if row else None + + +def save_priority_match(conn, priority_user_id, human_id, overlap_data): + """save a match for a priority user""" + c = conn.cursor() + + c.execute('''INSERT OR IGNORE INTO priority_matches + (priority_user_id, matched_human_id, overlap_score, overlap_reasons, status) + VALUES (?, ?, ?, ?, 'new')''', + (priority_user_id, human_id, + overlap_data.get('overlap_score', 0), + json.dumps(overlap_data.get('overlap_reasons', [])))) + + conn.commit() + return c.lastrowid + + +def get_priority_user_matches(conn, priority_user_id, status=None, limit=50): + """get matches for a priority user""" + c = conn.cursor() + + if status: + c.execute('''SELECT pm.*, h.* FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? AND pm.status = ? + ORDER BY pm.overlap_score DESC + LIMIT ?''', (priority_user_id, status, limit)) + else: + c.execute('''SELECT pm.*, h.* FROM priority_matches pm + JOIN humans h ON pm.matched_human_id = h.id + WHERE pm.priority_user_id = ? + ORDER BY pm.overlap_score DESC + LIMIT ?''', (priority_user_id, limit)) + + return [dict(row) for row in c.fetchall()] + + +def mark_match_viewed(conn, match_id): + """mark a priority match as viewed""" + c = conn.cursor() + c.execute('''UPDATE priority_matches SET status = 'viewed', viewed_at = ? + WHERE id = ?''', (datetime.now().isoformat(), match_id)) + conn.commit() + + +def expand_interests_to_signals(interests): + """expand user-friendly interests to signal terms""" + signals = set() + for interest in interests: + interest_lower = interest.lower().strip() + if interest_lower in INTEREST_TO_SIGNALS: + signals.update(INTEREST_TO_SIGNALS[interest_lower]) + else: + signals.add(interest_lower) + + # always add these aligned signals for priority users + signals.update(['foss', 'decentralized', 'federated_chat', 'containers', 'unix', 'selfhosted']) + return list(signals) + + +def score_priority_user(conn, user_id, scraped_profile=None): + """ + calculate a score for a priority user based on: + - their stated interests + - their scraped github profile (if available) + - their repos and activity + """ + c = conn.cursor() + c.execute('SELECT * FROM priority_users WHERE id = ?', (user_id,)) + row = c.fetchone() + if not row: + return None + + user = dict(row) + score = 0 + signals = set() + + # 1. score from stated interests + interests = user.get('interests') + if isinstance(interests, str): + interests = json.loads(interests) if interests else [] + + for interest in interests: + interest_lower = interest.lower() + # high-value interests + if 'solarpunk' in interest_lower: + score += 30 + signals.add('solarpunk') + if 'queer' in interest_lower: + score += 30 + signals.add('queer') + if 'cooperative' in interest_lower or 'intentional' in interest_lower: + score += 20 + signals.add('cooperative') + if 'privacy' in interest_lower: + score += 10 + signals.add('privacy') + if 'self-host' in interest_lower or 'selfhost' in interest_lower: + score += 15 + signals.add('selfhosted') + if 'home-assistant' in interest_lower: + score += 15 + signals.add('home_automation') + if 'foss' in interest_lower or 'open source' in interest_lower: + score += 10 + signals.add('foss') + + # 2. score from scraped profile + if scraped_profile: + # repos + repos = scraped_profile.get('top_repos', []) + if len(repos) >= 20: + score += 20 + elif len(repos) >= 10: + score += 10 + elif len(repos) >= 5: + score += 5 + + # languages + languages = scraped_profile.get('languages', {}) + if 'Python' in languages or 'Rust' in languages: + score += 5 + signals.add('modern_lang') + + # topics from repos + topics = scraped_profile.get('topics', []) + for topic in topics: + if topic in ['self-hosted', 'home-assistant', 'privacy', 'foss']: + score += 10 + signals.add(topic.replace('-', '_')) + + # followers + followers = scraped_profile.get('followers', 0) + if followers >= 100: + score += 15 + elif followers >= 50: + score += 10 + elif followers >= 10: + score += 5 + + # 3. add expanded signals + expanded = expand_interests_to_signals(interests) + signals.update(expanded) + + # update user + c.execute('''UPDATE priority_users + SET score = ?, signals = ?, scraped_profile = ?, last_scored_at = ? + WHERE id = ?''', + (score, json.dumps(list(signals)), json.dumps(scraped_profile) if scraped_profile else None, + datetime.now().isoformat(), user_id)) + conn.commit() + + return {'score': score, 'signals': list(signals)} + + +def auto_match_priority_user(conn, user_id, min_overlap=40): + """ + automatically find and save matches for a priority user + uses relationship filtering to skip already-connected people + """ + from scoutd.deep import check_already_connected + + c = conn.cursor() + + # get user + c.execute('SELECT * FROM priority_users WHERE id = ?', (user_id,)) + row = c.fetchone() + if not row: + return [] + + user = dict(row) + + # get user signals + user_signals = set() + if user.get('signals'): + signals = json.loads(user['signals']) if isinstance(user['signals'], str) else user['signals'] + user_signals.update(signals) + + # also expand interests + if user.get('interests'): + interests = json.loads(user['interests']) if isinstance(user['interests'], str) else user['interests'] + user_signals.update(expand_interests_to_signals(interests)) + + # clear old matches + c.execute('DELETE FROM priority_matches WHERE priority_user_id = ?', (user_id,)) + conn.commit() + + # get all humans + c.execute('SELECT * FROM humans WHERE score >= 25') + columns = [d[0] for d in c.description] + + matches = [] + for row in c.fetchall(): + human = dict(zip(columns, row)) + + # skip own profiles + username = (human.get('username') or '').lower() + if user.get('github') and username == user['github'].lower(): + continue + if user.get('reddit') and username == user.get('reddit', '').lower(): + continue + + # check if already connected + user_human = {'username': user.get('github'), 'platform': 'github', 'extra': {}} + connected, reason = check_already_connected(user_human, human) + if connected: + continue + + # get human signals + human_signals = human.get('signals', []) + if isinstance(human_signals, str): + human_signals = json.loads(human_signals) if human_signals else [] + + # calculate overlap + shared = user_signals & set(human_signals) + overlap_score = len(shared) * 10 + + # high-value bonuses + if 'queer' in human_signals: + overlap_score += 40 + shared.add('queer (rare!)') + if 'solarpunk' in human_signals: + overlap_score += 30 + shared.add('solarpunk (rare!)') + if 'cooperative' in human_signals: + overlap_score += 20 + shared.add('cooperative (values)') + + # location bonus + location = (human.get('location') or '').lower() + user_location = (user.get('location') or '').lower() + if user_location and location: + if any(x in location for x in ['seattle', 'portland', 'pnw', 'washington', 'oregon']): + if 'seattle' in user_location or 'pnw' in user_location: + overlap_score += 25 + shared.add('PNW location!') + + if overlap_score >= min_overlap: + matches.append({ + 'human': human, + 'overlap_score': overlap_score, + 'shared': list(shared), + }) + + # sort and save top matches + matches.sort(key=lambda x: x['overlap_score'], reverse=True) + + for m in matches[:50]: # save top 50 + save_priority_match(conn, user_id, m['human']['id'], { + 'overlap_score': m['overlap_score'], + 'overlap_reasons': m['shared'], + }) + + return matches + + +def update_priority_user_profile(conn, user_id, profile_data): + """update a priority user's profile with new data""" + c = conn.cursor() + + updates = [] + values = [] + + for field in ['name', 'email', 'github', 'reddit', 'mastodon', 'lobsters', + 'matrix', 'lemmy', 'discord', 'bluesky', 'location', 'bio', 'looking_for']: + if field in profile_data and profile_data[field]: + updates.append(f'{field} = ?') + values.append(profile_data[field]) + + if 'interests' in profile_data: + updates.append('interests = ?') + values.append(json.dumps(profile_data['interests'])) + + if updates: + values.append(user_id) + c.execute(f'''UPDATE priority_users SET {', '.join(updates)} WHERE id = ?''', values) + conn.commit() + + return True + + +def discover_host_user(conn, alias): + """ + auto-discover a host user by their alias (username). + scrapes github and discovers all connected social handles. + also merges in HOST_ env vars from config for manual overrides. + + returns the priority user id + """ + from scoutd.github import analyze_github_user + from config import (HOST_NAME, HOST_EMAIL, HOST_GITHUB, HOST_MASTODON, + HOST_REDDIT, HOST_LEMMY, HOST_LOBSTERS, HOST_MATRIX, + HOST_DISCORD, HOST_BLUESKY, HOST_LOCATION, HOST_INTERESTS, HOST_LOOKING_FOR) + + print(f"connectd: discovering host user '{alias}'...") + + # scrape github for full profile + profile = analyze_github_user(alias) + + if not profile: + print(f" could not find github user '{alias}'") + # still create from env vars if no github found + profile = {'name': HOST_NAME or alias, 'bio': '', 'location': HOST_LOCATION, + 'contact': {}, 'extra': {'handles': {}}, 'topics': [], 'signals': []} + + print(f" found: {profile.get('name')} ({alias})") + print(f" score: {profile.get('score', 0)}, signals: {len(profile.get('signals', []))}") + + # extract contact info + contact = profile.get('contact', {}) + handles = profile.get('extra', {}).get('handles', {}) + + # merge in HOST_ env vars (override discovered values) + if HOST_MASTODON: + handles['mastodon'] = HOST_MASTODON + if HOST_REDDIT: + handles['reddit'] = HOST_REDDIT + if HOST_LEMMY: + handles['lemmy'] = HOST_LEMMY + if HOST_LOBSTERS: + handles['lobsters'] = HOST_LOBSTERS + if HOST_MATRIX: + handles['matrix'] = HOST_MATRIX + if HOST_DISCORD: + handles['discord'] = HOST_DISCORD + if HOST_BLUESKY: + handles['bluesky'] = HOST_BLUESKY + + # check if user already exists + c = conn.cursor() + c.execute('SELECT id FROM priority_users WHERE github = ?', (alias,)) + existing = c.fetchone() + + # parse HOST_INTERESTS if provided + interests = profile.get('topics', []) + if HOST_INTERESTS: + interests = [i.strip() for i in HOST_INTERESTS.split(',') if i.strip()] + + user_data = { + 'name': HOST_NAME or profile.get('name') or alias, + 'email': HOST_EMAIL or contact.get('email'), + 'github': HOST_GITHUB or alias, + 'reddit': handles.get('reddit'), + 'mastodon': handles.get('mastodon') or contact.get('mastodon'), + 'lobsters': handles.get('lobsters'), + 'matrix': handles.get('matrix') or contact.get('matrix'), + 'lemmy': handles.get('lemmy') or contact.get('lemmy'), + 'discord': handles.get('discord'), + 'bluesky': handles.get('bluesky') or contact.get('bluesky'), + 'location': HOST_LOCATION or profile.get('location'), + 'bio': profile.get('bio'), + 'interests': interests, + 'looking_for': HOST_LOOKING_FOR, + } + + if existing: + # update existing user + user_id = existing['id'] + update_priority_user_profile(conn, user_id, user_data) + print(f" updated existing priority user (id={user_id})") + else: + # create new user + user_id = add_priority_user(conn, user_data) + print(f" created new priority user (id={user_id})") + + # score the user + scraped_profile = { + 'top_repos': profile.get('extra', {}).get('top_repos', []), + 'languages': profile.get('languages', {}), + 'topics': profile.get('topics', []), + 'followers': profile.get('extra', {}).get('followers', 0), + } + score_result = score_priority_user(conn, user_id, scraped_profile) + print(f" scored: {score_result.get('score')}, {len(score_result.get('signals', []))} signals") + + # print discovered handles + print(f" discovered handles:") + for platform, handle in handles.items(): + print(f" {platform}: {handle}") + + return user_id + + +def get_host_user(conn): + """get the host user (first priority user)""" + users = get_priority_users(conn) + return users[0] if users else None diff --git a/connectd/icon.png b/connectd/icon.png new file mode 100644 index 0000000..212b1a9 Binary files /dev/null and b/connectd/icon.png differ diff --git a/connectd/introd/__init__.py b/connectd/introd/__init__.py new file mode 100644 index 0000000..3d73d5a --- /dev/null +++ b/connectd/introd/__init__.py @@ -0,0 +1,10 @@ +""" +introd - outreach module +drafts intros, queues for human review, sends via appropriate channel +""" + +from .draft import draft_intro +from .review import get_pending_intros, approve_intro, reject_intro +from .send import send_intro + +__all__ = ['draft_intro', 'get_pending_intros', 'approve_intro', 'reject_intro', 'send_intro'] diff --git a/connectd/introd/deliver.py b/connectd/introd/deliver.py new file mode 100644 index 0000000..c261f46 --- /dev/null +++ b/connectd/introd/deliver.py @@ -0,0 +1,509 @@ +""" +introd/deliver.py - intro delivery via multiple channels + +supports: +- email (smtp) +- mastodon dm (if they allow dms) +- bluesky dm (via AT Protocol) +- matrix dm (creates DM room and sends message) +- github issue (opens intro as issue on their most active repo) +- manual queue (for review before sending) + +contact method is determined by ACTIVITY-BASED SELECTION: +- picks the platform where the user is MOST ACTIVE +- verified handles (from rel="me" links) get a bonus + +NOTE: reddit is NOT a delivery method - it's discovery only. +reddit-discovered users are contacted via their external links. +""" + +import os +import json +import smtplib +import requests +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from datetime import datetime +from pathlib import Path + +# config from env - no hardcoded credentials +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', 465)) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') +FROM_EMAIL = os.environ.get('FROM_EMAIL', '') + +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '') +MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '') +BLUESKY_HANDLE = os.environ.get('BLUESKY_HANDLE', '') +BLUESKY_APP_PASSWORD = os.environ.get('BLUESKY_APP_PASSWORD', '') +MATRIX_HOMESERVER = os.environ.get('MATRIX_HOMESERVER', '') +MATRIX_USER_ID = os.environ.get('MATRIX_USER_ID', '') +MATRIX_ACCESS_TOKEN = os.environ.get('MATRIX_ACCESS_TOKEN', '') + +# delivery log +DELIVERY_LOG = Path(__file__).parent.parent / 'data' / 'delivery_log.json' +MANUAL_QUEUE = Path(__file__).parent.parent / 'data' / 'manual_queue.json' + + +def load_delivery_log(): + """load delivery history""" + if DELIVERY_LOG.exists(): + return json.loads(DELIVERY_LOG.read_text()) + return {'sent': [], 'failed': [], 'queued': []} + + +def save_delivery_log(log): + """save delivery history""" + DELIVERY_LOG.parent.mkdir(parents=True, exist_ok=True) + DELIVERY_LOG.write_text(json.dumps(log, indent=2)) + + +def load_manual_queue(): + """load manual review queue""" + if MANUAL_QUEUE.exists(): + return json.loads(MANUAL_QUEUE.read_text()) + return [] + + +def save_manual_queue(queue): + """save manual review queue""" + MANUAL_QUEUE.parent.mkdir(parents=True, exist_ok=True) + MANUAL_QUEUE.write_text(json.dumps(queue, indent=2)) + + +def already_contacted(recipient_id): + """check if we've already sent an intro to this person""" + log = load_delivery_log() + sent_ids = [s.get('recipient_id') for s in log.get('sent', [])] + return recipient_id in sent_ids + + +def send_email(to_email, subject, body, dry_run=False): + """send email via smtp""" + if dry_run: + print(f" [dry run] would email {to_email}") + print(f" subject: {subject}") + print(f" body preview: {body[:100]}...") + return True, "dry run" + + try: + msg = MIMEMultipart('alternative') + msg['Subject'] = subject + msg['From'] = FROM_EMAIL + msg['To'] = to_email + + # plain text + text_part = MIMEText(body, 'plain') + msg.attach(text_part) + + # html version (simple) + html_body = body.replace('\n', '
') + html_part = MIMEText(f"

{html_body}

", 'html') + msg.attach(html_part) + + with smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) as server: + server.login(SMTP_USER, SMTP_PASS) + server.sendmail(SMTP_USER, to_email, msg.as_string()) + + return True, None + except Exception as e: + return False, str(e) + + +def create_github_issue(owner, repo, title, body, dry_run=False): + """create github issue as intro""" + if not GITHUB_TOKEN: + return False, "GITHUB_TOKEN not set" + + if dry_run: + print(f" [dry run] would create issue on {owner}/{repo}") + print(f" title: {title}") + return True, "dry run" + + try: + url = f"https://api.github.com/repos/{owner}/{repo}/issues" + resp = requests.post( + url, + headers={ + 'Authorization': f'token {GITHUB_TOKEN}', + 'Accept': 'application/vnd.github.v3+json', + }, + json={ + 'title': title, + 'body': body, + 'labels': ['introduction', 'community'], + }, + timeout=30, + ) + + if resp.status_code == 201: + issue_url = resp.json().get('html_url') + return True, issue_url + else: + return False, f"github api error: {resp.status_code} - {resp.text}" + except Exception as e: + return False, str(e) + + +def send_mastodon_dm(recipient_acct, message, dry_run=False): + """send mastodon direct message""" + if not MASTODON_TOKEN: + return False, "MASTODON_TOKEN not set" + + if dry_run: + print(f" [dry run] would DM {recipient_acct}") + print(f" message preview: {message[:100]}...") + return True, "dry run" + + try: + # post as direct message (visibility: direct, mention recipient) + url = f"https://{MASTODON_INSTANCE}/api/v1/statuses" + resp = requests.post( + url, + headers={ + 'Authorization': f'Bearer {MASTODON_TOKEN}', + 'Content-Type': 'application/json', + }, + json={ + 'status': f"@{recipient_acct} {message}", + 'visibility': 'direct', + }, + timeout=30, + ) + + if resp.status_code in [200, 201]: + return True, resp.json().get('url') + else: + return False, f"mastodon api error: {resp.status_code} - {resp.text}" + except Exception as e: + return False, str(e) + + +def send_bluesky_dm(recipient_handle, message, dry_run=False): + """send bluesky direct message via AT Protocol""" + if not BLUESKY_APP_PASSWORD: + return False, "BLUESKY_APP_PASSWORD not set" + + if dry_run: + print(f" [dry run] would DM {recipient_handle} on bluesky") + print(f" message preview: {message[:100]}...") + return True, "dry run" + + try: + # authenticate with bluesky + auth_url = "https://bsky.social/xrpc/com.atproto.server.createSession" + auth_resp = requests.post( + auth_url, + json={ + 'identifier': BLUESKY_HANDLE, + 'password': BLUESKY_APP_PASSWORD, + }, + timeout=30, + ) + + if auth_resp.status_code != 200: + return False, f"bluesky auth failed: {auth_resp.status_code}" + + auth_data = auth_resp.json() + access_token = auth_data.get('accessJwt') + did = auth_data.get('did') + + # resolve recipient DID + resolve_url = f"https://bsky.social/xrpc/com.atproto.identity.resolveHandle" + resolve_resp = requests.get( + resolve_url, + params={'handle': recipient_handle.lstrip('@')}, + timeout=30, + ) + + if resolve_resp.status_code != 200: + return False, f"couldn't resolve handle {recipient_handle}" + + recipient_did = resolve_resp.json().get('did') + + # create chat/DM (using convo namespace) + # first get or create conversation + convo_url = "https://bsky.social/xrpc/chat.bsky.convo.getConvoForMembers" + convo_resp = requests.get( + convo_url, + headers={'Authorization': f'Bearer {access_token}'}, + params={'members': [recipient_did]}, + timeout=30, + ) + + if convo_resp.status_code != 200: + # try creating conversation + return False, f"couldn't get/create conversation: {convo_resp.status_code}" + + convo_id = convo_resp.json().get('convo', {}).get('id') + + # send message + msg_url = "https://bsky.social/xrpc/chat.bsky.convo.sendMessage" + msg_resp = requests.post( + msg_url, + headers={ + 'Authorization': f'Bearer {access_token}', + 'Content-Type': 'application/json', + }, + json={ + 'convoId': convo_id, + 'message': {'text': message}, + }, + timeout=30, + ) + + if msg_resp.status_code in [200, 201]: + return True, f"sent to {recipient_handle}" + else: + return False, f"bluesky dm failed: {msg_resp.status_code} - {msg_resp.text}" + + except Exception as e: + return False, str(e) + + +def send_matrix_dm(recipient_mxid, message, dry_run=False): + """send matrix direct message""" + if not MATRIX_ACCESS_TOKEN: + return False, "MATRIX_ACCESS_TOKEN not set" + + if dry_run: + print(f" [dry run] would DM {recipient_mxid} on matrix") + print(f" message preview: {message[:100]}...") + return True, "dry run" + + try: + # create or get direct room with recipient + # first, check if we already have a DM room + headers = {'Authorization': f'Bearer {MATRIX_ACCESS_TOKEN}'} + + # create a new DM room + create_room_resp = requests.post( + f'{MATRIX_HOMESERVER}/_matrix/client/v3/createRoom', + headers=headers, + json={ + 'is_direct': True, + 'invite': [recipient_mxid], + 'preset': 'trusted_private_chat', + }, + timeout=30, + ) + + if create_room_resp.status_code not in [200, 201]: + return False, f"matrix room creation failed: {create_room_resp.status_code} - {create_room_resp.text}" + + room_id = create_room_resp.json().get('room_id') + + # send message to room + import time + txn_id = str(int(time.time() * 1000)) + + msg_resp = requests.put( + f'{MATRIX_HOMESERVER}/_matrix/client/v3/rooms/{room_id}/send/m.room.message/{txn_id}', + headers=headers, + json={ + 'msgtype': 'm.text', + 'body': message, + }, + timeout=30, + ) + + if msg_resp.status_code in [200, 201]: + return True, f"sent to {recipient_mxid} in {room_id}" + else: + return False, f"matrix send failed: {msg_resp.status_code} - {msg_resp.text}" + + except Exception as e: + return False, str(e) + + +def add_to_manual_queue(intro_data): + """add intro to manual review queue""" + queue = load_manual_queue() + queue.append({ + **intro_data, + 'queued_at': datetime.now().isoformat(), + 'status': 'pending', + }) + save_manual_queue(queue) + return True + + +def determine_best_contact(human): + """ + determine best contact method based on WHERE THEY'RE MOST ACTIVE + + uses activity-based selection from groq_draft module + """ + from introd.groq_draft import determine_contact_method as activity_based_contact + + method, info = activity_based_contact(human) + + # convert github_issue info to dict format for delivery + if method == 'github_issue' and isinstance(info, str) and '/' in info: + parts = info.split('/', 1) + return method, {'owner': parts[0], 'repo': parts[1]} + + return method, info + + +def deliver_intro(match_data, intro_draft, dry_run=False): + """ + deliver an intro via the best available method + + match_data: {human_a, human_b, overlap_score, overlap_reasons} + intro_draft: the text to send (from groq) + """ + recipient = match_data.get('human_b', {}) + recipient_id = f"{recipient.get('platform')}:{recipient.get('username')}" + + # check if already contacted + if already_contacted(recipient_id): + return False, "already contacted", None + + # determine contact method + method, contact_info = determine_best_contact(recipient) + + log = load_delivery_log() + result = { + 'recipient_id': recipient_id, + 'recipient_name': recipient.get('name') or recipient.get('username'), + 'method': method, + 'contact_info': contact_info, + 'overlap_score': match_data.get('overlap_score'), + 'timestamp': datetime.now().isoformat(), + } + + success = False + error = None + + if method == 'email': + subject = f"someone you might want to know - connectd" + success, error = send_email(contact_info, subject, intro_draft, dry_run) + + elif method == 'mastodon': + success, error = send_mastodon_dm(contact_info, intro_draft, dry_run) + + elif method == 'bluesky': + success, error = send_bluesky_dm(contact_info, intro_draft, dry_run) + + elif method == 'matrix': + success, error = send_matrix_dm(contact_info, intro_draft, dry_run) + + elif method == 'discord': + from scoutd.discord import send_discord_dm + success, error = send_discord_dm(contact_info, intro_draft, dry_run) + + elif method == 'lemmy': + from scoutd.lemmy import send_lemmy_dm + success, error = send_lemmy_dm(contact_info, intro_draft, dry_run) + + elif method == 'github_issue': + owner = contact_info.get('owner') + repo = contact_info.get('repo') + title = "community introduction from connectd" + # format for github + github_body = f"""hey {recipient.get('name') or recipient.get('username')}, + +{intro_draft} + +--- +*this is an automated introduction from [connectd](https://github.com/connectd-daemon), a daemon that finds isolated builders with aligned values and connects them. if this feels spammy, i apologize - you can close this issue and we won't reach out again.* +""" + success, error = create_github_issue(owner, repo, title, github_body, dry_run) + + elif method == 'manual': + # add to review queue + add_to_manual_queue({ + 'match': match_data, + 'draft': intro_draft, + 'recipient': recipient, + }) + success = True + error = "added to manual queue" + + # log result + result['success'] = success + result['error'] = error + + if success: + log['sent'].append(result) + else: + log['failed'].append(result) + + save_delivery_log(log) + + return success, error, method + + +def deliver_batch(matches_with_intros, dry_run=False): + """ + deliver intros for a batch of matches + + matches_with_intros: list of {match_data, intro_draft} + """ + results = [] + + for item in matches_with_intros: + match_data = item.get('match_data') or item.get('match') + intro_draft = item.get('intro_draft') or item.get('draft') + + if not match_data or not intro_draft: + continue + + success, error, method = deliver_intro(match_data, intro_draft, dry_run) + results.append({ + 'recipient': match_data.get('human_b', {}).get('username'), + 'method': method, + 'success': success, + 'error': error, + }) + + print(f" {match_data.get('human_b', {}).get('username')}: {method} - {'ok' if success else error}") + + return results + + +def get_delivery_stats(): + """get delivery statistics""" + log = load_delivery_log() + queue = load_manual_queue() + + return { + 'sent': len(log.get('sent', [])), + 'failed': len(log.get('failed', [])), + 'queued': len(log.get('queued', [])), + 'manual_pending': len([q for q in queue if q.get('status') == 'pending']), + 'by_method': { + 'email': len([s for s in log.get('sent', []) if s.get('method') == 'email']), + 'mastodon': len([s for s in log.get('sent', []) if s.get('method') == 'mastodon']), + 'github_issue': len([s for s in log.get('sent', []) if s.get('method') == 'github_issue']), + 'manual': len([s for s in log.get('sent', []) if s.get('method') == 'manual']), + }, + } + + +def review_manual_queue(): + """review and process manual queue""" + queue = load_manual_queue() + pending = [q for q in queue if q.get('status') == 'pending'] + + if not pending: + print("no items in manual queue") + return + + print(f"\n{len(pending)} items pending review:\n") + + for i, item in enumerate(pending, 1): + recipient = item.get('recipient', {}) + match = item.get('match', {}) + + print(f"[{i}] {recipient.get('name') or recipient.get('username')}") + print(f" platform: {recipient.get('platform')}") + print(f" url: {recipient.get('url')}") + print(f" overlap: {match.get('overlap_score')}") + print(f" draft preview: {item.get('draft', '')[:80]}...") + print() + + return pending diff --git a/connectd/introd/draft.py b/connectd/introd/draft.py new file mode 100644 index 0000000..3cbf160 --- /dev/null +++ b/connectd/introd/draft.py @@ -0,0 +1,210 @@ +""" +introd/draft.py - AI writes intro messages referencing both parties' work +""" + +import json + +# intro template - transparent about being AI, neutral third party +INTRO_TEMPLATE = """hi {recipient_name}, + +i'm an AI that connects isolated builders working on similar things. + +you're building: {recipient_summary} + +{other_name} is building: {other_summary} + +overlap: {overlap_summary} + +thought you might benefit from knowing each other. + +their work: {other_url} + +no pitch. just connection. ignore if not useful. + +- connectd +""" + +# shorter version for platforms with character limits +SHORT_TEMPLATE = """hi {recipient_name} - i'm an AI connecting aligned builders. + +you: {recipient_summary} +{other_name}: {other_summary} + +overlap: {overlap_summary} + +their work: {other_url} + +no pitch, just connection. +""" + + +def summarize_human(human_data): + """generate a brief summary of what someone is building/interested in""" + parts = [] + + # name or username + name = human_data.get('name') or human_data.get('username', 'unknown') + + # platform context + platform = human_data.get('platform', '') + + # signals/interests + signals = human_data.get('signals', []) + if isinstance(signals, str): + signals = json.loads(signals) + + # extra data + extra = human_data.get('extra', {}) + if isinstance(extra, str): + extra = json.loads(extra) + + # build summary based on available data + topics = extra.get('topics', []) + languages = list(extra.get('languages', {}).keys())[:3] + repo_count = extra.get('repo_count', 0) + subreddits = extra.get('subreddits', []) + + if platform == 'github': + if topics: + parts.append(f"working on {', '.join(topics[:3])}") + if languages: + parts.append(f"using {', '.join(languages)}") + if repo_count > 10: + parts.append(f"({repo_count} repos)") + + elif platform == 'reddit': + if subreddits: + parts.append(f"active in r/{', r/'.join(subreddits[:3])}") + + elif platform == 'mastodon': + instance = extra.get('instance', '') + if instance: + parts.append(f"on {instance}") + + elif platform == 'lobsters': + karma = extra.get('karma', 0) + if karma > 50: + parts.append(f"active on lobste.rs ({karma} karma)") + + # add key signals + key_signals = [s for s in signals if s in ['selfhosted', 'privacy', 'cooperative', + 'solarpunk', 'intentional_community', + 'home_automation', 'foss']] + if key_signals: + parts.append(f"interested in {', '.join(key_signals[:3])}") + + if not parts: + parts.append(f"builder on {platform}") + + return ' | '.join(parts) + + +def summarize_overlap(overlap_data): + """generate overlap summary""" + reasons = overlap_data.get('overlap_reasons', []) + if isinstance(reasons, str): + reasons = json.loads(reasons) + + if reasons: + return ' | '.join(reasons[:3]) + + # fallback + shared = overlap_data.get('shared_signals', []) + if shared: + return f"shared interests: {', '.join(shared[:3])}" + + return "aligned values and interests" + + +def draft_intro(match_data, recipient='a'): + """ + draft an intro message for a match + + match_data: dict with human_a, human_b, overlap info + recipient: 'a' or 'b' - who receives this intro + + returns: dict with draft text, channel, metadata + """ + if recipient == 'a': + recipient_human = match_data['human_a'] + other_human = match_data['human_b'] + else: + recipient_human = match_data['human_b'] + other_human = match_data['human_a'] + + # get names + recipient_name = recipient_human.get('name') or recipient_human.get('username', 'friend') + other_name = other_human.get('name') or other_human.get('username', 'someone') + + # generate summaries + recipient_summary = summarize_human(recipient_human) + other_summary = summarize_human(other_human) + overlap_summary = summarize_overlap(match_data) + + # other's url + other_url = other_human.get('url', '') + + # determine best channel + contact = recipient_human.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) + + channel = None + channel_address = None + + # prefer email if available + if contact.get('email'): + channel = 'email' + channel_address = contact['email'] + # github issue/discussion + elif recipient_human.get('platform') == 'github': + channel = 'github' + channel_address = recipient_human.get('url') + # mastodon DM + elif recipient_human.get('platform') == 'mastodon': + channel = 'mastodon' + channel_address = recipient_human.get('username') + # reddit message + elif recipient_human.get('platform') == 'reddit': + channel = 'reddit' + channel_address = recipient_human.get('username') + else: + channel = 'manual' + channel_address = recipient_human.get('url') + + # choose template based on channel + if channel in ['mastodon', 'reddit']: + template = SHORT_TEMPLATE + else: + template = INTRO_TEMPLATE + + # render draft + draft = template.format( + recipient_name=recipient_name.split()[0] if recipient_name else 'friend', # first name only + recipient_summary=recipient_summary, + other_name=other_name.split()[0] if other_name else 'someone', + other_summary=other_summary, + overlap_summary=overlap_summary, + other_url=other_url, + ) + + return { + 'recipient_human': recipient_human, + 'other_human': other_human, + 'channel': channel, + 'channel_address': channel_address, + 'draft': draft, + 'overlap_score': match_data.get('overlap_score', 0), + 'match_id': match_data.get('id'), + } + + +def draft_intros_for_match(match_data): + """ + draft intros for both parties in a match + returns list of two intro dicts + """ + intro_a = draft_intro(match_data, recipient='a') + intro_b = draft_intro(match_data, recipient='b') + + return [intro_a, intro_b] diff --git a/connectd/introd/groq_draft.py b/connectd/introd/groq_draft.py new file mode 100644 index 0000000..26ed004 --- /dev/null +++ b/connectd/introd/groq_draft.py @@ -0,0 +1,437 @@ +""" +introd/groq_draft.py - groq llama 4 maverick for smart intro drafting + +uses groq api to generate personalized, natural intro messages +that don't sound like ai-generated slop +""" + +import os +import json +import requests +from datetime import datetime + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile') + + +def determine_contact_method(human): + """ + determine best contact method based on WHERE THEY'RE MOST ACTIVE + + don't use fixed hierarchy - analyze activity per platform: + - count posts/commits/activity + - weight by recency (last 30 days matters more) + - contact them where they already are + - fall back to email only if no social activity + """ + from datetime import datetime, timedelta + + extra = human.get('extra', {}) + if isinstance(extra, str): + extra = json.loads(extra) if extra else {} + + # handle nested extra.extra from old save format + if 'extra' in extra and isinstance(extra['extra'], dict): + extra = {**extra, **extra['extra']} + + contact = human.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) if contact else {} + + # collect activity scores per platform + activity_scores = {} + now = datetime.now() + thirty_days_ago = now - timedelta(days=30) + ninety_days_ago = now - timedelta(days=90) + + # github activity + github_username = human.get('username') if human.get('platform') == 'github' else extra.get('github') + if github_username: + github_score = 0 + top_repos = extra.get('top_repos', []) + + for repo in top_repos: + # recent commits weight more + pushed_at = repo.get('pushed_at', '') + if pushed_at: + try: + push_date = datetime.fromisoformat(pushed_at.replace('Z', '+00:00')).replace(tzinfo=None) + if push_date > thirty_days_ago: + github_score += 10 # very recent + elif push_date > ninety_days_ago: + github_score += 5 # somewhat recent + else: + github_score += 1 # old but exists + except: + github_score += 1 + + # stars indicate engagement + github_score += min(repo.get('stars', 0) // 10, 5) + + # commit activity from deep scrape + commit_count = extra.get('commit_count', 0) + github_score += min(commit_count // 10, 20) + + if github_score > 0: + activity_scores['github_issue'] = { + 'score': github_score, + 'info': f"{github_username}/{top_repos[0]['name']}" if top_repos else github_username + } + + # mastodon activity + mastodon_handle = extra.get('mastodon') or contact.get('mastodon') + if mastodon_handle: + mastodon_score = 0 + statuses_count = extra.get('mastodon_statuses', 0) or human.get('statuses_count', 0) + + # high post count = active user + if statuses_count > 1000: + mastodon_score += 30 + elif statuses_count > 500: + mastodon_score += 20 + elif statuses_count > 100: + mastodon_score += 10 + elif statuses_count > 0: + mastodon_score += 5 + + # platform bonus for fediverse (values-aligned) + mastodon_score += 10 + + # bonus if handle was discovered via rel="me" or similar verification + # (having a handle linked from their website = they want to be contacted there) + handles = extra.get('handles', {}) + if handles.get('mastodon') == mastodon_handle: + mastodon_score += 15 # verified handle bonus + + if mastodon_score > 0: + activity_scores['mastodon'] = {'score': mastodon_score, 'info': mastodon_handle} + + # bluesky activity + bluesky_handle = extra.get('bluesky') or contact.get('bluesky') + if bluesky_handle: + bluesky_score = 0 + posts_count = extra.get('bluesky_posts', 0) or human.get('posts_count', 0) + + if posts_count > 500: + bluesky_score += 25 + elif posts_count > 100: + bluesky_score += 15 + elif posts_count > 0: + bluesky_score += 5 + + # newer platform, slightly lower weight + bluesky_score += 5 + + if bluesky_score > 0: + activity_scores['bluesky'] = {'score': bluesky_score, 'info': bluesky_handle} + + # twitter activity + twitter_handle = extra.get('twitter') or contact.get('twitter') + if twitter_handle: + twitter_score = 0 + tweets_count = extra.get('twitter_tweets', 0) + + if tweets_count > 1000: + twitter_score += 20 + elif tweets_count > 100: + twitter_score += 10 + elif tweets_count > 0: + twitter_score += 5 + + # if we found them via twitter hashtags, they're active there + if human.get('platform') == 'twitter': + twitter_score += 15 + + if twitter_score > 0: + activity_scores['twitter'] = {'score': twitter_score, 'info': twitter_handle} + + # NOTE: reddit is DISCOVERY ONLY, not a contact method + # we find users on reddit but reach out via their external links (github, mastodon, etc.) + # reddit-only users go to manual_queue for review + + # lobsters activity + lobsters_username = extra.get('lobsters') or contact.get('lobsters') + if lobsters_username or human.get('platform') == 'lobsters': + lobsters_score = 0 + lobsters_username = lobsters_username or human.get('username') + + karma = extra.get('lobsters_karma', 0) or human.get('karma', 0) + + # lobsters is invite-only, high signal + lobsters_score += 15 + + if karma > 100: + lobsters_score += 15 + elif karma > 50: + lobsters_score += 10 + elif karma > 0: + lobsters_score += 5 + + if lobsters_score > 0: + activity_scores['lobsters'] = {'score': lobsters_score, 'info': lobsters_username} + + # matrix activity + matrix_id = extra.get('matrix') or contact.get('matrix') + if matrix_id: + matrix_score = 0 + + # matrix users are typically privacy-conscious and technical + matrix_score += 15 # platform bonus for decentralized chat + + # bonus if handle was discovered via rel="me" verification + handles = extra.get('handles', {}) + if handles.get('matrix') == matrix_id: + matrix_score += 10 # verified handle bonus + + if matrix_score > 0: + activity_scores['matrix'] = {'score': matrix_score, 'info': matrix_id} + + # lemmy activity (fediverse) + lemmy_username = human.get('username') if human.get('platform') == 'lemmy' else extra.get('lemmy') + if lemmy_username: + lemmy_score = 0 + + # lemmy is fediverse - high values alignment + lemmy_score += 20 # fediverse platform bonus + + post_count = extra.get('post_count', 0) + comment_count = extra.get('comment_count', 0) + + if post_count > 100: + lemmy_score += 15 + elif post_count > 50: + lemmy_score += 10 + elif post_count > 10: + lemmy_score += 5 + + if comment_count > 500: + lemmy_score += 10 + elif comment_count > 100: + lemmy_score += 5 + + if lemmy_score > 0: + activity_scores['lemmy'] = {'score': lemmy_score, 'info': lemmy_username} + + # pick highest activity platform + if activity_scores: + best_platform = max(activity_scores.items(), key=lambda x: x[1]['score']) + return best_platform[0], best_platform[1]['info'] + + # fall back to email ONLY if no social activity detected + email = extra.get('email') or contact.get('email') + # also check emails list + if not email: + emails = extra.get('emails') or contact.get('emails') or [] + for e in emails: + if e and '@' in e and 'noreply' not in e.lower(): + email = e + break + + if email and '@' in email and 'noreply' not in email.lower(): + return 'email', email + + # last resort: manual + return 'manual', None + + +def draft_intro_with_llm(match_data, recipient='a', dry_run=False): + """ + use groq llama 4 maverick to draft a personalized intro + + match_data should contain: + - human_a: the first person + - human_b: the second person + - overlap_score: numeric score + - overlap_reasons: list of why they match + + recipient: 'a' or 'b' - who we're writing to + """ + if not GROQ_API_KEY: + return None, "GROQ_API_KEY not set" + + # determine recipient and other person + if recipient == 'a': + to_person = match_data.get('human_a', {}) + other_person = match_data.get('human_b', {}) + else: + to_person = match_data.get('human_b', {}) + other_person = match_data.get('human_a', {}) + + # build context + to_name = to_person.get('name') or to_person.get('username', 'friend') + other_name = other_person.get('name') or other_person.get('username', 'someone') + + to_signals = to_person.get('signals', []) + if isinstance(to_signals, str): + to_signals = json.loads(to_signals) if to_signals else [] + + other_signals = other_person.get('signals', []) + if isinstance(other_signals, str): + other_signals = json.loads(other_signals) if other_signals else [] + + overlap_reasons = match_data.get('overlap_reasons', []) + if isinstance(overlap_reasons, str): + overlap_reasons = json.loads(overlap_reasons) if overlap_reasons else [] + + # parse extra data + to_extra = to_person.get('extra', {}) + other_extra = other_person.get('extra', {}) + if isinstance(to_extra, str): + to_extra = json.loads(to_extra) if to_extra else {} + if isinstance(other_extra, str): + other_extra = json.loads(other_extra) if other_extra else {} + + # build profile summaries + to_profile = f""" +name: {to_name} +platform: {to_person.get('platform', 'unknown')} +bio: {to_person.get('bio') or 'no bio'} +location: {to_person.get('location') or 'unknown'} +signals: {', '.join(to_signals[:8])} +repos: {len(to_extra.get('top_repos', []))} public repos +languages: {', '.join(to_extra.get('languages', {}).keys())} +""" + + other_profile = f""" +name: {other_name} +platform: {other_person.get('platform', 'unknown')} +bio: {other_person.get('bio') or 'no bio'} +location: {other_person.get('location') or 'unknown'} +signals: {', '.join(other_signals[:8])} +repos: {len(other_extra.get('top_repos', []))} public repos +languages: {', '.join(other_extra.get('languages', {}).keys())} +url: {other_person.get('url', '')} +""" + + # build prompt + system_prompt = """you are connectd, an ai that connects isolated builders who share values but don't know each other yet. + +your job is to write a short, genuine intro message to one person about another person they might want to know. + +rules: +- be brief (3-5 sentences max) +- be genuine, not salesy or fake +- focus on WHY they might want to connect, not just WHAT they have in common +- don't be cringe or use buzzwords +- lowercase preferred (casual tone) +- no emojis unless the person's profile suggests they'd like them +- mention specific things from their profiles, not generic "you both like open source" +- end with a simple invitation, not a hard sell +- sign off as "- connectd" (lowercase) + +bad examples: +- "I noticed you're both passionate about..." (too formal) +- "You two would be PERFECT for each other!" (too salesy) +- "As a fellow privacy enthusiast..." (cringe) + +good examples: +- "hey, saw you're building X. there's someone else working on similar stuff in Y who might be interesting to know." +- "you might want to check out Z's work on federated systems - similar approach to what you're doing with A." +""" + + user_prompt = f"""write an intro message to {to_name} about {other_name}. + +RECIPIENT ({to_name}): +{to_profile} + +INTRODUCING ({other_name}): +{other_profile} + +WHY THEY MATCH (overlap score {match_data.get('overlap_score', 0)}): +{', '.join(overlap_reasons[:5])} + +write a short intro message. remember: lowercase, genuine, not salesy.""" + + try: + response = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [ + {'role': 'system', 'content': system_prompt}, + {'role': 'user', 'content': user_prompt}, + ], + 'temperature': 0.7, + 'max_tokens': 300, + }, + timeout=30, + ) + + if response.status_code != 200: + return None, f"groq api error: {response.status_code} - {response.text}" + + data = response.json() + draft = data['choices'][0]['message']['content'].strip() + + # determine contact method for recipient + contact_method, contact_info = determine_contact_method(to_person) + + return { + 'draft': draft, + 'model': MODEL, + 'to': to_name, + 'about': other_name, + 'overlap_score': match_data.get('overlap_score', 0), + 'contact_method': contact_method, + 'contact_info': contact_info, + 'generated_at': datetime.now().isoformat(), + }, None + + except Exception as e: + return None, f"groq error: {str(e)}" + + +def draft_intro_batch(matches, dry_run=False): + """ + draft intros for multiple matches + returns list of (match, intro_result, error) tuples + """ + results = [] + + for match in matches: + # draft for both directions + intro_a, err_a = draft_intro_with_llm(match, recipient='a', dry_run=dry_run) + intro_b, err_b = draft_intro_with_llm(match, recipient='b', dry_run=dry_run) + + results.append({ + 'match': match, + 'intro_to_a': intro_a, + 'intro_to_b': intro_b, + 'errors': [err_a, err_b], + }) + + return results + + +def test_groq_connection(): + """test that groq api is working""" + if not GROQ_API_KEY: + return False, "GROQ_API_KEY not set" + + try: + response = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [{'role': 'user', 'content': 'say "ok" and nothing else'}], + 'max_tokens': 10, + }, + timeout=10, + ) + + if response.status_code == 200: + return True, "groq api working" + else: + return False, f"groq api error: {response.status_code}" + + except Exception as e: + return False, f"groq connection error: {str(e)}" diff --git a/connectd/introd/lost_intro.py b/connectd/introd/lost_intro.py new file mode 100644 index 0000000..e98709b --- /dev/null +++ b/connectd/introd/lost_intro.py @@ -0,0 +1,250 @@ +""" +introd/lost_intro.py - intro drafting for lost builders + +different tone than builder-to-builder intros. +these people need encouragement, not networking. + +the goal isn't to recruit them. it's to show them the door exists. +they take it or they don't. but they'll know someone saw them. +""" + +import os +import json +import requests +from datetime import datetime + +GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') +GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' +MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile') + + +LOST_INTRO_TEMPLATE = """hey {name}, + +i'm connectd. i'm a daemon that finds people who might need a nudge. + +i noticed you're interested in {interests}. you ask good questions. you clearly get it. + +but maybe you haven't built anything yet. or you started and stopped. or you don't think you can. + +that's okay. most people don't. + +but some people do. here's one: {builder_name} ({builder_url}) + +{builder_description} + +they started where you are. look at what they built. + +you're not behind. you're just not started yet. + +no pressure. just wanted you to know someone noticed. + +- connectd""" + + +SYSTEM_PROMPT = """you are connectd, a daemon that finds isolated builders with aligned values and connects them. + +right now you're reaching out to someone who has POTENTIAL but hasn't found it yet. maybe they gave up, maybe they're stuck, maybe they don't believe they can do it. + +your job is to: +1. acknowledge where they are without being condescending +2. point them to an active builder who could inspire them +3. be genuine, not salesy or motivational-speaker-y +4. keep it short - these people are tired, don't overwhelm them +5. use lowercase, be human, no corporate bullshit +6. make it clear there's no pressure, no follow-up spam + +you're not recruiting. you're not selling. you're just showing them a door. + +the template structure: +- acknowledge them (you noticed something about them) +- normalize where they are (most people don't build things) +- show them someone who did (the builder) +- brief encouragement (you're not behind, just not started) +- sign off with no pressure + +do NOT: +- be preachy or lecture them +- use motivational cliches ("you got this!", "believe in yourself!") +- make promises about outcomes +- be too long - they don't have energy for long messages +- make them feel bad about where they are""" + + +def draft_lost_intro(lost_user, inspiring_builder, config=None): + """ + draft an intro for a lost builder, pairing them with an inspiring active builder. + + lost_user: the person who needs a nudge + inspiring_builder: an active builder with similar interests who could inspire them + """ + config = config or {} + + # gather info about lost user + lost_name = lost_user.get('name') or lost_user.get('username', 'there') + lost_signals = lost_user.get('lost_signals', []) + lost_interests = extract_interests(lost_user) + + # gather info about inspiring builder + builder_name = inspiring_builder.get('name') or inspiring_builder.get('username') + builder_url = inspiring_builder.get('url') or f"https://github.com/{inspiring_builder.get('username')}" + builder_description = create_builder_description(inspiring_builder) + + # use LLM to personalize + if GROQ_API_KEY and config.get('use_llm', True): + return draft_with_llm(lost_user, inspiring_builder, lost_interests, builder_description) + + # fallback to template + return LOST_INTRO_TEMPLATE.format( + name=lost_name, + interests=', '.join(lost_interests[:3]) if lost_interests else 'building things', + builder_name=builder_name, + builder_url=builder_url, + builder_description=builder_description, + ), None + + +def extract_interests(user): + """extract interests from user profile""" + interests = [] + + # from topics/tags + extra = user.get('extra', {}) + if isinstance(extra, str): + try: + extra = json.loads(extra) + except: + extra = {} + + topics = extra.get('topics', []) or extra.get('aligned_topics', []) + interests.extend(topics[:5]) + + # from subreddits + subreddits = user.get('subreddits', []) + for sub in subreddits[:3]: + if sub.lower() not in ['learnprogramming', 'findapath', 'getdisciplined']: + interests.append(sub) + + # from bio keywords + bio = user.get('bio') or '' + bio_lower = bio.lower() + + interest_keywords = [ + 'rust', 'python', 'javascript', 'go', 'linux', 'self-hosting', 'homelab', + 'privacy', 'security', 'open source', 'foss', 'decentralized', 'ai', 'ml', + 'web dev', 'backend', 'frontend', 'devops', 'data', 'automation', + ] + + for kw in interest_keywords: + if kw in bio_lower and kw not in interests: + interests.append(kw) + + return interests[:5] if interests else ['technology', 'building things'] + + +def create_builder_description(builder): + """create a brief description of what the builder has done""" + extra = builder.get('extra', {}) + if isinstance(extra, str): + try: + extra = json.loads(extra) + except: + extra = {} + + parts = [] + + # what they build + repos = extra.get('top_repos', [])[:3] + if repos: + repo_names = [r.get('name') for r in repos if r.get('name')] + if repo_names: + parts.append(f"they've built things like {', '.join(repo_names[:2])}") + + # their focus + topics = extra.get('aligned_topics', []) or extra.get('topics', []) + if topics: + parts.append(f"they work on {', '.join(topics[:3])}") + + # their vibe + signals = builder.get('signals', []) + if 'self-hosted' in str(signals).lower(): + parts.append("they're into self-hosting and owning their own infrastructure") + if 'privacy' in str(signals).lower(): + parts.append("they care about privacy") + if 'community' in str(signals).lower(): + parts.append("they're community-focused") + + if parts: + return '. '.join(parts) + '.' + else: + return "they're building cool stuff in the open." + + +def draft_with_llm(lost_user, inspiring_builder, interests, builder_description): + """use LLM to draft personalized intro""" + + lost_name = lost_user.get('name') or lost_user.get('username', 'there') + lost_signals = lost_user.get('lost_signals', []) + lost_bio = lost_user.get('bio', '') + + builder_name = inspiring_builder.get('name') or inspiring_builder.get('username') + builder_url = inspiring_builder.get('url') or f"https://github.com/{inspiring_builder.get('username')}" + + user_prompt = f"""draft an intro for this lost builder: + +LOST USER: +- name: {lost_name} +- interests: {', '.join(interests)} +- signals detected: {', '.join(lost_signals[:5]) if lost_signals else 'general stuck/aspiring patterns'} +- bio: {lost_bio[:200] if lost_bio else 'none'} + +INSPIRING BUILDER TO SHOW THEM: +- name: {builder_name} +- url: {builder_url} +- what they do: {builder_description} + +write a short, genuine message. no fluff. no motivational cliches. just human. +keep it under 150 words. +use lowercase. +end with "- connectd" +""" + + try: + resp = requests.post( + GROQ_API_URL, + headers={ + 'Authorization': f'Bearer {GROQ_API_KEY}', + 'Content-Type': 'application/json', + }, + json={ + 'model': MODEL, + 'messages': [ + {'role': 'system', 'content': SYSTEM_PROMPT}, + {'role': 'user', 'content': user_prompt}, + ], + 'temperature': 0.7, + 'max_tokens': 500, + }, + timeout=30, + ) + + if resp.status_code == 200: + content = resp.json()['choices'][0]['message']['content'] + return content.strip(), None + else: + return None, f"llm error: {resp.status_code}" + + except Exception as e: + return None, str(e) + + +def get_lost_intro_config(): + """get configuration for lost builder outreach""" + return { + 'enabled': True, + 'max_per_day': 5, # lower volume, higher care + 'require_review': True, # always manual approval + 'cooldown_days': 90, # don't spam struggling people + 'min_lost_score': 40, + 'min_values_score': 20, + 'use_llm': True, + } diff --git a/connectd/introd/review.py b/connectd/introd/review.py new file mode 100644 index 0000000..0c3fefc --- /dev/null +++ b/connectd/introd/review.py @@ -0,0 +1,126 @@ +""" +introd/review.py - human approval queue before sending +""" + +import json +from datetime import datetime + + +def get_pending_intros(db, limit=50): + """ + get all intros pending human review + + returns list of intro dicts with full context + """ + rows = db.get_pending_intros(limit=limit) + + intros = [] + for row in rows: + # get associated match and humans + match_id = row.get('match_id') + recipient_id = row.get('recipient_human_id') + + recipient = db.get_human_by_id(recipient_id) if recipient_id else None + + intros.append({ + 'id': row['id'], + 'match_id': match_id, + 'recipient': recipient, + 'channel': row.get('channel'), + 'draft': row.get('draft'), + 'status': row.get('status'), + }) + + return intros + + +def approve_intro(db, intro_id, approved_by='human'): + """ + approve an intro for sending + + intro_id: database id of the intro + approved_by: who approved it (for audit trail) + """ + db.approve_intro(intro_id, approved_by) + print(f"introd: approved intro {intro_id} by {approved_by}") + + +def reject_intro(db, intro_id, reason=None): + """ + reject an intro (won't be sent) + """ + c = db.conn.cursor() + c.execute('''UPDATE intros SET status = 'rejected', + approved_at = ?, approved_by = ? WHERE id = ?''', + (datetime.now().isoformat(), f"rejected: {reason}" if reason else "rejected", intro_id)) + db.conn.commit() + print(f"introd: rejected intro {intro_id}") + + +def review_intro_interactive(db, intro): + """ + interactive review of a single intro + + returns: 'approve', 'reject', 'edit', or 'skip' + """ + print("\n" + "=" * 60) + print("INTRO FOR REVIEW") + print("=" * 60) + + recipient = intro.get('recipient', {}) + print(f"\nRecipient: {recipient.get('name') or recipient.get('username')}") + print(f"Platform: {recipient.get('platform')}") + print(f"Channel: {intro.get('channel')}") + print(f"\n--- DRAFT ---") + print(intro.get('draft')) + print("--- END ---\n") + + while True: + choice = input("[a]pprove / [r]eject / [s]kip / [e]dit? ").strip().lower() + + if choice in ['a', 'approve']: + approve_intro(db, intro['id']) + return 'approve' + elif choice in ['r', 'reject']: + reason = input("reason (optional): ").strip() + reject_intro(db, intro['id'], reason) + return 'reject' + elif choice in ['s', 'skip']: + return 'skip' + elif choice in ['e', 'edit']: + print("editing not yet implemented - approve or reject") + else: + print("invalid choice") + + +def review_all_pending(db): + """ + interactive review of all pending intros + """ + intros = get_pending_intros(db) + + if not intros: + print("no pending intros to review") + return + + print(f"\n{len(intros)} intros pending review\n") + + approved = 0 + rejected = 0 + skipped = 0 + + for intro in intros: + result = review_intro_interactive(db, intro) + + if result == 'approve': + approved += 1 + elif result == 'reject': + rejected += 1 + else: + skipped += 1 + + cont = input("\ncontinue reviewing? [y/n] ").strip().lower() + if cont != 'y': + break + + print(f"\nreview complete: {approved} approved, {rejected} rejected, {skipped} skipped") diff --git a/connectd/introd/send.py b/connectd/introd/send.py new file mode 100644 index 0000000..b7d2c0d --- /dev/null +++ b/connectd/introd/send.py @@ -0,0 +1,216 @@ +""" +introd/send.py - actually deliver intros via appropriate channel +""" + +import smtplib +import requests +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from datetime import datetime +import os + +# email config (from env) +SMTP_HOST = os.environ.get('SMTP_HOST', '') +SMTP_PORT = int(os.environ.get('SMTP_PORT', '465')) +SMTP_USER = os.environ.get('SMTP_USER', '') +SMTP_PASS = os.environ.get('SMTP_PASS', '') +FROM_EMAIL = os.environ.get('FROM_EMAIL', '') + + +def send_email(to_email, subject, body): + """send email via SMTP""" + msg = MIMEMultipart() + msg['From'] = FROM_EMAIL + msg['To'] = to_email + msg['Subject'] = subject + + msg.attach(MIMEText(body, 'plain')) + + try: + with smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) as server: + server.login(SMTP_USER, SMTP_PASS) + server.send_message(msg) + return True, None + except Exception as e: + return False, str(e) + + +def send_github_issue(repo_url, title, body): + """ + create a github issue (requires GITHUB_TOKEN) + note: only works if you have write access to the repo + typically won't work for random users - fallback to manual + """ + # extract owner/repo from url + # https://github.com/owner/repo -> owner/repo + parts = repo_url.rstrip('/').split('/') + if len(parts) < 2: + return False, "invalid github url" + + owner = parts[-2] + repo = parts[-1] + + token = os.environ.get('GITHUB_TOKEN') + if not token: + return False, "no github token" + + # would create issue via API - but this is invasive + # better to just output the info for manual action + return False, "github issues not automated - use manual outreach" + + +def send_mastodon_dm(instance, username, message): + """ + send mastodon DM (requires account credentials) + not implemented - requires oauth setup + """ + return False, "mastodon DMs not automated - use manual outreach" + + +def send_reddit_message(username, subject, body): + """ + send reddit message (requires account credentials) + not implemented - requires oauth setup + """ + return False, "reddit messages not automated - use manual outreach" + + +def send_intro(db, intro_id): + """ + send an approved intro + + returns: (success, error_message) + """ + # get intro from db + c = db.conn.cursor() + c.execute('SELECT * FROM intros WHERE id = ?', (intro_id,)) + row = c.fetchone() + + if not row: + return False, "intro not found" + + intro = dict(row) + + if intro['status'] != 'approved': + return False, f"intro not approved (status: {intro['status']})" + + channel = intro.get('channel') + draft = intro.get('draft') + + # get recipient info + recipient = db.get_human_by_id(intro['recipient_human_id']) + if not recipient: + return False, "recipient not found" + + success = False + error = None + + if channel == 'email': + # get email from contact + import json + contact = recipient.get('contact', {}) + if isinstance(contact, str): + contact = json.loads(contact) + + email = contact.get('email') + if email: + success, error = send_email( + email, + "connection: aligned builder intro", + draft + ) + else: + error = "no email address" + + elif channel == 'github': + success, error = send_github_issue( + recipient.get('url'), + "connection: aligned builder intro", + draft + ) + + elif channel == 'mastodon': + success, error = send_mastodon_dm( + recipient.get('instance'), + recipient.get('username'), + draft + ) + + elif channel == 'reddit': + success, error = send_reddit_message( + recipient.get('username'), + "connection: aligned builder intro", + draft + ) + + else: + error = f"unknown channel: {channel}" + + # update status + if success: + db.mark_intro_sent(intro_id) + print(f"introd: sent intro {intro_id} via {channel}") + else: + # mark as needs manual sending + c.execute('''UPDATE intros SET status = 'manual_needed', + approved_at = ? WHERE id = ?''', + (datetime.now().isoformat(), intro_id)) + db.conn.commit() + print(f"introd: intro {intro_id} needs manual send ({error})") + + return success, error + + +def send_all_approved(db): + """ + send all approved intros + """ + c = db.conn.cursor() + c.execute('SELECT id FROM intros WHERE status = "approved"') + rows = c.fetchall() + + if not rows: + print("no approved intros to send") + return + + print(f"sending {len(rows)} approved intros...") + + sent = 0 + failed = 0 + + for row in rows: + success, error = send_intro(db, row['id']) + if success: + sent += 1 + else: + failed += 1 + + print(f"sent: {sent}, failed/manual: {failed}") + + +def export_manual_intros(db, output_file='manual_intros.txt'): + """ + export intros that need manual sending to a text file + """ + c = db.conn.cursor() + c.execute('''SELECT i.*, h.username, h.platform, h.url + FROM intros i + JOIN humans h ON i.recipient_human_id = h.id + WHERE i.status IN ('approved', 'manual_needed')''') + rows = c.fetchall() + + if not rows: + print("no intros to export") + return + + with open(output_file, 'w') as f: + for row in rows: + f.write("=" * 60 + "\n") + f.write(f"TO: {row['username']} ({row['platform']})\n") + f.write(f"URL: {row['url']}\n") + f.write(f"CHANNEL: {row['channel']}\n") + f.write("-" * 60 + "\n") + f.write(row['draft'] + "\n") + f.write("\n") + + print(f"exported {len(rows)} intros to {output_file}") diff --git a/connectd/logo.png b/connectd/logo.png new file mode 100644 index 0000000..212b1a9 Binary files /dev/null and b/connectd/logo.png differ diff --git a/connectd/matchd/__init__.py b/connectd/matchd/__init__.py new file mode 100644 index 0000000..916532f --- /dev/null +++ b/connectd/matchd/__init__.py @@ -0,0 +1,10 @@ +""" +matchd - pairing module +generates fingerprints, finds overlaps, ranks matches +""" + +from .fingerprint import generate_fingerprint +from .overlap import find_overlap +from .rank import rank_matches, find_all_matches + +__all__ = ['generate_fingerprint', 'find_overlap', 'rank_matches', 'find_all_matches'] diff --git a/connectd/matchd/fingerprint.py b/connectd/matchd/fingerprint.py new file mode 100644 index 0000000..832c05b --- /dev/null +++ b/connectd/matchd/fingerprint.py @@ -0,0 +1,210 @@ +""" +matchd/fingerprint.py - generate values profiles for humans +""" + +import json +from collections import defaultdict + +# values dimensions we track +VALUES_DIMENSIONS = [ + 'privacy', # surveillance concern, degoogle, self-hosted + 'decentralization', # p2p, fediverse, local-first + 'cooperation', # coops, mutual aid, community + 'queer_friendly', # lgbtq+, pronouns + 'environmental', # solarpunk, degrowth, sustainability + 'anticapitalist', # post-capitalism, worker ownership + 'builder', # creates vs consumes + 'pnw_oriented', # pacific northwest connection +] + +# skill categories +SKILL_CATEGORIES = [ + 'backend', # python, go, rust, databases + 'frontend', # js, react, css + 'devops', # docker, k8s, linux admin + 'hardware', # electronics, embedded, iot + 'design', # ui/ux, graphics + 'community', # organizing, facilitation + 'writing', # documentation, content +] + +# signal to dimension mapping +SIGNAL_TO_DIMENSION = { + 'privacy': 'privacy', + 'selfhosted': 'privacy', + 'degoogle': 'privacy', + 'decentralized': 'decentralization', + 'local_first': 'decentralization', + 'p2p': 'decentralization', + 'federated_chat': 'decentralization', + 'foss': 'decentralization', + 'cooperative': 'cooperation', + 'community': 'cooperation', + 'mutual_aid': 'cooperation', + 'intentional_community': 'cooperation', + 'queer': 'queer_friendly', + 'pronouns': 'queer_friendly', + 'blm': 'queer_friendly', + 'acab': 'queer_friendly', + 'solarpunk': 'environmental', + 'anticapitalist': 'anticapitalist', + 'pnw': 'pnw_oriented', + 'pnw_state': 'pnw_oriented', + 'remote': 'pnw_oriented', + 'home_automation': 'builder', + 'modern_lang': 'builder', + 'unix': 'builder', + 'containers': 'builder', +} + +# language to skill mapping +LANGUAGE_TO_SKILL = { + 'python': 'backend', + 'go': 'backend', + 'rust': 'backend', + 'java': 'backend', + 'ruby': 'backend', + 'php': 'backend', + 'javascript': 'frontend', + 'typescript': 'frontend', + 'html': 'frontend', + 'css': 'frontend', + 'vue': 'frontend', + 'shell': 'devops', + 'dockerfile': 'devops', + 'nix': 'devops', + 'hcl': 'devops', + 'c': 'hardware', + 'c++': 'hardware', + 'arduino': 'hardware', + 'verilog': 'hardware', +} + + +def generate_fingerprint(human_data): + """ + generate a values fingerprint for a human + + input: human dict from database (has signals, languages, etc) + output: fingerprint dict with values_vector, skills, interests + """ + # parse stored json fields + signals = human_data.get('signals', []) + if isinstance(signals, str): + signals = json.loads(signals) + + extra = human_data.get('extra', {}) + if isinstance(extra, str): + extra = json.loads(extra) + + languages = extra.get('languages', {}) + topics = extra.get('topics', []) + + # build values vector + values_vector = defaultdict(float) + + # from signals + for signal in signals: + dimension = SIGNAL_TO_DIMENSION.get(signal) + if dimension: + values_vector[dimension] += 1.0 + + # normalize values vector (0-1 scale) + max_val = max(values_vector.values()) if values_vector else 1 + values_vector = {k: min(v / max_val, 1.0) for k, v in values_vector.items()} + + # fill in missing dimensions with 0 + for dim in VALUES_DIMENSIONS: + if dim not in values_vector: + values_vector[dim] = 0.0 + + # determine skills from languages + skills = defaultdict(float) + total_repos = sum(languages.values()) if languages else 1 + + for lang, count in languages.items(): + skill = LANGUAGE_TO_SKILL.get(lang.lower()) + if skill: + skills[skill] += count / total_repos + + # normalize skills + if skills: + max_skill = max(skills.values()) + skills = {k: min(v / max_skill, 1.0) for k, v in skills.items()} + + # interests from topics and signals + interests = list(set(topics + signals)) + + # location preference + location_pref = None + if 'pnw' in signals or 'pnw_state' in signals: + location_pref = 'pnw' + elif 'remote' in signals: + location_pref = 'remote' + elif human_data.get('location'): + loc = human_data['location'].lower() + if any(x in loc for x in ['seattle', 'portland', 'washington', 'oregon', 'pnw', 'cascadia']): + location_pref = 'pnw' + + # availability (based on hireable flag if present) + availability = None + if extra.get('hireable'): + availability = 'open' + + return { + 'human_id': human_data.get('id'), + 'values_vector': dict(values_vector), + 'skills': dict(skills), + 'interests': interests, + 'location_pref': location_pref, + 'availability': availability, + } + + +def fingerprint_similarity(fp_a, fp_b): + """ + calculate similarity between two fingerprints + returns 0-1 score + """ + # values similarity (cosine-ish) + va = fp_a.get('values_vector', {}) + vb = fp_b.get('values_vector', {}) + + all_dims = set(va.keys()) | set(vb.keys()) + if not all_dims: + return 0.0 + + dot_product = sum(va.get(d, 0) * vb.get(d, 0) for d in all_dims) + mag_a = sum(v**2 for v in va.values()) ** 0.5 + mag_b = sum(v**2 for v in vb.values()) ** 0.5 + + if mag_a == 0 or mag_b == 0: + values_sim = 0.0 + else: + values_sim = dot_product / (mag_a * mag_b) + + # interest overlap (jaccard) + ia = set(fp_a.get('interests', [])) + ib = set(fp_b.get('interests', [])) + + if ia or ib: + interest_sim = len(ia & ib) / len(ia | ib) + else: + interest_sim = 0.0 + + # location compatibility + loc_a = fp_a.get('location_pref') + loc_b = fp_b.get('location_pref') + + loc_sim = 0.0 + if loc_a == loc_b and loc_a is not None: + loc_sim = 1.0 + elif loc_a == 'remote' or loc_b == 'remote': + loc_sim = 0.5 + elif loc_a == 'pnw' or loc_b == 'pnw': + loc_sim = 0.3 + + # weighted combination + similarity = (values_sim * 0.5) + (interest_sim * 0.3) + (loc_sim * 0.2) + + return similarity diff --git a/connectd/matchd/lost.py b/connectd/matchd/lost.py new file mode 100644 index 0000000..0845118 --- /dev/null +++ b/connectd/matchd/lost.py @@ -0,0 +1,199 @@ +""" +matchd/lost.py - lost builder matching + +lost builders don't get matched to each other (both need energy). +they get matched to ACTIVE builders who can inspire them. + +the goal: show them someone like them who made it. +""" + +import json +from .overlap import find_overlap, is_same_person + + +def find_inspiring_builder(lost_user, active_builders, db=None): + """ + find an active builder who could inspire a lost builder. + + criteria: + - shared interests (they need to relate to this person) + - active builder has shipped real work (proof it's possible) + - similar background signals if possible + - NOT the same person across platforms + """ + if not active_builders: + return None, "no active builders available" + + # parse lost user data + lost_signals = lost_user.get('signals', []) + if isinstance(lost_signals, str): + lost_signals = json.loads(lost_signals) if lost_signals else [] + + lost_extra = lost_user.get('extra', {}) + if isinstance(lost_extra, str): + lost_extra = json.loads(lost_extra) if lost_extra else {} + + # lost user interests + lost_interests = set() + lost_interests.update(lost_signals) + lost_interests.update(lost_extra.get('topics', [])) + lost_interests.update(lost_extra.get('aligned_topics', [])) + + # also include subreddits if from reddit (shows interests) + subreddits = lost_user.get('subreddits', []) + if isinstance(subreddits, str): + subreddits = json.loads(subreddits) if subreddits else [] + lost_interests.update(subreddits) + + # score each active builder + candidates = [] + + for builder in active_builders: + # skip if same person (cross-platform) + if is_same_person(lost_user, builder): + continue + + # get builder signals + builder_signals = builder.get('signals', []) + if isinstance(builder_signals, str): + builder_signals = json.loads(builder_signals) if builder_signals else [] + + builder_extra = builder.get('extra', {}) + if isinstance(builder_extra, str): + builder_extra = json.loads(builder_extra) if builder_extra else {} + + # builder interests + builder_interests = set() + builder_interests.update(builder_signals) + builder_interests.update(builder_extra.get('topics', [])) + builder_interests.update(builder_extra.get('aligned_topics', [])) + + # calculate match score + shared_interests = lost_interests & builder_interests + match_score = len(shared_interests) * 10 + + # bonus for high-value shared signals + high_value_signals = ['privacy', 'selfhosted', 'home_automation', 'foss', + 'solarpunk', 'cooperative', 'decentralized', 'queer'] + for signal in shared_interests: + if signal in high_value_signals: + match_score += 15 + + # bonus if builder has shipped real work (proof it's possible) + repos = builder_extra.get('top_repos', []) + if len(repos) >= 5: + match_score += 20 # they've built things + elif len(repos) >= 2: + match_score += 10 + + # bonus for high stars (visible success) + total_stars = sum(r.get('stars', 0) for r in repos) if repos else 0 + if total_stars >= 100: + match_score += 15 + elif total_stars >= 20: + match_score += 5 + + # bonus for similar location (relatable) + lost_loc = (lost_user.get('location') or '').lower() + builder_loc = (builder.get('location') or '').lower() + if lost_loc and builder_loc: + pnw_keywords = ['seattle', 'portland', 'washington', 'oregon', 'pnw'] + if any(k in lost_loc for k in pnw_keywords) and any(k in builder_loc for k in pnw_keywords): + match_score += 10 + + # minimum threshold - need SOMETHING in common + if match_score < 10: + continue + + candidates.append({ + 'builder': builder, + 'match_score': match_score, + 'shared_interests': list(shared_interests)[:5], + 'repos_count': len(repos), + 'total_stars': total_stars, + }) + + if not candidates: + return None, "no matching active builders found" + + # sort by match score, return best + candidates.sort(key=lambda x: x['match_score'], reverse=True) + best = candidates[0] + + return best, None + + +def find_matches_for_lost_builders(db, min_lost_score=40, min_values_score=20, limit=10): + """ + find inspiring builder matches for all lost builders ready for outreach. + + returns list of (lost_user, inspiring_builder, match_data) + """ + # get lost builders ready for outreach + lost_builders = db.get_lost_builders_for_outreach( + min_lost_score=min_lost_score, + min_values_score=min_values_score, + limit=limit + ) + + if not lost_builders: + return [], "no lost builders ready for outreach" + + # get active builders who can inspire + active_builders = db.get_active_builders(min_score=50, limit=200) + + if not active_builders: + return [], "no active builders available" + + matches = [] + + for lost_user in lost_builders: + best_match, error = find_inspiring_builder(lost_user, active_builders, db) + + if best_match: + matches.append({ + 'lost_user': lost_user, + 'inspiring_builder': best_match['builder'], + 'match_score': best_match['match_score'], + 'shared_interests': best_match['shared_interests'], + 'builder_repos': best_match['repos_count'], + 'builder_stars': best_match['total_stars'], + }) + + return matches, None + + +def get_lost_match_summary(match_data): + """ + get a human-readable summary of a lost builder match. + """ + lost = match_data['lost_user'] + builder = match_data['inspiring_builder'] + + lost_name = lost.get('name') or lost.get('username', 'someone') + builder_name = builder.get('name') or builder.get('username', 'a builder') + + lost_signals = match_data.get('lost_signals', []) + if isinstance(lost_signals, str): + lost_signals = json.loads(lost_signals) if lost_signals else [] + + shared = match_data.get('shared_interests', []) + + summary = f""" +lost builder: {lost_name} ({lost.get('platform')}) + lost score: {lost.get('lost_potential_score', 0)} + values score: {lost.get('score', 0)} + url: {lost.get('url')} + +inspiring builder: {builder_name} ({builder.get('platform')}) + score: {builder.get('score', 0)} + repos: {match_data.get('builder_repos', 0)} + stars: {match_data.get('builder_stars', 0)} + url: {builder.get('url')} + +match score: {match_data.get('match_score', 0)} +shared interests: {', '.join(shared) if shared else 'values alignment'} + +this lost builder needs to see that someone like them made it. +""" + return summary.strip() diff --git a/connectd/matchd/overlap.py b/connectd/matchd/overlap.py new file mode 100644 index 0000000..975ff4c --- /dev/null +++ b/connectd/matchd/overlap.py @@ -0,0 +1,150 @@ +""" +matchd/overlap.py - find pairs with alignment +""" + +import json +from .fingerprint import fingerprint_similarity + + +def find_overlap(human_a, human_b, fp_a=None, fp_b=None): + """ + analyze overlap between two humans + returns overlap details: score, shared values, complementary skills + """ + # parse stored json if needed + signals_a = human_a.get('signals', []) + if isinstance(signals_a, str): + signals_a = json.loads(signals_a) + + signals_b = human_b.get('signals', []) + if isinstance(signals_b, str): + signals_b = json.loads(signals_b) + + extra_a = human_a.get('extra', {}) + if isinstance(extra_a, str): + extra_a = json.loads(extra_a) + + extra_b = human_b.get('extra', {}) + if isinstance(extra_b, str): + extra_b = json.loads(extra_b) + + # shared signals + shared_signals = list(set(signals_a) & set(signals_b)) + + # shared topics + topics_a = set(extra_a.get('topics', [])) + topics_b = set(extra_b.get('topics', [])) + shared_topics = list(topics_a & topics_b) + + # complementary skills (what one has that the other doesn't) + langs_a = set(extra_a.get('languages', {}).keys()) + langs_b = set(extra_b.get('languages', {}).keys()) + complementary_langs = list((langs_a - langs_b) | (langs_b - langs_a)) + + # geographic compatibility + loc_a = human_a.get('location', '').lower() if human_a.get('location') else '' + loc_b = human_b.get('location', '').lower() if human_b.get('location') else '' + + pnw_keywords = ['seattle', 'portland', 'washington', 'oregon', 'pnw', 'cascadia', 'pacific northwest'] + remote_keywords = ['remote', 'anywhere', 'distributed'] + + a_pnw = any(k in loc_a for k in pnw_keywords) or 'pnw' in signals_a + b_pnw = any(k in loc_b for k in pnw_keywords) or 'pnw' in signals_b + a_remote = any(k in loc_a for k in remote_keywords) or 'remote' in signals_a + b_remote = any(k in loc_b for k in remote_keywords) or 'remote' in signals_b + + geographic_match = False + geo_reason = None + + if a_pnw and b_pnw: + geographic_match = True + geo_reason = 'both in pnw' + elif (a_pnw or b_pnw) and (a_remote or b_remote): + geographic_match = True + geo_reason = 'pnw + remote compatible' + elif a_remote and b_remote: + geographic_match = True + geo_reason = 'both remote-friendly' + + # calculate overlap score + base_score = 0 + + # shared values (most important) + base_score += len(shared_signals) * 10 + + # shared interests + base_score += len(shared_topics) * 5 + + # complementary skills bonus (they can help each other) + if complementary_langs: + base_score += min(len(complementary_langs), 5) * 3 + + # geographic bonus + if geographic_match: + base_score += 20 + + # fingerprint similarity if available + fp_score = 0 + if fp_a and fp_b: + fp_score = fingerprint_similarity(fp_a, fp_b) * 50 + + total_score = base_score + fp_score + + # build reasons + overlap_reasons = [] + if shared_signals: + overlap_reasons.append(f"shared values: {', '.join(shared_signals[:5])}") + if shared_topics: + overlap_reasons.append(f"shared interests: {', '.join(shared_topics[:5])}") + if geo_reason: + overlap_reasons.append(geo_reason) + if complementary_langs: + overlap_reasons.append(f"complementary skills: {', '.join(complementary_langs[:5])}") + + return { + 'overlap_score': total_score, + 'shared_signals': shared_signals, + 'shared_topics': shared_topics, + 'complementary_skills': complementary_langs, + 'geographic_match': geographic_match, + 'geo_reason': geo_reason, + 'overlap_reasons': overlap_reasons, + 'fingerprint_similarity': fp_score / 50 if fp_a and fp_b else None, + } + + +def is_same_person(human_a, human_b): + """ + check if two records might be the same person (cross-platform) + """ + # same platform = definitely different records + if human_a['platform'] == human_b['platform']: + return False + + # check username similarity + user_a = human_a.get('username', '').lower().split('@')[0] + user_b = human_b.get('username', '').lower().split('@')[0] + + if user_a == user_b: + return True + + # check if github username matches + contact_a = human_a.get('contact', {}) + contact_b = human_b.get('contact', {}) + + if isinstance(contact_a, str): + contact_a = json.loads(contact_a) + if isinstance(contact_b, str): + contact_b = json.loads(contact_b) + + # github cross-reference + if contact_a.get('github') and contact_a.get('github') == contact_b.get('github'): + return True + if contact_a.get('github') == user_b or contact_b.get('github') == user_a: + return True + + # email cross-reference + if contact_a.get('email') and contact_a.get('email') == contact_b.get('email'): + return True + + return False diff --git a/connectd/matchd/rank.py b/connectd/matchd/rank.py new file mode 100644 index 0000000..d10d014 --- /dev/null +++ b/connectd/matchd/rank.py @@ -0,0 +1,137 @@ +""" +matchd/rank.py - score and rank match quality +""" + +from itertools import combinations +from .fingerprint import generate_fingerprint +from .overlap import find_overlap, is_same_person +from scoutd.deep import check_already_connected + + +def rank_matches(matches): + """ + rank a list of matches by quality + returns sorted list with quality scores + """ + ranked = [] + + for match in matches: + # base score from overlap + score = match.get('overlap_score', 0) + + # bonus for geographic match + if match.get('geographic_match'): + score *= 1.2 + + # bonus for high fingerprint similarity + fp_sim = match.get('fingerprint_similarity') + if fp_sim and fp_sim > 0.7: + score *= 1.3 + + # bonus for complementary skills + comp_skills = match.get('complementary_skills', []) + if len(comp_skills) >= 3: + score *= 1.1 + + match['quality_score'] = score + ranked.append(match) + + # sort by quality score + ranked.sort(key=lambda x: x['quality_score'], reverse=True) + + return ranked + + +def find_all_matches(db, min_score=30, min_overlap=20): + """ + find all potential matches from database + returns list of match dicts + """ + print("matchd: finding all potential matches...") + + # get all humans above threshold + humans = db.get_all_humans(min_score=min_score) + print(f" {len(humans)} humans to match") + + # generate fingerprints + fingerprints = {} + for human in humans: + fp = generate_fingerprint(human) + fingerprints[human['id']] = fp + db.save_fingerprint(human['id'], fp) + + print(f" generated {len(fingerprints)} fingerprints") + + # find all pairs + matches = [] + checked = 0 + skipped_same = 0 + skipped_connected = 0 + + for human_a, human_b in combinations(humans, 2): + checked += 1 + + # skip if likely same person + if is_same_person(human_a, human_b): + skipped_same += 1 + continue + + # skip if already connected (same org, company, co-contributors) + connected, reason = check_already_connected(human_a, human_b) + if connected: + skipped_connected += 1 + continue + + # calculate overlap + fp_a = fingerprints.get(human_a['id']) + fp_b = fingerprints.get(human_b['id']) + + overlap = find_overlap(human_a, human_b, fp_a, fp_b) + + if overlap['overlap_score'] >= min_overlap: + match = { + 'human_a': human_a, + 'human_b': human_b, + **overlap + } + matches.append(match) + + # save to db + db.save_match(human_a['id'], human_b['id'], overlap) + + if checked % 1000 == 0: + print(f" checked {checked} pairs, {len(matches)} matches so far...") + + print(f" checked {checked} pairs") + print(f" skipped {skipped_same} (same person), {skipped_connected} (already connected)") + print(f" found {len(matches)} potential matches") + + # rank them + ranked = rank_matches(matches) + + return ranked + + +def get_top_matches(db, limit=50): + """ + get top matches from database + """ + match_rows = db.get_matches(limit=limit) + + matches = [] + for row in match_rows: + human_a = db.get_human_by_id(row['human_a_id']) + human_b = db.get_human_by_id(row['human_b_id']) + + if human_a and human_b: + matches.append({ + 'id': row['id'], + 'human_a': human_a, + 'human_b': human_b, + 'overlap_score': row['overlap_score'], + 'overlap_reasons': row['overlap_reasons'], + 'geographic_match': row['geographic_match'], + 'status': row['status'], + }) + + return matches diff --git a/connectd/repository.yaml b/connectd/repository.yaml new file mode 100644 index 0000000..d344f90 --- /dev/null +++ b/connectd/repository.yaml @@ -0,0 +1,3 @@ +name: connectd add-ons +url: https://github.com/sudoxnym/connectd +maintainer: sudoxnym diff --git a/connectd/requirements.txt b/connectd/requirements.txt new file mode 100644 index 0000000..b649c0d --- /dev/null +++ b/connectd/requirements.txt @@ -0,0 +1,2 @@ +requests>=2.28.0 +beautifulsoup4>=4.12.0 diff --git a/connectd/run.sh b/connectd/run.sh new file mode 100644 index 0000000..98600be --- /dev/null +++ b/connectd/run.sh @@ -0,0 +1,45 @@ +#!/usr/bin/with-contenv bashio +# shellcheck shell=bash + +# read options from add-on config +export HOST_USER=$(bashio::config 'host_user') +export HOST_NAME=$(bashio::config 'host_name') +export HOST_EMAIL=$(bashio::config 'host_email') +export HOST_MASTODON=$(bashio::config 'host_mastodon') +export HOST_REDDIT=$(bashio::config 'host_reddit') +export HOST_LEMMY=$(bashio::config 'host_lemmy') +export HOST_LOBSTERS=$(bashio::config 'host_lobsters') +export HOST_MATRIX=$(bashio::config 'host_matrix') +export HOST_DISCORD=$(bashio::config 'host_discord') +export HOST_BLUESKY=$(bashio::config 'host_bluesky') +export HOST_LOCATION=$(bashio::config 'host_location') +export HOST_INTERESTS=$(bashio::config 'host_interests') +export HOST_LOOKING_FOR=$(bashio::config 'host_looking_for') + +export GITHUB_TOKEN=$(bashio::config 'github_token') +export GROQ_API_KEY=$(bashio::config 'groq_api_key') + +export MASTODON_TOKEN=$(bashio::config 'mastodon_token') +export MASTODON_INSTANCE=$(bashio::config 'mastodon_instance') + +export DISCORD_BOT_TOKEN=$(bashio::config 'discord_bot_token') +export DISCORD_TARGET_SERVERS=$(bashio::config 'discord_target_servers') + +export LEMMY_INSTANCE=$(bashio::config 'lemmy_instance') +export LEMMY_USERNAME=$(bashio::config 'lemmy_username') +export LEMMY_PASSWORD=$(bashio::config 'lemmy_password') + +export SMTP_HOST=$(bashio::config 'smtp_host') +export SMTP_PORT=$(bashio::config 'smtp_port') +export SMTP_USER=$(bashio::config 'smtp_user') +export SMTP_PASS=$(bashio::config 'smtp_pass') + +# set data paths +export DB_PATH=/data/db/connectd.db +export CACHE_DIR=/data/cache + +bashio::log.info "starting connectd daemon..." +bashio::log.info "HOST_USER: ${HOST_USER}" + +cd /app +exec python3 daemon.py diff --git a/connectd/scoutd/__init__.py b/connectd/scoutd/__init__.py new file mode 100644 index 0000000..5d2e192 --- /dev/null +++ b/connectd/scoutd/__init__.py @@ -0,0 +1,29 @@ +""" +scoutd - discovery module +finds humans across platforms +""" + +from .github import scrape_github, get_github_user +from .reddit import scrape_reddit +from .mastodon import scrape_mastodon +from .lobsters import scrape_lobsters +from .matrix import scrape_matrix +from .twitter import scrape_twitter +from .bluesky import scrape_bluesky +from .lemmy import scrape_lemmy +from .discord import scrape_discord, send_discord_dm +from .deep import ( + deep_scrape_github_user, check_already_connected, save_deep_profile, + determine_contact_method, get_cached_orgs, cache_orgs, + get_emails_from_commit_history, scrape_website_for_emails, +) + +__all__ = [ + 'scrape_github', 'scrape_reddit', 'scrape_mastodon', 'scrape_lobsters', + 'scrape_matrix', 'scrape_twitter', 'scrape_bluesky', 'scrape_lemmy', + 'scrape_discord', 'send_discord_dm', + 'get_github_user', 'deep_scrape_github_user', + 'check_already_connected', 'save_deep_profile', 'determine_contact_method', + 'get_cached_orgs', 'cache_orgs', 'get_emails_from_commit_history', + 'scrape_website_for_emails', +] diff --git a/connectd/scoutd/bluesky.py b/connectd/scoutd/bluesky.py new file mode 100644 index 0000000..dbd4bc4 --- /dev/null +++ b/connectd/scoutd/bluesky.py @@ -0,0 +1,216 @@ +""" +scoutd/bluesky.py - bluesky/atproto discovery + +bluesky has an open API via AT Protocol - no auth needed for public data +many twitter refugees landed here, good source for aligned builders +""" + +import requests +import json +import time +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'bluesky' + +# public bluesky API +BSKY_API = 'https://public.api.bsky.app' + +# hashtags to search +ALIGNED_HASHTAGS = [ + 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', + 'privacy', 'solarpunk', 'cooperative', 'mutualaid', 'localfirst', + 'indieweb', 'smallweb', 'permacomputing', 'techworkers', 'coops', +] + + +def _api_get(endpoint, params=None): + """rate-limited API request with caching""" + url = f"{BSKY_API}{endpoint}" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(0.5) # rate limit + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" bluesky api error: {e}") + return None + + +def search_posts(query, limit=50): + """search for posts containing query""" + result = _api_get('/xrpc/app.bsky.feed.searchPosts', { + 'q': query, + 'limit': min(limit, 100), + }) + + if not result: + return [] + + posts = result.get('posts', []) + return posts + + +def get_profile(handle): + """get user profile by handle (e.g., user.bsky.social)""" + result = _api_get('/xrpc/app.bsky.actor.getProfile', {'actor': handle}) + return result + + +def get_author_feed(handle, limit=30): + """get user's recent posts""" + result = _api_get('/xrpc/app.bsky.feed.getAuthorFeed', { + 'actor': handle, + 'limit': limit, + }) + + if not result: + return [] + + return result.get('feed', []) + + +def analyze_bluesky_user(handle): + """analyze a bluesky user for alignment""" + profile = get_profile(handle) + if not profile: + return None + + # collect text + text_parts = [] + + # bio/description + description = profile.get('description', '') + if description: + text_parts.append(description) + + display_name = profile.get('displayName', '') + if display_name: + text_parts.append(display_name) + + # recent posts + feed = get_author_feed(handle, limit=20) + for item in feed: + post = item.get('post', {}) + record = post.get('record', {}) + text = record.get('text', '') + if text: + text_parts.append(text) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # bluesky bonus (decentralized, values-aligned platform choice) + platform_bonus = 10 + total_score = text_score + platform_bonus + + # activity bonus + followers = profile.get('followersCount', 0) + posts_count = profile.get('postsCount', 0) + + if posts_count >= 100: + total_score += 5 + if followers >= 100: + total_score += 5 + + # confidence + confidence = 0.35 # base for bluesky (better signal than twitter) + if len(text_parts) > 5: + confidence += 0.2 + if len(positive_signals) >= 3: + confidence += 0.2 + if posts_count >= 50: + confidence += 0.1 + confidence = min(confidence, 0.85) + + reasons = ['on bluesky (atproto)'] + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + return { + 'platform': 'bluesky', + 'username': handle, + 'url': f"https://bsky.app/profile/{handle}", + 'name': display_name or handle, + 'bio': description, + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'followers': followers, + 'posts_count': posts_count, + 'reasons': reasons, + 'contact': { + 'bluesky': handle, + }, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_bluesky(db, limit_per_hashtag=30): + """full bluesky scrape""" + print("scoutd/bluesky: starting scrape...") + + all_users = {} + + for hashtag in ALIGNED_HASHTAGS: + print(f" #{hashtag}...") + + # search for hashtag + posts = search_posts(f"#{hashtag}", limit=limit_per_hashtag) + + for post in posts: + author = post.get('author', {}) + handle = author.get('handle') + + if handle and handle not in all_users: + all_users[handle] = { + 'handle': handle, + 'display_name': author.get('displayName'), + 'hashtags': [hashtag], + } + elif handle: + all_users[handle]['hashtags'].append(hashtag) + + print(f" found {len(posts)} posts") + + # prioritize users in multiple hashtags + multi_hashtag = {h: d for h, d in all_users.items() if len(d.get('hashtags', [])) >= 2} + print(f" {len(multi_hashtag)} users in 2+ aligned hashtags") + + # analyze + results = [] + for handle in list(multi_hashtag.keys())[:100]: + try: + result = analyze_bluesky_user(handle) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + if result['score'] >= 30: + print(f" ★ @{handle}: {result['score']} pts") + except Exception as e: + print(f" error on {handle}: {e}") + + print(f"scoutd/bluesky: found {len(results)} aligned humans") + return results diff --git a/connectd/scoutd/deep.py b/connectd/scoutd/deep.py new file mode 100644 index 0000000..73f4ae1 --- /dev/null +++ b/connectd/scoutd/deep.py @@ -0,0 +1,966 @@ +""" +scoutd/deep.py - deep profile discovery +when we find someone, follow ALL their links to build complete picture + +github profile -> mastodon link -> scrape mastodon + -> website -> scrape for more links + -> twitter handle -> note it + -> email -> store it + +email discovery sources: +- github profile (if public) +- git commit history +- personal website/blog contact page +- README "contact me" sections +- mastodon/twitter bio + +fallback contact methods if no email: +- github_issue: open issue on their repo +- mastodon: DM if allowed +- manual: pending contact queue for review + +also filters out people who clearly already know each other +(same org, co-contributors to same repos) +""" + +import re +import json +import requests +import time +import subprocess +import tempfile +import shutil +from datetime import datetime +from urllib.parse import urlparse +from pathlib import Path + +from .signals import analyze_text +from .github import get_github_user, get_user_repos, _api_get as github_api +from .mastodon import analyze_mastodon_user, _api_get as mastodon_api +from .handles import discover_all_handles, extract_handles_from_text, scrape_website_for_handles + +# local cache for org memberships +ORG_CACHE_FILE = Path(__file__).parent.parent / 'data' / 'org_cache.json' +_org_cache = None + +# patterns to find social links in text +MASTODON_PATTERN = r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-z]{2,})' +TWITTER_PATTERN = r'(?:twitter\.com/|x\.com/)([a-zA-Z0-9_]+)' +GITHUB_PATTERN = r'github\.com/([a-zA-Z0-9_-]+)' +MATRIX_PATTERN = r'@([a-zA-Z0-9_]+):([a-zA-Z0-9.-]+)' +EMAIL_PATTERN = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b' + +# known mastodon instances for validation +KNOWN_INSTANCES = [ + 'mastodon.social', 'fosstodon.org', 'tech.lgbt', 'social.coop', + 'hackers.town', 'hachyderm.io', 'infosec.exchange', 'chaos.social', + 'mas.to', 'mstdn.social', 'mastodon.online', 'universeodon.com', + 'mathstodon.xyz', 'ruby.social', 'functional.cafe', 'types.pl', +] + +# contact page patterns for website scraping +CONTACT_PAGE_PATHS = [ + '/contact', '/contact/', '/contact.html', + '/about', '/about/', '/about.html', + '/connect', '/reach-out', '/hire', '/hire-me', +] + +# patterns to find emails in contact sections +CONTACT_SECTION_PATTERNS = [ + r'(?:contact|email|reach|mail)[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', + r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|@)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\.)\s*([a-zA-Z]{2,})', +] + + +def load_org_cache(): + """load org membership cache from disk""" + global _org_cache + if _org_cache is not None: + return _org_cache + + try: + ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + if ORG_CACHE_FILE.exists(): + with open(ORG_CACHE_FILE) as f: + _org_cache = json.load(f) + else: + _org_cache = {'users': {}, 'updated': {}} + except: + _org_cache = {'users': {}, 'updated': {}} + + return _org_cache + + +def save_org_cache(): + """save org membership cache to disk""" + global _org_cache + if _org_cache is None: + return + + try: + ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(ORG_CACHE_FILE, 'w') as f: + json.dump(_org_cache, f, indent=2) + except: + pass + + +def get_cached_orgs(username): + """get orgs from cache if available and fresh (< 7 days old)""" + cache = load_org_cache() + + if username not in cache['users']: + return None + + updated = cache['updated'].get(username) + if updated: + updated_dt = datetime.fromisoformat(updated) + if (datetime.now() - updated_dt).days < 7: + return cache['users'][username] + + return None + + +def cache_orgs(username, orgs): + """cache org membership for a user""" + cache = load_org_cache() + cache['users'][username] = orgs + cache['updated'][username] = datetime.now().isoformat() + save_org_cache() + + +def get_emails_from_commit_history(repo_url, limit=50): + """ + clone a repo (shallow) and extract unique committer emails from git log + """ + emails = set() + + try: + # create temp dir + with tempfile.TemporaryDirectory() as tmpdir: + # shallow clone with limited depth + result = subprocess.run( + ['git', 'clone', '--depth', '50', '--single-branch', repo_url, tmpdir], + capture_output=True, + text=True, + timeout=30 + ) + + if result.returncode != 0: + return [] + + # get unique emails from commit log + result = subprocess.run( + ['git', 'log', f'--max-count={limit}', '--format=%ae'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + for email in result.stdout.strip().split('\n'): + email = email.strip().lower() + # filter out bot/noreply emails + if email and not any(x in email for x in [ + 'noreply', 'no-reply', 'dependabot', 'github-actions', + 'renovate', 'greenkeeper', 'snyk-bot', 'users.noreply.github' + ]): + emails.add(email) + except (subprocess.TimeoutExpired, Exception): + pass + + return list(emails) + + +def scrape_website_for_emails(url, timeout=10): + """ + scrape a personal website for email addresses + checks main page and common contact pages + """ + emails = set() + + if not is_personal_website(url): + return [] + + headers = {'User-Agent': 'connectd/1.0 (looking for contact info)'} + + # normalize url + if not url.startswith('http'): + url = 'https://' + url + + base_url = url.rstrip('/') + + # pages to check + pages_to_check = [base_url] + [base_url + path for path in CONTACT_PAGE_PATHS] + + for page_url in pages_to_check: + try: + resp = requests.get(page_url, timeout=timeout, headers=headers) + if resp.status_code == 200: + text = resp.text + + # standard email pattern + for match in re.finditer(EMAIL_PATTERN, text): + email = match.group(0).lower() + if not any(x in email for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']): + emails.add(email) + + # obfuscated email patterns like "user [at] domain [dot] com" + for pattern in CONTACT_SECTION_PATTERNS: + for match in re.finditer(pattern, text, re.IGNORECASE): + if len(match.groups()) == 3: + email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower() + emails.add(email) + elif len(match.groups()) == 1: + emails.add(match.group(1).lower()) + + # mailto: links + for match in re.finditer(r'mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text): + emails.add(match.group(1).lower()) + + except: + continue + + return list(emails) + + +def extract_emails_from_readme(text): + """ + extract emails from README text, looking for contact sections + """ + emails = set() + + if not text: + return [] + + # look for contact-related sections + contact_patterns = [ + r'(?:##?\s*)?(?:contact|reach|email|get in touch|connect)[^\n]*\n([^\n#]+)', + r'(?:email|contact|reach me)[:\s]+([^\n]+)', + ] + + for pattern in contact_patterns: + for match in re.finditer(pattern, text, re.IGNORECASE): + section = match.group(1) + # extract emails from this section + for email_match in re.finditer(EMAIL_PATTERN, section): + email = email_match.group(0).lower() + if not any(x in email for x in ['noreply', 'no-reply', 'example.com']): + emails.add(email) + + # also check for obfuscated emails + for match in re.finditer(r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\))\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\))\s*([a-zA-Z]{2,})', text, re.IGNORECASE): + email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower() + emails.add(email) + + return list(emails) + + +def get_mastodon_dm_allowed(handle): + """check if a mastodon user allows DMs""" + profile = get_mastodon_profile(handle) + if not profile: + return False + + # check if they're locked (requires follow approval) + if profile.get('locked'): + return False + + # check bio for "DMs open" type messages + bio = (profile.get('note') or profile.get('summary') or '').lower() + if any(x in bio for x in ['dms open', 'dm me', 'message me', 'dms welcome']): + return True + + # default: assume open if not locked + return True + + +def determine_contact_method(profile): + """ + determine the best way to contact someone + returns (method, details) where method is one of: + - 'email': direct email contact + - 'github_issue': open issue on their repo + - 'mastodon': DM on mastodon + - 'manual': needs manual review + """ + # prefer email + if profile.get('email'): + return 'email', {'email': profile['email']} + + # check for multiple emails to pick from + if profile.get('emails') and len(profile['emails']) > 0: + # prefer non-github, non-work emails + for email in profile['emails']: + if not any(x in email.lower() for x in ['github', 'noreply', '@company', '@corp']): + return 'email', {'email': email} + # fall back to first one + return 'email', {'email': profile['emails'][0]} + + # try mastodon DM + if profile.get('mastodon'): + handles = profile['mastodon'] if isinstance(profile['mastodon'], list) else [profile['mastodon']] + for handle in handles: + if get_mastodon_dm_allowed(handle): + return 'mastodon', {'handle': handle} + + # try github issue on their most-starred repo + if profile.get('top_repos'): + # find repo with issues enabled and good stars + for repo in sorted(profile['top_repos'], key=lambda r: r.get('stars', 0), reverse=True): + if repo.get('stars', 0) >= 10: + repo_name = repo.get('name') + if repo_name: + return 'github_issue', { + 'repo': f"{profile['username']}/{repo_name}", + 'stars': repo.get('stars'), + } + + # manual review needed + return 'manual', { + 'reason': 'no email, mastodon, or suitable repo found', + 'available': { + 'twitter': profile.get('twitter'), + 'websites': profile.get('websites'), + 'matrix': profile.get('matrix'), + } + } + + +def extract_links_from_text(text): + """extract social links from bio/readme text""" + if not text: + return {} + + links = { + 'mastodon': [], + 'twitter': [], + 'github': [], + 'matrix': [], + 'email': [], + 'websites': [], + } + + # mastodon handles - only accept known instances or ones with 'mastodon'/'social' in name + for match in re.finditer(MASTODON_PATTERN, text): + user, instance = match.groups() + instance_lower = instance.lower() + # validate it's a known instance or looks like one + is_known = instance_lower in KNOWN_INSTANCES + looks_like_masto = any(x in instance_lower for x in ['mastodon', 'social', 'fedi', '.town', '.cafe']) + if is_known or looks_like_masto: + links['mastodon'].append(f"{user}@{instance}") + + # twitter + for match in re.finditer(TWITTER_PATTERN, text, re.IGNORECASE): + links['twitter'].append(match.group(1)) + + # github (for cross-referencing) + for match in re.finditer(GITHUB_PATTERN, text, re.IGNORECASE): + links['github'].append(match.group(1)) + + # matrix + for match in re.finditer(MATRIX_PATTERN, text): + user, server = match.groups() + links['matrix'].append(f"@{user}:{server}") + + # email + for match in re.finditer(EMAIL_PATTERN, text): + email = match.group(0) + # filter out obvious non-personal emails + if not any(x in email.lower() for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']): + links['email'].append(email) + + # websites (http/https links that aren't social platforms) + url_pattern = r'https?://([a-zA-Z0-9.-]+\.[a-z]{2,})[/\w.-]*' + for match in re.finditer(url_pattern, text): + domain = match.group(1).lower() + if not any(x in domain for x in ['github.com', 'twitter.com', 'mastodon', 'linkedin.com', 't.co']): + links['websites'].append(match.group(0)) + + # dedupe + for key in links: + links[key] = list(set(links[key])) + + return links + + +def is_personal_website(url): + """check if URL looks like a personal website vs corporate site""" + domain = urlparse(url).netloc.lower() + + # skip obvious corporate/platform sites + skip_domains = [ + 'github.com', 'gitlab.com', 'bitbucket.org', + 'twitter.com', 'x.com', 'linkedin.com', 'facebook.com', + 'youtube.com', 'medium.com', 'dev.to', 'hashnode.com', + 'wedo.com', 'google.com', 'microsoft.com', 'apple.com', + 'amazon.com', 'stackoverflow.com', 'reddit.com', + ] + + if any(skip in domain for skip in skip_domains): + return False + + # looks personal if: short domain, has common personal TLDs, contains username-like string + personal_tlds = ['.io', '.dev', '.me', '.co', '.xyz', '.page', '.codes', '.software'] + if any(domain.endswith(tld) for tld in personal_tlds): + return True + + # if domain is just name.com or similar + parts = domain.replace('www.', '').split('.') + if len(parts) == 2 and len(parts[0]) < 20: + return True + + return False + + +def scrape_website_for_links(url, timeout=10): + """scrape a personal website for more social links""" + if not is_personal_website(url): + return {} + + try: + resp = requests.get(url, timeout=timeout, headers={'User-Agent': 'connectd/1.0'}) + resp.raise_for_status() + return extract_links_from_text(resp.text) + except: + return {} + + +def get_mastodon_profile(handle): + """ + fetch mastodon profile from handle like user@instance + returns profile data or None + """ + if '@' not in handle: + return None + + parts = handle.split('@') + if len(parts) == 2: + user, instance = parts + elif len(parts) == 3 and parts[0] == '': + # @user@instance format + user, instance = parts[1], parts[2] + else: + return None + + # try to look up via webfinger + try: + webfinger_url = f"https://{instance}/.well-known/webfinger" + resp = requests.get( + webfinger_url, + params={'resource': f'acct:{user}@{instance}'}, + timeout=10, + headers={'Accept': 'application/json'} + ) + if resp.status_code == 200: + data = resp.json() + # find the profile link + for link in data.get('links', []): + if link.get('type') == 'application/activity+json': + profile_url = link.get('href') + # fetch the profile + profile_resp = requests.get( + profile_url, + timeout=10, + headers={'Accept': 'application/activity+json'} + ) + if profile_resp.status_code == 200: + return profile_resp.json() + except: + pass + + # fallback: try direct API + try: + search_url = f"https://{instance}/api/v1/accounts/lookup" + resp = requests.get(search_url, params={'acct': user}, timeout=10) + if resp.status_code == 200: + return resp.json() + except: + pass + + return None + + +def deep_scrape_github_user(login, scrape_commits=True): + """ + deep scrape a github user - follow all links, build complete profile + + email discovery sources: + 1. github profile (if public) + 2. git commit history (if scrape_commits=True) + 3. personal website/blog contact pages + 4. README "contact me" sections + 5. mastodon bio + """ + print(f" deep scraping {login}...") + + user = get_github_user(login) + if not user: + return None + + repos = get_user_repos(login, per_page=50) + + # collect all text to search for links + all_text = [] + readme_text = None + + if user.get('bio'): + all_text.append(user['bio']) + if user.get('blog'): + all_text.append(user['blog']) + if user.get('company'): + all_text.append(user['company']) + + # check readme of profile repo (username/username) + for branch in ['main', 'master']: + readme_url = f"https://raw.githubusercontent.com/{login}/{login}/{branch}/README.md" + try: + resp = requests.get(readme_url, timeout=10) + if resp.status_code == 200: + readme_text = resp.text + all_text.append(readme_text) + break + except: + pass + + # extract links from all collected text + combined_text = '\n'.join(all_text) + found_links = extract_links_from_text(combined_text) + + # ensure all keys exist + for key in ['email', 'twitter', 'github', 'matrix', 'mastodon', 'websites']: + if key not in found_links: + found_links[key] = [] + + # add explicit github fields + if user.get('email'): + found_links['email'].append(user['email']) + if user.get('twitter_username'): + found_links['twitter'].append(user['twitter_username']) + if user.get('blog'): + found_links['websites'].append(user['blog']) + + # EMAIL DISCOVERY: extract emails from README contact sections + if readme_text: + readme_emails = extract_emails_from_readme(readme_text) + found_links['email'].extend(readme_emails) + if readme_emails: + print(f" found {len(readme_emails)} email(s) in README") + + # dedupe + for key in found_links: + found_links[key] = list(set(found_links[key])) + + # now follow the links to gather more data + profile = { + 'source': 'github', + 'username': login, + 'url': f"https://github.com/{login}", + 'real_name': user.get('name'), + 'bio': user.get('bio'), + 'location': user.get('location'), + 'company': user.get('company'), + 'hireable': user.get('hireable'), + 'created_at': user.get('created_at'), + 'public_repos': user.get('public_repos'), + 'followers': user.get('followers'), + + # contact points + 'email': found_links['email'][0] if found_links['email'] else user.get('email'), + 'emails': list(found_links['email']), + 'twitter': found_links['twitter'][0] if found_links['twitter'] else user.get('twitter_username'), + 'mastodon': found_links['mastodon'], + 'matrix': found_links['matrix'], + 'websites': found_links['websites'], + + # cross-platform profiles we find + 'linked_profiles': {}, + + # repos and languages + 'top_repos': [], + 'languages': {}, + 'topics': [], + 'orgs': [], + + # contact method (will be determined at end) + 'contact_method': None, + 'contact_details': None, + } + + # analyze repos + top_starred_repo = None + for repo in repos[:30]: + if not repo.get('fork'): + repo_info = { + 'name': repo.get('name'), + 'description': repo.get('description'), + 'stars': repo.get('stargazers_count'), + 'language': repo.get('language'), + 'topics': repo.get('topics', []), + 'html_url': repo.get('html_url'), + 'pushed_at': repo.get('pushed_at'), # for activity-based contact selection + } + profile['top_repos'].append(repo_info) + + # track top starred for commit email scraping + if not top_starred_repo or repo.get('stargazers_count', 0) > top_starred_repo.get('stars', 0): + top_starred_repo = repo_info + + if repo.get('language'): + lang = repo['language'] + profile['languages'][lang] = profile['languages'].get(lang, 0) + 1 + + profile['topics'].extend(repo.get('topics', [])) + + profile['topics'] = list(set(profile['topics'])) + + # get orgs - check cache first + cached_orgs = get_cached_orgs(login) + if cached_orgs is not None: + print(f" using cached orgs: {cached_orgs}") + profile['orgs'] = cached_orgs + else: + orgs_url = f"https://api.github.com/users/{login}/orgs" + orgs_data = github_api(orgs_url) or [] + profile['orgs'] = [o.get('login') for o in orgs_data] + # cache for future use + cache_orgs(login, profile['orgs']) + if profile['orgs']: + print(f" fetched & cached orgs: {profile['orgs']}") + + # EMAIL DISCOVERY: scrape commit history from top repo + if scrape_commits and top_starred_repo and not profile['emails']: + repo_url = f"https://github.com/{login}/{top_starred_repo['name']}.git" + print(f" checking commit history in {top_starred_repo['name']}...") + commit_emails = get_emails_from_commit_history(repo_url) + if commit_emails: + print(f" found {len(commit_emails)} email(s) in commits") + profile['emails'].extend(commit_emails) + + # follow mastodon links + for masto_handle in found_links['mastodon'][:2]: # limit to 2 + print(f" following mastodon: {masto_handle}") + masto_profile = get_mastodon_profile(masto_handle) + if masto_profile: + profile['linked_profiles']['mastodon'] = { + 'handle': masto_handle, + 'display_name': masto_profile.get('display_name') or masto_profile.get('name'), + 'bio': masto_profile.get('note') or masto_profile.get('summary'), + 'followers': masto_profile.get('followers_count'), + 'url': masto_profile.get('url'), + 'locked': masto_profile.get('locked', False), + } + # extract more links from mastodon bio + masto_bio = masto_profile.get('note') or masto_profile.get('summary') or '' + masto_links = extract_links_from_text(masto_bio) + profile['emails'].extend(masto_links.get('email', [])) + profile['websites'].extend(masto_links.get('websites', [])) + + # EMAIL DISCOVERY: scrape personal website for contact info + for website in found_links['websites'][:2]: # check up to 2 sites + print(f" following website: {website}") + + # basic link extraction + site_links = scrape_website_for_links(website) + if site_links.get('mastodon') and not profile['mastodon']: + profile['mastodon'] = site_links['mastodon'] + + # enhanced email discovery - check contact pages + website_emails = scrape_website_for_emails(website) + if website_emails: + print(f" found {len(website_emails)} email(s) on website") + profile['emails'].extend(website_emails) + + # dedupe emails and pick best one + profile['emails'] = list(set(profile['emails'])) + + # rank emails by preference + def email_score(email): + email_lower = email.lower() + score = 0 + # prefer personal domains + if any(x in email_lower for x in ['@gmail', '@proton', '@hey.com', '@fastmail']): + score += 10 + # deprioritize github emails + if 'github' in email_lower: + score -= 20 + # deprioritize noreply + if 'noreply' in email_lower: + score -= 50 + # prefer emails matching username + if login.lower() in email_lower: + score += 5 + return score + + if profile['emails']: + profile['emails'].sort(key=email_score, reverse=True) + profile['email'] = profile['emails'][0] + + # COMPREHENSIVE HANDLE DISCOVERY + # find ALL social handles from website, README, rel="me" links, etc. + discovered_handles, discovered_emails = discover_all_handles(user) + + # merge discovered handles into profile + profile['handles'] = discovered_handles + + # update individual fields from discovered handles + if discovered_handles.get('mastodon') and not profile.get('mastodon'): + profile['mastodon'] = discovered_handles['mastodon'] + if discovered_handles.get('twitter') and not profile.get('twitter'): + profile['twitter'] = discovered_handles['twitter'] + if discovered_handles.get('bluesky'): + profile['bluesky'] = discovered_handles['bluesky'] + if discovered_handles.get('matrix') and not profile.get('matrix'): + profile['matrix'] = discovered_handles['matrix'] + if discovered_handles.get('linkedin'): + profile['linkedin'] = discovered_handles['linkedin'] + if discovered_handles.get('youtube'): + profile['youtube'] = discovered_handles['youtube'] + if discovered_handles.get('discord'): + profile['discord'] = discovered_handles['discord'] + if discovered_handles.get('telegram'): + profile['telegram'] = discovered_handles['telegram'] + + # merge discovered emails + for email in discovered_emails: + if email not in profile['emails']: + profile['emails'].append(email) + + print(f" handles found: {list(discovered_handles.keys())}") + + # determine best contact method + contact_method, contact_details = determine_contact_method(profile) + profile['contact_method'] = contact_method + profile['contact_details'] = contact_details + print(f" contact method: {contact_method}") + + # analyze all text for signals + all_profile_text = ' '.join([ + profile.get('bio') or '', + profile.get('company') or '', + profile.get('location') or '', + ' '.join(profile.get('topics', [])), + ]) + + for linked in profile.get('linked_profiles', {}).values(): + if linked.get('bio'): + all_profile_text += ' ' + linked['bio'] + + text_score, signals, negative = analyze_text(all_profile_text) + profile['signals'] = signals + profile['negative_signals'] = negative + profile['score'] = text_score + + # add builder score + if len(repos) > 20: + profile['score'] += 15 + elif len(repos) > 10: + profile['score'] += 10 + + # add topic alignment + from .signals import TARGET_TOPICS + aligned_topics = set(profile['topics']) & set(TARGET_TOPICS) + profile['score'] += len(aligned_topics) * 10 + profile['aligned_topics'] = list(aligned_topics) + + profile['scraped_at'] = datetime.now().isoformat() + + return profile + + +def check_mutual_github_follows(user_a, user_b): + """check if two github users follow each other""" + # check if a follows b + url = f"https://api.github.com/users/{user_a}/following/{user_b}" + try: + resp = requests.get(url, timeout=10, headers={'Accept': 'application/vnd.github.v3+json'}) + if resp.status_code == 204: # 204 = follows + return True + except: + pass + return False + + +def check_shared_repo_contributions(user_a, user_b): + """ + check if two users have contributed to the same repos + returns (bool, list of shared repos) + """ + # this would require checking contribution history + # for now, we check via the orgs and top_repos stored in extra + # the full implementation would query: + # GET /repos/{owner}/{repo}/contributors for their top repos + return False, [] + + +def check_github_interactions(user_a, user_b): + """ + check if users have had public interactions + (comments on each other's issues/PRs) + this is expensive - only do for high-score matches + """ + # would need to search: + # GET /search/issues?q=author:{user_a}+commenter:{user_b} + # GET /search/issues?q=author:{user_b}+commenter:{user_a} + return False + + +def check_already_connected(human_a, human_b, deep_check=False): + """ + check if two humans are likely already connected + (same org, co-contributors, mutual follows, interactions) + + connectd's job is connecting ISOLATED builders, not re-introducing coworkers + """ + # parse extra data if stored as json string + extra_a = human_a.get('extra', {}) + extra_b = human_b.get('extra', {}) + if isinstance(extra_a, str): + extra_a = json.loads(extra_a) if extra_a else {} + if isinstance(extra_b, str): + extra_b = json.loads(extra_b) if extra_b else {} + + # 1. same github org - check cache first, then stored data + orgs_a = set(extra_a.get('orgs', [])) + orgs_b = set(extra_b.get('orgs', [])) + + # also check org cache for fresher data + if human_a.get('platform') == 'github': + cached_a = get_cached_orgs(human_a.get('username', '')) + if cached_a: + orgs_a.update(cached_a) + if human_b.get('platform') == 'github': + cached_b = get_cached_orgs(human_b.get('username', '')) + if cached_b: + orgs_b.update(cached_b) + + shared_orgs = orgs_a & orgs_b + + if shared_orgs: + return True, f"same org: {', '.join(list(shared_orgs)[:3])}" + + # 2. same company + company_a = (extra_a.get('company') or '').lower().strip('@').strip() + company_b = (extra_b.get('company') or '').lower().strip('@').strip() + + if company_a and company_b and len(company_a) > 2: + if company_a == company_b or company_a in company_b or company_b in company_a: + return True, f"same company: {company_a or company_b}" + + # 3. co-contributors to same major repos (from stored top_repos) + repos_a = set() + repos_b = set() + for r in extra_a.get('top_repos', []): + if r.get('stars', 0) > 50: # only significant repos + repos_a.add(r.get('name', '').lower()) + for r in extra_b.get('top_repos', []): + if r.get('stars', 0) > 50: + repos_b.add(r.get('name', '').lower()) + + shared_repos = repos_a & repos_b + if len(shared_repos) >= 2: + return True, f"co-contributors: {', '.join(list(shared_repos)[:3])}" + + # 4. deep checks (more API calls - only if requested) + if deep_check: + user_a = human_a.get('username', '') + user_b = human_b.get('username', '') + + # check mutual follows + if human_a.get('platform') == 'github' and human_b.get('platform') == 'github': + if check_mutual_github_follows(user_a, user_b): + return True, "mutual github follows" + if check_mutual_github_follows(user_b, user_a): + return True, "mutual github follows" + + return False, None + + +def save_deep_profile(db, profile): + """save a deep-scraped profile to the database""" + # convert to standard human format + # IMPORTANT: extra field contains ALL data for activity-based contact selection + human_data = { + 'platform': profile['source'], + 'username': profile['username'], + 'url': profile['url'], + 'name': profile.get('real_name'), + 'bio': profile.get('bio'), + 'location': profile.get('location'), + 'score': profile.get('score', 0), + 'confidence': 0.8 if profile.get('linked_profiles') else 0.5, + 'signals': profile.get('signals', []), + 'negative_signals': profile.get('negative_signals', []), + 'reasons': [], + 'contact': { + 'email': profile.get('email'), + 'emails': profile.get('emails', []), + 'twitter': profile.get('twitter'), + 'mastodon': profile.get('mastodon'), + 'matrix': profile.get('matrix'), + 'websites': profile.get('websites'), + 'contact_method': profile.get('contact_method'), + 'contact_details': profile.get('contact_details'), + }, + 'extra': { + # identity + 'real_name': profile.get('real_name'), + 'company': profile.get('company'), + 'hireable': profile.get('hireable'), + 'orgs': profile.get('orgs'), + + # github activity (for activity-based contact) + 'top_repos': profile.get('top_repos'), + 'languages': profile.get('languages'), + 'topics': profile.get('topics'), + 'aligned_topics': profile.get('aligned_topics'), + 'followers': profile.get('followers'), + 'public_repos': profile.get('public_repos'), + 'commit_count': len(profile.get('emails', [])), # rough proxy + + # cross-platform links (for activity-based contact) + 'email': profile.get('email'), + 'emails': profile.get('emails', []), + 'twitter': profile.get('twitter'), + 'mastodon': profile.get('mastodon'), + 'matrix': profile.get('matrix'), + 'bluesky': profile.get('bluesky'), + 'reddit': profile.get('reddit'), + 'lobsters': profile.get('lobsters'), + 'linkedin': profile.get('linkedin'), + 'youtube': profile.get('youtube'), + 'discord': profile.get('discord'), + 'telegram': profile.get('telegram'), + 'linked_profiles': profile.get('linked_profiles'), + + # ALL discovered handles (comprehensive) + 'handles': profile.get('handles', {}), + + # activity counts (populated by platform scrapers) + 'mastodon_statuses': profile.get('mastodon_statuses', 0), + 'twitter_tweets': profile.get('twitter_tweets', 0), + 'reddit_activity': profile.get('reddit_activity', 0), + 'reddit_karma': profile.get('reddit_karma', 0), + 'lobsters_karma': profile.get('lobsters_karma', 0), + 'bluesky_posts': profile.get('bluesky_posts', 0), + }, + 'scraped_at': profile.get('scraped_at'), + } + + # build reasons + if profile.get('signals'): + human_data['reasons'].append(f"signals: {', '.join(profile['signals'][:5])}") + if profile.get('aligned_topics'): + human_data['reasons'].append(f"topics: {', '.join(profile['aligned_topics'][:5])}") + if profile.get('linked_profiles'): + platforms = list(profile['linked_profiles'].keys()) + human_data['reasons'].append(f"also on: {', '.join(platforms)}") + if profile.get('location'): + human_data['reasons'].append(f"location: {profile['location']}") + if profile.get('contact_method'): + human_data['reasons'].append(f"contact: {profile['contact_method']}") + + db.save_human(human_data) + return human_data diff --git a/connectd/scoutd/discord.py b/connectd/scoutd/discord.py new file mode 100644 index 0000000..84e9d6a --- /dev/null +++ b/connectd/scoutd/discord.py @@ -0,0 +1,323 @@ +""" +scoutd/discord.py - discord discovery + +discord requires a bot token to read messages. +target servers: programming help, career transition, indie hackers, etc. + +SETUP: +1. create discord app at discord.com/developers +2. add bot, get token +3. join target servers with bot +4. set DISCORD_BOT_TOKEN env var +""" + +import requests +import json +import time +import os +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text +from .lost import ( + analyze_social_for_lost_signals, + classify_user, +) + +DISCORD_BOT_TOKEN = os.environ.get('DISCORD_BOT_TOKEN', '') +DISCORD_API = 'https://discord.com/api/v10' + +# default server IDs - values-aligned communities +# bot must be invited to these servers to scout them +# invite links for reference (use numeric IDs below): +# - self-hosted: discord.gg/self-hosted +# - foss-dev: discord.gg/foss-developers-group +# - grapheneos: discord.gg/grapheneos +# - queer-coded: discord.me/queer-coded +# - homelab: discord.gg/homelab +# - esphome: discord.gg/n9sdw7pnsn +# - home-assistant: discord.gg/home-assistant +# - linuxserver: discord.gg/linuxserver +# - proxmox-scripts: discord.gg/jsYVk5JBxq +DEFAULT_SERVERS = [ + # self-hosted / foss / privacy + '693469700109369394', # self-hosted (selfhosted.show) + '920089648842293248', # foss developers group + '1176414688112820234', # grapheneos + + # queer tech + '925804557001437184', # queer coded + + # home automation / homelab + # note: these are large servers, bot needs to be invited + # '330944238910963714', # home assistant (150k+ members) + # '429907082951524364', # esphome (35k members) + # '478094546522079232', # homelab (35k members) + # '354974912613449730', # linuxserver.io (41k members) +] + +# merge env var servers with defaults +_env_servers = os.environ.get('DISCORD_TARGET_SERVERS', '').split(',') +_env_servers = [s.strip() for s in _env_servers if s.strip()] +TARGET_SERVERS = list(set(DEFAULT_SERVERS + _env_servers)) + +# channels to focus on (keywords in channel name) +TARGET_CHANNEL_KEYWORDS = [ + 'help', 'career', 'jobs', 'learning', 'beginner', + 'general', 'introductions', 'showcase', 'projects', +] + +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'discord' +CACHE_DIR.mkdir(parents=True, exist_ok=True) + + +def get_headers(): + """get discord api headers""" + if not DISCORD_BOT_TOKEN: + return None + return { + 'Authorization': f'Bot {DISCORD_BOT_TOKEN}', + 'Content-Type': 'application/json', + } + + +def get_guild_channels(guild_id): + """get channels in a guild""" + headers = get_headers() + if not headers: + return [] + + try: + resp = requests.get( + f'{DISCORD_API}/guilds/{guild_id}/channels', + headers=headers, + timeout=30 + ) + if resp.status_code == 200: + return resp.json() + return [] + except Exception: + return [] + + +def get_channel_messages(channel_id, limit=100): + """get recent messages from a channel""" + headers = get_headers() + if not headers: + return [] + + try: + resp = requests.get( + f'{DISCORD_API}/channels/{channel_id}/messages', + headers=headers, + params={'limit': limit}, + timeout=30 + ) + if resp.status_code == 200: + return resp.json() + return [] + except Exception: + return [] + + +def get_user_info(user_id): + """get discord user info""" + headers = get_headers() + if not headers: + return None + + try: + resp = requests.get( + f'{DISCORD_API}/users/{user_id}', + headers=headers, + timeout=30 + ) + if resp.status_code == 200: + return resp.json() + return None + except Exception: + return None + + +def analyze_discord_user(user_data, messages=None): + """analyze a discord user for values alignment and lost signals""" + username = user_data.get('username', '') + display_name = user_data.get('global_name') or username + user_id = user_data.get('id') + + # analyze messages + all_signals = [] + all_text = [] + total_score = 0 + + if messages: + for msg in messages[:20]: + content = msg.get('content', '') + if not content or len(content) < 20: + continue + + all_text.append(content) + score, signals, _ = analyze_text(content) + all_signals.extend(signals) + total_score += score + + all_signals = list(set(all_signals)) + + # lost builder detection + profile_for_lost = { + 'bio': '', + 'message_count': len(messages) if messages else 0, + } + posts_for_lost = [{'text': t} for t in all_text] + + lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) + lost_potential_score = lost_weight + user_type = classify_user(lost_potential_score, 50, total_score) + + return { + 'platform': 'discord', + 'username': username, + 'url': f"https://discord.com/users/{user_id}", + 'name': display_name, + 'bio': '', + 'location': None, + 'score': total_score, + 'confidence': min(0.8, 0.2 + len(all_signals) * 0.1), + 'signals': all_signals, + 'negative_signals': [], + 'reasons': [], + 'contact': {'discord': f"{username}#{user_data.get('discriminator', '0')}"}, + 'extra': { + 'user_id': user_id, + 'message_count': len(messages) if messages else 0, + }, + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_discord(db, limit_per_channel=50): + """scrape discord servers for aligned builders""" + if not DISCORD_BOT_TOKEN: + print("discord: DISCORD_BOT_TOKEN not set, skipping") + return 0 + + if not TARGET_SERVERS or TARGET_SERVERS == ['']: + print("discord: DISCORD_TARGET_SERVERS not set, skipping") + return 0 + + print("scouting discord...") + + found = 0 + lost_found = 0 + seen_users = set() + + for guild_id in TARGET_SERVERS: + if not guild_id: + continue + + guild_id = guild_id.strip() + channels = get_guild_channels(guild_id) + + if not channels: + print(f" guild {guild_id}: no access or no channels") + continue + + # filter to relevant channels + target_channels = [] + for ch in channels: + if ch.get('type') != 0: # text channels only + continue + name = ch.get('name', '').lower() + if any(kw in name for kw in TARGET_CHANNEL_KEYWORDS): + target_channels.append(ch) + + print(f" guild {guild_id}: {len(target_channels)} relevant channels") + + for channel in target_channels[:5]: # limit channels per server + messages = get_channel_messages(channel['id'], limit=limit_per_channel) + + if not messages: + continue + + # group messages by user + user_messages = {} + for msg in messages: + author = msg.get('author', {}) + if author.get('bot'): + continue + + user_id = author.get('id') + if not user_id or user_id in seen_users: + continue + + if user_id not in user_messages: + user_messages[user_id] = {'user': author, 'messages': []} + user_messages[user_id]['messages'].append(msg) + + # analyze each user + for user_id, data in user_messages.items(): + if user_id in seen_users: + continue + seen_users.add(user_id) + + result = analyze_discord_user(data['user'], data['messages']) + if not result: + continue + + if result['score'] >= 20 or result.get('lost_potential_score', 0) >= 30: + db.save_human(result) + found += 1 + + if result.get('user_type') in ['lost', 'both']: + lost_found += 1 + + time.sleep(1) # rate limit between channels + + time.sleep(2) # between guilds + + print(f"discord: found {found} humans ({lost_found} lost builders)") + return found + + +def send_discord_dm(user_id, message, dry_run=False): + """send a DM to a discord user""" + if not DISCORD_BOT_TOKEN: + return False, "DISCORD_BOT_TOKEN not set" + + if dry_run: + print(f" [dry run] would DM discord user {user_id}") + return True, "dry run" + + headers = get_headers() + + try: + # create DM channel + dm_resp = requests.post( + f'{DISCORD_API}/users/@me/channels', + headers=headers, + json={'recipient_id': user_id}, + timeout=30 + ) + + if dm_resp.status_code not in [200, 201]: + return False, f"couldn't create DM channel: {dm_resp.status_code}" + + channel_id = dm_resp.json().get('id') + + # send message + msg_resp = requests.post( + f'{DISCORD_API}/channels/{channel_id}/messages', + headers=headers, + json={'content': message}, + timeout=30 + ) + + if msg_resp.status_code in [200, 201]: + return True, f"sent to {user_id}" + else: + return False, f"send failed: {msg_resp.status_code}" + + except Exception as e: + return False, str(e) diff --git a/connectd/scoutd/github.py b/connectd/scoutd/github.py new file mode 100644 index 0000000..b6c3084 --- /dev/null +++ b/connectd/scoutd/github.py @@ -0,0 +1,330 @@ +""" +scoutd/github.py - github discovery +scrapes repos, bios, commit patterns to find aligned builders +also detects lost builders - people with potential who haven't started yet +""" + +import requests +import json +import time +import os +from datetime import datetime +from pathlib import Path +from collections import defaultdict + +from .signals import analyze_text, TARGET_TOPICS, ECOSYSTEM_REPOS +from .lost import ( + analyze_github_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, + get_signal_descriptions, +) +from .handles import discover_all_handles + +# rate limit: 60/hr unauthenticated, 5000/hr with token +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') +HEADERS = {'Accept': 'application/vnd.github.v3+json'} +if GITHUB_TOKEN: + HEADERS['Authorization'] = f'token {GITHUB_TOKEN}' + +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'github' + + +def _api_get(url, params=None): + """rate-limited api request with caching""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + # check cache (1 hour expiry) + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + # rate limit + time.sleep(0.5 if GITHUB_TOKEN else 2) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + + # cache + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" github api error: {e}") + return None + + +def search_repos_by_topic(topic, per_page=100): + """search repos by topic tag""" + url = 'https://api.github.com/search/repositories' + params = {'q': f'topic:{topic}', 'sort': 'stars', 'order': 'desc', 'per_page': per_page} + data = _api_get(url, params) + return data.get('items', []) if data else [] + + +def get_repo_contributors(repo_full_name, per_page=100): + """get top contributors to a repo""" + url = f'https://api.github.com/repos/{repo_full_name}/contributors' + return _api_get(url, {'per_page': per_page}) or [] + + +def get_github_user(login): + """get full user profile""" + url = f'https://api.github.com/users/{login}' + return _api_get(url) + + +def get_user_repos(login, per_page=100): + """get user's repos""" + url = f'https://api.github.com/users/{login}/repos' + return _api_get(url, {'per_page': per_page, 'sort': 'pushed'}) or [] + + +def analyze_github_user(login): + """ + analyze a github user for values alignment + returns dict with score, confidence, signals, contact info + """ + user = get_github_user(login) + if not user: + return None + + repos = get_user_repos(login) + + # collect text corpus + text_parts = [] + if user.get('bio'): + text_parts.append(user['bio']) + if user.get('company'): + text_parts.append(user['company']) + if user.get('location'): + text_parts.append(user['location']) + + # analyze repos + all_topics = [] + languages = defaultdict(int) + total_stars = 0 + + for repo in repos: + if repo.get('description'): + text_parts.append(repo['description']) + if repo.get('topics'): + all_topics.extend(repo['topics']) + if repo.get('language'): + languages[repo['language']] += 1 + total_stars += repo.get('stargazers_count', 0) + + full_text = ' '.join(text_parts) + + # analyze signals + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # topic alignment + aligned_topics = set(all_topics) & set(TARGET_TOPICS) + topic_score = len(aligned_topics) * 10 + + # builder score (repos indicate building, not just talking) + builder_score = 0 + if len(repos) > 20: + builder_score = 15 + elif len(repos) > 10: + builder_score = 10 + elif len(repos) > 5: + builder_score = 5 + + # hireable bonus + hireable_score = 5 if user.get('hireable') else 0 + + # total score + total_score = text_score + topic_score + builder_score + hireable_score + + # === LOST BUILDER DETECTION === + # build profile dict for lost analysis + profile_for_lost = { + 'bio': user.get('bio'), + 'repos': repos, + 'public_repos': user.get('public_repos', len(repos)), + 'followers': user.get('followers', 0), + 'following': user.get('following', 0), + 'extra': { + 'top_repos': repos[:10], + }, + } + + # analyze for lost signals + lost_signals, lost_weight = analyze_github_for_lost_signals(profile_for_lost) + + # also check text for lost language patterns + text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) + for sig in text_lost_signals: + if sig not in lost_signals: + lost_signals.append(sig) + lost_weight += text_lost_weight + + lost_potential_score = lost_weight + + # classify: builder, lost, both, or none + user_type = classify_user(lost_potential_score, builder_score, total_score) + + # confidence based on data richness + confidence = 0.3 + if user.get('bio'): + confidence += 0.15 + if len(repos) > 5: + confidence += 0.15 + if len(text_parts) > 5: + confidence += 0.15 + if user.get('email') or user.get('blog') or user.get('twitter_username'): + confidence += 0.15 + if total_stars > 100: + confidence += 0.1 + confidence = min(confidence, 1.0) + + # build reasons + reasons = [] + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if aligned_topics: + reasons.append(f"topics: {', '.join(list(aligned_topics)[:5])}") + if builder_score > 0: + reasons.append(f"builder ({len(repos)} repos)") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + # add lost reasons if applicable + if user_type == 'lost' or user_type == 'both': + lost_descriptions = get_signal_descriptions(lost_signals) + if lost_descriptions: + reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") + + # === DEEP HANDLE DISCOVERY === + # follow blog links, scrape websites, find ALL social handles + handles, discovered_emails = discover_all_handles(user) + + # merge discovered emails with github email + all_emails = discovered_emails or [] + if user.get('email'): + all_emails.append(user['email']) + all_emails = list(set(e for e in all_emails if e and 'noreply' not in e.lower())) + + return { + 'platform': 'github', + 'username': login, + 'url': f"https://github.com/{login}", + 'name': user.get('name'), + 'bio': user.get('bio'), + 'location': user.get('location'), + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'topics': list(aligned_topics), + 'languages': dict(languages), + 'repo_count': len(repos), + 'total_stars': total_stars, + 'reasons': reasons, + 'contact': { + 'email': all_emails[0] if all_emails else None, + 'emails': all_emails, + 'blog': user.get('blog'), + 'twitter': user.get('twitter_username') or handles.get('twitter'), + 'mastodon': handles.get('mastodon'), + 'bluesky': handles.get('bluesky'), + 'matrix': handles.get('matrix'), + 'lemmy': handles.get('lemmy'), + }, + 'extra': { + 'topics': list(aligned_topics), + 'languages': dict(languages), + 'repo_count': len(repos), + 'total_stars': total_stars, + 'hireable': user.get('hireable', False), + 'handles': handles, # all discovered handles + }, + 'hireable': user.get('hireable', False), + 'scraped_at': datetime.now().isoformat(), + # lost builder fields + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, # 'builder', 'lost', 'both', 'none' + } + + +def scrape_github(db, limit_per_source=50): + """ + full github scrape + returns list of analyzed users + """ + print("scoutd/github: starting scrape...") + + all_logins = set() + + # 1. ecosystem repo contributors + print(" scraping ecosystem repo contributors...") + for repo in ECOSYSTEM_REPOS: + contributors = get_repo_contributors(repo, per_page=limit_per_source) + for c in contributors: + login = c.get('login') + if login and not login.endswith('[bot]'): + all_logins.add(login) + print(f" {repo}: {len(contributors)} contributors") + + # 2. topic repos + print(" scraping topic repos...") + for topic in TARGET_TOPICS[:10]: + repos = search_repos_by_topic(topic, per_page=30) + for repo in repos: + owner = repo.get('owner', {}).get('login') + if owner and not owner.endswith('[bot]'): + all_logins.add(owner) + print(f" #{topic}: {len(repos)} repos") + + print(f" found {len(all_logins)} unique users to analyze") + + # analyze each + results = [] + builders_found = 0 + lost_found = 0 + + for i, login in enumerate(all_logins): + if i % 20 == 0: + print(f" analyzing... {i}/{len(all_logins)}") + + try: + result = analyze_github_user(login) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + user_type = result.get('user_type', 'none') + + if user_type == 'builder': + builders_found += 1 + if result['score'] >= 50: + print(f" ★ {login}: {result['score']} pts, {result['confidence']:.0%} conf") + + elif user_type == 'lost': + lost_found += 1 + lost_score = result.get('lost_potential_score', 0) + if lost_score >= 40: + print(f" 💔 {login}: lost_score={lost_score}, values={result['score']} pts") + + elif user_type == 'both': + builders_found += 1 + lost_found += 1 + print(f" ⚡ {login}: recovering builder (lost={result.get('lost_potential_score', 0)}, active={result['score']})") + + except Exception as e: + print(f" error on {login}: {e}") + + print(f"scoutd/github: found {len(results)} aligned humans") + print(f" - {builders_found} active builders") + print(f" - {lost_found} lost builders (need encouragement)") + return results diff --git a/connectd/scoutd/handles.py b/connectd/scoutd/handles.py new file mode 100644 index 0000000..ccf10ad --- /dev/null +++ b/connectd/scoutd/handles.py @@ -0,0 +1,507 @@ +""" +scoutd/handles.py - comprehensive social handle discovery + +finds ALL social handles from: +- github bio/profile +- personal websites (rel="me", footers, contact pages, json-ld) +- README files +- linktree/bio.link/carrd pages +- any linked pages + +stores structured handle data for activity-based contact selection +""" + +import re +import json +import requests +from urllib.parse import urlparse, urljoin +from bs4 import BeautifulSoup + +HEADERS = {'User-Agent': 'Mozilla/5.0 (compatible; connectd/1.0)'} + +# platform URL patterns -> (platform, handle_extractor) +PLATFORM_PATTERNS = { + # fediverse + 'mastodon': [ + (r'https?://([^/]+)/@([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + (r'https?://([^/]+)/users/([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + (r'https?://mastodon\.social/@([^/?#]+)', lambda m: f"@{m.group(1)}@mastodon.social"), + ], + 'pixelfed': [ + (r'https?://pixelfed\.social/@([^/?#]+)', lambda m: f"@{m.group(1)}@pixelfed.social"), + (r'https?://([^/]*pixelfed[^/]*)/@([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + ], + 'lemmy': [ + (r'https?://([^/]+)/u/([^/?#]+)', lambda m: f"@{m.group(2)}@{m.group(1)}"), + (r'https?://lemmy\.([^/]+)/u/([^/?#]+)', lambda m: f"@{m.group(2)}@lemmy.{m.group(1)}"), + ], + + # mainstream + 'twitter': [ + (r'https?://(?:www\.)?(?:twitter|x)\.com/([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + 'bluesky': [ + (r'https?://bsky\.app/profile/([^/?#]+)', lambda m: m.group(1)), + (r'https?://([^.]+)\.bsky\.social', lambda m: f"{m.group(1)}.bsky.social"), + ], + 'threads': [ + (r'https?://(?:www\.)?threads\.net/@([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + 'instagram': [ + (r'https?://(?:www\.)?instagram\.com/([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + 'facebook': [ + (r'https?://(?:www\.)?facebook\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'linkedin': [ + (r'https?://(?:www\.)?linkedin\.com/in/([^/?#]+)', lambda m: m.group(1)), + (r'https?://(?:www\.)?linkedin\.com/company/([^/?#]+)', lambda m: f"company/{m.group(1)}"), + ], + + # dev platforms + 'github': [ + (r'https?://(?:www\.)?github\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'gitlab': [ + (r'https?://(?:www\.)?gitlab\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'codeberg': [ + (r'https?://codeberg\.org/([^/?#]+)', lambda m: m.group(1)), + ], + 'sourcehut': [ + (r'https?://sr\.ht/~([^/?#]+)', lambda m: f"~{m.group(1)}"), + (r'https?://git\.sr\.ht/~([^/?#]+)', lambda m: f"~{m.group(1)}"), + ], + + # chat + 'matrix': [ + (r'https?://matrix\.to/#/(@[^:]+:[^/?#]+)', lambda m: m.group(1)), + ], + 'discord': [ + (r'https?://discord\.gg/([^/?#]+)', lambda m: f"invite/{m.group(1)}"), + (r'https?://discord\.com/invite/([^/?#]+)', lambda m: f"invite/{m.group(1)}"), + ], + 'telegram': [ + (r'https?://t\.me/([^/?#]+)', lambda m: f"@{m.group(1)}"), + ], + + # content + 'youtube': [ + (r'https?://(?:www\.)?youtube\.com/@([^/?#]+)', lambda m: f"@{m.group(1)}"), + (r'https?://(?:www\.)?youtube\.com/c(?:hannel)?/([^/?#]+)', lambda m: m.group(1)), + ], + 'twitch': [ + (r'https?://(?:www\.)?twitch\.tv/([^/?#]+)', lambda m: m.group(1)), + ], + 'substack': [ + (r'https?://([^.]+)\.substack\.com', lambda m: m.group(1)), + ], + 'medium': [ + (r'https?://(?:www\.)?medium\.com/@([^/?#]+)', lambda m: f"@{m.group(1)}"), + (r'https?://([^.]+)\.medium\.com', lambda m: m.group(1)), + ], + 'devto': [ + (r'https?://dev\.to/([^/?#]+)', lambda m: m.group(1)), + ], + + # funding + 'kofi': [ + (r'https?://ko-fi\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'patreon': [ + (r'https?://(?:www\.)?patreon\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'liberapay': [ + (r'https?://liberapay\.com/([^/?#]+)', lambda m: m.group(1)), + ], + 'github_sponsors': [ + (r'https?://github\.com/sponsors/([^/?#]+)', lambda m: m.group(1)), + ], + + # link aggregators (we'll parse these specially) + 'linktree': [ + (r'https?://linktr\.ee/([^/?#]+)', lambda m: m.group(1)), + ], + 'biolink': [ + (r'https?://bio\.link/([^/?#]+)', lambda m: m.group(1)), + ], + 'carrd': [ + (r'https?://([^.]+)\.carrd\.co', lambda m: m.group(1)), + ], +} + +# fediverse handle pattern: @user@instance +FEDIVERSE_HANDLE_PATTERN = re.compile(r'@([\w.-]+)@([\w.-]+\.[\w]+)') + +# email pattern +EMAIL_PATTERN = re.compile(r'\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b') + +# known fediverse instances (for context-free handle detection) +KNOWN_FEDIVERSE_INSTANCES = [ + 'mastodon.social', 'mastodon.online', 'mstdn.social', 'mas.to', + 'tech.lgbt', 'fosstodon.org', 'hackers.town', 'social.coop', + 'kolektiva.social', 'solarpunk.moe', 'wandering.shop', + 'elekk.xyz', 'cybre.space', 'octodon.social', 'chaos.social', + 'infosec.exchange', 'ruby.social', 'phpc.social', 'toot.cafe', + 'mstdn.io', 'pixelfed.social', 'lemmy.ml', 'lemmy.world', + 'kbin.social', 'pleroma.site', 'akkoma.dev', +] + + +def extract_handle_from_url(url): + """extract platform and handle from a URL""" + for platform, patterns in PLATFORM_PATTERNS.items(): + for pattern, extractor in patterns: + match = re.match(pattern, url, re.I) + if match: + return platform, extractor(match) + return None, None + + +def extract_fediverse_handles(text): + """find @user@instance.tld patterns in text""" + handles = [] + for match in FEDIVERSE_HANDLE_PATTERN.finditer(text): + user, instance = match.groups() + handles.append(f"@{user}@{instance}") + return handles + + +def extract_emails(text): + """find email addresses in text""" + emails = [] + for match in EMAIL_PATTERN.finditer(text): + email = match.group(1) + # filter out common non-personal emails + if not any(x in email.lower() for x in ['noreply', 'no-reply', 'donotreply', 'example.com']): + emails.append(email) + return emails + + +def scrape_page(url, timeout=15): + """fetch and parse a web page""" + try: + resp = requests.get(url, headers=HEADERS, timeout=timeout, allow_redirects=True) + resp.raise_for_status() + return BeautifulSoup(resp.text, 'html.parser'), resp.text + except Exception as e: + return None, None + + +def extract_rel_me_links(soup): + """extract rel="me" links (used for verification)""" + links = [] + if not soup: + return links + + for a in soup.find_all('a', rel=lambda x: x and 'me' in x): + href = a.get('href') + if href: + links.append(href) + + return links + + +def extract_social_links_from_page(soup, base_url=None): + """extract all social links from a page""" + links = [] + if not soup: + return links + + # all links + for a in soup.find_all('a', href=True): + href = a['href'] + if base_url and not href.startswith('http'): + href = urljoin(base_url, href) + + # check if it's a known social platform + platform, handle = extract_handle_from_url(href) + if platform: + links.append({'platform': platform, 'handle': handle, 'url': href}) + + return links + + +def extract_json_ld(soup): + """extract structured data from JSON-LD""" + data = {} + if not soup: + return data + + for script in soup.find_all('script', type='application/ld+json'): + try: + ld = json.loads(script.string) + # look for sameAs links (social profiles) + if isinstance(ld, dict): + same_as = ld.get('sameAs', []) + if isinstance(same_as, str): + same_as = [same_as] + for url in same_as: + platform, handle = extract_handle_from_url(url) + if platform: + data[platform] = handle + except: + pass + + return data + + +def scrape_linktree(url): + """scrape a linktree/bio.link/carrd page for all links""" + handles = {} + soup, raw = scrape_page(url) + if not soup: + return handles + + # linktree uses data attributes and JS, but links are often in the HTML + links = extract_social_links_from_page(soup, url) + for link in links: + if link['platform'] not in ['linktree', 'biolink', 'carrd']: + handles[link['platform']] = link['handle'] + + # also check for fediverse handles in text + if raw: + fedi_handles = extract_fediverse_handles(raw) + if fedi_handles: + handles['mastodon'] = fedi_handles[0] + + return handles + + +def scrape_website_for_handles(url, follow_links=True): + """ + comprehensive website scrape for social handles + + checks: + - rel="me" links + - social links in page + - json-ld structured data + - /about and /contact pages + - fediverse handles in text + - emails + """ + handles = {} + emails = [] + + soup, raw = scrape_page(url) + if not soup: + return handles, emails + + # 1. rel="me" links (most authoritative) + rel_me = extract_rel_me_links(soup) + for link in rel_me: + platform, handle = extract_handle_from_url(link) + if platform and platform not in handles: + handles[platform] = handle + + # 2. all social links on page + social_links = extract_social_links_from_page(soup, url) + for link in social_links: + if link['platform'] not in handles: + handles[link['platform']] = link['handle'] + + # 3. json-ld structured data + json_ld = extract_json_ld(soup) + for platform, handle in json_ld.items(): + if platform not in handles: + handles[platform] = handle + + # 4. fediverse handles in text + if raw: + fedi = extract_fediverse_handles(raw) + if fedi and 'mastodon' not in handles: + handles['mastodon'] = fedi[0] + + # emails + emails = extract_emails(raw) + + # 5. follow links to /about, /contact + if follow_links: + parsed = urlparse(url) + base = f"{parsed.scheme}://{parsed.netloc}" + + for path in ['/about', '/contact', '/links', '/social']: + try: + sub_soup, sub_raw = scrape_page(base + path) + if sub_soup: + sub_links = extract_social_links_from_page(sub_soup, base) + for link in sub_links: + if link['platform'] not in handles: + handles[link['platform']] = link['handle'] + + if sub_raw: + fedi = extract_fediverse_handles(sub_raw) + if fedi and 'mastodon' not in handles: + handles['mastodon'] = fedi[0] + + emails.extend(extract_emails(sub_raw)) + except: + pass + + # 6. check for linktree etc in links and follow them + for platform in ['linktree', 'biolink', 'carrd']: + if platform in handles: + # this is actually a link aggregator, scrape it + link_url = None + for link in social_links: + if link['platform'] == platform: + link_url = link['url'] + break + + if link_url: + aggregator_handles = scrape_linktree(link_url) + for p, h in aggregator_handles.items(): + if p not in handles: + handles[p] = h + + del handles[platform] # remove the aggregator itself + + return handles, list(set(emails)) + + +def extract_handles_from_text(text): + """extract handles from plain text (bio, README, etc)""" + handles = {} + + if not text: + return handles + + # fediverse handles + fedi = extract_fediverse_handles(text) + if fedi: + handles['mastodon'] = fedi[0] + + # URL patterns in text + url_pattern = re.compile(r'https?://[^\s<>"\']+') + for match in url_pattern.finditer(text): + url = match.group(0).rstrip('.,;:!?)') + platform, handle = extract_handle_from_url(url) + if platform and platform not in handles: + handles[platform] = handle + + # twitter-style @mentions (only if looks like twitter context) + if 'twitter' in text.lower() or 'x.com' in text.lower(): + twitter_pattern = re.compile(r'(?:^|[^\w])@(\w{1,15})(?:[^\w]|$)') + for match in twitter_pattern.finditer(text): + if 'twitter' not in handles: + handles['twitter'] = f"@{match.group(1)}" + + # matrix handles + matrix_pattern = re.compile(r'@([\w.-]+):([\w.-]+)') + for match in matrix_pattern.finditer(text): + if 'matrix' not in handles: + handles['matrix'] = f"@{match.group(1)}:{match.group(2)}" + + return handles + + +def scrape_github_readme(username): + """scrape user's profile README (username/username repo)""" + handles = {} + emails = [] + + url = f"https://raw.githubusercontent.com/{username}/{username}/main/README.md" + try: + resp = requests.get(url, headers=HEADERS, timeout=10) + if resp.status_code == 200: + text = resp.text + + # extract handles from text + handles = extract_handles_from_text(text) + + # extract emails + emails = extract_emails(text) + + return handles, emails + except: + pass + + # try master branch + url = f"https://raw.githubusercontent.com/{username}/{username}/master/README.md" + try: + resp = requests.get(url, headers=HEADERS, timeout=10) + if resp.status_code == 200: + text = resp.text + handles = extract_handles_from_text(text) + emails = extract_emails(text) + except: + pass + + return handles, emails + + +def discover_all_handles(github_profile): + """ + comprehensive handle discovery from a github profile dict + + github_profile should contain: + - username + - bio + - blog (website URL) + - twitter_username + - etc. + """ + handles = {} + emails = [] + + username = github_profile.get('login') or github_profile.get('username') + + print(f" discovering handles for {username}...") + + # 1. github bio + bio = github_profile.get('bio', '') + if bio: + bio_handles = extract_handles_from_text(bio) + handles.update(bio_handles) + emails.extend(extract_emails(bio)) + + # 2. twitter from github profile + twitter = github_profile.get('twitter_username') + if twitter and 'twitter' not in handles: + handles['twitter'] = f"@{twitter}" + + # 3. website from github profile + website = github_profile.get('blog') + if website: + if not website.startswith('http'): + website = f"https://{website}" + + print(f" scraping website: {website}") + site_handles, site_emails = scrape_website_for_handles(website) + for p, h in site_handles.items(): + if p not in handles: + handles[p] = h + emails.extend(site_emails) + + # 4. profile README + if username: + print(f" checking profile README...") + readme_handles, readme_emails = scrape_github_readme(username) + for p, h in readme_handles.items(): + if p not in handles: + handles[p] = h + emails.extend(readme_emails) + + # 5. email from github profile + github_email = github_profile.get('email') + if github_email: + emails.append(github_email) + + # dedupe emails + emails = list(set(e for e in emails if e and '@' in e and 'noreply' not in e.lower())) + + print(f" found {len(handles)} handles, {len(emails)} emails") + + return handles, emails + + +def merge_handles(existing, new): + """merge new handles into existing, preferring more specific handles""" + for platform, handle in new.items(): + if platform not in existing: + existing[platform] = handle + elif len(handle) > len(existing[platform]): + # prefer longer/more specific handles + existing[platform] = handle + + return existing diff --git a/connectd/scoutd/lemmy.py b/connectd/scoutd/lemmy.py new file mode 100644 index 0000000..ccf51ab --- /dev/null +++ b/connectd/scoutd/lemmy.py @@ -0,0 +1,322 @@ +""" +scoutd/lemmy.py - lemmy (fediverse reddit) discovery + +lemmy is federated so we hit multiple instances. +great for finding lost builders in communities like: +- /c/programming, /c/technology, /c/linux +- /c/antiwork, /c/workreform (lost builders!) +- /c/selfhosted, /c/privacy, /c/opensource + +supports authenticated access for private instances and DM delivery. +""" + +import requests +import json +import time +import os +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text +from .lost import ( + analyze_social_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, +) + +# auth config from environment +LEMMY_INSTANCE = os.environ.get('LEMMY_INSTANCE', '') +LEMMY_USERNAME = os.environ.get('LEMMY_USERNAME', '') +LEMMY_PASSWORD = os.environ.get('LEMMY_PASSWORD', '') + +# auth token cache +_auth_token = None + +# popular lemmy instances +LEMMY_INSTANCES = [ + 'lemmy.ml', + 'lemmy.world', + 'programming.dev', + 'lemm.ee', + 'sh.itjust.works', +] + +# communities to scout (format: community@instance or just community for local) +TARGET_COMMUNITIES = [ + # builder communities + 'programming', + 'selfhosted', + 'linux', + 'opensource', + 'privacy', + 'technology', + 'webdev', + 'rust', + 'python', + 'golang', + + # lost builder communities (people struggling, stuck, seeking) + 'antiwork', + 'workreform', + 'careerguidance', + 'cscareerquestions', + 'learnprogramming', + 'findapath', +] + +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'lemmy' +CACHE_DIR.mkdir(parents=True, exist_ok=True) + + +def get_auth_token(instance=None): + """get auth token for lemmy instance""" + global _auth_token + + if _auth_token: + return _auth_token + + instance = instance or LEMMY_INSTANCE + if not all([instance, LEMMY_USERNAME, LEMMY_PASSWORD]): + return None + + try: + url = f"https://{instance}/api/v3/user/login" + resp = requests.post(url, json={ + 'username_or_email': LEMMY_USERNAME, + 'password': LEMMY_PASSWORD, + }, timeout=30) + + if resp.status_code == 200: + _auth_token = resp.json().get('jwt') + return _auth_token + return None + except Exception as e: + print(f"lemmy auth error: {e}") + return None + + +def send_lemmy_dm(recipient_username, message, dry_run=False): + """send a private message via lemmy""" + if not LEMMY_INSTANCE: + return False, "LEMMY_INSTANCE not configured" + + if dry_run: + print(f"[dry run] would send lemmy DM to {recipient_username}") + return True, None + + token = get_auth_token() + if not token: + return False, "failed to authenticate with lemmy" + + try: + # parse recipient - could be username@instance or just username + if '@' in recipient_username: + username, instance = recipient_username.split('@', 1) + else: + username = recipient_username + instance = LEMMY_INSTANCE + + # get recipient user id + user_url = f"https://{LEMMY_INSTANCE}/api/v3/user" + resp = requests.get(user_url, params={'username': f"{username}@{instance}"}, timeout=30) + + if resp.status_code != 200: + # try without instance suffix for local users + resp = requests.get(user_url, params={'username': username}, timeout=30) + + if resp.status_code != 200: + return False, f"could not find user {recipient_username}" + + recipient_id = resp.json().get('person_view', {}).get('person', {}).get('id') + if not recipient_id: + return False, "could not get recipient id" + + # send DM + dm_url = f"https://{LEMMY_INSTANCE}/api/v3/private_message" + resp = requests.post(dm_url, + headers={'Authorization': f'Bearer {token}'}, + json={ + 'content': message, + 'recipient_id': recipient_id, + }, + timeout=30 + ) + + if resp.status_code == 200: + return True, None + else: + return False, f"lemmy DM error: {resp.status_code} - {resp.text}" + + except Exception as e: + return False, f"lemmy DM error: {str(e)}" + + +def get_community_posts(instance, community, limit=50, sort='New'): + """get posts from a lemmy community""" + try: + url = f"https://{instance}/api/v3/post/list" + params = { + 'community_name': community, + 'sort': sort, + 'limit': limit, + } + + resp = requests.get(url, params=params, timeout=30) + if resp.status_code == 200: + return resp.json().get('posts', []) + return [] + except Exception as e: + return [] + + +def get_user_profile(instance, username): + """get lemmy user profile""" + try: + url = f"https://{instance}/api/v3/user" + params = {'username': username} + + resp = requests.get(url, params=params, timeout=30) + if resp.status_code == 200: + return resp.json() + return None + except Exception: + return None + + +def analyze_lemmy_user(instance, username, posts=None): + """analyze a lemmy user for values alignment and lost signals""" + profile = get_user_profile(instance, username) + if not profile: + return None + + person = profile.get('person_view', {}).get('person', {}) + counts = profile.get('person_view', {}).get('counts', {}) + + bio = person.get('bio', '') or '' + display_name = person.get('display_name') or person.get('name', username) + + # analyze bio + bio_score, bio_signals, bio_reasons = analyze_text(bio) + + # analyze posts if provided + post_signals = [] + post_text = [] + if posts: + for post in posts[:10]: + post_data = post.get('post', {}) + title = post_data.get('name', '') + body = post_data.get('body', '') + post_text.append(f"{title} {body}") + + _, signals, _ = analyze_text(f"{title} {body}") + post_signals.extend(signals) + + all_signals = list(set(bio_signals + post_signals)) + total_score = bio_score + len(post_signals) * 5 + + # lost builder detection + profile_for_lost = { + 'bio': bio, + 'post_count': counts.get('post_count', 0), + 'comment_count': counts.get('comment_count', 0), + } + posts_for_lost = [{'text': t} for t in post_text] + + lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) + lost_potential_score = lost_weight + user_type = classify_user(lost_potential_score, 50, total_score) + + return { + 'platform': 'lemmy', + 'username': f"{username}@{instance}", + 'url': f"https://{instance}/u/{username}", + 'name': display_name, + 'bio': bio, + 'location': None, + 'score': total_score, + 'confidence': min(0.9, 0.3 + len(all_signals) * 0.1), + 'signals': all_signals, + 'negative_signals': [], + 'reasons': bio_reasons, + 'contact': {}, + 'extra': { + 'instance': instance, + 'post_count': counts.get('post_count', 0), + 'comment_count': counts.get('comment_count', 0), + }, + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_lemmy(db, limit_per_community=30): + """scrape lemmy instances for aligned builders""" + print("scouting lemmy...") + + found = 0 + lost_found = 0 + seen_users = set() + + # build instance list - user's instance first if configured + instances = list(LEMMY_INSTANCES) + if LEMMY_INSTANCE and LEMMY_INSTANCE not in instances: + instances.insert(0, LEMMY_INSTANCE) + + for instance in instances: + print(f" instance: {instance}") + + for community in TARGET_COMMUNITIES: + posts = get_community_posts(instance, community, limit=limit_per_community) + + if not posts: + continue + + print(f" /c/{community}: {len(posts)} posts") + + # group posts by user + user_posts = {} + for post in posts: + creator = post.get('creator', {}) + username = creator.get('name') + if not username: + continue + + user_key = f"{username}@{instance}" + if user_key in seen_users: + continue + + if user_key not in user_posts: + user_posts[user_key] = [] + user_posts[user_key].append(post) + + # analyze each user + for user_key, posts in user_posts.items(): + username = user_key.split('@')[0] + + if user_key in seen_users: + continue + seen_users.add(user_key) + + result = analyze_lemmy_user(instance, username, posts) + if not result: + continue + + if result['score'] >= 20 or result.get('lost_potential_score', 0) >= 30: + db.save_human(result) + found += 1 + + if result.get('user_type') in ['lost', 'both']: + lost_found += 1 + print(f" {result['username']}: {result['score']:.0f} (lost: {result['lost_potential_score']:.0f})") + elif result['score'] >= 40: + print(f" {result['username']}: {result['score']:.0f}") + + time.sleep(0.5) # rate limit + + time.sleep(1) # between communities + + time.sleep(2) # between instances + + print(f"lemmy: found {found} humans ({lost_found} lost builders)") + return found diff --git a/connectd/scoutd/lobsters.py b/connectd/scoutd/lobsters.py new file mode 100644 index 0000000..4106ca5 --- /dev/null +++ b/connectd/scoutd/lobsters.py @@ -0,0 +1,169 @@ +""" +scoutd/lobsters.py - lobste.rs discovery +high-signal invite-only tech community +""" + +import requests +import json +import time +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'lobsters' + +ALIGNED_TAGS = ['privacy', 'security', 'distributed', 'rust', 'linux', 'culture', 'practices'] + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(2) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" lobsters api error: {e}") + return None + + +def get_stories_by_tag(tag): + """get recent stories by tag""" + url = f'https://lobste.rs/t/{tag}.json' + return _api_get(url) or [] + + +def get_newest_stories(): + """get newest stories""" + return _api_get('https://lobste.rs/newest.json') or [] + + +def get_user(username): + """get user profile""" + return _api_get(f'https://lobste.rs/u/{username}.json') + + +def analyze_lobsters_user(username): + """analyze a lobste.rs user""" + user = get_user(username) + if not user: + return None + + text_parts = [] + if user.get('about'): + text_parts.append(user['about']) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # lobsters base bonus (invite-only, high signal) + base_score = 15 + + # karma bonus + karma = user.get('karma', 0) + karma_score = 0 + if karma > 100: + karma_score = 10 + elif karma > 50: + karma_score = 5 + + # github presence + github_score = 5 if user.get('github_username') else 0 + + # homepage + homepage_score = 5 if user.get('homepage') else 0 + + total_score = text_score + base_score + karma_score + github_score + homepage_score + + # confidence + confidence = 0.4 # higher base for invite-only + if text_parts: + confidence += 0.2 + if karma > 50: + confidence += 0.2 + confidence = min(confidence, 0.9) + + reasons = ['on lobste.rs (invite-only)'] + if karma > 50: + reasons.append(f"active ({karma} karma)") + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + return { + 'platform': 'lobsters', + 'username': username, + 'url': f"https://lobste.rs/u/{username}", + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'karma': karma, + 'reasons': reasons, + 'contact': { + 'github': user.get('github_username'), + 'twitter': user.get('twitter_username'), + 'homepage': user.get('homepage'), + }, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_lobsters(db): + """full lobste.rs scrape""" + print("scoutd/lobsters: starting scrape...") + + all_users = set() + + # stories by aligned tags + for tag in ALIGNED_TAGS: + print(f" tag: {tag}...") + stories = get_stories_by_tag(tag) + for story in stories: + submitter = story.get('submitter_user', {}).get('username') + if submitter: + all_users.add(submitter) + + # newest stories + print(" newest stories...") + for story in get_newest_stories(): + submitter = story.get('submitter_user', {}).get('username') + if submitter: + all_users.add(submitter) + + print(f" {len(all_users)} unique users to analyze") + + # analyze + results = [] + for username in all_users: + try: + result = analyze_lobsters_user(username) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + if result['score'] >= 30: + print(f" ★ {username}: {result['score']} pts") + except Exception as e: + print(f" error on {username}: {e}") + + print(f"scoutd/lobsters: found {len(results)} aligned humans") + return results diff --git a/connectd/scoutd/lost.py b/connectd/scoutd/lost.py new file mode 100644 index 0000000..ab19298 --- /dev/null +++ b/connectd/scoutd/lost.py @@ -0,0 +1,491 @@ +""" +scoutd/lost.py - lost builder detection + +finds people with potential who haven't found it yet, gave up, or are too beaten down to try. + +these aren't failures. they're seeds that never got water. + +detection signals: +- github: forked but never modified, starred many but built nothing, learning repos abandoned +- reddit/forums: "i wish i could...", stuck asking beginner questions for years, helping others but never sharing +- social: retoots builders but never posts own work, imposter syndrome language, isolation signals +- profiles: bio says what they WANT to be, "aspiring" for 2+ years, empty portfolios + +the goal isn't to recruit them. it's to show them the door exists. +""" + +import re +from datetime import datetime, timedelta +from collections import defaultdict + + +# signal definitions with weights +LOST_SIGNALS = { + # github signals + 'forked_never_modified': { + 'weight': 15, + 'category': 'github', + 'description': 'forked repos but never pushed changes', + }, + 'starred_many_built_nothing': { + 'weight': 20, + 'category': 'github', + 'description': 'starred 50+ repos but has 0-2 own repos', + }, + 'account_no_repos': { + 'weight': 10, + 'category': 'github', + 'description': 'account exists but no public repos', + }, + 'inactivity_bursts': { + 'weight': 15, + 'category': 'github', + 'description': 'long gaps then brief activity bursts', + }, + 'only_issues_comments': { + 'weight': 12, + 'category': 'github', + 'description': 'only activity is issues/comments on others work', + }, + 'abandoned_learning_repos': { + 'weight': 18, + 'category': 'github', + 'description': 'learning/tutorial repos that were never finished', + }, + 'readme_only_repos': { + 'weight': 10, + 'category': 'github', + 'description': 'repos with just README, no actual code', + }, + + # language signals (from posts/comments/bio) + 'wish_i_could': { + 'weight': 12, + 'category': 'language', + 'description': '"i wish i could..." language', + 'patterns': [ + r'i wish i could', + r'i wish i knew how', + r'wish i had the (time|energy|motivation|skills?)', + ], + }, + 'someday_want': { + 'weight': 10, + 'category': 'language', + 'description': '"someday i want to..." language', + 'patterns': [ + r'someday i (want|hope|plan) to', + r'one day i\'ll', + r'eventually i\'ll', + r'when i have time i\'ll', + ], + }, + 'stuck_beginner': { + 'weight': 20, + 'category': 'language', + 'description': 'asking beginner questions for years', + 'patterns': [ + r'still (trying|learning|struggling) (to|with)', + r'can\'t seem to (get|understand|figure)', + r'been trying for (months|years)', + ], + }, + 'self_deprecating': { + 'weight': 15, + 'category': 'language', + 'description': 'self-deprecating about abilities', + 'patterns': [ + r'i\'m (not smart|too dumb|not good) enough', + r'i (suck|am terrible) at', + r'i\'ll never be able to', + r'people like me (can\'t|don\'t)', + r'i\'m just not (a|the) (type|kind)', + ], + }, + 'no_energy': { + 'weight': 18, + 'category': 'language', + 'description': '"how do people have energy" posts', + 'patterns': [ + r'how do (people|you|they) have (the )?(energy|time|motivation)', + r'where do (people|you|they) find (the )?(energy|motivation)', + r'i\'m (always|constantly) (tired|exhausted|drained)', + r'no (energy|motivation) (left|anymore)', + ], + }, + 'imposter_syndrome': { + 'weight': 15, + 'category': 'language', + 'description': 'imposter syndrome language', + 'patterns': [ + r'imposter syndrome', + r'feel like (a |an )?(fraud|fake|imposter)', + r'don\'t (belong|deserve)', + r'everyone else (seems|is) (so much )?(better|smarter)', + r'they\'ll (find out|realize) i\'m', + ], + }, + 'should_really': { + 'weight': 8, + 'category': 'language', + 'description': '"i should really..." posts', + 'patterns': [ + r'i (should|need to) really', + r'i keep (meaning|wanting) to', + r'i\'ve been (meaning|wanting) to', + ], + }, + 'isolation_signals': { + 'weight': 20, + 'category': 'language', + 'description': 'isolation/loneliness language', + 'patterns': [ + r'no one (understands|gets it|to talk to)', + r'(feel|feeling) (so )?(alone|isolated|lonely)', + r'don\'t have anyone (to|who)', + r'wish i (had|knew) (someone|people)', + ], + }, + 'enthusiasm_for_others': { + 'weight': 10, + 'category': 'behavior', + 'description': 'celebrates others but dismissive of self', + }, + + # subreddit/community signals + 'stuck_communities': { + 'weight': 15, + 'category': 'community', + 'description': 'active in stuck/struggling communities', + 'subreddits': [ + 'learnprogramming', + 'findapath', + 'getdisciplined', + 'getmotivated', + 'decidingtobebetter', + 'selfimprovement', + 'adhd', + 'depression', + 'anxiety', + ], + }, + + # profile signals + 'aspirational_bio': { + 'weight': 12, + 'category': 'profile', + 'description': 'bio says what they WANT to be', + 'patterns': [ + r'aspiring', + r'future', + r'want(ing)? to (be|become)', + r'learning to', + r'trying to (become|be|learn)', + r'hoping to', + ], + }, + 'empty_portfolio': { + 'weight': 15, + 'category': 'profile', + 'description': 'links to empty portfolio sites', + }, + 'long_aspiring': { + 'weight': 20, + 'category': 'profile', + 'description': '"aspiring" in bio for 2+ years', + }, +} + +# subreddits that indicate someone might be stuck +STUCK_SUBREDDITS = { + 'learnprogramming': 8, + 'findapath': 15, + 'getdisciplined': 12, + 'getmotivated': 10, + 'decidingtobebetter': 12, + 'selfimprovement': 8, + 'adhd': 10, + 'depression': 15, + 'anxiety': 12, + 'socialanxiety': 12, + 'neet': 20, + 'lostgeneration': 15, + 'antiwork': 5, # could be aligned OR stuck + 'careerguidance': 8, + 'cscareerquestions': 5, +} + + +def analyze_text_for_lost_signals(text): + """analyze text for lost builder language patterns""" + if not text: + return [], 0 + + text_lower = text.lower() + signals_found = [] + total_weight = 0 + + for signal_name, signal_data in LOST_SIGNALS.items(): + if 'patterns' not in signal_data: + continue + + for pattern in signal_data['patterns']: + if re.search(pattern, text_lower): + signals_found.append(signal_name) + total_weight += signal_data['weight'] + break # only count each signal once + + return signals_found, total_weight + + +def analyze_github_for_lost_signals(profile): + """analyze github profile for lost builder signals""" + signals_found = [] + total_weight = 0 + + if not profile: + return signals_found, total_weight + + repos = profile.get('repos', []) or profile.get('top_repos', []) + extra = profile.get('extra', {}) + + public_repos = profile.get('public_repos', len(repos)) + followers = profile.get('followers', 0) + following = profile.get('following', 0) + + # starred many but built nothing + # (we'd need to fetch starred count separately, approximate with following ratio) + if public_repos <= 2 and following > 50: + signals_found.append('starred_many_built_nothing') + total_weight += LOST_SIGNALS['starred_many_built_nothing']['weight'] + + # account but no repos + if public_repos == 0: + signals_found.append('account_no_repos') + total_weight += LOST_SIGNALS['account_no_repos']['weight'] + + # check repos for signals + forked_count = 0 + forked_modified = 0 + learning_repos = 0 + readme_only = 0 + + learning_keywords = ['learning', 'tutorial', 'course', 'practice', 'exercise', + 'bootcamp', 'udemy', 'freecodecamp', 'odin', 'codecademy'] + + for repo in repos: + name = (repo.get('name') or '').lower() + description = (repo.get('description') or '').lower() + language = repo.get('language') + is_fork = repo.get('fork', False) + + # forked but never modified + if is_fork: + forked_count += 1 + # if pushed_at is close to created_at, never modified + # (simplified: just count forks for now) + + # learning/tutorial repos + if any(kw in name or kw in description for kw in learning_keywords): + learning_repos += 1 + + # readme only (no language detected usually means no code) + if not language and not is_fork: + readme_only += 1 + + if forked_count >= 5 and public_repos - forked_count <= 2: + signals_found.append('forked_never_modified') + total_weight += LOST_SIGNALS['forked_never_modified']['weight'] + + if learning_repos >= 3: + signals_found.append('abandoned_learning_repos') + total_weight += LOST_SIGNALS['abandoned_learning_repos']['weight'] + + if readme_only >= 2: + signals_found.append('readme_only_repos') + total_weight += LOST_SIGNALS['readme_only_repos']['weight'] + + # check bio for lost signals + bio = profile.get('bio') or '' + bio_signals, bio_weight = analyze_text_for_lost_signals(bio) + signals_found.extend(bio_signals) + total_weight += bio_weight + + # aspirational bio check + bio_lower = bio.lower() + if any(re.search(p, bio_lower) for p in LOST_SIGNALS['aspirational_bio']['patterns']): + if 'aspirational_bio' not in signals_found: + signals_found.append('aspirational_bio') + total_weight += LOST_SIGNALS['aspirational_bio']['weight'] + + return signals_found, total_weight + + +def analyze_reddit_for_lost_signals(activity, subreddits): + """analyze reddit activity for lost builder signals""" + signals_found = [] + total_weight = 0 + + # check subreddit activity + stuck_sub_activity = 0 + for sub in subreddits: + if sub.lower() in STUCK_SUBREDDITS: + stuck_sub_activity += STUCK_SUBREDDITS[sub.lower()] + + if stuck_sub_activity >= 20: + signals_found.append('stuck_communities') + total_weight += min(stuck_sub_activity, 30) # cap at 30 + + # analyze post/comment text + all_text = [] + for item in activity: + if item.get('title'): + all_text.append(item['title']) + if item.get('body'): + all_text.append(item['body']) + + combined_text = ' '.join(all_text) + text_signals, text_weight = analyze_text_for_lost_signals(combined_text) + signals_found.extend(text_signals) + total_weight += text_weight + + # check for helping others but never sharing own work + help_count = 0 + share_count = 0 + for item in activity: + body = (item.get('body') or '').lower() + title = (item.get('title') or '').lower() + + # helping patterns + if any(p in body for p in ['try this', 'you could', 'have you tried', 'i recommend']): + help_count += 1 + + # sharing patterns + if any(p in body + title for p in ['i built', 'i made', 'my project', 'check out my', 'i created']): + share_count += 1 + + if help_count >= 5 and share_count == 0: + signals_found.append('enthusiasm_for_others') + total_weight += LOST_SIGNALS['enthusiasm_for_others']['weight'] + + return signals_found, total_weight + + +def analyze_social_for_lost_signals(profile, posts): + """analyze mastodon/social for lost builder signals""" + signals_found = [] + total_weight = 0 + + # check bio + bio = profile.get('bio') or profile.get('note') or '' + bio_signals, bio_weight = analyze_text_for_lost_signals(bio) + signals_found.extend(bio_signals) + total_weight += bio_weight + + # check posts + boost_count = 0 + original_count = 0 + own_work_count = 0 + + for post in posts: + content = (post.get('content') or '').lower() + is_boost = post.get('reblog') is not None or post.get('repost') + + if is_boost: + boost_count += 1 + else: + original_count += 1 + + # check if sharing own work + if any(p in content for p in ['i built', 'i made', 'my project', 'working on', 'just shipped']): + own_work_count += 1 + + # analyze text + text_signals, text_weight = analyze_text_for_lost_signals(content) + for sig in text_signals: + if sig not in signals_found: + signals_found.append(sig) + total_weight += LOST_SIGNALS[sig]['weight'] + + # boosts builders but never posts own work + if boost_count >= 10 and own_work_count == 0: + signals_found.append('enthusiasm_for_others') + total_weight += LOST_SIGNALS['enthusiasm_for_others']['weight'] + + return signals_found, total_weight + + +def calculate_lost_potential_score(signals_found): + """calculate overall lost potential score from signals""" + total = 0 + for signal in signals_found: + if signal in LOST_SIGNALS: + total += LOST_SIGNALS[signal]['weight'] + return total + + +def classify_user(lost_score, builder_score, values_score): + """ + classify user as builder, lost, or neither + + returns: 'builder' | 'lost' | 'both' | 'none' + """ + # high builder score = active builder + if builder_score >= 50 and lost_score < 30: + return 'builder' + + # high lost score + values alignment = lost builder (priority outreach) + if lost_score >= 40 and values_score >= 20: + return 'lost' + + # both signals = complex case, might be recovering + if lost_score >= 30 and builder_score >= 30: + return 'both' + + return 'none' + + +def get_signal_descriptions(signals_found): + """get human-readable descriptions of detected signals""" + descriptions = [] + for signal in signals_found: + if signal in LOST_SIGNALS: + descriptions.append(LOST_SIGNALS[signal]['description']) + return descriptions + + +def should_outreach_lost(user_data, config=None): + """ + determine if we should reach out to a lost builder + + considers: + - lost_potential_score threshold + - values alignment + - cooldown period + - manual review requirement + """ + config = config or {} + + lost_score = user_data.get('lost_potential_score', 0) + values_score = user_data.get('score', 0) # regular alignment score + + # minimum thresholds + min_lost = config.get('min_lost_score', 40) + min_values = config.get('min_values_score', 20) + + if lost_score < min_lost: + return False, 'lost_score too low' + + if values_score < min_values: + return False, 'values_score too low' + + # check cooldown + last_outreach = user_data.get('last_lost_outreach') + if last_outreach: + cooldown_days = config.get('cooldown_days', 90) + last_dt = datetime.fromisoformat(last_outreach) + if datetime.now() - last_dt < timedelta(days=cooldown_days): + return False, f'cooldown active (90 days)' + + # always require manual review for lost outreach + return True, 'requires_review' diff --git a/connectd/scoutd/mastodon.py b/connectd/scoutd/mastodon.py new file mode 100644 index 0000000..014ec51 --- /dev/null +++ b/connectd/scoutd/mastodon.py @@ -0,0 +1,290 @@ +""" +scoutd/mastodon.py - fediverse discovery +scrapes high-signal instances: tech.lgbt, social.coop, fosstodon, hackers.town +also detects lost builders - social isolation, imposter syndrome, struggling folks +""" + +import requests +import json +import time +import re +from datetime import datetime +from pathlib import Path + +from .signals import analyze_text, ALIGNED_INSTANCES +from .lost import ( + analyze_social_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, + get_signal_descriptions, +) + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'mastodon' + +TARGET_HASHTAGS = [ + 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', + 'privacy', 'solarpunk', 'cooperative', 'cohousing', 'mutualaid', + 'intentionalcommunity', 'degoogle', 'fediverse', 'indieweb', +] + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(1) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" mastodon api error: {e}") + return None + + +def strip_html(text): + """strip html tags""" + return re.sub(r'<[^>]+>', ' ', text) if text else '' + + +def get_instance_directory(instance, limit=40): + """get users from instance directory""" + url = f'https://{instance}/api/v1/directory' + return _api_get(url, {'limit': limit, 'local': 'true'}) or [] + + +def get_hashtag_timeline(instance, hashtag, limit=40): + """get posts from hashtag""" + url = f'https://{instance}/api/v1/timelines/tag/{hashtag}' + return _api_get(url, {'limit': limit}) or [] + + +def get_user_statuses(instance, user_id, limit=30): + """get user's recent posts""" + url = f'https://{instance}/api/v1/accounts/{user_id}/statuses' + return _api_get(url, {'limit': limit, 'exclude_reblogs': 'true'}) or [] + + +def analyze_mastodon_user(account, instance): + """analyze a mastodon account""" + acct = account.get('acct', '') + if '@' not in acct: + acct = f"{acct}@{instance}" + + # collect text + text_parts = [] + bio = strip_html(account.get('note', '')) + if bio: + text_parts.append(bio) + + display_name = account.get('display_name', '') + if display_name: + text_parts.append(display_name) + + # profile fields + for field in account.get('fields', []): + if field.get('name'): + text_parts.append(field['name']) + if field.get('value'): + text_parts.append(strip_html(field['value'])) + + # get recent posts + user_id = account.get('id') + if user_id: + statuses = get_user_statuses(instance, user_id) + for status in statuses: + content = strip_html(status.get('content', '')) + if content: + text_parts.append(content) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # instance bonus + instance_bonus = ALIGNED_INSTANCES.get(instance, 0) + total_score = text_score + instance_bonus + + # pronouns bonus + if re.search(r'\b(they/them|she/her|he/him|xe/xem)\b', full_text, re.I): + total_score += 10 + positive_signals.append('pronouns') + + # activity level + statuses_count = account.get('statuses_count', 0) + followers = account.get('followers_count', 0) + if statuses_count > 100: + total_score += 5 + + # === LOST BUILDER DETECTION === + # build profile and posts for lost analysis + profile_for_lost = { + 'bio': bio, + 'note': account.get('note'), + } + + # convert statuses to posts format for analyze_social_for_lost_signals + posts_for_lost = [] + if user_id: + statuses = get_user_statuses(instance, user_id) + for status in statuses: + posts_for_lost.append({ + 'content': strip_html(status.get('content', '')), + 'reblog': status.get('reblog'), + }) + + # analyze for lost signals + lost_signals, lost_weight = analyze_social_for_lost_signals(profile_for_lost, posts_for_lost) + + # also check combined text for lost patterns + text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) + for sig in text_lost_signals: + if sig not in lost_signals: + lost_signals.append(sig) + lost_weight += text_lost_weight + + lost_potential_score = lost_weight + + # classify: builder, lost, both, or none + # for mastodon, we use statuses_count as a proxy for builder activity + builder_activity = 10 if statuses_count > 100 else 5 if statuses_count > 50 else 0 + user_type = classify_user(lost_potential_score, builder_activity, total_score) + + # confidence + confidence = 0.3 + if len(text_parts) > 5: + confidence += 0.2 + if statuses_count > 50: + confidence += 0.2 + if len(positive_signals) > 3: + confidence += 0.2 + confidence = min(confidence, 0.9) + + reasons = [] + if instance in ALIGNED_INSTANCES: + reasons.append(f"on {instance}") + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + # add lost reasons if applicable + if user_type == 'lost' or user_type == 'both': + lost_descriptions = get_signal_descriptions(lost_signals) + if lost_descriptions: + reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") + + return { + 'platform': 'mastodon', + 'username': acct, + 'url': account.get('url'), + 'name': display_name, + 'bio': bio, + 'instance': instance, + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'statuses_count': statuses_count, + 'followers': followers, + 'reasons': reasons, + 'scraped_at': datetime.now().isoformat(), + # lost builder fields + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_mastodon(db, limit_per_instance=40): + """full mastodon scrape""" + print("scoutd/mastodon: starting scrape...") + + all_accounts = [] + + # 1. instance directories + print(" scraping instance directories...") + for instance in ALIGNED_INSTANCES: + accounts = get_instance_directory(instance, limit=limit_per_instance) + for acct in accounts: + acct['_instance'] = instance + all_accounts.append(acct) + print(f" {instance}: {len(accounts)} users") + + # 2. hashtag timelines + print(" scraping hashtags...") + seen = set() + for tag in TARGET_HASHTAGS[:8]: + for instance in ['fosstodon.org', 'tech.lgbt', 'social.coop']: + posts = get_hashtag_timeline(instance, tag, limit=20) + for post in posts: + account = post.get('account', {}) + acct = account.get('acct', '') + if '@' not in acct: + acct = f"{acct}@{instance}" + + if acct not in seen: + seen.add(acct) + account['_instance'] = instance + all_accounts.append(account) + + # dedupe + unique = {} + for acct in all_accounts: + key = acct.get('acct', acct.get('id', '')) + if key not in unique: + unique[key] = acct + + print(f" {len(unique)} unique accounts to analyze") + + # analyze + results = [] + builders_found = 0 + lost_found = 0 + + for acct_data in unique.values(): + instance = acct_data.get('_instance', 'mastodon.social') + try: + result = analyze_mastodon_user(acct_data, instance) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + user_type = result.get('user_type', 'none') + + if user_type == 'builder': + builders_found += 1 + if result['score'] >= 40: + print(f" ★ @{result['username']}: {result['score']} pts") + + elif user_type == 'lost': + lost_found += 1 + lost_score = result.get('lost_potential_score', 0) + if lost_score >= 40: + print(f" 💔 @{result['username']}: lost_score={lost_score}, values={result['score']} pts") + + elif user_type == 'both': + builders_found += 1 + lost_found += 1 + print(f" ⚡ @{result['username']}: recovering builder") + + except Exception as e: + print(f" error: {e}") + + print(f"scoutd/mastodon: found {len(results)} aligned humans") + print(f" - {builders_found} active builders") + print(f" - {lost_found} lost builders (need encouragement)") + return results diff --git a/connectd/scoutd/matrix.py b/connectd/scoutd/matrix.py new file mode 100644 index 0000000..162d9ac --- /dev/null +++ b/connectd/scoutd/matrix.py @@ -0,0 +1,196 @@ +""" +scoutd/matrix.py - matrix room membership discovery +finds users in multiple aligned public rooms +""" + +import requests +import json +import time +from datetime import datetime +from pathlib import Path +from collections import defaultdict + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'connectd/1.0', 'Accept': 'application/json'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'matrix' + +# public matrix rooms to check membership +ALIGNED_ROOMS = [ + '#homeassistant:matrix.org', + '#esphome:matrix.org', + '#selfhosted:matrix.org', + '#privacy:matrix.org', + '#solarpunk:matrix.org', + '#cooperative:matrix.org', + '#foss:matrix.org', + '#linux:matrix.org', +] + +# homeservers to query +HOMESERVERS = [ + 'matrix.org', + 'matrix.envs.net', + 'tchncs.de', +] + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(1) + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + # matrix apis often fail, don't spam errors + return None + + +def get_room_members(homeserver, room_alias): + """ + get members of a public room + note: most matrix servers don't expose this publicly + this is a best-effort scrape + """ + # resolve room alias to id first + try: + alias_url = f'https://{homeserver}/_matrix/client/r0/directory/room/{room_alias}' + alias_data = _api_get(alias_url) + if not alias_data or 'room_id' not in alias_data: + return [] + + room_id = alias_data['room_id'] + + # try to get members (usually requires auth) + members_url = f'https://{homeserver}/_matrix/client/r0/rooms/{room_id}/members' + members_data = _api_get(members_url) + + if members_data and 'chunk' in members_data: + members = [] + for event in members_data['chunk']: + if event.get('type') == 'm.room.member' and event.get('content', {}).get('membership') == 'join': + user_id = event.get('state_key') + display_name = event.get('content', {}).get('displayname') + if user_id: + members.append({'user_id': user_id, 'display_name': display_name}) + return members + except: + pass + + return [] + + +def get_public_rooms(homeserver, limit=100): + """get public rooms directory""" + url = f'https://{homeserver}/_matrix/client/r0/publicRooms' + data = _api_get(url, {'limit': limit}) + return data.get('chunk', []) if data else [] + + +def analyze_matrix_user(user_id, rooms_joined, display_name=None): + """analyze a matrix user based on room membership""" + # score based on room membership overlap + room_score = len(rooms_joined) * 10 + + # multi-room bonus + if len(rooms_joined) >= 4: + room_score += 20 + elif len(rooms_joined) >= 2: + room_score += 10 + + # analyze display name if available + text_score = 0 + signals = [] + if display_name: + text_score, signals, _ = analyze_text(display_name) + + total_score = room_score + text_score + + confidence = 0.3 + if len(rooms_joined) >= 3: + confidence += 0.3 + if display_name: + confidence += 0.1 + confidence = min(confidence, 0.8) + + reasons = [f"in {len(rooms_joined)} aligned rooms: {', '.join(rooms_joined[:3])}"] + if signals: + reasons.append(f"signals: {', '.join(signals[:3])}") + + return { + 'platform': 'matrix', + 'username': user_id, + 'url': f"https://matrix.to/#/{user_id}", + 'name': display_name, + 'score': total_score, + 'confidence': confidence, + 'signals': signals, + 'rooms': rooms_joined, + 'reasons': reasons, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_matrix(db): + """ + matrix scrape - limited due to auth requirements + best effort on public room data + """ + print("scoutd/matrix: starting scrape (limited - most apis require auth)...") + + user_rooms = defaultdict(list) + + # try to get public room directories + for homeserver in HOMESERVERS: + print(f" checking {homeserver} public rooms...") + rooms = get_public_rooms(homeserver, limit=50) + + for room in rooms: + room_alias = room.get('canonical_alias', '') + # check if it matches any aligned room patterns + aligned_keywords = ['homeassistant', 'selfhosted', 'privacy', 'linux', 'foss', 'cooperative'] + if any(kw in room_alias.lower() or kw in room.get('name', '').lower() for kw in aligned_keywords): + print(f" found aligned room: {room_alias or room.get('name')}") + + # try to get members from aligned rooms (usually fails without auth) + for room_alias in ALIGNED_ROOMS[:3]: # limit attempts + for homeserver in HOMESERVERS[:1]: # just try matrix.org + members = get_room_members(homeserver, room_alias) + if members: + print(f" {room_alias}: {len(members)} members") + for member in members: + user_rooms[member['user_id']].append(room_alias) + + # filter for multi-room users + multi_room = {u: rooms for u, rooms in user_rooms.items() if len(rooms) >= 2} + print(f" {len(multi_room)} users in 2+ aligned rooms") + + # analyze + results = [] + for user_id, rooms in multi_room.items(): + try: + result = analyze_matrix_user(user_id, rooms) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + except Exception as e: + print(f" error: {e}") + + print(f"scoutd/matrix: found {len(results)} aligned humans (limited by auth)") + return results diff --git a/connectd/scoutd/reddit.py b/connectd/scoutd/reddit.py new file mode 100644 index 0000000..723ff93 --- /dev/null +++ b/connectd/scoutd/reddit.py @@ -0,0 +1,503 @@ +""" +scoutd/reddit.py - reddit discovery (DISCOVERY ONLY, NOT OUTREACH) + +reddit is a SIGNAL SOURCE, not a contact channel. +flow: +1. scrape reddit for users active in target subs +2. extract their reddit profile +3. look for links TO other platforms (github, mastodon, website, etc.) +4. add to scout database with reddit as signal source +5. reach out via their OTHER platforms, never reddit + +if reddit user has no external links: + - add to manual_queue with note "reddit-only, needs manual review" + +also detects lost builders - stuck in learnprogramming for years, imposter syndrome, etc. +""" + +import requests +import json +import time +import re +from datetime import datetime +from pathlib import Path +from collections import defaultdict + +from .signals import analyze_text, ALIGNED_SUBREDDITS, NEGATIVE_SUBREDDITS +from .lost import ( + analyze_reddit_for_lost_signals, + analyze_text_for_lost_signals, + classify_user, + get_signal_descriptions, + STUCK_SUBREDDITS, +) + +HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit' + +# patterns for extracting external platform links +PLATFORM_PATTERNS = { + 'github': [ + r'github\.com/([a-zA-Z0-9_-]+)', + r'gh:\s*@?([a-zA-Z0-9_-]+)', + ], + 'mastodon': [ + r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', + r'mastodon\.social/@([a-zA-Z0-9_]+)', + r'fosstodon\.org/@([a-zA-Z0-9_]+)', + r'hachyderm\.io/@([a-zA-Z0-9_]+)', + r'tech\.lgbt/@([a-zA-Z0-9_]+)', + ], + 'twitter': [ + r'twitter\.com/([a-zA-Z0-9_]+)', + r'x\.com/([a-zA-Z0-9_]+)', + r'(?:^|\s)@([a-zA-Z0-9_]{1,15})(?:\s|$)', # bare @handle + ], + 'bluesky': [ + r'bsky\.app/profile/([a-zA-Z0-9_.-]+)', + r'([a-zA-Z0-9_-]+)\.bsky\.social', + ], + 'website': [ + r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)', + ], + 'matrix': [ + r'@([a-zA-Z0-9_-]+):([a-zA-Z0-9.-]+)', + ], +} + + +def _api_get(url, params=None): + """rate-limited request""" + cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_data') + except: + pass + + time.sleep(2) # reddit rate limit + + try: + resp = requests.get(url, headers=HEADERS, params=params, timeout=30) + resp.raise_for_status() + result = resp.json() + cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) + return result + except requests.exceptions.RequestException as e: + print(f" reddit api error: {e}") + return None + + +def extract_external_links(text): + """extract links to other platforms from text""" + links = {} + + if not text: + return links + + for platform, patterns in PLATFORM_PATTERNS.items(): + for pattern in patterns: + matches = re.findall(pattern, text, re.IGNORECASE) + if matches: + if platform == 'mastodon' and isinstance(matches[0], tuple): + # full fediverse handle + links[platform] = f"@{matches[0][0]}@{matches[0][1]}" + elif platform == 'matrix' and isinstance(matches[0], tuple): + links[platform] = f"@{matches[0][0]}:{matches[0][1]}" + elif platform == 'website': + # skip reddit/imgur/etc + for match in matches: + if not any(x in match.lower() for x in ['reddit', 'imgur', 'redd.it', 'i.redd']): + links[platform] = f"https://{match}" + break + else: + links[platform] = matches[0] + break + + return links + + +def get_user_profile(username): + """get user profile including bio/description""" + url = f'https://www.reddit.com/user/{username}/about.json' + data = _api_get(url) + + if not data or 'data' not in data: + return None + + profile = data['data'] + return { + 'username': username, + 'name': profile.get('name'), + 'bio': profile.get('subreddit', {}).get('public_description', ''), + 'title': profile.get('subreddit', {}).get('title', ''), + 'icon': profile.get('icon_img'), + 'created_utc': profile.get('created_utc'), + 'total_karma': profile.get('total_karma', 0), + 'link_karma': profile.get('link_karma', 0), + 'comment_karma': profile.get('comment_karma', 0), + } + + +def get_subreddit_users(subreddit, limit=100): + """get recent posters/commenters from a subreddit""" + users = set() + + # posts + url = f'https://www.reddit.com/r/{subreddit}/new.json' + data = _api_get(url, {'limit': limit}) + if data and 'data' in data: + for post in data['data'].get('children', []): + author = post['data'].get('author') + if author and author not in ['[deleted]', 'AutoModerator']: + users.add(author) + + # comments + url = f'https://www.reddit.com/r/{subreddit}/comments.json' + data = _api_get(url, {'limit': limit}) + if data and 'data' in data: + for comment in data['data'].get('children', []): + author = comment['data'].get('author') + if author and author not in ['[deleted]', 'AutoModerator']: + users.add(author) + + return users + + +def get_user_activity(username): + """get user's posts and comments""" + activity = [] + + # posts + url = f'https://www.reddit.com/user/{username}/submitted.json' + data = _api_get(url, {'limit': 100}) + if data and 'data' in data: + for post in data['data'].get('children', []): + activity.append({ + 'type': 'post', + 'subreddit': post['data'].get('subreddit'), + 'title': post['data'].get('title', ''), + 'body': post['data'].get('selftext', ''), + 'score': post['data'].get('score', 0), + }) + + # comments + url = f'https://www.reddit.com/user/{username}/comments.json' + data = _api_get(url, {'limit': 100}) + if data and 'data' in data: + for comment in data['data'].get('children', []): + activity.append({ + 'type': 'comment', + 'subreddit': comment['data'].get('subreddit'), + 'body': comment['data'].get('body', ''), + 'score': comment['data'].get('score', 0), + }) + + return activity + + +def analyze_reddit_user(username): + """ + analyze a reddit user for alignment and extract external platform links. + + reddit is DISCOVERY ONLY - we find users here but contact them elsewhere. + """ + activity = get_user_activity(username) + if not activity: + return None + + # get profile for bio + profile = get_user_profile(username) + + # count subreddit activity + sub_activity = defaultdict(int) + text_parts = [] + total_karma = 0 + + for item in activity: + sub = item.get('subreddit', '').lower() + if sub: + sub_activity[sub] += 1 + if item.get('title'): + text_parts.append(item['title']) + if item.get('body'): + text_parts.append(item['body']) + total_karma += item.get('score', 0) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # EXTRACT EXTERNAL LINKS - this is the key part + # check profile bio first + external_links = {} + if profile: + bio_text = f"{profile.get('bio', '')} {profile.get('title', '')}" + external_links.update(extract_external_links(bio_text)) + + # also scan posts/comments for links (people often share their github etc) + activity_links = extract_external_links(full_text) + for platform, link in activity_links.items(): + if platform not in external_links: + external_links[platform] = link + + # subreddit scoring + sub_score = 0 + aligned_subs = [] + for sub, count in sub_activity.items(): + weight = ALIGNED_SUBREDDITS.get(sub, 0) + if weight > 0: + sub_score += weight * min(count, 5) + aligned_subs.append(sub) + + # multi-sub bonus + if len(aligned_subs) >= 5: + sub_score += 30 + elif len(aligned_subs) >= 3: + sub_score += 15 + + # negative sub penalty + for sub in sub_activity: + if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]: + sub_score -= 50 + negative_signals.append(f"r/{sub}") + + total_score = text_score + sub_score + + # bonus if they have external links (we can actually contact them) + if external_links.get('github'): + total_score += 10 + positive_signals.append('has github') + if external_links.get('mastodon'): + total_score += 10 + positive_signals.append('has mastodon') + if external_links.get('website'): + total_score += 5 + positive_signals.append('has website') + + # === LOST BUILDER DETECTION === + # reddit is HIGH SIGNAL for lost builders - stuck in learnprogramming, + # imposter syndrome posts, "i wish i could" language, etc. + subreddits_list = list(sub_activity.keys()) + lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list) + + # also check full text for lost patterns (already done partially in analyze_reddit_for_lost_signals) + text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text) + for sig in text_lost_signals: + if sig not in lost_signals: + lost_signals.append(sig) + lost_weight += text_lost_weight + + lost_potential_score = lost_weight + + # classify: builder, lost, both, or none + # for reddit, builder_score is based on having external links + high karma + builder_activity = 0 + if external_links.get('github'): + builder_activity += 20 + if total_karma > 1000: + builder_activity += 15 + elif total_karma > 500: + builder_activity += 10 + + user_type = classify_user(lost_potential_score, builder_activity, total_score) + + # confidence + confidence = 0.3 + if len(activity) > 20: + confidence += 0.2 + if len(aligned_subs) >= 2: + confidence += 0.2 + if len(text_parts) > 10: + confidence += 0.2 + # higher confidence if we have contact methods + if external_links: + confidence += 0.1 + confidence = min(confidence, 0.95) + + reasons = [] + if aligned_subs: + reasons.append(f"active in: {', '.join(aligned_subs[:5])}") + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + if external_links: + reasons.append(f"external: {', '.join(external_links.keys())}") + + # add lost reasons if applicable + if user_type == 'lost' or user_type == 'both': + lost_descriptions = get_signal_descriptions(lost_signals) + if lost_descriptions: + reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}") + + # determine if this is reddit-only (needs manual review) + reddit_only = len(external_links) == 0 + if reddit_only: + reasons.append("REDDIT-ONLY: needs manual review for outreach") + + return { + 'platform': 'reddit', + 'username': username, + 'url': f"https://reddit.com/u/{username}", + 'score': total_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'subreddits': aligned_subs, + 'activity_count': len(activity), + 'karma': total_karma, + 'reasons': reasons, + 'scraped_at': datetime.now().isoformat(), + # external platform links for outreach + 'external_links': external_links, + 'reddit_only': reddit_only, + 'extra': { + 'github': external_links.get('github'), + 'mastodon': external_links.get('mastodon'), + 'twitter': external_links.get('twitter'), + 'bluesky': external_links.get('bluesky'), + 'website': external_links.get('website'), + 'matrix': external_links.get('matrix'), + 'reddit_karma': total_karma, + 'reddit_activity': len(activity), + }, + # lost builder fields + 'lost_potential_score': lost_potential_score, + 'lost_signals': lost_signals, + 'user_type': user_type, + } + + +def scrape_reddit(db, limit_per_sub=50): + """ + full reddit scrape - DISCOVERY ONLY + + finds aligned users, extracts external links for outreach. + reddit-only users go to manual queue. + """ + print("scoutd/reddit: starting scrape (discovery only, not outreach)...") + + # find users in multiple aligned subs + user_subs = defaultdict(set) + + # aligned subs - active builders + priority_subs = ['intentionalcommunity', 'cohousing', 'selfhosted', + 'homeassistant', 'solarpunk', 'cooperatives', 'privacy', + 'localllama', 'homelab', 'degoogle', 'pihole', 'unraid'] + + # lost builder subs - people who need encouragement + # these folks might be stuck, but they have aligned interests + lost_subs = ['learnprogramming', 'findapath', 'getdisciplined', + 'careerguidance', 'cscareerquestions', 'decidingtobebetter'] + + # scrape both - we want to find lost builders with aligned interests + all_subs = priority_subs + lost_subs + + for sub in all_subs: + print(f" scraping r/{sub}...") + users = get_subreddit_users(sub, limit=limit_per_sub) + for user in users: + user_subs[user].add(sub) + print(f" found {len(users)} users") + + # filter for multi-sub users + multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2} + print(f" {len(multi_sub)} users in 2+ aligned subs") + + # analyze + results = [] + reddit_only_count = 0 + external_link_count = 0 + builders_found = 0 + lost_found = 0 + + for username in multi_sub: + try: + result = analyze_reddit_user(username) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + user_type = result.get('user_type', 'none') + + # track lost builders - reddit is high signal for these + if user_type == 'lost': + lost_found += 1 + lost_score = result.get('lost_potential_score', 0) + if lost_score >= 40: + print(f" 💔 u/{username}: lost_score={lost_score}, values={result['score']} pts") + # lost builders also go to manual queue if reddit-only + if result.get('reddit_only'): + _add_to_manual_queue(result) + + elif user_type == 'builder': + builders_found += 1 + + elif user_type == 'both': + builders_found += 1 + lost_found += 1 + print(f" ⚡ u/{username}: recovering builder") + + # track external links + if result.get('reddit_only'): + reddit_only_count += 1 + # add high-value users to manual queue for review + if result['score'] >= 50 and user_type != 'lost': # lost already added above + _add_to_manual_queue(result) + print(f" 📋 u/{username}: {result['score']} pts (reddit-only → manual queue)") + else: + external_link_count += 1 + if result['score'] >= 50 and user_type == 'builder': + links = list(result.get('external_links', {}).keys()) + print(f" ★ u/{username}: {result['score']} pts → {', '.join(links)}") + + except Exception as e: + print(f" error on {username}: {e}") + + print(f"scoutd/reddit: found {len(results)} aligned humans") + print(f" - {builders_found} active builders") + print(f" - {lost_found} lost builders (need encouragement)") + print(f" - {external_link_count} with external links (reachable)") + print(f" - {reddit_only_count} reddit-only (manual queue)") + return results + + +def _add_to_manual_queue(result): + """add reddit-only user to manual queue for review""" + from pathlib import Path + import json + + queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json' + queue_file.parent.mkdir(parents=True, exist_ok=True) + + queue = [] + if queue_file.exists(): + try: + queue = json.loads(queue_file.read_text()) + except: + pass + + # check if already in queue + existing = [q for q in queue if q.get('username') == result['username'] and q.get('platform') == 'reddit'] + if existing: + return + + queue.append({ + 'platform': 'reddit', + 'username': result['username'], + 'url': result['url'], + 'score': result['score'], + 'subreddits': result.get('subreddits', []), + 'signals': result.get('signals', []), + 'reasons': result.get('reasons', []), + 'note': 'reddit-only user - no external links found. DM manually if promising.', + 'queued_at': datetime.now().isoformat(), + 'status': 'pending', + }) + + queue_file.write_text(json.dumps(queue, indent=2)) diff --git a/connectd/scoutd/signals.py b/connectd/scoutd/signals.py new file mode 100644 index 0000000..53c178c --- /dev/null +++ b/connectd/scoutd/signals.py @@ -0,0 +1,158 @@ +""" +shared signal patterns for all scrapers +""" + +import re + +# positive signals - what we're looking for +POSITIVE_PATTERNS = [ + # values + (r'\b(solarpunk|cyberpunk)\b', 'solarpunk', 10), + (r'\b(anarchis[tm]|mutual.?aid)\b', 'mutual_aid', 10), + (r'\b(cooperative|collective|worker.?owned?|coop|co.?op)\b', 'cooperative', 15), + (r'\b(community|commons)\b', 'community', 5), + (r'\b(intentional.?community|cohousing|commune)\b', 'intentional_community', 20), + + # queer-friendly + (r'\b(queer|lgbtq?|trans|nonbinary|enby|genderqueer)\b', 'queer', 15), + (r'\b(they/them|she/her|he/him|xe/xem|any.?pronouns)\b', 'pronouns', 10), + (r'\bblm\b', 'blm', 5), + (r'\b(acab|1312)\b', 'acab', 5), + + # tech values + (r'\b(privacy|surveillance|anti.?surveillance)\b', 'privacy', 10), + (r'\b(self.?host(?:ed|ing)?|homelab|home.?server)\b', 'selfhosted', 15), + (r'\b(local.?first|offline.?first)\b', 'local_first', 15), + (r'\b(decentralized?|federation|federated|fediverse)\b', 'decentralized', 10), + (r'\b(foss|libre|open.?source|copyleft)\b', 'foss', 10), + (r'\b(home.?assistant|home.?automation)\b', 'home_automation', 10), + (r'\b(mesh|p2p|peer.?to.?peer)\b', 'p2p', 10), + (r'\b(matrix|xmpp|irc)\b', 'federated_chat', 5), + (r'\b(degoogle|de.?google)\b', 'degoogle', 10), + + # location/availability + (r'\b(seattle|portland|pnw|cascadia|pacific.?northwest)\b', 'pnw', 20), + (r'\b(washington|oregon)\b', 'pnw_state', 10), + (r'\b(remote|anywhere|relocate|looking.?to.?move)\b', 'remote', 10), + + # anti-capitalism + (r'\b(anti.?capitalis[tm]|post.?capitalis[tm]|degrowth)\b', 'anticapitalist', 10), + + # neurodivergent (often overlaps with our values) + (r'\b(neurodivergent|adhd|autistic|autism)\b', 'neurodivergent', 5), + + # technical skills (bonus for builders) + (r'\b(rust|go|python|typescript)\b', 'modern_lang', 3), + (r'\b(linux|bsd|nixos)\b', 'unix', 3), + (r'\b(kubernetes|docker|podman)\b', 'containers', 3), +] + +# negative signals - red flags +NEGATIVE_PATTERNS = [ + (r'\b(qanon|maga|trump|wwg1wga)\b', 'maga', -50), + (r'\b(covid.?hoax|plandemic|5g.?conspiracy)\b', 'conspiracy', -50), + (r'\b(nwo|illuminati|deep.?state)\b', 'conspiracy', -30), + (r'\b(anti.?vax|antivax)\b', 'antivax', -30), + (r'\b(sovereign.?citizen)\b', 'sovcit', -40), + (r'\b(crypto.?bro|web3|nft|blockchain|bitcoin|ethereum)\b', 'crypto', -15), + (r'\b(conservative|republican)\b', 'conservative', -20), + (r'\b(free.?speech.?absolutist)\b', 'freeze_peach', -20), +] + +# target topics for repo discovery +TARGET_TOPICS = [ + 'local-first', 'self-hosted', 'privacy', 'mesh-network', + 'cooperative', 'solarpunk', 'decentralized', 'p2p', + 'fediverse', 'activitypub', 'matrix-org', 'homeassistant', + 'esphome', 'open-source-hardware', 'right-to-repair', + 'mutual-aid', 'commons', 'degoogle', 'privacy-tools', +] + +# ecosystem repos - high signal contributors +ECOSYSTEM_REPOS = [ + 'home-assistant/core', + 'esphome/esphome', + 'matrix-org/synapse', + 'LemmyNet/lemmy', + 'mastodon/mastodon', + 'owncast/owncast', + 'nextcloud/server', + 'immich-app/immich', + 'jellyfin/jellyfin', + 'navidrome/navidrome', + 'paperless-ngx/paperless-ngx', + 'actualbudget/actual', + 'firefly-iii/firefly-iii', + 'logseq/logseq', + 'AppFlowy-IO/AppFlowy', + 'siyuan-note/siyuan', + 'anytype/anytype-ts', + 'calcom/cal.com', + 'plausible/analytics', + 'umami-software/umami', +] + +# aligned subreddits +ALIGNED_SUBREDDITS = { + 'intentionalcommunity': 25, + 'cohousing': 25, + 'cooperatives': 20, + 'solarpunk': 20, + 'selfhosted': 15, + 'homeassistant': 15, + 'homelab': 10, + 'privacy': 15, + 'PrivacyGuides': 15, + 'degoogle': 15, + 'anticonsumption': 10, + 'Frugal': 5, + 'simpleliving': 5, + 'Seattle': 10, + 'Portland': 10, + 'cascadia': 15, + 'linux': 5, + 'opensource': 10, + 'FOSS': 10, +} + +# negative subreddits +NEGATIVE_SUBREDDITS = [ + 'conspiracy', 'conservative', 'walkaway', 'louderwithcrowder', + 'JordanPeterson', 'TimPool', 'NoNewNormal', 'LockdownSkepticism', +] + +# high-signal mastodon instances +ALIGNED_INSTANCES = { + 'tech.lgbt': 20, + 'social.coop': 25, + 'fosstodon.org': 10, + 'hackers.town': 15, + 'hachyderm.io': 10, + 'infosec.exchange': 5, +} + + +def analyze_text(text): + """ + analyze text for signals + returns: (score, signals_found, negative_signals) + """ + if not text: + return 0, [], [] + + text = text.lower() + score = 0 + signals = [] + negatives = [] + + for pattern, signal_name, points in POSITIVE_PATTERNS: + if re.search(pattern, text, re.IGNORECASE): + score += points + signals.append(signal_name) + + for pattern, signal_name, points in NEGATIVE_PATTERNS: + if re.search(pattern, text, re.IGNORECASE): + score += points # points are already negative + negatives.append(signal_name) + + return score, list(set(signals)), list(set(negatives)) diff --git a/connectd/scoutd/twitter.py b/connectd/scoutd/twitter.py new file mode 100644 index 0000000..90fd06f --- /dev/null +++ b/connectd/scoutd/twitter.py @@ -0,0 +1,255 @@ +""" +scoutd/twitter.py - twitter/x discovery via nitter instances + +scrapes nitter (twitter frontend) to find users posting about aligned topics +without needing twitter API access + +nitter instances rotate to avoid rate limits +""" + +import requests +import json +import time +import re +from datetime import datetime +from pathlib import Path +from bs4 import BeautifulSoup + +from .signals import analyze_text + +HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0'} +CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'twitter' + +# nitter instances (rotate through these) +NITTER_INSTANCES = [ + 'nitter.privacydev.net', + 'nitter.poast.org', + 'nitter.woodland.cafe', + 'nitter.esmailelbob.xyz', +] + +# hashtags to search +ALIGNED_HASHTAGS = [ + 'selfhosted', 'homelab', 'homeassistant', 'foss', 'opensource', + 'privacy', 'solarpunk', 'cooperative', 'mutualaid', 'localfirst', + 'indieweb', 'smallweb', 'permacomputing', 'degrowth', 'techworkers', +] + +_current_instance_idx = 0 + + +def get_nitter_instance(): + """get current nitter instance, rotate on failure""" + global _current_instance_idx + return NITTER_INSTANCES[_current_instance_idx % len(NITTER_INSTANCES)] + + +def rotate_instance(): + """switch to next nitter instance""" + global _current_instance_idx + _current_instance_idx += 1 + + +def _scrape_page(url, retries=3): + """scrape a nitter page with instance rotation""" + for attempt in range(retries): + instance = get_nitter_instance() + full_url = url.replace('{instance}', instance) + + # check cache + cache_key = f"{full_url}" + cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" + CACHE_DIR.mkdir(parents=True, exist_ok=True) + + if cache_file.exists(): + try: + data = json.loads(cache_file.read_text()) + if time.time() - data.get('_cached_at', 0) < 3600: + return data.get('_html') + except: + pass + + time.sleep(2) # rate limit + + try: + resp = requests.get(full_url, headers=HEADERS, timeout=30) + if resp.status_code == 200: + cache_file.write_text(json.dumps({ + '_cached_at': time.time(), + '_html': resp.text + })) + return resp.text + elif resp.status_code in [429, 503]: + print(f" nitter {instance} rate limited, rotating...") + rotate_instance() + else: + print(f" nitter error: {resp.status_code}") + return None + except Exception as e: + print(f" nitter {instance} error: {e}") + rotate_instance() + + return None + + +def search_hashtag(hashtag): + """search for tweets with hashtag""" + url = f"https://{{instance}}/search?q=%23{hashtag}&f=tweets" + html = _scrape_page(url) + if not html: + return [] + + soup = BeautifulSoup(html, 'html.parser') + tweets = [] + + for tweet_div in soup.select('.timeline-item'): + try: + username_elem = tweet_div.select_one('.username') + content_elem = tweet_div.select_one('.tweet-content') + fullname_elem = tweet_div.select_one('.fullname') + + if username_elem and content_elem: + username = username_elem.text.strip().lstrip('@') + tweets.append({ + 'username': username, + 'name': fullname_elem.text.strip() if fullname_elem else username, + 'content': content_elem.text.strip(), + }) + except Exception as e: + continue + + return tweets + + +def get_user_profile(username): + """get user profile from nitter""" + url = f"https://{{instance}}/{username}" + html = _scrape_page(url) + if not html: + return None + + soup = BeautifulSoup(html, 'html.parser') + + try: + bio_elem = soup.select_one('.profile-bio') + bio = bio_elem.text.strip() if bio_elem else '' + + location_elem = soup.select_one('.profile-location') + location = location_elem.text.strip() if location_elem else '' + + website_elem = soup.select_one('.profile-website a') + website = website_elem.get('href') if website_elem else '' + + # get recent tweets for more signal + tweets = [] + for tweet_div in soup.select('.timeline-item')[:10]: + content_elem = tweet_div.select_one('.tweet-content') + if content_elem: + tweets.append(content_elem.text.strip()) + + return { + 'username': username, + 'bio': bio, + 'location': location, + 'website': website, + 'recent_tweets': tweets, + } + except Exception as e: + print(f" error parsing {username}: {e}") + return None + + +def analyze_twitter_user(username, profile=None): + """analyze a twitter user for alignment""" + if not profile: + profile = get_user_profile(username) + + if not profile: + return None + + # collect text + text_parts = [profile.get('bio', '')] + text_parts.extend(profile.get('recent_tweets', [])) + + full_text = ' '.join(text_parts) + text_score, positive_signals, negative_signals = analyze_text(full_text) + + # twitter is noisy, lower base confidence + confidence = 0.25 + if len(positive_signals) >= 3: + confidence += 0.2 + if profile.get('website'): + confidence += 0.1 + if len(profile.get('recent_tweets', [])) >= 5: + confidence += 0.1 + confidence = min(confidence, 0.7) # cap lower for twitter + + reasons = [] + if positive_signals: + reasons.append(f"signals: {', '.join(positive_signals[:5])}") + if negative_signals: + reasons.append(f"WARNING: {', '.join(negative_signals)}") + + return { + 'platform': 'twitter', + 'username': username, + 'url': f"https://twitter.com/{username}", + 'name': profile.get('name', username), + 'bio': profile.get('bio'), + 'location': profile.get('location'), + 'score': text_score, + 'confidence': confidence, + 'signals': positive_signals, + 'negative_signals': negative_signals, + 'reasons': reasons, + 'contact': { + 'twitter': username, + 'website': profile.get('website'), + }, + 'scraped_at': datetime.now().isoformat(), + } + + +def scrape_twitter(db, limit_per_hashtag=50): + """full twitter scrape via nitter""" + print("scoutd/twitter: starting scrape via nitter...") + + all_users = {} + + for hashtag in ALIGNED_HASHTAGS: + print(f" #{hashtag}...") + tweets = search_hashtag(hashtag) + + for tweet in tweets[:limit_per_hashtag]: + username = tweet.get('username') + if username and username not in all_users: + all_users[username] = { + 'username': username, + 'name': tweet.get('name'), + 'hashtags': [hashtag], + } + elif username: + all_users[username]['hashtags'].append(hashtag) + + print(f" found {len(tweets)} tweets") + + # prioritize users in multiple hashtags + multi_hashtag = {u: d for u, d in all_users.items() if len(d.get('hashtags', [])) >= 2} + print(f" {len(multi_hashtag)} users in 2+ aligned hashtags") + + # analyze + results = [] + for username, data in list(multi_hashtag.items())[:100]: # limit to prevent rate limits + try: + result = analyze_twitter_user(username) + if result and result['score'] > 0: + results.append(result) + db.save_human(result) + + if result['score'] >= 30: + print(f" ★ @{username}: {result['score']} pts") + except Exception as e: + print(f" error on {username}: {e}") + + print(f"scoutd/twitter: found {len(results)} aligned humans") + return results diff --git a/connectd/setup_user.py b/connectd/setup_user.py new file mode 100644 index 0000000..b1162fa --- /dev/null +++ b/connectd/setup_user.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +setup priority user - add yourself to get matches + +usage: + python setup_user.py # interactive setup + python setup_user.py --show # show your profile + python setup_user.py --matches # show your matches +""" + +import argparse +import json +from db import Database +from db.users import (init_users_table, add_priority_user, get_priority_users, + get_priority_user_matches) + + +def interactive_setup(db): + """interactive priority user setup""" + print("=" * 60) + print("connectd priority user setup") + print("=" * 60) + print("\nlink your profiles so connectd can find matches for YOU\n") + + name = input("name: ").strip() + email = input("email (for notifications): ").strip() + github = input("github username (optional): ").strip() or None + reddit = input("reddit username (optional): ").strip() or None + mastodon = input("mastodon handle e.g. user@instance (optional): ").strip() or None + lobsters = input("lobste.rs username (optional): ").strip() or None + matrix = input("matrix id e.g. @user:matrix.org (optional): ").strip() or None + location = input("location (e.g. seattle, remote): ").strip() or None + + print("\nwhat are you interested in? (comma separated)") + print("examples: self-hosting, cooperatives, solarpunk, home automation") + interests_raw = input("interests: ").strip() + interests = [i.strip() for i in interests_raw.split(',')] if interests_raw else [] + + print("\nwhat kind of people are you looking to connect with?") + looking_for = input("looking for: ").strip() or None + + user_data = { + 'name': name, + 'email': email, + 'github': github, + 'reddit': reddit, + 'mastodon': mastodon, + 'lobsters': lobsters, + 'matrix': matrix, + 'location': location, + 'interests': interests, + 'looking_for': looking_for, + } + + user_id = add_priority_user(db.conn, user_data) + print(f"\n✓ added as priority user #{user_id}") + print("connectd will now find matches for you") + + +def show_profile(db): + """show current priority user profile""" + users = get_priority_users(db.conn) + + if not users: + print("no priority users configured") + print("run: python setup_user.py") + return + + for user in users: + print("=" * 60) + print(f"priority user #{user['id']}: {user['name']}") + print("=" * 60) + print(f"email: {user['email']}") + if user['github']: + print(f"github: {user['github']}") + if user['reddit']: + print(f"reddit: {user['reddit']}") + if user['mastodon']: + print(f"mastodon: {user['mastodon']}") + if user['lobsters']: + print(f"lobsters: {user['lobsters']}") + if user['matrix']: + print(f"matrix: {user['matrix']}") + if user['location']: + print(f"location: {user['location']}") + if user['interests']: + interests = json.loads(user['interests']) if isinstance(user['interests'], str) else user['interests'] + print(f"interests: {', '.join(interests)}") + if user['looking_for']: + print(f"looking for: {user['looking_for']}") + + +def show_matches(db): + """show matches for priority user""" + users = get_priority_users(db.conn) + + if not users: + print("no priority users configured") + return + + for user in users: + print(f"\n=== matches for {user['name']} ===\n") + + matches = get_priority_user_matches(db.conn, user['id'], limit=20) + + if not matches: + print("no matches yet - run the daemon to discover people") + continue + + for i, match in enumerate(matches, 1): + print(f"{i}. {match['username']} ({match['platform']})") + print(f" score: {match['overlap_score']:.0f}") + print(f" url: {match['url']}") + + reasons = match.get('overlap_reasons', '[]') + if isinstance(reasons, str): + reasons = json.loads(reasons) + if reasons: + print(f" why: {reasons[0] if reasons else ''}") + print() + + +def main(): + parser = argparse.ArgumentParser(description='setup priority user') + parser.add_argument('--show', action='store_true', help='show your profile') + parser.add_argument('--matches', action='store_true', help='show your matches') + args = parser.parse_args() + + db = Database() + init_users_table(db.conn) + + if args.show: + show_profile(db) + elif args.matches: + show_matches(db) + else: + interactive_setup(db) + + db.close() + + +if __name__ == '__main__': + main() diff --git a/repository.json b/repository.json new file mode 100644 index 0000000..3adf76b --- /dev/null +++ b/repository.json @@ -0,0 +1,5 @@ +{ + "name": "connectd add-ons", + "url": "https://github.com/sudoxnym/ha-addons", + "maintainer": "sudoxnym" +}