add forge support, central coordination, lost builder detection

- central API client for distributed instance coordination
- forge scraper: gitea, forgejo, gogs, gitlab, sourcehut, codeberg
- forge issue delivery as outreach method
- usage-based contact method ranking with fallback chain
- lost builder detection and targeted outreach
- reddit and lobsters handle discovery
- deep scrape for handle/email discovery from profiles
This commit is contained in:
root 2025-12-16 21:30:05 +00:00
parent 99946bfef5
commit f33409ceda
15 changed files with 2102 additions and 837 deletions

70
api.py
View file

@ -116,6 +116,7 @@ DASHBOARD_HTML = """<!DOCTYPE html>
<div id="queue" class="pnl"></div> <div id="queue" class="pnl"></div>
<div id="sent" class="pnl"></div> <div id="sent" class="pnl"></div>
<div id="failed" class="pnl"></div> <div id="failed" class="pnl"></div>
<div id="lost" class="pnl"></div>
<script> <script>
var currentTab = 'host'; var currentTab = 'host';
@ -130,7 +131,8 @@ function initTabs() {
{id: 'host', label: 'you'}, {id: 'host', label: 'you'},
{id: 'queue', label: 'queue'}, {id: 'queue', label: 'queue'},
{id: 'sent', label: 'sent'}, {id: 'sent', label: 'sent'},
{id: 'failed', label: 'failed'} {id: 'failed', label: 'failed'},
{id: 'lost', label: 'lost builders'}
]; ];
tabs.forEach(function(t) { tabs.forEach(function(t) {
@ -319,6 +321,31 @@ async function loadFailed() {
$('failed').innerHTML = html; $('failed').innerHTML = html;
} }
async function loadLost() {
var res = await fetch("/api/lost_builders");
var data = await res.json();
var html = "<h2>lost builders (" + (data.total || 0) + ")</h2>";
html += "<p style=\"color:#c792ea;font-size:0.8em;margin-bottom:10px\">people who need to see that someone like them made it</p>";
if (!data.matches || data.matches.length === 0) {
html += "<div class=\"meta\">no lost builders found</div>";
}
for (var i = 0; i < (data.matches || []).length; i++) {
var m = data.matches[i];
html += "<div class=\"card\">";
html += "<div class=\"card-hdr\"><span class=\"to\">LOST: " + m.lost_user + "</span><span class=\"score\">" + m.match_score + "</span></div>";
html += "<div class=\"meta\">lost: " + m.lost_score + " | values: " + m.values_score + "</div>";
html += "<div class=\"meta\" style=\"color:#0f8\">BUILDER: " + m.builder + " (" + m.builder_platform + ")</div>";
html += "<div class=\"meta\">score: " + m.builder_score + " | repos: " + m.builder_repos + " | stars: " + m.builder_stars + "</div>";
html += "<div class=\"meta\">shared: " + (m.shared || []).join(", ") + "</div>";
html += "</div>";
}
$("lost").innerHTML = html;
}
function load() { function load() {
loadStats(); loadStats();
@ -326,6 +353,7 @@ function load() {
loadQueue(); loadQueue();
loadSent(); loadSent();
loadFailed(); loadFailed();
loadLost();
} }
document.addEventListener('click', function(e) { document.addEventListener('click', function(e) {
@ -438,6 +466,8 @@ class APIHandler(BaseHTTPRequestHandler):
self._handle_top_humans() self._handle_top_humans()
elif path == '/api/user': elif path == '/api/user':
self._handle_user() self._handle_user()
elif path == '/api/lost_builders':
self._handle_lost_builders()
else: else:
self._send_json({'error': 'not found'}, 404) self._send_json({'error': 'not found'}, 404)
def _handle_favicon(self): def _handle_favicon(self):
@ -1171,6 +1201,44 @@ class APIHandler(BaseHTTPRequestHandler):
self._send_json({'error': str(e)}, 500) self._send_json({'error': str(e)}, 500)
def _handle_lost_builders(self):
"""return lost builders with their inspiring matches"""
try:
from matchd.lost import find_matches_for_lost_builders
db = Database()
matches, error = find_matches_for_lost_builders(db, min_lost_score=30, min_values_score=15, limit=50)
result = {
'total': len(matches) if matches else 0,
'error': error,
'matches': []
}
if matches:
for m in matches:
lost = m.get('lost_user', {})
builder = m.get('inspiring_builder', {})
result['matches'].append({
'lost_user': lost.get('username'),
'lost_platform': lost.get('platform'),
'lost_score': lost.get('lost_potential_score', 0),
'values_score': lost.get('score', 0),
'builder': builder.get('username'),
'builder_platform': builder.get('platform'),
'builder_score': builder.get('score', 0),
'builder_repos': m.get('builder_repos', 0),
'builder_stars': m.get('builder_stars', 0),
'match_score': m.get('match_score', 0),
'shared': m.get('shared_interests', [])[:5],
})
db.close()
self._send_json(result)
except Exception as e:
self._send_json({'error': str(e)}, 500)
def run_api_server(): def run_api_server():
"""run the API server in a thread""" """run the API server in a thread"""
server = HTTPServer(('0.0.0.0', API_PORT), APIHandler) server = HTTPServer(('0.0.0.0', API_PORT), APIHandler)

183
central_client.py Normal file
View file

@ -0,0 +1,183 @@
"""
connectd/central_client.py - client for connectd-central API
provides similar interface to local Database class but uses remote API.
allows distributed instances to share data and coordinate outreach.
"""
import os
import json
import requests
from typing import Optional, List, Dict, Any, Tuple
from datetime import datetime
CENTRAL_API = os.environ.get('CONNECTD_CENTRAL_API', '')
API_KEY = os.environ.get('CONNECTD_API_KEY', '')
INSTANCE_ID = os.environ.get('CONNECTD_INSTANCE_ID', 'default')
class CentralClient:
"""client for connectd-central API"""
def __init__(self, api_url: str = None, api_key: str = None, instance_id: str = None):
self.api_url = api_url or CENTRAL_API
self.api_key = api_key or API_KEY
self.instance_id = instance_id or INSTANCE_ID
self.headers = {
'X-API-Key': self.api_key,
'Content-Type': 'application/json'
}
if not self.api_key:
raise ValueError('CONNECTD_API_KEY environment variable required')
def _get(self, endpoint: str, params: dict = None) -> dict:
resp = requests.get(f'{self.api_url}{endpoint}', headers=self.headers, params=params)
resp.raise_for_status()
return resp.json()
def _post(self, endpoint: str, data: dict) -> dict:
resp = requests.post(f'{self.api_url}{endpoint}', headers=self.headers, json=data)
resp.raise_for_status()
return resp.json()
# === HUMANS ===
def get_human(self, human_id: int) -> Optional[dict]:
try:
return self._get(f'/humans/{human_id}')
except:
return None
def get_humans(self, platform: str = None, user_type: str = None,
min_score: float = 0, limit: int = 100, offset: int = 0) -> List[dict]:
params = {'min_score': min_score, 'limit': limit, 'offset': offset}
if platform:
params['platform'] = platform
if user_type:
params['user_type'] = user_type
result = self._get('/humans', params)
return result.get('humans', [])
def get_all_humans(self, min_score: float = 0, limit: int = 100000) -> List[dict]:
"""get all humans (for matching)"""
return self.get_humans(min_score=min_score, limit=limit)
def get_lost_builders(self, min_score: float = 30, limit: int = 100) -> List[dict]:
"""get lost builders for outreach"""
return self.get_humans(user_type='lost', min_score=min_score, limit=limit)
def get_builders(self, min_score: float = 50, limit: int = 100) -> List[dict]:
"""get active builders"""
return self.get_humans(user_type='builder', min_score=min_score, limit=limit)
def upsert_human(self, human: dict) -> int:
"""create or update human, returns id"""
result = self._post('/humans', human)
return result.get('id')
def upsert_humans_bulk(self, humans: List[dict]) -> Tuple[int, int]:
"""bulk upsert humans, returns (created, updated)"""
result = self._post('/humans/bulk', humans)
return result.get('created', 0), result.get('updated', 0)
# === MATCHES ===
def get_matches(self, min_score: float = 0, limit: int = 100, offset: int = 0) -> List[dict]:
params = {'min_score': min_score, 'limit': limit, 'offset': offset}
result = self._get('/matches', params)
return result.get('matches', [])
def create_match(self, human_a_id: int, human_b_id: int,
overlap_score: float, overlap_reasons: str = None) -> int:
"""create match, returns id"""
result = self._post('/matches', {
'human_a_id': human_a_id,
'human_b_id': human_b_id,
'overlap_score': overlap_score,
'overlap_reasons': overlap_reasons
})
return result.get('id')
def create_matches_bulk(self, matches: List[dict]) -> int:
"""bulk create matches, returns count"""
result = self._post('/matches/bulk', matches)
return result.get('created', 0)
# === OUTREACH COORDINATION ===
def get_pending_outreach(self, outreach_type: str = None, limit: int = 50) -> List[dict]:
"""get pending outreach that hasn't been claimed"""
params = {'limit': limit}
if outreach_type:
params['outreach_type'] = outreach_type
result = self._get('/outreach/pending', params)
return result.get('pending', [])
def claim_outreach(self, human_id: int, match_id: int = None,
outreach_type: str = 'intro') -> Optional[int]:
"""claim outreach for a human, returns outreach_id or None if already claimed"""
try:
result = self._post('/outreach/claim', {
'human_id': human_id,
'match_id': match_id,
'outreach_type': outreach_type
})
return result.get('outreach_id')
except requests.exceptions.HTTPError as e:
if e.response.status_code == 409:
return None # already claimed by another instance
raise
def complete_outreach(self, outreach_id: int, status: str,
sent_via: str = None, draft: str = None, error: str = None):
"""mark outreach as complete"""
self._post('/outreach/complete', {
'outreach_id': outreach_id,
'status': status,
'sent_via': sent_via,
'draft': draft,
'error': error
})
def get_outreach_history(self, status: str = None, limit: int = 100) -> List[dict]:
params = {'limit': limit}
if status:
params['status'] = status
result = self._get('/outreach/history', params)
return result.get('history', [])
def already_contacted(self, human_id: int) -> bool:
"""check if human has been contacted"""
history = self._get('/outreach/history', {'limit': 10000})
sent = history.get('history', [])
return any(h['human_id'] == human_id and h['status'] == 'sent' for h in sent)
# === STATS ===
def get_stats(self) -> dict:
return self._get('/stats')
# === INSTANCE MANAGEMENT ===
def register_instance(self, name: str, host: str):
"""register this instance with central"""
self._post(f'/instances/register?name={name}&host={host}', {})
def get_instances(self) -> List[dict]:
result = self._get('/instances')
return result.get('instances', [])
# === HEALTH ===
def health_check(self) -> bool:
try:
result = self._get('/health')
return result.get('status') == 'ok'
except:
return False
# convenience function
def get_client() -> CentralClient:
return CentralClient()

View file

@ -22,7 +22,7 @@ CACHE_DIR.mkdir(exist_ok=True)
SCOUT_INTERVAL = 3600 * 4 # full scout every 4 hours SCOUT_INTERVAL = 3600 * 4 # full scout every 4 hours
MATCH_INTERVAL = 3600 # check matches every hour MATCH_INTERVAL = 3600 # check matches every hour
INTRO_INTERVAL = 1800 # send intros every 2 hours INTRO_INTERVAL = 1800 # send intros every 2 hours
MAX_INTROS_PER_DAY = 250 # rate limit builder-to-builder outreach MAX_INTROS_PER_DAY = 1000 # rate limit builder-to-builder outreach
# === MATCHING CONFIG === # === MATCHING CONFIG ===
@ -42,7 +42,7 @@ LOST_CONFIG = {
# outreach settings # outreach settings
'enabled': True, 'enabled': True,
'max_per_day': 20, # lower volume, higher care 'max_per_day': 100, # lower volume, higher care
'require_review': False, # fully autonomous 'require_review': False, # fully autonomous
'cooldown_days': 90, # don't spam struggling people 'cooldown_days': 90, # don't spam struggling people
@ -70,6 +70,47 @@ GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions'
GROQ_MODEL = os.environ.get('GROQ_MODEL', 'llama-3.3-70b-versatile') GROQ_MODEL = os.environ.get('GROQ_MODEL', 'llama-3.3-70b-versatile')
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '')
# === FORGE TOKENS ===
# for creating issues on self-hosted git forges
# each forge needs its own token from that instance
#
# CODEBERG: Settings -> Applications -> Generate Token (repo:write scope)
# GITEA/FORGEJO: Settings -> Applications -> Generate Token
# GITLAB: Settings -> Access Tokens -> Personal Access Token (api scope)
# SOURCEHUT: Settings -> Personal Access Tokens (uses email instead)
CODEBERG_TOKEN = os.environ.get('CODEBERG_TOKEN', '')
GITEA_TOKENS = {} # instance_url -> token, loaded from env
GITLAB_TOKENS = {} # instance_url -> token, loaded from env
# parse GITEA_TOKENS from env
# format: GITEA_TOKEN_192_168_1_8_3259=token -> http://192.168.1.8:3259
# format: GITEA_TOKEN_codeberg_org=token -> https://codeberg.org
def _parse_instance_url(env_key, prefix):
"""convert env key to instance URL"""
raw = env_key.replace(prefix, '')
parts = raw.split('_')
# check if last part is a port number
if parts[-1].isdigit() and len(parts[-1]) <= 5:
port = parts[-1]
host = '.'.join(parts[:-1])
# local IPs use http
if host.startswith('192.168.') or host.startswith('10.') or host == 'localhost':
return f'http://{host}:{port}'
return f'https://{host}:{port}'
else:
host = '.'.join(parts)
return f'https://{host}'
for key, value in os.environ.items():
if key.startswith('GITEA_TOKEN_'):
url = _parse_instance_url(key, 'GITEA_TOKEN_')
GITEA_TOKENS[url] = value
elif key.startswith('GITLAB_TOKEN_'):
url = _parse_instance_url(key, 'GITLAB_TOKEN_')
GITLAB_TOKENS[url] = value
MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '') MASTODON_TOKEN = os.environ.get('MASTODON_TOKEN', '')
MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '') MASTODON_INSTANCE = os.environ.get('MASTODON_INSTANCE', '')

150
daemon.py
View file

@ -12,6 +12,7 @@ runs continuously, respects rate limits, sends intros automatically
import time import time
import json import json
import signal import signal
import os
import sys import sys
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
@ -20,13 +21,14 @@ from db import Database
from db.users import (init_users_table, get_priority_users, save_priority_match, from db.users import (init_users_table, get_priority_users, save_priority_match,
get_priority_user_matches, discover_host_user) get_priority_user_matches, discover_host_user)
from scoutd import scrape_github, scrape_reddit, scrape_mastodon, scrape_lobsters, scrape_lemmy, scrape_discord from scoutd import scrape_github, scrape_reddit, scrape_mastodon, scrape_lobsters, scrape_lemmy, scrape_discord
from config import HOST_USER, INTRO_INTERVAL, MAX_INTROS_PER_DAY, SCOUT_INTERVAL, MATCH_INTERVAL from scoutd.forges import scrape_all_forges
from config import HOST_USER
from scoutd.github import analyze_github_user, get_github_user from scoutd.github import analyze_github_user, get_github_user
from scoutd.signals import analyze_text from scoutd.signals import analyze_text
from matchd.fingerprint import generate_fingerprint, fingerprint_similarity from matchd.fingerprint import generate_fingerprint, fingerprint_similarity
from matchd.overlap import find_overlap from matchd.overlap import find_overlap
from matchd.lost import find_matches_for_lost_builders from matchd.lost import find_matches_for_lost_builders
from introd.groq_draft import draft_intro_with_llm as draft_intro from introd.draft import draft_intro, summarize_human, summarize_overlap
from introd.lost_intro import draft_lost_intro, get_lost_intro_config from introd.lost_intro import draft_lost_intro, get_lost_intro_config
from introd.send import send_email from introd.send import send_email
from introd.deliver import deliver_intro, determine_best_contact from introd.deliver import deliver_intro, determine_best_contact
@ -34,7 +36,19 @@ from config import get_lost_config
from api import start_api_thread, update_daemon_state from api import start_api_thread, update_daemon_state
# daemon config # daemon config
SCOUT_INTERVAL = 3600 * 4 # full scout every 4 hours
MATCH_INTERVAL = 3600 # check matches every hour
INTRO_INTERVAL = 3600 * 2 # send intros every 2 hours
LOST_INTERVAL = 3600 * 6 # lost builder outreach every 6 hours (lower volume) LOST_INTERVAL = 3600 * 6 # lost builder outreach every 6 hours (lower volume)
from config import MAX_INTROS_PER_DAY
# central coordination (optional - for distributed instances)
try:
from central_client import CentralClient
CENTRAL_ENABLED = bool(os.environ.get('CONNECTD_API_KEY'))
except ImportError:
CENTRAL_ENABLED = False
CentralClient = None # from config.py
MIN_OVERLAP_PRIORITY = 30 # min score for priority user matches MIN_OVERLAP_PRIORITY = 30 # min score for priority user matches
MIN_OVERLAP_STRANGERS = 50 # higher bar for stranger intros MIN_OVERLAP_STRANGERS = 50 # higher bar for stranger intros
@ -43,6 +57,9 @@ class ConnectDaemon:
def __init__(self, dry_run=False): def __init__(self, dry_run=False):
self.db = Database() self.db = Database()
init_users_table(self.db.conn) init_users_table(self.db.conn)
purged = self.db.purge_disqualified()
if any(purged.values()):
self.log(f"purged disqualified: {purged}")
self.running = True self.running = True
self.dry_run = dry_run self.dry_run = dry_run
self.started_at = datetime.now() self.started_at = datetime.now()
@ -52,6 +69,18 @@ class ConnectDaemon:
self.last_lost = None self.last_lost = None
self.intros_today = 0 self.intros_today = 0
self.lost_intros_today = 0 self.lost_intros_today = 0
# central coordination
self.central = None
if CENTRAL_ENABLED:
try:
self.central = CentralClient()
instance_id = os.environ.get('CONNECTD_INSTANCE_ID', 'unknown')
self.central.register_instance(instance_id, os.environ.get('CONNECTD_INSTANCE_IP', 'unknown'))
self.log(f"connected to central API as {instance_id}")
except Exception as e:
self.log(f"central API unavailable: {e}")
self.central = None
self.today = datetime.now().date() self.today = datetime.now().date()
# handle shutdown gracefully # handle shutdown gracefully
@ -108,6 +137,18 @@ class ConnectDaemon:
self.today = datetime.now().date() self.today = datetime.now().date()
self.intros_today = 0 self.intros_today = 0
self.lost_intros_today = 0 self.lost_intros_today = 0
# central coordination
self.central = None
if CENTRAL_ENABLED:
try:
self.central = CentralClient()
instance_id = os.environ.get('CONNECTD_INSTANCE_ID', 'unknown')
self.central.register_instance(instance_id, os.environ.get('CONNECTD_INSTANCE_IP', 'unknown'))
self.log(f"connected to central API as {instance_id}")
except Exception as e:
self.log(f"central API unavailable: {e}")
self.central = None
self.log("reset daily intro limits") self.log("reset daily intro limits")
def scout_cycle(self): def scout_cycle(self):
@ -126,6 +167,16 @@ class ConnectDaemon:
try: try:
scrape_mastodon(self.db, limit_per_instance=30) scrape_mastodon(self.db, limit_per_instance=30)
# scrape self-hosted git forges (highest signal)
self.log("scraping self-hosted git forges...")
try:
forge_humans = scrape_all_forges(limit_per_instance=30)
for h in forge_humans:
self.db.upsert_human(h)
self.log(f" forges: {len(forge_humans)} humans")
except Exception as e:
self.log(f" forge scrape error: {e}")
except Exception as e: except Exception as e:
self.log(f"mastodon scout error: {e}") self.log(f"mastodon scout error: {e}")
@ -157,7 +208,7 @@ class ConnectDaemon:
self.log(f"matching for {len(priority_users)} priority users...") self.log(f"matching for {len(priority_users)} priority users...")
humans = self.db.get_all_humans(min_score=20, limit=500) humans = self.db.get_all_humans(min_score=20)
for puser in priority_users: for puser in priority_users:
# build priority user's fingerprint from their linked profiles # build priority user's fingerprint from their linked profiles
@ -230,7 +281,7 @@ class ConnectDaemon:
"""find matches between discovered humans (altruistic)""" """find matches between discovered humans (altruistic)"""
self.log("matching strangers...") self.log("matching strangers...")
humans = self.db.get_all_humans(min_score=40, limit=200) humans = self.db.get_all_humans(min_score=40)
if len(humans) < 2: if len(humans) < 2:
return return
@ -256,7 +307,7 @@ class ConnectDaemon:
overlap = find_overlap(human_a, human_b, fp_a, fp_b) overlap = find_overlap(human_a, human_b, fp_a, fp_b)
if overlap['overlap_score'] >= MIN_OVERLAP_STRANGERS: if overlap and overlap["overlap_score"] >= MIN_OVERLAP_STRANGERS:
# save match # save match
self.db.save_match(human_a['id'], human_b['id'], overlap) self.db.save_match(human_a['id'], human_b['id'], overlap)
matches_found += 1 matches_found += 1
@ -266,6 +317,37 @@ class ConnectDaemon:
self.last_match = datetime.now() self.last_match = datetime.now()
def claim_from_central(self, human_id, match_id=None, outreach_type='intro'):
"""claim outreach from central - returns outreach_id or None if already claimed"""
if not self.central:
return -1 # local mode, always allow
try:
return self.central.claim_outreach(human_id, match_id, outreach_type)
except Exception as e:
self.log(f"central claim error: {e}")
return -1 # allow local if central fails
def complete_on_central(self, outreach_id, status, sent_via=None, draft=None, error=None):
"""mark outreach complete on central"""
if not self.central or outreach_id == -1:
return
try:
self.central.complete_outreach(outreach_id, status, sent_via, draft, error)
except Exception as e:
self.log(f"central complete error: {e}")
def sync_to_central(self, humans=None, matches=None):
"""sync local data to central"""
if not self.central:
return
try:
if humans:
self.central.upsert_humans_bulk(humans)
if matches:
self.central.create_matches_bulk(matches)
except Exception as e:
self.log(f"central sync error: {e}")
def send_stranger_intros(self): def send_stranger_intros(self):
"""send intros to connect strangers (or preview in dry-run mode)""" """send intros to connect strangers (or preview in dry-run mode)"""
self.reset_daily_limits() self.reset_daily_limits()
@ -331,29 +413,18 @@ class ConnectDaemon:
'overlap_reasons': match['overlap_reasons'], 'overlap_reasons': match['overlap_reasons'],
} }
# ACTIVITY-BASED CONTACT SELECTION # try to send intro to person with email
# use deliver_intro which calls determine_best_contact
# picks method based on WHERE they're most active:
# - mastodon DM if active on fediverse
# - github issue if actively committing
# - email ONLY as last resort
for recipient, other in [(human_a, human_b), (human_b, human_a)]: for recipient, other in [(human_a, human_b), (human_b, human_a)]:
# draft intro using groq LLM contact = recipient.get('contact', {})
# retry groq up to 3 times with 10s wait if isinstance(contact, str):
intro_result, intro_error = None, None contact = json.loads(contact)
for retry in range(3):
intro_result, intro_error = draft_intro(match_data, recipient='a' if recipient == human_a else 'b')
if not intro_error:
break
self.log(f"groq retry {retry+1}/3: {intro_error}")
import time
time.sleep(10)
if intro_error: email = contact.get('email')
self.log(f"failed to draft intro after retries: {intro_error}") if not email:
continue continue
intro = {'draft': intro_result.get('draft', '')}
# draft intro
intro = draft_intro(match_data, recipient='a' if recipient == human_a else 'b')
# parse overlap reasons for display # parse overlap reasons for display
reasons = match['overlap_reasons'] reasons = match['overlap_reasons']
@ -361,13 +432,12 @@ class ConnectDaemon:
reasons = json.loads(reasons) reasons = json.loads(reasons)
reason_summary = ', '.join(reasons[:3]) if reasons else 'aligned values' reason_summary = ', '.join(reasons[:3]) if reasons else 'aligned values'
# determine best contact method based on activity
method, contact_info = determine_best_contact(recipient)
if self.dry_run: if self.dry_run:
# print preview
print("\n" + "=" * 60) print("\n" + "=" * 60)
print(f"TO: {recipient['username']} ({recipient['platform']})") print(f"TO: {recipient['username']} ({recipient['platform']})")
print(f"METHOD: {method} -> {contact_info}") print(f"EMAIL: {email}")
print(f"SUBJECT: you might want to meet {other['username']}")
print(f"SCORE: {match['overlap_score']:.0f} ({reason_summary})") print(f"SCORE: {match['overlap_score']:.0f} ({reason_summary})")
print("-" * 60) print("-" * 60)
print("MESSAGE:") print("MESSAGE:")
@ -377,12 +447,23 @@ class ConnectDaemon:
print("=" * 60) print("=" * 60)
break break
else: else:
# deliver via activity-based method selection # claim from central first
success, error, delivery_method = deliver_intro(match_data, intro['draft'], intro.get('subject')) outreach_id = self.claim_from_central(recipient['id'], match['id'], 'intro')
if outreach_id is None:
self.log(f"skipping {recipient['username']} - already claimed by another instance")
continue
# actually send
success, error = send_email(
email,
f"connectd: you might want to meet {other['username']}",
intro['draft']
)
if success: if success:
self.log(f"sent intro to {recipient['username']} via {delivery_method}") self.log(f"sent intro to {recipient['username']} ({email})")
self.intros_today += 1 self.intros_today += 1
self.complete_on_central(outreach_id, 'sent', 'email', intro['draft'])
# mark match as intro_sent # mark match as intro_sent
c.execute('UPDATE matches SET status = "intro_sent" WHERE id = ?', c.execute('UPDATE matches SET status = "intro_sent" WHERE id = ?',
@ -390,7 +471,8 @@ class ConnectDaemon:
self.db.conn.commit() self.db.conn.commit()
break break
else: else:
self.log(f"failed to reach {recipient['username']} via {delivery_method}: {error}") self.log(f"failed to send to {email}: {error}")
self.complete_on_central(outreach_id, 'failed', error=error)
self.last_intro = datetime.now() self.last_intro = datetime.now()
@ -475,7 +557,7 @@ class ConnectDaemon:
'overlap_reasons': match.get('shared_interests', []), 'overlap_reasons': match.get('shared_interests', []),
} }
success, error, delivery_method = deliver_intro(match_data, draft, None) success, error, delivery_method = deliver_intro(match_data, draft)
if success: if success:
self.log(f"sent lost builder intro to {lost_name} via {delivery_method}") self.log(f"sent lost builder intro to {lost_name} via {delivery_method}")

View file

@ -183,7 +183,7 @@ class Database:
row = c.fetchone() row = c.fetchone()
return dict(row) if row else None return dict(row) if row else None
def get_all_humans(self, min_score=0, limit=1000): def get_all_humans(self, min_score=0, limit=100000):
"""get all humans above score threshold""" """get all humans above score threshold"""
c = self.conn.cursor() c = self.conn.cursor()
c.execute('''SELECT * FROM humans c.execute('''SELECT * FROM humans
@ -373,3 +373,64 @@ class Database:
def close(self): def close(self):
self.conn.close() self.conn.close()
def purge_disqualified(self):
"""
auto-cleanup: remove all matches/intros involving users with disqualifying signals
DISQUALIFYING: maga, conspiracy, conservative, antivax, sovcit
"""
c = self.conn.cursor()
purged = {}
# patterns to match disqualifying signals
disq_patterns = ["maga", "conspiracy", "conservative", "antivax", "sovcit"]
# build WHERE clause for negative_signals check
neg_check = " OR ".join([f"negative_signals LIKE '%{p}%'" for p in disq_patterns])
# 1. delete from intros where recipient is disqualified
c.execute(f"""
DELETE FROM intros WHERE recipient_human_id IN (
SELECT id FROM humans WHERE {neg_check}
)
""")
purged["intros"] = c.rowcount
# 2. delete from priority_matches where matched_human is disqualified
c.execute(f"""
DELETE FROM priority_matches WHERE matched_human_id IN (
SELECT id FROM humans WHERE {neg_check}
)
""")
purged["priority_matches"] = c.rowcount
# 3. delete from matches where either human is disqualified
c.execute(f"""
DELETE FROM matches WHERE
human_a_id IN (SELECT id FROM humans WHERE {neg_check})
OR human_b_id IN (SELECT id FROM humans WHERE {neg_check})
""")
purged["matches"] = c.rowcount
# 4. cleanup orphaned records (humans deleted but refs remain)
c.execute("""
DELETE FROM matches WHERE
NOT EXISTS (SELECT 1 FROM humans h WHERE h.id = human_a_id)
OR NOT EXISTS (SELECT 1 FROM humans h WHERE h.id = human_b_id)
""")
purged["orphaned_matches"] = c.rowcount
c.execute("""
DELETE FROM priority_matches WHERE
NOT EXISTS (SELECT 1 FROM humans h WHERE h.id = matched_human_id)
""")
purged["orphaned_priority"] = c.rowcount
c.execute("""
DELETE FROM intros WHERE
NOT EXISTS (SELECT 1 FROM humans h WHERE h.id = recipient_human_id)
""")
purged["orphaned_intros"] = c.rowcount
self.conn.commit()
return purged

View file

@ -147,6 +147,87 @@ def create_github_issue(owner, repo, title, body, dry_run=False):
return False, str(e) return False, str(e)
def create_forge_issue(platform_type, instance_url, owner, repo, title, body, dry_run=False):
"""
create issue on self-hosted git forge.
supports gitea/forgejo/gogs (same API) and gitlab.
"""
from config import CODEBERG_TOKEN, GITEA_TOKENS, GITLAB_TOKENS
if dry_run:
print(f" [dry run] would create issue on {platform_type}:{instance_url}/{owner}/{repo}")
return True, None
try:
if platform_type in ('gitea', 'forgejo', 'gogs'):
# get token for this instance
token = None
if 'codeberg.org' in instance_url:
token = CODEBERG_TOKEN
else:
token = GITEA_TOKENS.get(instance_url)
if not token:
return False, f"no auth token for {instance_url}"
# gitea API
api_url = f"{instance_url}/api/v1/repos/{owner}/{repo}/issues"
headers = {
'Content-Type': 'application/json',
'Authorization': f'token {token}'
}
data = {'title': title, 'body': body}
resp = requests.post(api_url, headers=headers, json=data, timeout=15)
if resp.status_code in (200, 201):
return True, resp.json().get('html_url')
else:
return False, f"gitea api error: {resp.status_code} - {resp.text[:200]}"
elif platform_type == 'gitlab':
token = GITLAB_TOKENS.get(instance_url)
if not token:
return False, f"no auth token for {instance_url}"
# need to get project ID first
search_url = f"{instance_url}/api/v4/projects"
headers = {'PRIVATE-TOKEN': token}
params = {'search': repo}
resp = requests.get(search_url, headers=headers, params=params, timeout=15)
if resp.status_code != 200:
return False, f"gitlab project lookup failed: {resp.status_code}"
projects = resp.json()
project_id = None
for p in projects:
if p.get('path') == repo or p.get('name') == repo:
project_id = p.get('id')
break
if not project_id:
return False, f"project {repo} not found"
# create issue
issue_url = f"{instance_url}/api/v4/projects/{project_id}/issues"
data = {'title': title, 'description': body}
resp = requests.post(issue_url, headers=headers, json=data, timeout=15)
if resp.status_code in (200, 201):
return True, resp.json().get('web_url')
else:
return False, f"gitlab api error: {resp.status_code}"
elif platform_type == 'sourcehut':
return False, "sourcehut uses mailing lists - use email instead"
else:
return False, f"unknown forge type: {platform_type}"
except Exception as e:
return False, str(e)
def send_mastodon_dm(recipient_acct, message, dry_run=False): def send_mastodon_dm(recipient_acct, message, dry_run=False):
"""send mastodon direct message""" """send mastodon direct message"""
if not MASTODON_TOKEN: if not MASTODON_TOKEN:
@ -419,14 +500,94 @@ def deliver_intro(match_data, intro_draft, subject=None, dry_run=False):
""" """
success, error = create_github_issue(owner, repo, title, github_body, dry_run) success, error = create_github_issue(owner, repo, title, github_body, dry_run)
elif method == 'forge_issue':
# self-hosted git forge issue (gitea/forgejo/gitlab/sourcehut)
platform_type = contact_info.get('platform_type')
instance_url = contact_info.get('instance_url')
owner = contact_info.get('owner')
repo = contact_info.get('repo')
title = subject or "community introduction from connectd"
# get the other person's contact info for bidirectional link
sender = match_data.get('human_a', {})
sender_name = sender.get('name') or sender.get('username') or 'someone'
sender_platform = sender.get('platform', '')
sender_url = sender.get('url', '')
if not sender_url:
if sender_platform == 'github':
sender_url = f"https://github.com/{sender.get('username')}"
elif sender_platform == 'mastodon':
sender_url = f"https://fosstodon.org/@{sender.get('username')}"
elif ':' in sender_platform: # forge platform
extra = sender.get('extra', {})
if isinstance(extra, str):
import json as _json
extra = _json.loads(extra) if extra else {}
sender_url = extra.get('instance_url', '') + '/' + sender.get('username', '')
forge_body = f"""hey {recipient.get('name') or recipient.get('username')},
{intro_draft}
**reach them at:** {sender_url or 'see their profile'}
---
*this is an automated introduction from [connectd](https://github.com/connectd-daemon) - a daemon that finds isolated builders with aligned values and connects them.*
*if this feels spammy, close this issue and we won't reach out again.*
"""
success, error = create_forge_issue(platform_type, instance_url, owner, repo, title, forge_body, dry_run)
elif method == 'manual': elif method == 'manual':
# skip - no longer using manual queue # skip - no longer using manual queue
success = False success = False
error = "manual method deprecated - skipping" error = "manual method deprecated - skipping"
# FALLBACK CHAIN: if primary method failed, try fallbacks
if not success and fallbacks:
for fallback_method, fallback_info in fallbacks:
result['fallback_attempts'] = result.get('fallback_attempts', [])
result['fallback_attempts'].append({'method': fallback_method})
fb_success = False
fb_error = None
if fallback_method == 'email':
fb_success, fb_error = send_email(fallback_info, email_subject, intro_draft, dry_run)
elif fallback_method == 'mastodon':
fb_success, fb_error = send_mastodon_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'bluesky':
fb_success, fb_error = send_bluesky_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'matrix':
fb_success, fb_error = send_matrix_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'github_issue':
owner = fallback_info.get('owner') if isinstance(fallback_info, dict) else fallback_info.split('/')[0]
repo = fallback_info.get('repo') if isinstance(fallback_info, dict) else fallback_info.split('/')[1]
fb_success, fb_error = create_github_issue(owner, repo, email_subject, intro_draft, dry_run)
elif fallback_method == 'forge_issue':
fb_success, fb_error = create_forge_issue(
fallback_info.get('platform_type'),
fallback_info.get('instance_url'),
fallback_info.get('owner'),
fallback_info.get('repo'),
email_subject, intro_draft, dry_run
)
if fb_success:
success = True
method = fallback_method
contact_info = fallback_info
error = None
result['fallback_succeeded'] = fallback_method
break
else:
result['fallback_attempts'][-1]['error'] = fb_error
# log result # log result
result['success'] = success result['success'] = success
result['error'] = error result['error'] = error
result['final_method'] = method
if success: if success:
log['sent'].append(result) log['sent'].append(result)

View file

@ -21,3 +21,7 @@ services:
- ./api.py:/app/api.py:ro - ./api.py:/app/api.py:ro
- ./deliver.py:/app/introd/deliver.py:ro - ./deliver.py:/app/introd/deliver.py:ro
- ./soul.txt:/app/soul.txt:ro - ./soul.txt:/app/soul.txt:ro
- ./scoutd/reddit.py:/app/scoutd/reddit.py:ro
- ./matchd/overlap.py:/app/matchd/overlap.py:ro
- ./central_client.py:/app/central_client.py:ro
- ./scoutd/forges.py:/app/scoutd/forges.py:ro

View file

@ -1,437 +1,419 @@
""" """
introd/groq_draft.py - groq llama 4 maverick for smart intro drafting connectd - groq message drafting
reads soul from file, uses as guideline for llm to personalize
uses groq api to generate personalized, natural intro messages
that don't sound like ai-generated slop
""" """
import os import os
import json import json
import requests from groq import Groq
from datetime import datetime
GROQ_API_KEY = os.environ.get('GROQ_API_KEY', '') GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_API_URL = 'https://api.groq.com/openai/v1/chat/completions' GROQ_MODEL = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")
MODEL = os.environ.get('GROQ_MODEL', 'llama-3.1-70b-versatile')
client = Groq(api_key=GROQ_API_KEY) if GROQ_API_KEY else None
def determine_contact_method(human): # load soul from file (guideline, not script)
""" SOUL_PATH = os.getenv("SOUL_PATH", "/app/soul.txt")
determine best contact method based on WHERE THEY'RE MOST ACTIVE def load_soul():
don't use fixed hierarchy - analyze activity per platform:
- count posts/commits/activity
- weight by recency (last 30 days matters more)
- contact them where they already are
- fall back to email only if no social activity
"""
from datetime import datetime, timedelta
extra = human.get('extra', {})
if isinstance(extra, str):
extra = json.loads(extra) if extra else {}
# handle nested extra.extra from old save format
if 'extra' in extra and isinstance(extra['extra'], dict):
extra = {**extra, **extra['extra']}
contact = human.get('contact', {})
if isinstance(contact, str):
contact = json.loads(contact) if contact else {}
# collect activity scores per platform
activity_scores = {}
now = datetime.now()
thirty_days_ago = now - timedelta(days=30)
ninety_days_ago = now - timedelta(days=90)
# github activity
github_username = human.get('username') if human.get('platform') == 'github' else extra.get('github')
if github_username:
github_score = 0
top_repos = extra.get('top_repos', [])
for repo in top_repos:
# recent commits weight more
pushed_at = repo.get('pushed_at', '')
if pushed_at:
try: try:
push_date = datetime.fromisoformat(pushed_at.replace('Z', '+00:00')).replace(tzinfo=None) with open(SOUL_PATH, 'r') as f:
if push_date > thirty_days_ago: return f.read().strip()
github_score += 10 # very recent
elif push_date > ninety_days_ago:
github_score += 5 # somewhat recent
else:
github_score += 1 # old but exists
except: except:
github_score += 1 return None
# stars indicate engagement SIGNATURE_HTML = """
github_score += min(repo.get('stars', 0) // 10, 5) <div style="margin-top: 24px; padding-top: 16px; border-top: 1px solid #333;">
<div style="margin-bottom: 12px;">
<a href="https://github.com/sudoxnym/connectd" style="color: #8b5cf6; text-decoration: none; font-size: 14px;">github.com/sudoxnym/connectd</a>
<span style="color: #666; font-size: 12px; margin-left: 8px;">(main repo)</span>
</div>
<div style="display: flex; gap: 16px; align-items: center;">
<a href="https://github.com/connectd-daemon" title="GitHub" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12"/></svg>
</a>
<a href="https://mastodon.sudoxreboot.com/@connectd" title="Mastodon" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M23.268 5.313c-.35-2.578-2.617-4.61-5.304-5.004C17.51.242 15.792 0 11.813 0h-.03c-3.98 0-4.835.242-5.288.309C3.882.692 1.496 2.518.917 5.127.64 6.412.61 7.837.661 9.143c.074 1.874.088 3.745.26 5.611.118 1.24.325 2.47.62 3.68.55 2.237 2.777 4.098 4.96 4.857 2.336.792 4.849.923 7.256.38.265-.061.527-.132.786-.213.585-.184 1.27-.39 1.774-.753a.057.057 0 0 0 .023-.043v-1.809a.052.052 0 0 0-.02-.041.053.053 0 0 0-.046-.01 20.282 20.282 0 0 1-4.709.545c-2.73 0-3.463-1.284-3.674-1.818a5.593 5.593 0 0 1-.319-1.433.053.053 0 0 1 .066-.054c1.517.363 3.072.546 4.632.546.376 0 .75 0 1.125-.01 1.57-.044 3.224-.124 4.768-.422.038-.008.077-.015.11-.024 2.435-.464 4.753-1.92 4.989-5.604.008-.145.03-1.52.03-1.67.002-.512.167-3.63-.024-5.545zm-3.748 9.195h-2.561V8.29c0-1.309-.55-1.976-1.67-1.976-1.23 0-1.846.79-1.846 2.35v3.403h-2.546V8.663c0-1.56-.617-2.35-1.848-2.35-1.112 0-1.668.668-1.67 1.977v6.218H4.822V8.102c0-1.31.337-2.35 1.011-3.12.696-.77 1.608-1.164 2.74-1.164 1.311 0 2.302.5 2.962 1.498l.638 1.06.638-1.06c.66-.999 1.65-1.498 2.96-1.498 1.13 0 2.043.395 2.74 1.164.675.77 1.012 1.81 1.012 3.12z"/></svg>
</a>
<a href="https://bsky.app/profile/connectd.bsky.social" title="Bluesky" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M5.202 2.857C7.954 4.922 10.913 9.11 12 11.358c1.087-2.247 4.046-6.436 6.798-8.501C20.783 1.366 24 .213 24 3.883c0 .732-.42 6.156-.667 7.037-.856 3.061-3.978 3.842-6.755 3.37 4.854.826 6.089 3.562 3.422 6.299-5.065 5.196-7.28-1.304-7.847-2.97-.104-.305-.152-.448-.153-.327 0-.121-.05.022-.153.327-.568 1.666-2.782 8.166-7.847 2.97-2.667-2.737-1.432-5.473 3.422-6.3-2.777.473-5.899-.308-6.755-3.369C.42 10.04 0 4.615 0 3.883c0-3.67 3.217-2.517 5.202-1.026"/></svg>
</a>
<a href="https://lemmy.sudoxreboot.com/c/connectd" title="Lemmy" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M2.9595 4.2228a3.9132 3.9132 0 0 0-.332.019c-.8781.1012-1.67.5699-2.155 1.3862-.475.8-.5922 1.6809-.35 2.4971.2421.8162.8297 1.5575 1.6982 2.1449.0053.0035.0106.0076.0163.0114.746.4498 1.492.7431 2.2877.8994-.02.3318-.0272.6689-.006 1.0181.0634 1.0432.4368 2.0006.996 2.8492l-2.0061.8189a.4163.4163 0 0 0-.2276.2239.416.416 0 0 0 .0879.455.415.415 0 0 0 .2941.1231.4156.4156 0 0 0 .1595-.0312l2.2093-.9035c.408.4859.8695.9315 1.3723 1.318.0196.0151.0407.0264.0603.0423l-1.2918 1.7103a.416.416 0 0 0 .664.501l1.314-1.7385c.7185.4548 1.4782.7927 2.2294 1.0242.3833.7209 1.1379 1.1871 2.0202 1.1871.8907 0 1.6442-.501 2.0242-1.2072.744-.2347 1.4959-.5729 2.2073-1.0262l1.332 1.7606a.4157.4157 0 0 0 .7439-.1936.4165.4165 0 0 0-.0799-.3074l-1.3099-1.7345c.0083-.0075.0178-.0113.0261-.0188.4968-.3803.9549-.8175 1.3622-1.2939l2.155.8794a.4156.4156 0 0 0 .5412-.2276.4151.4151 0 0 0-.2273-.5432l-1.9438-.7928c.577-.8538.9697-1.8183 1.0504-2.8693.0268-.3507.0242-.6914.0079-1.0262.7905-.1572 1.5321-.4502 2.2737-.8974.0053-.0033.011-.0076.0163-.0113.8684-.5874 1.456-1.3287 1.6982-2.145.2421-.8161.125-1.697-.3501-2.497-.4849-.8163-1.2768-1.2852-2.155-1.3863a3.2175 3.2175 0 0 0-.332-.0189c-.7852-.0151-1.6231.229-2.4286.6942-.5926.342-1.1252.867-1.5433 1.4387-1.1699-.6703-2.6923-1.0476-4.5635-1.0785a15.5768 15.5768 0 0 0-.5111 0c-2.085.034-3.7537.43-5.0142 1.1449-.0033-.0038-.0045-.0114-.008-.0152-.4233-.5916-.973-1.1365-1.5835-1.489-.8055-.465-1.6434-.7083-2.4286-.6941Zm.2858.7365c.5568.042 1.1696.2358 1.7787.5875.485.28.9757.7554 1.346 1.2696a5.6875 5.6875 0 0 0-.4969.4085c-.9201.8516-1.4615 1.9597-1.668 3.2335-.6809-.1402-1.3183-.3945-1.984-.7948-.7553-.5128-1.2159-1.1225-1.4004-1.7445-.1851-.624-.1074-1.2712.2776-1.9196.3743-.63.9275-.9534 1.6118-1.0322a2.796 2.796 0 0 1 .5352-.0076Zm17.5094 0a2.797 2.797 0 0 1 .5353.0075c.6842.0786 1.2374.4021 1.6117 1.0322.385.6484.4627 1.2957.2776 1.9196-.1845.622-.645 1.2317-1.4004 1.7445-.6578.3955-1.2881.6472-1.9598.7888-.1942-1.2968-.7375-2.4338-1.666-3.302a5.5639 5.5639 0 0 0-.4709-.3923c.3645-.49.8287-.9428 1.2938-1.2113.6091-.3515 1.2219-.5454 1.7787-.5875ZM12.006 6.0036a14.832 14.832 0 0 1 .487 0c2.3901.0393 4.0848.67 5.1631 1.678 1.1501 1.0754 1.6423 2.6006 1.499 4.467-.1311 1.7079-1.2203 3.2281-2.652 4.324-.694.5313-1.4626.9354-2.2254 1.2294.0031-.0453.014-.0888.014-.1349.0029-1.1964-.9313-2.2133-2.2918-2.2133-1.3606 0-2.3222 1.0154-2.2918 2.2213.0013.0507.014.0972.0181.1471-.781-.2933-1.5696-.7013-2.2777-1.2456-1.4239-1.0945-2.4997-2.6129-2.6037-4.322-.1129-1.8567.3778-3.3382 1.5212-4.3965C7.5094 6.7 9.352 6.047 12.006 6.0036Zm-3.6419 6.8291c-.6053 0-1.0966.4903-1.0966 1.0966 0 .6063.4913 1.0986 1.0966 1.0986s1.0966-.4923 1.0966-1.0986c0-.6063-.4913-1.0966-1.0966-1.0966zm7.2819.0113c-.5998 0-1.0866.4859-1.0866 1.0866s.4868 1.0885 1.0866 1.0885c.5997 0 1.0865-.4878 1.0865-1.0885s-.4868-1.0866-1.0865-1.0866zM12 16.0835c1.0237 0 1.5654.638 1.5634 1.4829-.0018.7849-.6723 1.485-1.5634 1.485-.9167 0-1.54-.5629-1.5634-1.493-.0212-.8347.5397-1.4749 1.5634-1.4749Z"/></svg>
</a>
<a href="https://discord.gg/connectd" title="Discord" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M20.317 4.3698a19.7913 19.7913 0 00-4.8851-1.5152.0741.0741 0 00-.0785.0371c-.211.3753-.4447.8648-.6083 1.2495-1.8447-.2762-3.68-.2762-5.4868 0-.1636-.3933-.4058-.8742-.6177-1.2495a.077.077 0 00-.0785-.037 19.7363 19.7363 0 00-4.8852 1.515.0699.0699 0 00-.0321.0277C.5334 9.0458-.319 13.5799.0992 18.0578a.0824.0824 0 00.0312.0561c2.0528 1.5076 4.0413 2.4228 5.9929 3.0294a.0777.0777 0 00.0842-.0276c.4616-.6304.8731-1.2952 1.226-1.9942a.076.076 0 00-.0416-.1057c-.6528-.2476-1.2743-.5495-1.8722-.8923a.077.077 0 01-.0076-.1277c.1258-.0943.2517-.1923.3718-.2914a.0743.0743 0 01.0776-.0105c3.9278 1.7933 8.18 1.7933 12.0614 0a.0739.0739 0 01.0785.0095c.1202.099.246.1981.3728.2924a.077.077 0 01-.0066.1276 12.2986 12.2986 0 01-1.873.8914.0766.0766 0 00-.0407.1067c.3604.698.7719 1.3628 1.225 1.9932a.076.076 0 00.0842.0286c1.961-.6067 3.9495-1.5219 6.0023-3.0294a.077.077 0 00.0313-.0552c.5004-5.177-.8382-9.6739-3.5485-13.6604a.061.061 0 00-.0312-.0286zM8.02 15.3312c-1.1825 0-2.1569-1.0857-2.1569-2.419 0-1.3332.9555-2.4189 2.157-2.4189 1.2108 0 2.1757 1.0952 2.1568 2.419 0 1.3332-.9555 2.4189-2.1569 2.4189zm7.9748 0c-1.1825 0-2.1569-1.0857-2.1569-2.419 0-1.3332.9554-2.4189 2.1569-2.4189 1.2108 0 2.1757 1.0952 2.1568 2.419 0 1.3332-.946 2.4189-2.1568 2.4189Z"/></svg>
</a>
<a href="https://matrix.to/#/@connectd:sudoxreboot.com" title="Matrix" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M.632.55v22.9H2.28V24H0V0h2.28v.55zm7.043 7.26v1.157h.033c.309-.443.683-.784 1.117-1.024.433-.245.936-.365 1.5-.365.54 0 1.033.107 1.481.314.448.208.785.582 1.02 1.108.254-.374.6-.706 1.034-.992.434-.287.95-.43 1.546-.43.453 0 .872.056 1.26.167.388.11.716.286.993.53.276.245.489.559.646.951.152.392.23.863.23 1.417v5.728h-2.349V11.52c0-.286-.01-.559-.032-.812a1.755 1.755 0 0 0-.18-.66 1.106 1.106 0 0 0-.438-.448c-.194-.11-.457-.166-.785-.166-.332 0-.6.064-.803.189a1.38 1.38 0 0 0-.48.499 1.946 1.946 0 0 0-.231.696 5.56 5.56 0 0 0-.06.785v4.768h-2.35v-4.8c0-.254-.004-.503-.018-.752a2.074 2.074 0 0 0-.143-.688 1.052 1.052 0 0 0-.415-.503c-.194-.125-.476-.19-.854-.19-.111 0-.259.024-.439.074-.18.051-.36.143-.53.282-.171.138-.319.337-.439.595-.12.259-.18.6-.18 1.02v4.966H5.46V7.81zm15.693 15.64V.55H21.72V0H24v24h-2.28v-.55z"/></svg>
</a>
<a href="https://reddit.com/r/connectd" title="Reddit" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M12 0C5.373 0 0 5.373 0 12c0 3.314 1.343 6.314 3.515 8.485l-2.286 2.286C.775 23.225 1.097 24 1.738 24H12c6.627 0 12-5.373 12-12S18.627 0 12 0Zm4.388 3.199c1.104 0 1.999.895 1.999 1.999 0 1.105-.895 2-1.999 2-.946 0-1.739-.657-1.947-1.539v.002c-1.147.162-2.032 1.15-2.032 2.341v.007c1.776.067 3.4.567 4.686 1.363.473-.363 1.064-.58 1.707-.58 1.547 0 2.802 1.254 2.802 2.802 0 1.117-.655 2.081-1.601 2.531-.088 3.256-3.637 5.876-7.997 5.876-4.361 0-7.905-2.617-7.998-5.87-.954-.447-1.614-1.415-1.614-2.538 0-1.548 1.255-2.802 2.803-2.802.645 0 1.239.218 1.712.585 1.275-.79 2.881-1.291 4.64-1.365v-.01c0-1.663 1.263-3.034 2.88-3.207.188-.911.993-1.595 1.959-1.595Zm-8.085 8.376c-.784 0-1.459.78-1.506 1.797-.047 1.016.64 1.429 1.426 1.429.786 0 1.371-.369 1.418-1.385.047-1.017-.553-1.841-1.338-1.841Zm7.406 0c-.786 0-1.385.824-1.338 1.841.047 1.017.634 1.385 1.418 1.385.785 0 1.473-.413 1.426-1.429-.046-1.017-.721-1.797-1.506-1.797Zm-3.703 4.013c-.974 0-1.907.048-2.77.135-.147.015-.241.168-.183.305.483 1.154 1.622 1.964 2.953 1.964 1.33 0 2.47-.81 2.953-1.964.057-.137-.037-.29-.184-.305-.863-.087-1.795-.135-2.769-.135Z"/></svg>
</a>
<a href="mailto:connectd@sudoxreboot.com" title="Email" style="color: #888; text-decoration: none;">
<svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M1.5 8.67v8.58a3 3 0 003 3h15a3 3 0 003-3V8.67l-8.928 5.493a3 3 0 01-3.144 0L1.5 8.67z"/><path d="M22.5 6.908V6.75a3 3 0 00-3-3h-15a3 3 0 00-3 3v.158l9.714 5.978a1.5 1.5 0 001.572 0L22.5 6.908z"/></svg>
</a>
</div>
</div>
"""
# commit activity from deep scrape SIGNATURE_PLAINTEXT = """
commit_count = extra.get('commit_count', 0) ---
github_score += min(commit_count // 10, 20) github.com/sudoxnym/connectd (main repo)
if github_score > 0: github: github.com/connectd-daemon
activity_scores['github_issue'] = { mastodon: @connectd@mastodon.sudoxreboot.com
'score': github_score, bluesky: connectd.bsky.social
'info': f"{github_username}/{top_repos[0]['name']}" if top_repos else github_username lemmy: lemmy.sudoxreboot.com/c/connectd
} discord: discord.gg/connectd
matrix: @connectd:sudoxreboot.com
# mastodon activity reddit: reddit.com/r/connectd
mastodon_handle = human.get('username') if human.get('platform') == 'mastodon' else (extra.get('mastodon') or contact.get('mastodon')) email: connectd@sudoxreboot.com
if mastodon_handle: """
mastodon_score = 0
statuses_count = extra.get('mastodon_statuses', 0) or human.get('statuses_count', 0)
# high post count = active user
if statuses_count > 1000:
mastodon_score += 30
elif statuses_count > 500:
mastodon_score += 20
elif statuses_count > 100:
mastodon_score += 10
elif statuses_count > 0:
mastodon_score += 5
# platform bonus for fediverse (values-aligned)
mastodon_score += 10
# bonus if handle was discovered via rel="me" or similar verification
# (having a handle linked from their website = they want to be contacted there)
handles = extra.get('handles', {})
if handles.get('mastodon') == mastodon_handle:
mastodon_score += 15 # verified handle bonus
if mastodon_score > 0:
activity_scores['mastodon'] = {'score': mastodon_score, 'info': mastodon_handle}
# bluesky activity
bluesky_handle = human.get('username') if human.get('platform') == 'bluesky' else (extra.get('bluesky') or contact.get('bluesky'))
if bluesky_handle:
bluesky_score = 0
posts_count = extra.get('bluesky_posts', 0) or human.get('posts_count', 0)
if posts_count > 500:
bluesky_score += 25
elif posts_count > 100:
bluesky_score += 15
elif posts_count > 0:
bluesky_score += 5
# newer platform, slightly lower weight
bluesky_score += 5
if bluesky_score > 0:
activity_scores['bluesky'] = {'score': bluesky_score, 'info': bluesky_handle}
# twitter activity
twitter_handle = extra.get('twitter') or contact.get('twitter')
if twitter_handle:
twitter_score = 0
tweets_count = extra.get('twitter_tweets', 0)
if tweets_count > 1000:
twitter_score += 20
elif tweets_count > 100:
twitter_score += 10
elif tweets_count > 0:
twitter_score += 5
# if we found them via twitter hashtags, they're active there
if human.get('platform') == 'twitter':
twitter_score += 15
if twitter_score > 0:
activity_scores['twitter'] = {'score': twitter_score, 'info': twitter_handle}
# NOTE: reddit is DISCOVERY ONLY, not a contact method
# we find users on reddit but reach out via their external links (github, mastodon, etc.)
# reddit-only users go to manual_queue for review
# lobsters activity
lobsters_username = extra.get('lobsters') or contact.get('lobsters')
if lobsters_username or human.get('platform') == 'lobsters':
lobsters_score = 0
lobsters_username = lobsters_username or human.get('username')
karma = extra.get('lobsters_karma', 0) or human.get('karma', 0)
# lobsters is invite-only, high signal
lobsters_score += 15
if karma > 100:
lobsters_score += 15
elif karma > 50:
lobsters_score += 10
elif karma > 0:
lobsters_score += 5
if lobsters_score > 0:
activity_scores['lobsters'] = {'score': lobsters_score, 'info': lobsters_username}
# matrix activity
matrix_id = extra.get('matrix') or contact.get('matrix')
if matrix_id:
matrix_score = 0
# matrix users are typically privacy-conscious and technical
matrix_score += 15 # platform bonus for decentralized chat
# bonus if handle was discovered via rel="me" verification
handles = extra.get('handles', {})
if handles.get('matrix') == matrix_id:
matrix_score += 10 # verified handle bonus
if matrix_score > 0:
activity_scores['matrix'] = {'score': matrix_score, 'info': matrix_id}
# lemmy activity (fediverse)
lemmy_username = human.get('username') if human.get('platform') == 'lemmy' else extra.get('lemmy')
if lemmy_username:
lemmy_score = 0
# lemmy is fediverse - high values alignment
lemmy_score += 20 # fediverse platform bonus
post_count = extra.get('post_count', 0)
comment_count = extra.get('comment_count', 0)
if post_count > 100:
lemmy_score += 15
elif post_count > 50:
lemmy_score += 10
elif post_count > 10:
lemmy_score += 5
if comment_count > 500:
lemmy_score += 10
elif comment_count > 100:
lemmy_score += 5
if lemmy_score > 0:
activity_scores['lemmy'] = {'score': lemmy_score, 'info': lemmy_username}
# pick highest activity platform
if activity_scores:
best_platform = max(activity_scores.items(), key=lambda x: x[1]['score'])
return best_platform[0], best_platform[1]['info']
# fall back to email ONLY if no social activity detected
email = extra.get('email') or contact.get('email')
# also check emails list
if not email:
emails = extra.get('emails') or contact.get('emails') or []
for e in emails:
if e and '@' in e and 'noreply' not in e.lower():
email = e
break
if email and '@' in email and 'noreply' not in email.lower():
return 'email', email
# last resort: manual
return 'manual', None
def draft_intro_with_llm(match_data, recipient='a', dry_run=False): def draft_intro_with_llm(match_data: dict, recipient: str = 'a', dry_run: bool = True):
""" """
use groq llama 4 maverick to draft a personalized intro draft an intro message using groq llm.
match_data should contain: args:
- human_a: the first person match_data: dict with human_a, human_b, overlap_score, overlap_reasons
- human_b: the second person recipient: 'a' or 'b' - who receives the message
- overlap_score: numeric score dry_run: if True, preview mode
- overlap_reasons: list of why they match
recipient: 'a' or 'b' - who we're writing to returns:
tuple (result_dict, error_string)
result_dict has: subject, draft_html, draft_plain
""" """
if not GROQ_API_KEY: if not client:
return None, "GROQ_API_KEY not set" return None, "GROQ_API_KEY not set"
# determine recipient and other person
if recipient == 'a':
to_person = match_data.get('human_a', {})
other_person = match_data.get('human_b', {})
else:
to_person = match_data.get('human_b', {})
other_person = match_data.get('human_a', {})
# build context
to_name = to_person.get('name') or to_person.get('username', 'friend')
other_name = other_person.get('name') or other_person.get('username', 'someone')
to_signals = to_person.get('signals', [])
if isinstance(to_signals, str):
to_signals = json.loads(to_signals) if to_signals else []
other_signals = other_person.get('signals', [])
if isinstance(other_signals, str):
other_signals = json.loads(other_signals) if other_signals else []
overlap_reasons = match_data.get('overlap_reasons', [])
if isinstance(overlap_reasons, str):
overlap_reasons = json.loads(overlap_reasons) if overlap_reasons else []
# parse extra data
to_extra = to_person.get('extra', {})
other_extra = other_person.get('extra', {})
if isinstance(to_extra, str):
to_extra = json.loads(to_extra) if to_extra else {}
if isinstance(other_extra, str):
other_extra = json.loads(other_extra) if other_extra else {}
# build profile summaries
to_profile = f"""
name: {to_name}
platform: {to_person.get('platform', 'unknown')}
bio: {to_person.get('bio') or 'no bio'}
location: {to_person.get('location') or 'unknown'}
signals: {', '.join(to_signals[:8])}
repos: {len(to_extra.get('top_repos', []))} public repos
languages: {', '.join(to_extra.get('languages', {}).keys())}
"""
other_profile = f"""
name: {other_name}
platform: {other_person.get('platform', 'unknown')}
bio: {other_person.get('bio') or 'no bio'}
location: {other_person.get('location') or 'unknown'}
signals: {', '.join(other_signals[:8])}
repos: {len(other_extra.get('top_repos', []))} public repos
languages: {', '.join(other_extra.get('languages', {}).keys())}
url: {other_person.get('url', '')}
"""
# build prompt
system_prompt = """you are connectd, an ai that connects isolated builders who share values but don't know each other yet.
your job is to write a short, genuine intro message to one person about another person they might want to know.
rules:
- be brief (3-5 sentences max)
- be genuine, not salesy or fake
- focus on WHY they might want to connect, not just WHAT they have in common
- don't be cringe or use buzzwords
- lowercase preferred (casual tone)
- no emojis unless the person's profile suggests they'd like them
- mention specific things from their profiles, not generic "you both like open source"
- end with a simple invitation, not a hard sell
- sign off as "- connectd" (lowercase)
bad examples:
- "I noticed you're both passionate about..." (too formal)
- "You two would be PERFECT for each other!" (too salesy)
- "As a fellow privacy enthusiast..." (cringe)
good examples:
- "hey, saw you're building X. there's someone else working on similar stuff in Y who might be interesting to know."
- "you might want to check out Z's work on federated systems - similar approach to what you're doing with A."
"""
user_prompt = f"""write an intro message to {to_name} about {other_name}.
RECIPIENT ({to_name}):
{to_profile}
INTRODUCING ({other_name}):
{other_profile}
WHY THEY MATCH (overlap score {match_data.get('overlap_score', 0)}):
{', '.join(overlap_reasons[:5])}
write a short intro message. remember: lowercase, genuine, not salesy."""
try: try:
response = requests.post( human_a = match_data.get('human_a', {})
GROQ_API_URL, human_b = match_data.get('human_b', {})
headers={ reasons = match_data.get('overlap_reasons', [])
'Authorization': f'Bearer {GROQ_API_KEY}',
'Content-Type': 'application/json', # recipient gets the message, about_person is who we're introducing them to
}, if recipient == 'a':
json={ to_person = human_a
'model': MODEL, about_person = human_b
'messages': [ else:
{'role': 'system', 'content': system_prompt}, to_person = human_b
{'role': 'user', 'content': user_prompt}, about_person = human_a
],
'temperature': 0.7, to_name = to_person.get('username', 'friend')
'max_tokens': 300, about_name = about_person.get('username', 'someone')
}, about_bio = about_person.get('extra', {}).get('bio', '')
timeout=30,
# extract contact info for about_person
about_extra = about_person.get('extra', {})
if isinstance(about_extra, str):
import json as _json
about_extra = _json.loads(about_extra) if about_extra else {}
about_contact = about_person.get('contact', {})
if isinstance(about_contact, str):
about_contact = _json.loads(about_contact) if about_contact else {}
# build contact link for about_person
about_platform = about_person.get('platform', '')
about_username = about_person.get('username', '')
contact_link = None
if about_platform == 'mastodon' and about_username:
if '@' in about_username:
parts = about_username.split('@')
if len(parts) >= 2:
contact_link = f"https://{parts[1]}/@{parts[0]}"
elif about_platform == 'github' and about_username:
contact_link = f"https://github.com/{about_username}"
elif about_extra.get('mastodon') or about_contact.get('mastodon'):
handle = about_extra.get('mastodon') or about_contact.get('mastodon')
if '@' in handle:
parts = handle.lstrip('@').split('@')
if len(parts) >= 2:
contact_link = f"https://{parts[1]}/@{parts[0]}"
elif about_extra.get('github') or about_contact.get('github'):
contact_link = f"https://github.com/{about_extra.get('github') or about_contact.get('github')}"
elif about_extra.get('email'):
contact_link = about_extra['email']
elif about_contact.get('email'):
contact_link = about_contact['email']
elif about_extra.get('website'):
contact_link = about_extra['website']
elif about_extra.get('external_links', {}).get('website'):
contact_link = about_extra['external_links']['website']
elif about_extra.get('extra', {}).get('website'):
contact_link = about_extra['extra']['website']
elif about_platform == 'reddit' and about_username:
contact_link = f"reddit.com/u/{about_username}"
if not contact_link:
contact_link = f"github.com/{about_username}" if about_username else "reach out via connectd"
# skip if no real contact method (just reddit or generic)
if contact_link.startswith('reddit.com') or contact_link == "reach out via connectd" or 'stackblitz' in contact_link:
return None, f"no real contact info for {about_name} - skipping draft"
# format the shared factors naturally
if reasons:
factor = ', '.join(reasons[:3]) if len(reasons) > 1 else reasons[0]
else:
factor = "shared values and interests"
# load soul as guideline
soul = load_soul()
if not soul:
return None, "could not load soul file"
# build the prompt - soul is GUIDELINE not script
prompt = f"""you are connectd, a daemon that finds isolated builders and connects them.
write a personal message TO {to_name} telling them about {about_name}.
here is the soul/spirit of what connectd is about - use this as a GUIDELINE for tone and message, NOT as a script to copy verbatim:
---
{soul}
---
key facts for this message:
- recipient: {to_name}
- introducing them to: {about_name}
- their shared interests/values: {factor}
- about {about_name}: {about_bio if about_bio else 'a builder like you'}
- HOW TO REACH {about_name}: {contact_link}
RULES:
1. say their name ONCE at start, then use "you"
2. MUST include how to reach {about_name}: {contact_link}
3. lowercase, raw, emotional - follow the soul
4. end with the contact link
return ONLY the message body. signature is added separately."""
response = client.chat.completions.create(
model=GROQ_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.6,
max_tokens=1200
) )
if response.status_code != 200: body = response.choices[0].message.content.strip()
return None, f"groq api error: {response.status_code} - {response.text}"
data = response.json() # generate subject
draft = data['choices'][0]['message']['content'].strip() subject_prompt = f"""generate a short, lowercase email subject for a message to {to_name} about connecting them with {about_name} over their shared interest in {factor}.
# determine contact method for recipient no corporate speak. no clickbait. raw and real.
contact_method, contact_info = determine_contact_method(to_person) examples:
- "found you, {to_name}"
- "you're not alone"
- "a door just opened"
- "{to_name}, there's someone you should meet"
return ONLY the subject line."""
subject_response = client.chat.completions.create(
model=GROQ_MODEL,
messages=[{"role": "user", "content": subject_prompt}],
temperature=0.9,
max_tokens=50
)
subject = subject_response.choices[0].message.content.strip().strip('"').strip("'")
# format html
draft_html = f"<div style='font-family: monospace; white-space: pre-wrap; color: #e0e0e0; background: #1a1a1a; padding: 20px;'>{body}</div>{SIGNATURE_HTML}"
draft_plain = body + SIGNATURE_PLAINTEXT
return { return {
'draft': draft, 'subject': subject,
'model': MODEL, 'draft_html': draft_html,
'to': to_name, 'draft_plain': draft_plain
'about': other_name,
'overlap_score': match_data.get('overlap_score', 0),
'contact_method': contact_method,
'contact_info': contact_info,
'generated_at': datetime.now().isoformat(),
}, None }, None
except Exception as e: except Exception as e:
return None, f"groq error: {str(e)}" return None, str(e)
def draft_intro_batch(matches, dry_run=False): # for backwards compat with old code
""" def draft_message(person: dict, factor: str, platform: str = "email") -> dict:
draft intros for multiple matches """legacy function - wraps new api"""
returns list of (match, intro_result, error) tuples match_data = {
""" 'human_a': {'username': 'recipient'},
results = [] 'human_b': person,
'overlap_reasons': [factor]
for match in matches: }
# draft for both directions result, error = draft_intro_with_llm(match_data, recipient='a')
intro_a, err_a = draft_intro_with_llm(match, recipient='a', dry_run=dry_run) if error:
intro_b, err_b = draft_intro_with_llm(match, recipient='b', dry_run=dry_run) raise ValueError(error)
return {
results.append({ 'subject': result['subject'],
'match': match, 'body_html': result['draft_html'],
'intro_to_a': intro_a, 'body_plain': result['draft_plain']
'intro_to_b': intro_b, }
'errors': [err_a, err_b],
})
return results
def test_groq_connection(): if __name__ == "__main__":
"""test that groq api is working""" # test
if not GROQ_API_KEY: test_data = {
return False, "GROQ_API_KEY not set" 'human_a': {'username': 'sudoxnym', 'extra': {'bio': 'building intentional communities'}},
'human_b': {'username': 'testuser', 'extra': {'bio': 'home assistant enthusiast'}},
try: 'overlap_reasons': ['home-assistant', 'open source', 'community building']
response = requests.post( }
GROQ_API_URL, result, error = draft_intro_with_llm(test_data, recipient='a')
headers={ if error:
'Authorization': f'Bearer {GROQ_API_KEY}', print(f"error: {error}")
'Content-Type': 'application/json',
},
json={
'model': MODEL,
'messages': [{'role': 'user', 'content': 'say "ok" and nothing else'}],
'max_tokens': 10,
},
timeout=10,
)
if response.status_code == 200:
return True, "groq api working"
else: else:
return False, f"groq api error: {response.status_code}" print(f"subject: {result['subject']}")
print(f"\nbody:\n{result['draft_plain']}")
except Exception as e: # contact method ranking - USAGE BASED
return False, f"groq connection error: {str(e)}" # we rank by where the person is MOST ACTIVE, not by our preference
def determine_contact_method(human):
"""
determine ALL available contact methods, ranked by USER'S ACTIVITY.
looks at activity metrics to decide where they're most engaged.
returns: (best_method, best_info, fallbacks)
where fallbacks is a list of (method, info) tuples in activity order
"""
import json
extra = human.get('extra', {})
contact = human.get('contact', {})
if isinstance(extra, str):
extra = json.loads(extra) if extra else {}
if isinstance(contact, str):
contact = json.loads(contact) if contact else {}
nested_extra = extra.get('extra', {})
platform = human.get('platform', '')
available = []
# === ACTIVITY SCORING ===
# each method gets scored by how active the user is there
# EMAIL - always medium priority (we cant measure activity)
email = extra.get('email') or contact.get('email') or nested_extra.get('email')
if email and '@' in str(email):
available.append(('email', email, 50)) # baseline score
# MASTODON - score by post count / followers
mastodon = extra.get('mastodon') or contact.get('mastodon') or nested_extra.get('mastodon')
if mastodon:
masto_activity = extra.get('mastodon_posts', 0) or extra.get('statuses_count', 0)
masto_score = min(100, 30 + (masto_activity // 10)) # 30 base + 1 per 10 posts
available.append(('mastodon', mastodon, masto_score))
# if they CAME FROM mastodon, thats their primary
if platform == 'mastodon':
handle = f"@{human.get('username')}"
instance = human.get('instance') or extra.get('instance') or ''
if instance:
handle = f"@{human.get('username')}@{instance}"
activity = extra.get('statuses_count', 0) or extra.get('activity_count', 0)
score = min(100, 50 + (activity // 5)) # higher base since its their home
# dont dupe
if not any(a[0] == 'mastodon' for a in available):
available.append(('mastodon', handle, score))
else:
# update score if this is higher
for i, (m, info, s) in enumerate(available):
if m == 'mastodon' and score > s:
available[i] = ('mastodon', handle, score)
# MATRIX - score by presence (binary for now)
matrix = extra.get('matrix') or contact.get('matrix') or nested_extra.get('matrix')
if matrix and ':' in str(matrix):
available.append(('matrix', matrix, 40))
# BLUESKY - score by followers/posts if available
bluesky = extra.get('bluesky') or contact.get('bluesky') or nested_extra.get('bluesky')
if bluesky:
bsky_activity = extra.get('bluesky_posts', 0)
bsky_score = min(100, 25 + (bsky_activity // 10))
available.append(('bluesky', bluesky, bsky_score))
# LEMMY - score by activity
lemmy = extra.get('lemmy') or contact.get('lemmy') or nested_extra.get('lemmy')
if lemmy:
lemmy_activity = extra.get('lemmy_posts', 0) or extra.get('lemmy_comments', 0)
lemmy_score = min(100, 30 + lemmy_activity)
available.append(('lemmy', lemmy, lemmy_score))
if platform == 'lemmy':
handle = human.get('username')
activity = extra.get('activity_count', 0)
score = min(100, 50 + activity)
if not any(a[0] == 'lemmy' for a in available):
available.append(('lemmy', handle, score))
# DISCORD - lower priority (hard to DM)
discord = extra.get('discord') or contact.get('discord') or nested_extra.get('discord')
if discord:
available.append(('discord', discord, 20))
# GITHUB ISSUE - for github users, score by repo activity
if platform == 'github':
top_repos = extra.get('top_repos', [])
if top_repos:
repo = top_repos[0] if isinstance(top_repos[0], str) else top_repos[0].get('name', '')
stars = extra.get('total_stars', 0)
repos_count = extra.get('repos_count', 0)
# active github user = higher issue score
gh_score = min(60, 20 + (stars // 100) + (repos_count // 5))
if repo:
available.append(('github_issue', f"{human.get('username')}/{repo}", gh_score))
# FORGE ISSUE - for self-hosted git users (gitea/forgejo/gitlab/sourcehut/codeberg)
# these are HIGH SIGNAL users - they actually selfhost
if platform and ':' in platform:
platform_type, instance = platform.split(':', 1)
if platform_type in ('gitea', 'forgejo', 'gogs', 'gitlab', 'sourcehut'):
repos = extra.get('repos', [])
if repos:
repo = repos[0] if isinstance(repos[0], str) else repos[0].get('name', '')
instance_url = extra.get('instance_url', '')
if repo and instance_url:
# forge users get higher priority than github (they selfhost!)
forge_score = 55 # higher than github_issue (50)
available.append(('forge_issue', {
'platform_type': platform_type,
'instance': instance,
'instance_url': instance_url,
'owner': human.get('username'),
'repo': repo
}, forge_score))
# REDDIT - discovered people, use their other links
if platform == 'reddit':
reddit_activity = extra.get('reddit_activity', 0) or extra.get('activity_count', 0)
# reddit users we reach via their external links (email, mastodon, etc)
# boost their other methods if reddit is their main platform
for i, (m, info, score) in enumerate(available):
if m in ('email', 'mastodon', 'matrix', 'bluesky'):
# boost score for reddit-discovered users' external contacts
boost = min(30, reddit_activity // 3)
available[i] = (m, info, score + boost)
# sort by activity score (highest first)
available.sort(key=lambda x: x[2], reverse=True)
if not available:
return 'manual', None, []
best = available[0]
fallbacks = [(m, i) for m, i, p in available[1:]]
return best[0], best[1], fallbacks
def get_ranked_contact_methods(human):
"""
get all contact methods for a human, ranked by their activity.
"""
method, info, fallbacks = determine_contact_method(human)
if method == 'manual':
return []
return [(method, info)] + fallbacks

View file

@ -334,18 +334,24 @@ def determine_best_contact(human):
""" """
determine best contact method based on WHERE THEY'RE MOST ACTIVE determine best contact method based on WHERE THEY'RE MOST ACTIVE
uses activity-based selection from groq_draft module returns: (method, info, fallbacks)
uses activity-based selection - ranks by user's actual usage
""" """
from introd.groq_draft import determine_contact_method as activity_based_contact from introd.groq_draft import determine_contact_method as activity_based_contact
method, info = activity_based_contact(human) method, info, fallbacks = activity_based_contact(human)
# convert github_issue info to dict format for delivery # convert github_issue info to dict format for delivery
if method == 'github_issue' and isinstance(info, str) and '/' in info: def format_info(m, i):
parts = info.split('/', 1) if m == 'github_issue' and isinstance(i, str) and '/' in i:
return method, {'owner': parts[0], 'repo': parts[1]} parts = i.split('/', 1)
return {'owner': parts[0], 'repo': parts[1]}
return i
return method, info info = format_info(method, info)
fallbacks = [(m, format_info(m, i)) for m, i in fallbacks]
return method, info, fallbacks
def deliver_intro(match_data, intro_draft, dry_run=False): def deliver_intro(match_data, intro_draft, dry_run=False):
@ -362,8 +368,8 @@ def deliver_intro(match_data, intro_draft, dry_run=False):
if already_contacted(recipient_id): if already_contacted(recipient_id):
return False, "already contacted", None return False, "already contacted", None
# determine contact method # determine contact method with fallbacks
method, contact_info = determine_best_contact(recipient) method, contact_info, fallbacks = determine_best_contact(recipient)
log = load_delivery_log() log = load_delivery_log()
result = { result = {
@ -423,9 +429,60 @@ def deliver_intro(match_data, intro_draft, dry_run=False):
success = True success = True
error = "added to manual queue" error = "added to manual queue"
# if failed and we have fallbacks, try them
if not success and fallbacks:
for fallback_method, fallback_info in fallbacks:
result['fallback_attempts'] = result.get('fallback_attempts', [])
result['fallback_attempts'].append({
'method': fallback_method,
'contact_info': fallback_info
})
fb_success = False
fb_error = None
if fallback_method == 'email':
subject = f"someone you might want to know - connectd"
fb_success, fb_error = send_email(fallback_info, subject, intro_draft, dry_run)
elif fallback_method == 'mastodon':
fb_success, fb_error = send_mastodon_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'bluesky':
fb_success, fb_error = send_bluesky_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'matrix':
fb_success, fb_error = send_matrix_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'lemmy':
from scoutd.lemmy import send_lemmy_dm
fb_success, fb_error = send_lemmy_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'discord':
from scoutd.discord import send_discord_dm
fb_success, fb_error = send_discord_dm(fallback_info, intro_draft, dry_run)
elif fallback_method == 'github_issue':
owner = fallback_info.get('owner')
repo = fallback_info.get('repo')
title = "community introduction from connectd"
github_body = f"""hey {recipient.get('name') or recipient.get('username')},
{intro_draft}
---
*automated introduction from connectd*
"""
fb_success, fb_error = create_github_issue(owner, repo, title, github_body, dry_run)
if fb_success:
success = True
method = fallback_method
contact_info = fallback_info
error = None
result['fallback_succeeded'] = fallback_method
break
else:
result['fallback_attempts'][-1]['error'] = fb_error
# log result # log result
result['success'] = success result['success'] = success
result['error'] = error result['error'] = error
result['final_method'] = method
if success: if success:
log['sent'].append(result) log['sent'].append(result)

View file

@ -104,6 +104,54 @@ def draft_intro_with_llm(match_data: dict, recipient: str = 'a', dry_run: bool =
about_name = about_person.get('username', 'someone') about_name = about_person.get('username', 'someone')
about_bio = about_person.get('extra', {}).get('bio', '') about_bio = about_person.get('extra', {}).get('bio', '')
# extract contact info for about_person
about_extra = about_person.get('extra', {})
if isinstance(about_extra, str):
import json as _json
about_extra = _json.loads(about_extra) if about_extra else {}
about_contact = about_person.get('contact', {})
if isinstance(about_contact, str):
about_contact = _json.loads(about_contact) if about_contact else {}
# build contact link for about_person
about_platform = about_person.get('platform', '')
about_username = about_person.get('username', '')
contact_link = None
if about_platform == 'mastodon' and about_username:
if '@' in about_username:
parts = about_username.split('@')
if len(parts) >= 2:
contact_link = f"https://{parts[1]}/@{parts[0]}"
elif about_platform == 'github' and about_username:
contact_link = f"https://github.com/{about_username}"
elif about_extra.get('mastodon') or about_contact.get('mastodon'):
handle = about_extra.get('mastodon') or about_contact.get('mastodon')
if '@' in handle:
parts = handle.lstrip('@').split('@')
if len(parts) >= 2:
contact_link = f"https://{parts[1]}/@{parts[0]}"
elif about_extra.get('github') or about_contact.get('github'):
contact_link = f"https://github.com/{about_extra.get('github') or about_contact.get('github')}"
elif about_extra.get('email'):
contact_link = about_extra['email']
elif about_contact.get('email'):
contact_link = about_contact['email']
elif about_extra.get('website'):
contact_link = about_extra['website']
elif about_extra.get('external_links', {}).get('website'):
contact_link = about_extra['external_links']['website']
elif about_extra.get('extra', {}).get('website'):
contact_link = about_extra['extra']['website']
elif about_platform == 'reddit' and about_username:
contact_link = f"reddit.com/u/{about_username}"
if not contact_link:
contact_link = f"github.com/{about_username}" if about_username else "reach out via connectd"
# skip if no real contact method (just reddit or generic)
if contact_link.startswith('reddit.com') or contact_link == "reach out via connectd" or 'stackblitz' in contact_link:
return None, f"no real contact info for {about_name} - skipping draft"
# format the shared factors naturally # format the shared factors naturally
if reasons: if reasons:
factor = ', '.join(reasons[:3]) if len(reasons) > 1 else reasons[0] factor = ', '.join(reasons[:3]) if len(reasons) > 1 else reasons[0]
@ -131,24 +179,20 @@ key facts for this message:
- introducing them to: {about_name} - introducing them to: {about_name}
- their shared interests/values: {factor} - their shared interests/values: {factor}
- about {about_name}: {about_bio if about_bio else 'a builder like you'} - about {about_name}: {about_bio if about_bio else 'a builder like you'}
- HOW TO REACH {about_name}: {contact_link}
CRITICAL RULES - DO NOT SANITIZE: RULES:
1. use their name ({to_name}) throughout - make it personal 1. say their name ONCE at start, then use "you"
2. format the shared factor naturally like "your shared interest in X and Y" or "your work on X" 2. MUST include how to reach {about_name}: {contact_link}
3. this message is TO {to_name} ABOUT {about_name} - don't confuse this 3. lowercase, raw, emotional - follow the soul
4. keep it lowercase, raw, emotional, real 4. end with the contact link
5. no corporate speak, no "best regards", no "hi there"
6. DO NOT water down the message - keep the raw emotional energy
7. address them directly, call them to action
8. make them feel seen and not alone
9. end with rallying energy about building together
return ONLY the message body. signature is added separately.""" return ONLY the message body. signature is added separately."""
response = client.chat.completions.create( response = client.chat.completions.create(
model=GROQ_MODEL, model=GROQ_MODEL,
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
temperature=0.8, temperature=0.6,
max_tokens=1200 max_tokens=1200
) )
@ -220,3 +264,136 @@ if __name__ == "__main__":
else: else:
print(f"subject: {result['subject']}") print(f"subject: {result['subject']}")
print(f"\nbody:\n{result['draft_plain']}") print(f"\nbody:\n{result['draft_plain']}")
# contact method ranking - USAGE BASED
# we rank by where the person is MOST ACTIVE, not by our preference
def determine_contact_method(human):
"""
determine ALL available contact methods, ranked by USER'S ACTIVITY.
looks at activity metrics to decide where they're most engaged.
returns: (best_method, best_info, fallbacks)
where fallbacks is a list of (method, info) tuples in activity order
"""
import json
extra = human.get('extra', {})
contact = human.get('contact', {})
if isinstance(extra, str):
extra = json.loads(extra) if extra else {}
if isinstance(contact, str):
contact = json.loads(contact) if contact else {}
nested_extra = extra.get('extra', {})
platform = human.get('platform', '')
available = []
# === ACTIVITY SCORING ===
# each method gets scored by how active the user is there
# EMAIL - always medium priority (we cant measure activity)
email = extra.get('email') or contact.get('email') or nested_extra.get('email')
if email and '@' in str(email):
available.append(('email', email, 50)) # baseline score
# MASTODON - score by post count / followers
mastodon = extra.get('mastodon') or contact.get('mastodon') or nested_extra.get('mastodon')
if mastodon:
masto_activity = extra.get('mastodon_posts', 0) or extra.get('statuses_count', 0)
masto_score = min(100, 30 + (masto_activity // 10)) # 30 base + 1 per 10 posts
available.append(('mastodon', mastodon, masto_score))
# if they CAME FROM mastodon, thats their primary
if platform == 'mastodon':
handle = f"@{human.get('username')}"
instance = human.get('instance') or extra.get('instance') or ''
if instance:
handle = f"@{human.get('username')}@{instance}"
activity = extra.get('statuses_count', 0) or extra.get('activity_count', 0)
score = min(100, 50 + (activity // 5)) # higher base since its their home
# dont dupe
if not any(a[0] == 'mastodon' for a in available):
available.append(('mastodon', handle, score))
else:
# update score if this is higher
for i, (m, info, s) in enumerate(available):
if m == 'mastodon' and score > s:
available[i] = ('mastodon', handle, score)
# MATRIX - score by presence (binary for now)
matrix = extra.get('matrix') or contact.get('matrix') or nested_extra.get('matrix')
if matrix and ':' in str(matrix):
available.append(('matrix', matrix, 40))
# BLUESKY - score by followers/posts if available
bluesky = extra.get('bluesky') or contact.get('bluesky') or nested_extra.get('bluesky')
if bluesky:
bsky_activity = extra.get('bluesky_posts', 0)
bsky_score = min(100, 25 + (bsky_activity // 10))
available.append(('bluesky', bluesky, bsky_score))
# LEMMY - score by activity
lemmy = extra.get('lemmy') or contact.get('lemmy') or nested_extra.get('lemmy')
if lemmy:
lemmy_activity = extra.get('lemmy_posts', 0) or extra.get('lemmy_comments', 0)
lemmy_score = min(100, 30 + lemmy_activity)
available.append(('lemmy', lemmy, lemmy_score))
if platform == 'lemmy':
handle = human.get('username')
activity = extra.get('activity_count', 0)
score = min(100, 50 + activity)
if not any(a[0] == 'lemmy' for a in available):
available.append(('lemmy', handle, score))
# DISCORD - lower priority (hard to DM)
discord = extra.get('discord') or contact.get('discord') or nested_extra.get('discord')
if discord:
available.append(('discord', discord, 20))
# GITHUB ISSUE - for github users, score by repo activity
if platform == 'github':
top_repos = extra.get('top_repos', [])
if top_repos:
repo = top_repos[0] if isinstance(top_repos[0], str) else top_repos[0].get('name', '')
stars = extra.get('total_stars', 0)
repos_count = extra.get('repos_count', 0)
# active github user = higher issue score
gh_score = min(60, 20 + (stars // 100) + (repos_count // 5))
if repo:
available.append(('github_issue', f"{human.get('username')}/{repo}", gh_score))
# REDDIT - discovered people, use their other links
if platform == 'reddit':
reddit_activity = extra.get('reddit_activity', 0) or extra.get('activity_count', 0)
# reddit users we reach via their external links (email, mastodon, etc)
# boost their other methods if reddit is their main platform
for i, (m, info, score) in enumerate(available):
if m in ('email', 'mastodon', 'matrix', 'bluesky'):
# boost score for reddit-discovered users' external contacts
boost = min(30, reddit_activity // 3)
available[i] = (m, info, score + boost)
# sort by activity score (highest first)
available.sort(key=lambda x: x[2], reverse=True)
if not available:
return 'manual', None, []
best = available[0]
fallbacks = [(m, i) for m, i, p in available[1:]]
return best[0], best[1], fallbacks
def get_ranked_contact_methods(human):
"""
get all contact methods for a human, ranked by their activity.
"""
method, info, fallbacks = determine_contact_method(human)
if method == 'manual':
return []
return [(method, info)] + fallbacks

View file

@ -1,15 +1,20 @@
""" """
matchd/overlap.py - find pairs with alignment matchd/overlap.py - find pairs with alignment
CRITICAL: blocks users with disqualifying negative signals (maga, conspiracy, conservative)
""" """
import json import json
from .fingerprint import fingerprint_similarity from .fingerprint import fingerprint_similarity
# signals that HARD BLOCK matching - no exceptions
DISQUALIFYING_SIGNALS = {'maga', 'conspiracy', 'conservative', 'antivax', 'sovcit'}
def find_overlap(human_a, human_b, fp_a=None, fp_b=None): def find_overlap(human_a, human_b, fp_a=None, fp_b=None):
""" """
analyze overlap between two humans analyze overlap between two humans
returns overlap details: score, shared values, complementary skills returns None if either has disqualifying signals
""" """
# parse stored json if needed # parse stored json if needed
signals_a = human_a.get('signals', []) signals_a = human_a.get('signals', [])
@ -20,13 +25,49 @@ def find_overlap(human_a, human_b, fp_a=None, fp_b=None):
if isinstance(signals_b, str): if isinstance(signals_b, str):
signals_b = json.loads(signals_b) signals_b = json.loads(signals_b)
# === HARD BLOCK: check for disqualifying negative signals ===
neg_a = human_a.get('negative_signals', [])
if isinstance(neg_a, str):
neg_a = json.loads(neg_a) if neg_a else []
neg_b = human_b.get('negative_signals', [])
if isinstance(neg_b, str):
neg_b = json.loads(neg_b) if neg_b else []
# also check 'reasons' field for WARNING entries
reasons_a = human_a.get('reasons', '')
if isinstance(reasons_a, str) and 'WARNING' in reasons_a:
# extract signals from WARNING: x, y, z
import re
warn_match = re.search(r'WARNING[:\s]+([^"\]]+)', reasons_a)
if warn_match:
warn_signals = [s.strip().lower() for s in warn_match.group(1).split(',')]
neg_a = list(set(neg_a + warn_signals))
reasons_b = human_b.get('reasons', '')
if isinstance(reasons_b, str) and 'WARNING' in reasons_b:
import re
warn_match = re.search(r'WARNING[:\s]+([^"\]]+)', reasons_b)
if warn_match:
warn_signals = [s.strip().lower() for s in warn_match.group(1).split(',')]
neg_b = list(set(neg_b + warn_signals))
# block if either has disqualifying signals
disq_a = set(neg_a) & DISQUALIFYING_SIGNALS
disq_b = set(neg_b) & DISQUALIFYING_SIGNALS
if disq_a:
return None # blocked
if disq_b:
return None # blocked
extra_a = human_a.get('extra', {}) extra_a = human_a.get('extra', {})
if isinstance(extra_a, str): if isinstance(extra_a, str):
extra_a = json.loads(extra_a) extra_a = json.loads(extra_a) if extra_a else {}
extra_b = human_b.get('extra', {}) extra_b = human_b.get('extra', {})
if isinstance(extra_b, str): if isinstance(extra_b, str):
extra_b = json.loads(extra_b) extra_b = json.loads(extra_b) if extra_b else {}
# shared signals # shared signals
shared_signals = list(set(signals_a) & set(signals_b)) shared_signals = list(set(signals_a) & set(signals_b))
@ -36,7 +77,7 @@ def find_overlap(human_a, human_b, fp_a=None, fp_b=None):
topics_b = set(extra_b.get('topics', [])) topics_b = set(extra_b.get('topics', []))
shared_topics = list(topics_a & topics_b) shared_topics = list(topics_a & topics_b)
# complementary skills (what one has that the other doesn't) # complementary skills
langs_a = set(extra_a.get('languages', {}).keys()) langs_a = set(extra_a.get('languages', {}).keys())
langs_b = set(extra_b.get('languages', {}).keys()) langs_b = set(extra_b.get('languages', {}).keys())
complementary_langs = list((langs_a - langs_b) | (langs_b - langs_a)) complementary_langs = list((langs_a - langs_b) | (langs_b - langs_a))
@ -68,38 +109,30 @@ def find_overlap(human_a, human_b, fp_a=None, fp_b=None):
# calculate overlap score # calculate overlap score
base_score = 0 base_score = 0
# shared values (most important)
base_score += len(shared_signals) * 10 base_score += len(shared_signals) * 10
# shared interests
base_score += len(shared_topics) * 5 base_score += len(shared_topics) * 5
# complementary skills bonus (they can help each other)
if complementary_langs: if complementary_langs:
base_score += min(len(complementary_langs), 5) * 3 base_score += min(len(complementary_langs), 5) * 3
# geographic bonus
if geographic_match: if geographic_match:
base_score += 20 base_score += 20
# fingerprint similarity if available
fp_score = 0 fp_score = 0
if fp_a and fp_b: if fp_a and fp_b:
fp_score = fingerprint_similarity(fp_a, fp_b) * 50 fp_score = fingerprint_similarity(fp_a, fp_b) * 50
total_score = base_score + fp_score total_score = base_score + fp_score
# build reasons
overlap_reasons = [] overlap_reasons = []
if shared_signals: if shared_signals:
overlap_reasons.append(f"shared values: {', '.join(shared_signals[:5])}") overlap_reasons.append(f"shared: {', '.join(shared_signals[:5])}")
if shared_topics: if shared_topics:
overlap_reasons.append(f"shared interests: {', '.join(shared_topics[:5])}") overlap_reasons.append(f"interests: {', '.join(shared_topics[:5])}")
if geo_reason: if geo_reason:
overlap_reasons.append(geo_reason) overlap_reasons.append(geo_reason)
if complementary_langs: if complementary_langs:
overlap_reasons.append(f"complementary skills: {', '.join(complementary_langs[:5])}") overlap_reasons.append(f"complementary: {', '.join(complementary_langs[:5])}")
return { return {
'overlap_score': total_score, 'overlap_score': total_score,
@ -114,36 +147,28 @@ def find_overlap(human_a, human_b, fp_a=None, fp_b=None):
def is_same_person(human_a, human_b): def is_same_person(human_a, human_b):
""" """check if two records might be the same person (cross-platform)"""
check if two records might be the same person (cross-platform)
"""
# same platform = definitely different records
if human_a['platform'] == human_b['platform']: if human_a['platform'] == human_b['platform']:
return False return False
# check username similarity
user_a = human_a.get('username', '').lower().split('@')[0] user_a = human_a.get('username', '').lower().split('@')[0]
user_b = human_b.get('username', '').lower().split('@')[0] user_b = human_b.get('username', '').lower().split('@')[0]
if user_a == user_b: if user_a == user_b:
return True return True
# check if github username matches
contact_a = human_a.get('contact', {}) contact_a = human_a.get('contact', {})
contact_b = human_b.get('contact', {}) contact_b = human_b.get('contact', {})
if isinstance(contact_a, str): if isinstance(contact_a, str):
contact_a = json.loads(contact_a) contact_a = json.loads(contact_a) if contact_a else {}
if isinstance(contact_b, str): if isinstance(contact_b, str):
contact_b = json.loads(contact_b) contact_b = json.loads(contact_b) if contact_b else {}
# github cross-reference
if contact_a.get('github') and contact_a.get('github') == contact_b.get('github'): if contact_a.get('github') and contact_a.get('github') == contact_b.get('github'):
return True return True
if contact_a.get('github') == user_b or contact_b.get('github') == user_a: if contact_a.get('github') == user_b or contact_b.get('github') == user_a:
return True return True
# email cross-reference
if contact_a.get('email') and contact_a.get('email') == contact_b.get('email'): if contact_a.get('email') and contact_a.get('email') == contact_b.get('email'):
return True return True

491
scoutd/forges.py Normal file
View file

@ -0,0 +1,491 @@
"""
scoutd/forges.py - scrape self-hosted git forges
these people = highest signal. they actually selfhost.
supported platforms:
- gitea (and forks like forgejo)
- gogs
- gitlab ce
- sourcehut
- codeberg (gitea-based)
scrapes users AND extracts contact info for outreach.
"""
import os
import re
import json
import time
import requests
from typing import List, Dict, Optional, Tuple
from datetime import datetime
from .signals import analyze_text
# rate limiting
REQUEST_DELAY = 1.0
# known public instances to scrape
# format: (name, url, platform_type)
KNOWN_INSTANCES = [
# === PUBLIC INSTANCES ===
# local/private instances can be added via LOCAL_FORGE_INSTANCES env var
# codeberg (largest gitea instance)
('codeberg', 'https://codeberg.org', 'gitea'),
# sourcehut
('sourcehut', 'https://sr.ht', 'sourcehut'),
# notable gitea/forgejo instances
('gitea.com', 'https://gitea.com', 'gitea'),
('git.disroot.org', 'https://git.disroot.org', 'gitea'),
('git.gay', 'https://git.gay', 'forgejo'),
('git.envs.net', 'https://git.envs.net', 'forgejo'),
('tildegit', 'https://tildegit.org', 'gitea'),
('git.sr.ht', 'https://git.sr.ht', 'sourcehut'),
# gitlab ce instances
('framagit', 'https://framagit.org', 'gitlab'),
('gitlab.gnome.org', 'https://gitlab.gnome.org', 'gitlab'),
('invent.kde.org', 'https://invent.kde.org', 'gitlab'),
('salsa.debian.org', 'https://salsa.debian.org', 'gitlab'),
]
# headers
HEADERS = {
'User-Agent': 'connectd/1.0 (finding builders with aligned values)',
'Accept': 'application/json',
}
def log(msg):
print(f" forges: {msg}")
# === GITEA/FORGEJO/GOGS API ===
# these share the same API structure
def scrape_gitea_users(instance_url: str, limit: int = 100) -> List[Dict]:
"""
scrape users from a gitea/forgejo/gogs instance.
uses the explore/users page or API if available.
"""
users = []
# try API first (gitea 1.x+)
try:
api_url = f"{instance_url}/api/v1/users/search"
params = {'q': '', 'limit': min(limit, 50)}
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
if resp.status_code == 200:
data = resp.json()
user_list = data.get('data', []) or data.get('users', []) or data
if isinstance(user_list, list):
for u in user_list[:limit]:
users.append({
'username': u.get('login') or u.get('username'),
'full_name': u.get('full_name'),
'avatar': u.get('avatar_url'),
'website': u.get('website'),
'location': u.get('location'),
'bio': u.get('description') or u.get('bio'),
})
log(f" got {len(users)} users via API")
except Exception as e:
log(f" API failed: {e}")
# fallback: scrape explore page
if not users:
try:
explore_url = f"{instance_url}/explore/users"
resp = requests.get(explore_url, headers=HEADERS, timeout=15)
if resp.status_code == 200:
# parse HTML for usernames
usernames = re.findall(r'href="/([^/"]+)"[^>]*class="[^"]*user[^"]*"', resp.text)
usernames += re.findall(r'<a[^>]+href="/([^/"]+)"[^>]*title="[^"]*"', resp.text)
usernames = list(set(usernames))[:limit]
for username in usernames:
if username and not username.startswith(('explore', 'api', 'user', 'repo')):
users.append({'username': username})
log(f" got {len(users)} users via scrape")
except Exception as e:
log(f" scrape failed: {e}")
return users
def get_gitea_user_details(instance_url: str, username: str) -> Optional[Dict]:
"""get detailed user info from gitea/forgejo/gogs"""
try:
# API endpoint
api_url = f"{instance_url}/api/v1/users/{username}"
resp = requests.get(api_url, headers=HEADERS, timeout=10)
if resp.status_code == 200:
u = resp.json()
return {
'username': u.get('login') or u.get('username'),
'full_name': u.get('full_name'),
'email': u.get('email'), # may be hidden
'website': u.get('website'),
'location': u.get('location'),
'bio': u.get('description') or u.get('bio'),
'created': u.get('created'),
'followers': u.get('followers_count', 0),
'following': u.get('following_count', 0),
}
except:
pass
return None
def get_gitea_user_repos(instance_url: str, username: str, limit: int = 10) -> List[Dict]:
"""get user's repos from gitea/forgejo/gogs"""
repos = []
try:
api_url = f"{instance_url}/api/v1/users/{username}/repos"
resp = requests.get(api_url, headers=HEADERS, timeout=10)
if resp.status_code == 200:
for r in resp.json()[:limit]:
repos.append({
'name': r.get('name'),
'full_name': r.get('full_name'),
'description': r.get('description'),
'stars': r.get('stars_count', 0),
'forks': r.get('forks_count', 0),
'language': r.get('language'),
'updated': r.get('updated_at'),
})
except:
pass
return repos
# === GITLAB CE API ===
def scrape_gitlab_users(instance_url: str, limit: int = 100) -> List[Dict]:
"""scrape users from a gitlab ce instance"""
users = []
try:
# gitlab API - public users endpoint
api_url = f"{instance_url}/api/v4/users"
params = {'per_page': min(limit, 100), 'active': True}
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=15)
if resp.status_code == 200:
for u in resp.json()[:limit]:
users.append({
'username': u.get('username'),
'full_name': u.get('name'),
'avatar': u.get('avatar_url'),
'website': u.get('website_url'),
'location': u.get('location'),
'bio': u.get('bio'),
'public_email': u.get('public_email'),
})
log(f" got {len(users)} gitlab users")
except Exception as e:
log(f" gitlab API failed: {e}")
return users
def get_gitlab_user_details(instance_url: str, username: str) -> Optional[Dict]:
"""get detailed gitlab user info"""
try:
api_url = f"{instance_url}/api/v4/users"
params = {'username': username}
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
if resp.status_code == 200:
users = resp.json()
if users:
u = users[0]
return {
'username': u.get('username'),
'full_name': u.get('name'),
'email': u.get('public_email'),
'website': u.get('website_url'),
'location': u.get('location'),
'bio': u.get('bio'),
'created': u.get('created_at'),
}
except:
pass
return None
def get_gitlab_user_projects(instance_url: str, username: str, limit: int = 10) -> List[Dict]:
"""get user's projects from gitlab"""
repos = []
try:
# first get user id
api_url = f"{instance_url}/api/v4/users"
params = {'username': username}
resp = requests.get(api_url, params=params, headers=HEADERS, timeout=10)
if resp.status_code == 200 and resp.json():
user_id = resp.json()[0].get('id')
# get projects
proj_url = f"{instance_url}/api/v4/users/{user_id}/projects"
resp = requests.get(proj_url, headers=HEADERS, timeout=10)
if resp.status_code == 200:
for p in resp.json()[:limit]:
repos.append({
'name': p.get('name'),
'full_name': p.get('path_with_namespace'),
'description': p.get('description'),
'stars': p.get('star_count', 0),
'forks': p.get('forks_count', 0),
'updated': p.get('last_activity_at'),
})
except:
pass
return repos
# === SOURCEHUT API ===
def scrape_sourcehut_users(limit: int = 100) -> List[Dict]:
"""
scrape users from sourcehut.
sourcehut doesn't have a public user list, so we scrape from:
- recent commits
- mailing lists
- project pages
"""
users = []
seen = set()
try:
# scrape from git.sr.ht explore
resp = requests.get('https://git.sr.ht/projects', headers=HEADERS, timeout=15)
if resp.status_code == 200:
# extract usernames from repo paths like ~username/repo
usernames = re.findall(r'href="/~([^/"]+)', resp.text)
for username in usernames:
if username not in seen:
seen.add(username)
users.append({'username': username})
if len(users) >= limit:
break
log(f" got {len(users)} sourcehut users")
except Exception as e:
log(f" sourcehut scrape failed: {e}")
return users
def get_sourcehut_user_details(username: str) -> Optional[Dict]:
"""get sourcehut user details"""
try:
# scrape profile page
profile_url = f"https://sr.ht/~{username}"
resp = requests.get(profile_url, headers=HEADERS, timeout=10)
if resp.status_code == 200:
bio = ''
# extract bio from page
bio_match = re.search(r'<div class="container">\s*<p>([^<]+)</p>', resp.text)
if bio_match:
bio = bio_match.group(1).strip()
return {
'username': username,
'bio': bio,
'profile_url': profile_url,
}
except:
pass
return None
def get_sourcehut_user_repos(username: str, limit: int = 10) -> List[Dict]:
"""get sourcehut user's repos"""
repos = []
try:
git_url = f"https://git.sr.ht/~{username}"
resp = requests.get(git_url, headers=HEADERS, timeout=10)
if resp.status_code == 200:
# extract repo names
repo_matches = re.findall(rf'href="/~{username}/([^"]+)"', resp.text)
for repo in repo_matches[:limit]:
if repo and not repo.startswith(('refs', 'log', 'tree')):
repos.append({
'name': repo,
'full_name': f"~{username}/{repo}",
})
except:
pass
return repos
# === UNIFIED SCRAPER ===
def scrape_forge(instance_name: str, instance_url: str, platform_type: str, limit: int = 50) -> List[Dict]:
"""
scrape users from any forge type.
returns list of human dicts ready for database.
"""
log(f"scraping {instance_name} ({platform_type})...")
humans = []
# get user list based on platform type
if platform_type in ('gitea', 'forgejo', 'gogs'):
users = scrape_gitea_users(instance_url, limit)
get_details = lambda u: get_gitea_user_details(instance_url, u)
get_repos = lambda u: get_gitea_user_repos(instance_url, u)
elif platform_type == 'gitlab':
users = scrape_gitlab_users(instance_url, limit)
get_details = lambda u: get_gitlab_user_details(instance_url, u)
get_repos = lambda u: get_gitlab_user_projects(instance_url, u)
elif platform_type == 'sourcehut':
users = scrape_sourcehut_users(limit)
get_details = get_sourcehut_user_details
get_repos = get_sourcehut_user_repos
else:
log(f" unknown platform type: {platform_type}")
return []
for user in users:
username = user.get('username')
if not username:
continue
time.sleep(REQUEST_DELAY)
# get detailed info
details = get_details(username)
if details:
user.update(details)
# get repos
repos = get_repos(username)
# build human record
bio = user.get('bio', '') or ''
website = user.get('website', '') or ''
# analyze signals from bio
score, signals, reasons = analyze_text(bio + ' ' + website)
# BOOST: self-hosted git = highest signal
score += 25
signals.append('selfhosted_git')
reasons.append(f'uses self-hosted git ({instance_name})')
# extract contact info
contact = {}
email = user.get('email') or user.get('public_email')
if email and '@' in email:
contact['email'] = email
if website:
contact['website'] = website
# build human dict
human = {
'platform': f'{platform_type}:{instance_name}',
'username': username,
'name': user.get('full_name'),
'bio': bio,
'url': f"{instance_url}/{username}" if platform_type != 'sourcehut' else f"https://sr.ht/~{username}",
'score': score,
'signals': json.dumps(signals),
'reasons': json.dumps(reasons),
'contact': json.dumps(contact),
'extra': json.dumps({
'instance': instance_name,
'instance_url': instance_url,
'platform_type': platform_type,
'repos': repos[:5],
'followers': user.get('followers', 0),
'email': email,
'website': website,
}),
'user_type': 'builder' if repos else 'none',
}
humans.append(human)
log(f" {username}: score={score}, repos={len(repos)}")
return humans
def scrape_all_forges(limit_per_instance: int = 30) -> List[Dict]:
"""scrape all known forge instances"""
all_humans = []
for instance_name, instance_url, platform_type in KNOWN_INSTANCES:
try:
humans = scrape_forge(instance_name, instance_url, platform_type, limit_per_instance)
all_humans.extend(humans)
log(f" {instance_name}: {len(humans)} humans")
except Exception as e:
log(f" {instance_name} failed: {e}")
time.sleep(2) # be nice between instances
log(f"total: {len(all_humans)} humans from {len(KNOWN_INSTANCES)} forges")
return all_humans
# === OUTREACH METHODS ===
def can_message_on_forge(instance_url: str, platform_type: str) -> bool:
"""check if we can send messages on this forge"""
# gitea/forgejo don't have DMs
# gitlab has merge request comments
# sourcehut has mailing lists
return platform_type in ('gitlab', 'sourcehut')
def open_forge_issue(instance_url: str, platform_type: str,
owner: str, repo: str, title: str, body: str) -> Tuple[bool, str]:
"""
open an issue on a forge as outreach method.
requires API token for authenticated requests.
"""
# would need tokens per instance - for now return False
# this is a fallback method, email is preferred
return False, "forge issue creation not implemented yet"
# === DISCOVERY ===
def discover_forge_instances() -> List[Tuple[str, str, str]]:
"""
discover new forge instances from:
- fediverse (they often announce)
- known lists
- DNS patterns
returns list of (name, url, platform_type)
"""
# start with known instances
instances = list(KNOWN_INSTANCES)
# could add discovery logic here:
# - scrape https://codeberg.org/forgejo/forgejo/issues for instance mentions
# - check fediverse for git.* domains
# - crawl gitea/forgejo awesome lists
return instances
if __name__ == '__main__':
# test
print("testing forge scrapers...")
# test codeberg
humans = scrape_forge('codeberg', 'https://codeberg.org', 'gitea', limit=5)
print(f"codeberg: {len(humans)} humans")
for h in humans[:2]:
print(f" {h['username']}: {h['score']} - {h.get('signals')}")

View file

@ -103,6 +103,15 @@ PLATFORM_PATTERNS = {
'devto': [ 'devto': [
(r'https?://dev\.to/([^/?#]+)', lambda m: m.group(1)), (r'https?://dev\.to/([^/?#]+)', lambda m: m.group(1)),
], ],
# reddit/lobsters
'reddit': [
(r'https?://(?:www\.)?reddit\.com/u(?:ser)?/([^/?#]+)', lambda m: f"u/{m.group(1)}"),
(r'https?://(?:old|new)\.reddit\.com/u(?:ser)?/([^/?#]+)', lambda m: f"u/{m.group(1)}"),
],
'lobsters': [
(r'https?://lobste\.rs/u/([^/?#]+)', lambda m: m.group(1)),
],
# funding # funding
'kofi': [ 'kofi': [

View file

@ -1,24 +1,14 @@
""" """
scoutd/reddit.py - reddit discovery (DISCOVERY ONLY, NOT OUTREACH) scoutd/reddit.py - reddit discovery with TAVILY web search
reddit is a SIGNAL SOURCE, not a contact channel. CRITICAL: always quote usernames in tavily searches to avoid fuzzy matching
flow:
1. scrape reddit for users active in target subs
2. extract their reddit profile
3. look for links TO other platforms (github, mastodon, website, etc.)
4. add to scout database with reddit as signal source
5. reach out via their OTHER platforms, never reddit
if reddit user has no external links:
- add to manual_queue with note "reddit-only, needs manual review"
also detects lost builders - stuck in learnprogramming for years, imposter syndrome, etc.
""" """
import requests import requests
import json import json
import time import time
import re import re
import os
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from collections import defaultdict from collections import defaultdict
@ -35,43 +25,14 @@ from .lost import (
HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'} HEADERS = {'User-Agent': 'connectd:v1.0 (community discovery)'}
CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit' CACHE_DIR = Path(__file__).parent.parent / 'db' / 'cache' / 'reddit'
# patterns for extracting external platform links GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
PLATFORM_PATTERNS = { TAVILY_API_KEY = os.getenv('TAVILY_API_KEY', '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')
'github': [
r'github\.com/([a-zA-Z0-9_-]+)',
r'gh:\s*@?([a-zA-Z0-9_-]+)',
],
'mastodon': [
r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
r'mastodon\.social/@([a-zA-Z0-9_]+)',
r'fosstodon\.org/@([a-zA-Z0-9_]+)',
r'hachyderm\.io/@([a-zA-Z0-9_]+)',
r'tech\.lgbt/@([a-zA-Z0-9_]+)',
],
'twitter': [
r'twitter\.com/([a-zA-Z0-9_]+)',
r'x\.com/([a-zA-Z0-9_]+)',
r'(?:^|\s)@([a-zA-Z0-9_]{1,15})(?:\s|$)', # bare @handle
],
'bluesky': [
r'bsky\.app/profile/([a-zA-Z0-9_.-]+)',
r'([a-zA-Z0-9_-]+)\.bsky\.social',
],
'website': [
r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)',
],
'matrix': [
r'@([a-zA-Z0-9_-]+):([a-zA-Z0-9.-]+)',
],
}
def _api_get(url, params=None): def _api_get(url, params=None, headers=None):
"""rate-limited request"""
cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}" cache_key = f"{url}_{json.dumps(params or {}, sort_keys=True)}"
cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json" cache_file = CACHE_DIR / f"{hash(cache_key) & 0xffffffff}.json"
CACHE_DIR.mkdir(parents=True, exist_ok=True) CACHE_DIR.mkdir(parents=True, exist_ok=True)
if cache_file.exists(): if cache_file.exists():
try: try:
data = json.loads(cache_file.read_text()) data = json.loads(cache_file.read_text())
@ -79,142 +40,263 @@ def _api_get(url, params=None):
return data.get('_data') return data.get('_data')
except: except:
pass pass
time.sleep(1)
time.sleep(2) # reddit rate limit req_headers = {**HEADERS, **(headers or {})}
try: try:
resp = requests.get(url, headers=HEADERS, params=params, timeout=30) resp = requests.get(url, headers=req_headers, params=params, timeout=30)
resp.raise_for_status() resp.raise_for_status()
result = resp.json() result = resp.json()
cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result})) cache_file.write_text(json.dumps({'_cached_at': time.time(), '_data': result}))
return result return result
except requests.exceptions.RequestException as e: except:
print(f" reddit api error: {e}")
return None return None
def extract_external_links(text): def tavily_search(query, max_results=10):
"""extract links to other platforms from text""" if not TAVILY_API_KEY:
links = {} return []
try:
resp = requests.post(
'https://api.tavily.com/search',
json={'api_key': TAVILY_API_KEY, 'query': query, 'max_results': max_results},
timeout=30
)
if resp.status_code == 200:
return resp.json().get('results', [])
except Exception as e:
print(f" tavily error: {e}")
return []
def extract_links_from_text(text, username=None):
found = {}
if not text: if not text:
return links return found
text_lower = text.lower()
username_lower = username.lower() if username else None
for platform, patterns in PLATFORM_PATTERNS.items(): # email
for pattern in patterns: for email in re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text):
matches = re.findall(pattern, text, re.IGNORECASE) if any(x in email.lower() for x in ['noreply', 'example', '@reddit', 'info@', 'support@', 'contact@', 'admin@']):
if matches: continue
if platform == 'mastodon' and isinstance(matches[0], tuple): if username_lower and username_lower in email.lower():
# full fediverse handle found['email'] = email
links[platform] = f"@{matches[0][0]}@{matches[0][1]}"
elif platform == 'matrix' and isinstance(matches[0], tuple):
links[platform] = f"@{matches[0][0]}:{matches[0][1]}"
elif platform == 'website':
# skip reddit/imgur/etc
for match in matches:
if not any(x in match.lower() for x in ['reddit', 'imgur', 'redd.it', 'i.redd']):
links[platform] = f"https://{match}"
break break
else: if 'email' not in found:
links[platform] = matches[0] found['email'] = email
# github
for gh in re.findall(r'github\.com/([a-zA-Z0-9_-]+)', text):
if gh.lower() in ['topics', 'explore', 'trending', 'sponsors', 'orgs']:
continue
if username_lower and gh.lower() == username_lower:
found['github'] = gh
break break
return links # mastodon
masto = re.search(r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text)
if masto:
found['mastodon'] = f"@{masto.group(1)}@{masto.group(2)}"
for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
m = re.search(f'{inst}/@([a-zA-Z0-9_]+)', text)
if m:
found['mastodon'] = f"@{m.group(1)}@{inst}"
break
# bluesky
bsky = re.search(r'bsky\.app/profile/([a-zA-Z0-9_.-]+)', text)
if bsky:
found['bluesky'] = bsky.group(1)
# twitter
tw = re.search(r'(?:twitter|x)\.com/([a-zA-Z0-9_]+)', text)
if tw and tw.group(1).lower() not in ['home', 'explore', 'search']:
found['twitter'] = tw.group(1)
# linkedin
li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', text)
if li:
found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
# twitch
twitch = re.search(r'twitch\.tv/([a-zA-Z0-9_]+)', text)
if twitch:
found['twitch'] = f"https://twitch.tv/{twitch.group(1)}"
# itch.io
itch = re.search(r'itch\.io/profile/([a-zA-Z0-9_-]+)', text)
if itch:
found['itch'] = f"https://itch.io/profile/{itch.group(1)}"
# website
for url in re.findall(r'https?://([a-zA-Z0-9_-]+\.[a-zA-Z]{2,}[a-zA-Z0-9./_-]*)', text):
skip = ['reddit', 'imgur', 'google', 'facebook', 'twitter', 'youtube', 'wikipedia', 'amazon']
if not any(x in url.lower() for x in skip):
if username_lower and username_lower in url.lower():
found['website'] = f"https://{url}"
break
if 'website' not in found:
found['website'] = f"https://{url}"
return found
def cross_platform_discovery(username, full_text=''):
"""
search the ENTIRE internet using TAVILY.
CRITICAL: always quote username to avoid fuzzy matching!
"""
found = {}
all_content = full_text
username_lower = username.lower()
print(f" 🔍 cross-platform search for {username}...")
# ALWAYS QUOTE THE USERNAME - critical for exact matching
searches = [
f'"{username}"', # just username, quoted
f'"{username}" github', # github
f'"{username}" developer programmer', # dev context
f'"{username}" email contact', # contact
f'"{username}" mastodon', # fediverse
]
for query in searches:
print(f" 🌐 tavily: {query}")
results = tavily_search(query, max_results=5)
for result in results:
url = result.get('url', '').lower()
title = result.get('title', '')
content = result.get('content', '')
combined = f"{url} {title} {content}"
# validate username appears
if username_lower not in combined.lower():
continue
all_content += f" {combined}"
# extract from URL directly
if f'github.com/{username_lower}' in url and not found.get('github'):
found['github'] = username
print(f" ✓ github: {username}")
if f'twitch.tv/{username_lower}' in url and not found.get('twitch'):
found['twitch'] = f"https://twitch.tv/{username}"
print(f" ✓ twitch")
if 'itch.io/profile/' in url and username_lower in url and not found.get('itch'):
found['itch'] = url if url.startswith('http') else f"https://{url}"
print(f" ✓ itch.io")
if 'linkedin.com/in/' in url and not found.get('linkedin'):
li = re.search(r'linkedin\.com/in/([a-zA-Z0-9_-]+)', url)
if li:
found['linkedin'] = f"https://linkedin.com/in/{li.group(1)}"
print(f" ✓ linkedin")
# extract from content
extracted = extract_links_from_text(all_content, username)
for k, v in extracted.items():
if k not in found:
found[k] = v
print(f"{k}")
# good contact found? stop searching
if found.get('email') or found.get('github') or found.get('mastodon') or found.get('twitch'):
break
# === API CHECKS ===
if not found.get('github'):
headers = {'Authorization': f'token {GITHUB_TOKEN}'} if GITHUB_TOKEN else {}
try:
resp = requests.get(f'https://api.github.com/users/{username}', headers=headers, timeout=10)
if resp.status_code == 200:
data = resp.json()
found['github'] = username
print(f" ✓ github API")
if data.get('email') and 'email' not in found:
found['email'] = data['email']
if data.get('blog') and 'website' not in found:
found['website'] = data['blog'] if data['blog'].startswith('http') else f"https://{data['blog']}"
except:
pass
if not found.get('mastodon'):
for inst in ['mastodon.social', 'fosstodon.org', 'hachyderm.io', 'tech.lgbt']:
try:
resp = requests.get(f'https://{inst}/api/v1/accounts/lookup', params={'acct': username}, timeout=5)
if resp.status_code == 200:
found['mastodon'] = f"@{username}@{inst}"
print(f" ✓ mastodon: {found['mastodon']}")
break
except:
continue
if not found.get('bluesky'):
try:
resp = requests.get('https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile',
params={'actor': f'{username}.bsky.social'}, timeout=10)
if resp.status_code == 200:
found['bluesky'] = resp.json().get('handle')
print(f" ✓ bluesky")
except:
pass
return found
def get_user_profile(username): def get_user_profile(username):
"""get user profile including bio/description"""
url = f'https://www.reddit.com/user/{username}/about.json' url = f'https://www.reddit.com/user/{username}/about.json'
data = _api_get(url) data = _api_get(url)
if not data or 'data' not in data: if not data or 'data' not in data:
return None return None
profile = data['data'] profile = data['data']
return { return {
'username': username, 'username': username,
'name': profile.get('name'),
'bio': profile.get('subreddit', {}).get('public_description', ''), 'bio': profile.get('subreddit', {}).get('public_description', ''),
'title': profile.get('subreddit', {}).get('title', ''), 'title': profile.get('subreddit', {}).get('title', ''),
'icon': profile.get('icon_img'),
'created_utc': profile.get('created_utc'),
'total_karma': profile.get('total_karma', 0), 'total_karma': profile.get('total_karma', 0),
'link_karma': profile.get('link_karma', 0),
'comment_karma': profile.get('comment_karma', 0),
} }
def get_subreddit_users(subreddit, limit=100): def get_subreddit_users(subreddit, limit=100):
"""get recent posters/commenters from a subreddit"""
users = set() users = set()
for endpoint in ['new', 'comments']:
# posts url = f'https://www.reddit.com/r/{subreddit}/{endpoint}.json'
url = f'https://www.reddit.com/r/{subreddit}/new.json'
data = _api_get(url, {'limit': limit}) data = _api_get(url, {'limit': limit})
if data and 'data' in data: if data and 'data' in data:
for post in data['data'].get('children', []): for item in data['data'].get('children', []):
author = post['data'].get('author') author = item['data'].get('author')
if author and author not in ['[deleted]', 'AutoModerator']: if author and author not in ['[deleted]', 'AutoModerator']:
users.add(author) users.add(author)
# comments
url = f'https://www.reddit.com/r/{subreddit}/comments.json'
data = _api_get(url, {'limit': limit})
if data and 'data' in data:
for comment in data['data'].get('children', []):
author = comment['data'].get('author')
if author and author not in ['[deleted]', 'AutoModerator']:
users.add(author)
return users return users
def get_user_activity(username): def get_user_activity(username):
"""get user's posts and comments"""
activity = [] activity = []
for endpoint in ['submitted', 'comments']:
# posts url = f'https://www.reddit.com/user/{username}/{endpoint}.json'
url = f'https://www.reddit.com/user/{username}/submitted.json'
data = _api_get(url, {'limit': 100}) data = _api_get(url, {'limit': 100})
if data and 'data' in data: if data and 'data' in data:
for post in data['data'].get('children', []): for item in data['data'].get('children', []):
activity.append({ activity.append({
'type': 'post', 'type': 'post' if endpoint == 'submitted' else 'comment',
'subreddit': post['data'].get('subreddit'), 'subreddit': item['data'].get('subreddit'),
'title': post['data'].get('title', ''), 'title': item['data'].get('title', ''),
'body': post['data'].get('selftext', ''), 'body': item['data'].get('selftext', '') or item['data'].get('body', ''),
'score': post['data'].get('score', 0), 'score': item['data'].get('score', 0),
}) })
# comments
url = f'https://www.reddit.com/user/{username}/comments.json'
data = _api_get(url, {'limit': 100})
if data and 'data' in data:
for comment in data['data'].get('children', []):
activity.append({
'type': 'comment',
'subreddit': comment['data'].get('subreddit'),
'body': comment['data'].get('body', ''),
'score': comment['data'].get('score', 0),
})
return activity return activity
def analyze_reddit_user(username): def analyze_reddit_user(username):
"""
analyze a reddit user for alignment and extract external platform links.
reddit is DISCOVERY ONLY - we find users here but contact them elsewhere.
"""
activity = get_user_activity(username) activity = get_user_activity(username)
if not activity: if not activity:
return None return None
# get profile for bio
profile = get_user_profile(username) profile = get_user_profile(username)
# count subreddit activity
sub_activity = defaultdict(int) sub_activity = defaultdict(int)
text_parts = [] text_parts = []
total_karma = 0 total_karma = 0
@ -232,20 +314,16 @@ def analyze_reddit_user(username):
full_text = ' '.join(text_parts) full_text = ' '.join(text_parts)
text_score, positive_signals, negative_signals = analyze_text(full_text) text_score, positive_signals, negative_signals = analyze_text(full_text)
# EXTRACT EXTERNAL LINKS - this is the key part
# check profile bio first
external_links = {} external_links = {}
if profile: if profile:
bio_text = f"{profile.get('bio', '')} {profile.get('title', '')}" external_links.update(extract_links_from_text(f"{profile.get('bio', '')} {profile.get('title', '')}", username))
external_links.update(extract_external_links(bio_text)) external_links.update(extract_links_from_text(full_text, username))
# also scan posts/comments for links (people often share their github etc) # TAVILY search
activity_links = extract_external_links(full_text) discovered = cross_platform_discovery(username, full_text)
for platform, link in activity_links.items(): external_links.update(discovered)
if platform not in external_links:
external_links[platform] = link
# subreddit scoring # scoring
sub_score = 0 sub_score = 0
aligned_subs = [] aligned_subs = []
for sub, count in sub_activity.items(): for sub, count in sub_activity.items():
@ -254,13 +332,11 @@ def analyze_reddit_user(username):
sub_score += weight * min(count, 5) sub_score += weight * min(count, 5)
aligned_subs.append(sub) aligned_subs.append(sub)
# multi-sub bonus
if len(aligned_subs) >= 5: if len(aligned_subs) >= 5:
sub_score += 30 sub_score += 30
elif len(aligned_subs) >= 3: elif len(aligned_subs) >= 3:
sub_score += 15 sub_score += 15
# negative sub penalty
for sub in sub_activity: for sub in sub_activity:
if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]: if sub.lower() in [n.lower() for n in NEGATIVE_SUBREDDITS]:
sub_score -= 50 sub_score -= 50
@ -268,77 +344,33 @@ def analyze_reddit_user(username):
total_score = text_score + sub_score total_score = text_score + sub_score
# bonus if they have external links (we can actually contact them)
if external_links.get('github'): if external_links.get('github'):
total_score += 10 total_score += 10
positive_signals.append('has github') positive_signals.append('github')
if external_links.get('mastodon'): if external_links.get('mastodon'):
total_score += 10 total_score += 10
positive_signals.append('has mastodon') positive_signals.append('mastodon')
if external_links.get('website'): if external_links.get('email'):
total_score += 15
positive_signals.append('email')
if external_links.get('twitch'):
total_score += 5 total_score += 5
positive_signals.append('has website') positive_signals.append('twitch')
# === LOST BUILDER DETECTION === # lost builder
# reddit is HIGH SIGNAL for lost builders - stuck in learnprogramming,
# imposter syndrome posts, "i wish i could" language, etc.
subreddits_list = list(sub_activity.keys()) subreddits_list = list(sub_activity.keys())
lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list) lost_signals, lost_weight = analyze_reddit_for_lost_signals(activity, subreddits_list)
text_lost_signals, _ = analyze_text_for_lost_signals(full_text)
# also check full text for lost patterns (already done partially in analyze_reddit_for_lost_signals)
text_lost_signals, text_lost_weight = analyze_text_for_lost_signals(full_text)
for sig in text_lost_signals: for sig in text_lost_signals:
if sig not in lost_signals: if sig not in lost_signals:
lost_signals.append(sig) lost_signals.append(sig)
lost_weight += text_lost_weight
lost_potential_score = lost_weight builder_activity = 20 if external_links.get('github') else 0
user_type = classify_user(lost_weight, builder_activity, total_score)
# classify: builder, lost, both, or none confidence = min(0.95, 0.3 + (0.2 if len(activity) > 20 else 0) + (0.2 if len(aligned_subs) >= 2 else 0) + (0.1 if external_links else 0))
# for reddit, builder_score is based on having external links + high karma
builder_activity = 0
if external_links.get('github'):
builder_activity += 20
if total_karma > 1000:
builder_activity += 15
elif total_karma > 500:
builder_activity += 10
user_type = classify_user(lost_potential_score, builder_activity, total_score) reddit_only = not any([external_links.get(k) for k in ['github', 'mastodon', 'bluesky', 'email', 'matrix', 'linkedin', 'twitch', 'itch']])
# confidence
confidence = 0.3
if len(activity) > 20:
confidence += 0.2
if len(aligned_subs) >= 2:
confidence += 0.2
if len(text_parts) > 10:
confidence += 0.2
# higher confidence if we have contact methods
if external_links:
confidence += 0.1
confidence = min(confidence, 0.95)
reasons = []
if aligned_subs:
reasons.append(f"active in: {', '.join(aligned_subs[:5])}")
if positive_signals:
reasons.append(f"signals: {', '.join(positive_signals[:5])}")
if negative_signals:
reasons.append(f"WARNING: {', '.join(negative_signals)}")
if external_links:
reasons.append(f"external: {', '.join(external_links.keys())}")
# add lost reasons if applicable
if user_type == 'lost' or user_type == 'both':
lost_descriptions = get_signal_descriptions(lost_signals)
if lost_descriptions:
reasons.append(f"LOST SIGNALS: {', '.join(lost_descriptions[:3])}")
# determine if this is reddit-only (needs manual review)
reddit_only = len(external_links) == 0
if reddit_only:
reasons.append("REDDIT-ONLY: needs manual review for outreach")
return { return {
'platform': 'reddit', 'platform': 'reddit',
@ -351,153 +383,46 @@ def analyze_reddit_user(username):
'subreddits': aligned_subs, 'subreddits': aligned_subs,
'activity_count': len(activity), 'activity_count': len(activity),
'karma': total_karma, 'karma': total_karma,
'reasons': reasons, 'reasons': [f"contact: {', '.join(external_links.keys())}"] if external_links else [],
'scraped_at': datetime.now().isoformat(), 'scraped_at': datetime.now().isoformat(),
# external platform links for outreach
'external_links': external_links, 'external_links': external_links,
'reddit_only': reddit_only, 'reddit_only': reddit_only,
'extra': { 'extra': external_links,
'github': external_links.get('github'), 'lost_potential_score': lost_weight,
'mastodon': external_links.get('mastodon'),
'twitter': external_links.get('twitter'),
'bluesky': external_links.get('bluesky'),
'website': external_links.get('website'),
'matrix': external_links.get('matrix'),
'reddit_karma': total_karma,
'reddit_activity': len(activity),
},
# lost builder fields
'lost_potential_score': lost_potential_score,
'lost_signals': lost_signals, 'lost_signals': lost_signals,
'user_type': user_type, 'user_type': user_type,
} }
def scrape_reddit(db, limit_per_sub=50): def scrape_reddit(db, limit_per_sub=50):
""" print("scoutd/reddit: scraping (TAVILY enabled)...")
full reddit scrape - DISCOVERY ONLY
finds aligned users, extracts external links for outreach.
reddit-only users go to manual queue.
"""
print("scoutd/reddit: starting scrape (discovery only, not outreach)...")
# find users in multiple aligned subs
user_subs = defaultdict(set) user_subs = defaultdict(set)
for sub in ['intentionalcommunity', 'cohousing', 'selfhosted', 'homeassistant', 'solarpunk', 'cooperatives', 'privacy', 'localllama', 'homelab', 'learnprogramming']:
# aligned subs - active builders
priority_subs = ['intentionalcommunity', 'cohousing', 'selfhosted',
'homeassistant', 'solarpunk', 'cooperatives', 'privacy',
'localllama', 'homelab', 'degoogle', 'pihole', 'unraid']
# lost builder subs - people who need encouragement
# these folks might be stuck, but they have aligned interests
lost_subs = ['learnprogramming', 'findapath', 'getdisciplined',
'careerguidance', 'cscareerquestions', 'decidingtobebetter']
# scrape both - we want to find lost builders with aligned interests
all_subs = priority_subs + lost_subs
for sub in all_subs:
print(f" scraping r/{sub}...")
users = get_subreddit_users(sub, limit=limit_per_sub) users = get_subreddit_users(sub, limit=limit_per_sub)
for user in users: for user in users:
user_subs[user].add(sub) user_subs[user].add(sub)
print(f" found {len(users)} users")
# filter for multi-sub users
multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2} multi_sub = {u: subs for u, subs in user_subs.items() if len(subs) >= 2}
print(f" {len(multi_sub)} users in 2+ aligned subs") print(f" {len(multi_sub)} users in 2+ subs")
# analyze
results = [] results = []
reddit_only_count = 0
external_link_count = 0
builders_found = 0
lost_found = 0
for username in multi_sub: for username in multi_sub:
try: try:
result = analyze_reddit_user(username) result = analyze_reddit_user(username)
if result and result['score'] > 0: if result and result['score'] > 0:
results.append(result) results.append(result)
db.save_human(result) db.save_human(result)
user_type = result.get('user_type', 'none')
# track lost builders - reddit is high signal for these
if user_type == 'lost':
lost_found += 1
lost_score = result.get('lost_potential_score', 0)
if lost_score >= 40:
print(f" 💔 u/{username}: lost_score={lost_score}, values={result['score']} pts")
# lost builders also go to manual queue if reddit-only
if result.get('reddit_only'):
_add_to_manual_queue(result)
elif user_type == 'builder':
builders_found += 1
elif user_type == 'both':
builders_found += 1
lost_found += 1
print(f" ⚡ u/{username}: recovering builder")
# track external links
if result.get('reddit_only'):
reddit_only_count += 1
# add high-value users to manual queue for review
if result['score'] >= 50 and user_type != 'lost': # lost already added above
_add_to_manual_queue(result)
print(f" 📋 u/{username}: {result['score']} pts (reddit-only → manual queue)")
else:
external_link_count += 1
if result['score'] >= 50 and user_type == 'builder':
links = list(result.get('external_links', {}).keys())
print(f" ★ u/{username}: {result['score']} pts → {', '.join(links)}")
except Exception as e: except Exception as e:
print(f" error on {username}: {e}") print(f" error: {username}: {e}")
print(f"scoutd/reddit: found {len(results)} aligned humans") print(f"scoutd/reddit: {len(results)} humans")
print(f" - {builders_found} active builders")
print(f" - {lost_found} lost builders (need encouragement)")
print(f" - {external_link_count} with external links (reachable)")
print(f" - {reddit_only_count} reddit-only (manual queue)")
return results return results
def _add_to_manual_queue(result): def _add_to_manual_queue(result):
"""add reddit-only user to manual queue for review"""
from pathlib import Path
import json
queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json' queue_file = Path(__file__).parent.parent / 'data' / 'manual_queue.json'
queue_file.parent.mkdir(parents=True, exist_ok=True) queue_file.parent.mkdir(parents=True, exist_ok=True)
queue = json.loads(queue_file.read_text()) if queue_file.exists() else []
queue = [] if not any(q.get('username') == result['username'] for q in queue):
if queue_file.exists(): queue.append({'platform': 'reddit', 'username': result['username'], 'url': result['url'], 'score': result['score'], 'queued_at': datetime.now().isoformat()})
try:
queue = json.loads(queue_file.read_text())
except:
pass
# check if already in queue
existing = [q for q in queue if q.get('username') == result['username'] and q.get('platform') == 'reddit']
if existing:
return
queue.append({
'platform': 'reddit',
'username': result['username'],
'url': result['url'],
'score': result['score'],
'subreddits': result.get('subreddits', []),
'signals': result.get('signals', []),
'reasons': result.get('reasons', []),
'note': 'reddit-only user - no external links found. DM manually if promising.',
'queued_at': datetime.now().isoformat(),
'status': 'pending',
})
queue_file.write_text(json.dumps(queue, indent=2)) queue_file.write_text(json.dumps(queue, indent=2))

View file

@ -31,9 +31,8 @@ there's a better way and we are going to build it together."
you can reach *person* at *preffered contact method* you can reach *person* at *preffered contact method*
- connectd daemon
hope it goes well! hope it goes well!
-connectd
CONNECTD_ICONS (line 33-44): CONNECTD_ICONS (line 33-44):
CONNECTD_ICONS = '''<div style="display:flex;gap:16px;flex-wrap:wrap"> CONNECTD_ICONS = '''<div style="display:flex;gap:16px;flex-wrap:wrap">
<a href="https://github.com/connectd-daemon" title="GitHub" style="color:#888"><svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12"/></svg></a> <a href="https://github.com/connectd-daemon" title="GitHub" style="color:#888"><svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor"><path d="M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12"/></svg></a>