ha-addons/connectd/scoutd/deep.py
2025-12-15 11:06:57 -06:00

966 lines
34 KiB
Python

"""
scoutd/deep.py - deep profile discovery
when we find someone, follow ALL their links to build complete picture
github profile -> mastodon link -> scrape mastodon
-> website -> scrape for more links
-> twitter handle -> note it
-> email -> store it
email discovery sources:
- github profile (if public)
- git commit history
- personal website/blog contact page
- README "contact me" sections
- mastodon/twitter bio
fallback contact methods if no email:
- github_issue: open issue on their repo
- mastodon: DM if allowed
- manual: pending contact queue for review
also filters out people who clearly already know each other
(same org, co-contributors to same repos)
"""
import re
import json
import requests
import time
import subprocess
import tempfile
import shutil
from datetime import datetime
from urllib.parse import urlparse
from pathlib import Path
from .signals import analyze_text
from .github import get_github_user, get_user_repos, _api_get as github_api
from .mastodon import analyze_mastodon_user, _api_get as mastodon_api
from .handles import discover_all_handles, extract_handles_from_text, scrape_website_for_handles
# local cache for org memberships
ORG_CACHE_FILE = Path(__file__).parent.parent / 'data' / 'org_cache.json'
_org_cache = None
# patterns to find social links in text
MASTODON_PATTERN = r'@([a-zA-Z0-9_]+)@([a-zA-Z0-9.-]+\.[a-z]{2,})'
TWITTER_PATTERN = r'(?:twitter\.com/|x\.com/)([a-zA-Z0-9_]+)'
GITHUB_PATTERN = r'github\.com/([a-zA-Z0-9_-]+)'
MATRIX_PATTERN = r'@([a-zA-Z0-9_]+):([a-zA-Z0-9.-]+)'
EMAIL_PATTERN = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
# known mastodon instances for validation
KNOWN_INSTANCES = [
'mastodon.social', 'fosstodon.org', 'tech.lgbt', 'social.coop',
'hackers.town', 'hachyderm.io', 'infosec.exchange', 'chaos.social',
'mas.to', 'mstdn.social', 'mastodon.online', 'universeodon.com',
'mathstodon.xyz', 'ruby.social', 'functional.cafe', 'types.pl',
]
# contact page patterns for website scraping
CONTACT_PAGE_PATHS = [
'/contact', '/contact/', '/contact.html',
'/about', '/about/', '/about.html',
'/connect', '/reach-out', '/hire', '/hire-me',
]
# patterns to find emails in contact sections
CONTACT_SECTION_PATTERNS = [
r'(?:contact|email|reach|mail)[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|@)\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|\.)\s*([a-zA-Z]{2,})',
]
def load_org_cache():
"""load org membership cache from disk"""
global _org_cache
if _org_cache is not None:
return _org_cache
try:
ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
if ORG_CACHE_FILE.exists():
with open(ORG_CACHE_FILE) as f:
_org_cache = json.load(f)
else:
_org_cache = {'users': {}, 'updated': {}}
except:
_org_cache = {'users': {}, 'updated': {}}
return _org_cache
def save_org_cache():
"""save org membership cache to disk"""
global _org_cache
if _org_cache is None:
return
try:
ORG_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(ORG_CACHE_FILE, 'w') as f:
json.dump(_org_cache, f, indent=2)
except:
pass
def get_cached_orgs(username):
"""get orgs from cache if available and fresh (< 7 days old)"""
cache = load_org_cache()
if username not in cache['users']:
return None
updated = cache['updated'].get(username)
if updated:
updated_dt = datetime.fromisoformat(updated)
if (datetime.now() - updated_dt).days < 7:
return cache['users'][username]
return None
def cache_orgs(username, orgs):
"""cache org membership for a user"""
cache = load_org_cache()
cache['users'][username] = orgs
cache['updated'][username] = datetime.now().isoformat()
save_org_cache()
def get_emails_from_commit_history(repo_url, limit=50):
"""
clone a repo (shallow) and extract unique committer emails from git log
"""
emails = set()
try:
# create temp dir
with tempfile.TemporaryDirectory() as tmpdir:
# shallow clone with limited depth
result = subprocess.run(
['git', 'clone', '--depth', '50', '--single-branch', repo_url, tmpdir],
capture_output=True,
text=True,
timeout=30
)
if result.returncode != 0:
return []
# get unique emails from commit log
result = subprocess.run(
['git', 'log', f'--max-count={limit}', '--format=%ae'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0:
for email in result.stdout.strip().split('\n'):
email = email.strip().lower()
# filter out bot/noreply emails
if email and not any(x in email for x in [
'noreply', 'no-reply', 'dependabot', 'github-actions',
'renovate', 'greenkeeper', 'snyk-bot', 'users.noreply.github'
]):
emails.add(email)
except (subprocess.TimeoutExpired, Exception):
pass
return list(emails)
def scrape_website_for_emails(url, timeout=10):
"""
scrape a personal website for email addresses
checks main page and common contact pages
"""
emails = set()
if not is_personal_website(url):
return []
headers = {'User-Agent': 'connectd/1.0 (looking for contact info)'}
# normalize url
if not url.startswith('http'):
url = 'https://' + url
base_url = url.rstrip('/')
# pages to check
pages_to_check = [base_url] + [base_url + path for path in CONTACT_PAGE_PATHS]
for page_url in pages_to_check:
try:
resp = requests.get(page_url, timeout=timeout, headers=headers)
if resp.status_code == 200:
text = resp.text
# standard email pattern
for match in re.finditer(EMAIL_PATTERN, text):
email = match.group(0).lower()
if not any(x in email for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']):
emails.add(email)
# obfuscated email patterns like "user [at] domain [dot] com"
for pattern in CONTACT_SECTION_PATTERNS:
for match in re.finditer(pattern, text, re.IGNORECASE):
if len(match.groups()) == 3:
email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower()
emails.add(email)
elif len(match.groups()) == 1:
emails.add(match.group(1).lower())
# mailto: links
for match in re.finditer(r'mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', text):
emails.add(match.group(1).lower())
except:
continue
return list(emails)
def extract_emails_from_readme(text):
"""
extract emails from README text, looking for contact sections
"""
emails = set()
if not text:
return []
# look for contact-related sections
contact_patterns = [
r'(?:##?\s*)?(?:contact|reach|email|get in touch|connect)[^\n]*\n([^\n#]+)',
r'(?:email|contact|reach me)[:\s]+([^\n]+)',
]
for pattern in contact_patterns:
for match in re.finditer(pattern, text, re.IGNORECASE):
section = match.group(1)
# extract emails from this section
for email_match in re.finditer(EMAIL_PATTERN, section):
email = email_match.group(0).lower()
if not any(x in email for x in ['noreply', 'no-reply', 'example.com']):
emails.add(email)
# also check for obfuscated emails
for match in re.finditer(r'([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\))\s*([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\))\s*([a-zA-Z]{2,})', text, re.IGNORECASE):
email = f"{match.group(1)}@{match.group(2)}.{match.group(3)}".lower()
emails.add(email)
return list(emails)
def get_mastodon_dm_allowed(handle):
"""check if a mastodon user allows DMs"""
profile = get_mastodon_profile(handle)
if not profile:
return False
# check if they're locked (requires follow approval)
if profile.get('locked'):
return False
# check bio for "DMs open" type messages
bio = (profile.get('note') or profile.get('summary') or '').lower()
if any(x in bio for x in ['dms open', 'dm me', 'message me', 'dms welcome']):
return True
# default: assume open if not locked
return True
def determine_contact_method(profile):
"""
determine the best way to contact someone
returns (method, details) where method is one of:
- 'email': direct email contact
- 'github_issue': open issue on their repo
- 'mastodon': DM on mastodon
- 'manual': needs manual review
"""
# prefer email
if profile.get('email'):
return 'email', {'email': profile['email']}
# check for multiple emails to pick from
if profile.get('emails') and len(profile['emails']) > 0:
# prefer non-github, non-work emails
for email in profile['emails']:
if not any(x in email.lower() for x in ['github', 'noreply', '@company', '@corp']):
return 'email', {'email': email}
# fall back to first one
return 'email', {'email': profile['emails'][0]}
# try mastodon DM
if profile.get('mastodon'):
handles = profile['mastodon'] if isinstance(profile['mastodon'], list) else [profile['mastodon']]
for handle in handles:
if get_mastodon_dm_allowed(handle):
return 'mastodon', {'handle': handle}
# try github issue on their most-starred repo
if profile.get('top_repos'):
# find repo with issues enabled and good stars
for repo in sorted(profile['top_repos'], key=lambda r: r.get('stars', 0), reverse=True):
if repo.get('stars', 0) >= 10:
repo_name = repo.get('name')
if repo_name:
return 'github_issue', {
'repo': f"{profile['username']}/{repo_name}",
'stars': repo.get('stars'),
}
# manual review needed
return 'manual', {
'reason': 'no email, mastodon, or suitable repo found',
'available': {
'twitter': profile.get('twitter'),
'websites': profile.get('websites'),
'matrix': profile.get('matrix'),
}
}
def extract_links_from_text(text):
"""extract social links from bio/readme text"""
if not text:
return {}
links = {
'mastodon': [],
'twitter': [],
'github': [],
'matrix': [],
'email': [],
'websites': [],
}
# mastodon handles - only accept known instances or ones with 'mastodon'/'social' in name
for match in re.finditer(MASTODON_PATTERN, text):
user, instance = match.groups()
instance_lower = instance.lower()
# validate it's a known instance or looks like one
is_known = instance_lower in KNOWN_INSTANCES
looks_like_masto = any(x in instance_lower for x in ['mastodon', 'social', 'fedi', '.town', '.cafe'])
if is_known or looks_like_masto:
links['mastodon'].append(f"{user}@{instance}")
# twitter
for match in re.finditer(TWITTER_PATTERN, text, re.IGNORECASE):
links['twitter'].append(match.group(1))
# github (for cross-referencing)
for match in re.finditer(GITHUB_PATTERN, text, re.IGNORECASE):
links['github'].append(match.group(1))
# matrix
for match in re.finditer(MATRIX_PATTERN, text):
user, server = match.groups()
links['matrix'].append(f"@{user}:{server}")
# email
for match in re.finditer(EMAIL_PATTERN, text):
email = match.group(0)
# filter out obvious non-personal emails
if not any(x in email.lower() for x in ['noreply', 'no-reply', 'example.com', 'users.noreply']):
links['email'].append(email)
# websites (http/https links that aren't social platforms)
url_pattern = r'https?://([a-zA-Z0-9.-]+\.[a-z]{2,})[/\w.-]*'
for match in re.finditer(url_pattern, text):
domain = match.group(1).lower()
if not any(x in domain for x in ['github.com', 'twitter.com', 'mastodon', 'linkedin.com', 't.co']):
links['websites'].append(match.group(0))
# dedupe
for key in links:
links[key] = list(set(links[key]))
return links
def is_personal_website(url):
"""check if URL looks like a personal website vs corporate site"""
domain = urlparse(url).netloc.lower()
# skip obvious corporate/platform sites
skip_domains = [
'github.com', 'gitlab.com', 'bitbucket.org',
'twitter.com', 'x.com', 'linkedin.com', 'facebook.com',
'youtube.com', 'medium.com', 'dev.to', 'hashnode.com',
'wedo.com', 'google.com', 'microsoft.com', 'apple.com',
'amazon.com', 'stackoverflow.com', 'reddit.com',
]
if any(skip in domain for skip in skip_domains):
return False
# looks personal if: short domain, has common personal TLDs, contains username-like string
personal_tlds = ['.io', '.dev', '.me', '.co', '.xyz', '.page', '.codes', '.software']
if any(domain.endswith(tld) for tld in personal_tlds):
return True
# if domain is just name.com or similar
parts = domain.replace('www.', '').split('.')
if len(parts) == 2 and len(parts[0]) < 20:
return True
return False
def scrape_website_for_links(url, timeout=10):
"""scrape a personal website for more social links"""
if not is_personal_website(url):
return {}
try:
resp = requests.get(url, timeout=timeout, headers={'User-Agent': 'connectd/1.0'})
resp.raise_for_status()
return extract_links_from_text(resp.text)
except:
return {}
def get_mastodon_profile(handle):
"""
fetch mastodon profile from handle like user@instance
returns profile data or None
"""
if '@' not in handle:
return None
parts = handle.split('@')
if len(parts) == 2:
user, instance = parts
elif len(parts) == 3 and parts[0] == '':
# @user@instance format
user, instance = parts[1], parts[2]
else:
return None
# try to look up via webfinger
try:
webfinger_url = f"https://{instance}/.well-known/webfinger"
resp = requests.get(
webfinger_url,
params={'resource': f'acct:{user}@{instance}'},
timeout=10,
headers={'Accept': 'application/json'}
)
if resp.status_code == 200:
data = resp.json()
# find the profile link
for link in data.get('links', []):
if link.get('type') == 'application/activity+json':
profile_url = link.get('href')
# fetch the profile
profile_resp = requests.get(
profile_url,
timeout=10,
headers={'Accept': 'application/activity+json'}
)
if profile_resp.status_code == 200:
return profile_resp.json()
except:
pass
# fallback: try direct API
try:
search_url = f"https://{instance}/api/v1/accounts/lookup"
resp = requests.get(search_url, params={'acct': user}, timeout=10)
if resp.status_code == 200:
return resp.json()
except:
pass
return None
def deep_scrape_github_user(login, scrape_commits=True):
"""
deep scrape a github user - follow all links, build complete profile
email discovery sources:
1. github profile (if public)
2. git commit history (if scrape_commits=True)
3. personal website/blog contact pages
4. README "contact me" sections
5. mastodon bio
"""
print(f" deep scraping {login}...")
user = get_github_user(login)
if not user:
return None
repos = get_user_repos(login, per_page=50)
# collect all text to search for links
all_text = []
readme_text = None
if user.get('bio'):
all_text.append(user['bio'])
if user.get('blog'):
all_text.append(user['blog'])
if user.get('company'):
all_text.append(user['company'])
# check readme of profile repo (username/username)
for branch in ['main', 'master']:
readme_url = f"https://raw.githubusercontent.com/{login}/{login}/{branch}/README.md"
try:
resp = requests.get(readme_url, timeout=10)
if resp.status_code == 200:
readme_text = resp.text
all_text.append(readme_text)
break
except:
pass
# extract links from all collected text
combined_text = '\n'.join(all_text)
found_links = extract_links_from_text(combined_text)
# ensure all keys exist
for key in ['email', 'twitter', 'github', 'matrix', 'mastodon', 'websites']:
if key not in found_links:
found_links[key] = []
# add explicit github fields
if user.get('email'):
found_links['email'].append(user['email'])
if user.get('twitter_username'):
found_links['twitter'].append(user['twitter_username'])
if user.get('blog'):
found_links['websites'].append(user['blog'])
# EMAIL DISCOVERY: extract emails from README contact sections
if readme_text:
readme_emails = extract_emails_from_readme(readme_text)
found_links['email'].extend(readme_emails)
if readme_emails:
print(f" found {len(readme_emails)} email(s) in README")
# dedupe
for key in found_links:
found_links[key] = list(set(found_links[key]))
# now follow the links to gather more data
profile = {
'source': 'github',
'username': login,
'url': f"https://github.com/{login}",
'real_name': user.get('name'),
'bio': user.get('bio'),
'location': user.get('location'),
'company': user.get('company'),
'hireable': user.get('hireable'),
'created_at': user.get('created_at'),
'public_repos': user.get('public_repos'),
'followers': user.get('followers'),
# contact points
'email': found_links['email'][0] if found_links['email'] else user.get('email'),
'emails': list(found_links['email']),
'twitter': found_links['twitter'][0] if found_links['twitter'] else user.get('twitter_username'),
'mastodon': found_links['mastodon'],
'matrix': found_links['matrix'],
'websites': found_links['websites'],
# cross-platform profiles we find
'linked_profiles': {},
# repos and languages
'top_repos': [],
'languages': {},
'topics': [],
'orgs': [],
# contact method (will be determined at end)
'contact_method': None,
'contact_details': None,
}
# analyze repos
top_starred_repo = None
for repo in repos[:30]:
if not repo.get('fork'):
repo_info = {
'name': repo.get('name'),
'description': repo.get('description'),
'stars': repo.get('stargazers_count'),
'language': repo.get('language'),
'topics': repo.get('topics', []),
'html_url': repo.get('html_url'),
'pushed_at': repo.get('pushed_at'), # for activity-based contact selection
}
profile['top_repos'].append(repo_info)
# track top starred for commit email scraping
if not top_starred_repo or repo.get('stargazers_count', 0) > top_starred_repo.get('stars', 0):
top_starred_repo = repo_info
if repo.get('language'):
lang = repo['language']
profile['languages'][lang] = profile['languages'].get(lang, 0) + 1
profile['topics'].extend(repo.get('topics', []))
profile['topics'] = list(set(profile['topics']))
# get orgs - check cache first
cached_orgs = get_cached_orgs(login)
if cached_orgs is not None:
print(f" using cached orgs: {cached_orgs}")
profile['orgs'] = cached_orgs
else:
orgs_url = f"https://api.github.com/users/{login}/orgs"
orgs_data = github_api(orgs_url) or []
profile['orgs'] = [o.get('login') for o in orgs_data]
# cache for future use
cache_orgs(login, profile['orgs'])
if profile['orgs']:
print(f" fetched & cached orgs: {profile['orgs']}")
# EMAIL DISCOVERY: scrape commit history from top repo
if scrape_commits and top_starred_repo and not profile['emails']:
repo_url = f"https://github.com/{login}/{top_starred_repo['name']}.git"
print(f" checking commit history in {top_starred_repo['name']}...")
commit_emails = get_emails_from_commit_history(repo_url)
if commit_emails:
print(f" found {len(commit_emails)} email(s) in commits")
profile['emails'].extend(commit_emails)
# follow mastodon links
for masto_handle in found_links['mastodon'][:2]: # limit to 2
print(f" following mastodon: {masto_handle}")
masto_profile = get_mastodon_profile(masto_handle)
if masto_profile:
profile['linked_profiles']['mastodon'] = {
'handle': masto_handle,
'display_name': masto_profile.get('display_name') or masto_profile.get('name'),
'bio': masto_profile.get('note') or masto_profile.get('summary'),
'followers': masto_profile.get('followers_count'),
'url': masto_profile.get('url'),
'locked': masto_profile.get('locked', False),
}
# extract more links from mastodon bio
masto_bio = masto_profile.get('note') or masto_profile.get('summary') or ''
masto_links = extract_links_from_text(masto_bio)
profile['emails'].extend(masto_links.get('email', []))
profile['websites'].extend(masto_links.get('websites', []))
# EMAIL DISCOVERY: scrape personal website for contact info
for website in found_links['websites'][:2]: # check up to 2 sites
print(f" following website: {website}")
# basic link extraction
site_links = scrape_website_for_links(website)
if site_links.get('mastodon') and not profile['mastodon']:
profile['mastodon'] = site_links['mastodon']
# enhanced email discovery - check contact pages
website_emails = scrape_website_for_emails(website)
if website_emails:
print(f" found {len(website_emails)} email(s) on website")
profile['emails'].extend(website_emails)
# dedupe emails and pick best one
profile['emails'] = list(set(profile['emails']))
# rank emails by preference
def email_score(email):
email_lower = email.lower()
score = 0
# prefer personal domains
if any(x in email_lower for x in ['@gmail', '@proton', '@hey.com', '@fastmail']):
score += 10
# deprioritize github emails
if 'github' in email_lower:
score -= 20
# deprioritize noreply
if 'noreply' in email_lower:
score -= 50
# prefer emails matching username
if login.lower() in email_lower:
score += 5
return score
if profile['emails']:
profile['emails'].sort(key=email_score, reverse=True)
profile['email'] = profile['emails'][0]
# COMPREHENSIVE HANDLE DISCOVERY
# find ALL social handles from website, README, rel="me" links, etc.
discovered_handles, discovered_emails = discover_all_handles(user)
# merge discovered handles into profile
profile['handles'] = discovered_handles
# update individual fields from discovered handles
if discovered_handles.get('mastodon') and not profile.get('mastodon'):
profile['mastodon'] = discovered_handles['mastodon']
if discovered_handles.get('twitter') and not profile.get('twitter'):
profile['twitter'] = discovered_handles['twitter']
if discovered_handles.get('bluesky'):
profile['bluesky'] = discovered_handles['bluesky']
if discovered_handles.get('matrix') and not profile.get('matrix'):
profile['matrix'] = discovered_handles['matrix']
if discovered_handles.get('linkedin'):
profile['linkedin'] = discovered_handles['linkedin']
if discovered_handles.get('youtube'):
profile['youtube'] = discovered_handles['youtube']
if discovered_handles.get('discord'):
profile['discord'] = discovered_handles['discord']
if discovered_handles.get('telegram'):
profile['telegram'] = discovered_handles['telegram']
# merge discovered emails
for email in discovered_emails:
if email not in profile['emails']:
profile['emails'].append(email)
print(f" handles found: {list(discovered_handles.keys())}")
# determine best contact method
contact_method, contact_details = determine_contact_method(profile)
profile['contact_method'] = contact_method
profile['contact_details'] = contact_details
print(f" contact method: {contact_method}")
# analyze all text for signals
all_profile_text = ' '.join([
profile.get('bio') or '',
profile.get('company') or '',
profile.get('location') or '',
' '.join(profile.get('topics', [])),
])
for linked in profile.get('linked_profiles', {}).values():
if linked.get('bio'):
all_profile_text += ' ' + linked['bio']
text_score, signals, negative = analyze_text(all_profile_text)
profile['signals'] = signals
profile['negative_signals'] = negative
profile['score'] = text_score
# add builder score
if len(repos) > 20:
profile['score'] += 15
elif len(repos) > 10:
profile['score'] += 10
# add topic alignment
from .signals import TARGET_TOPICS
aligned_topics = set(profile['topics']) & set(TARGET_TOPICS)
profile['score'] += len(aligned_topics) * 10
profile['aligned_topics'] = list(aligned_topics)
profile['scraped_at'] = datetime.now().isoformat()
return profile
def check_mutual_github_follows(user_a, user_b):
"""check if two github users follow each other"""
# check if a follows b
url = f"https://api.github.com/users/{user_a}/following/{user_b}"
try:
resp = requests.get(url, timeout=10, headers={'Accept': 'application/vnd.github.v3+json'})
if resp.status_code == 204: # 204 = follows
return True
except:
pass
return False
def check_shared_repo_contributions(user_a, user_b):
"""
check if two users have contributed to the same repos
returns (bool, list of shared repos)
"""
# this would require checking contribution history
# for now, we check via the orgs and top_repos stored in extra
# the full implementation would query:
# GET /repos/{owner}/{repo}/contributors for their top repos
return False, []
def check_github_interactions(user_a, user_b):
"""
check if users have had public interactions
(comments on each other's issues/PRs)
this is expensive - only do for high-score matches
"""
# would need to search:
# GET /search/issues?q=author:{user_a}+commenter:{user_b}
# GET /search/issues?q=author:{user_b}+commenter:{user_a}
return False
def check_already_connected(human_a, human_b, deep_check=False):
"""
check if two humans are likely already connected
(same org, co-contributors, mutual follows, interactions)
connectd's job is connecting ISOLATED builders, not re-introducing coworkers
"""
# parse extra data if stored as json string
extra_a = human_a.get('extra', {})
extra_b = human_b.get('extra', {})
if isinstance(extra_a, str):
extra_a = json.loads(extra_a) if extra_a else {}
if isinstance(extra_b, str):
extra_b = json.loads(extra_b) if extra_b else {}
# 1. same github org - check cache first, then stored data
orgs_a = set(extra_a.get('orgs', []))
orgs_b = set(extra_b.get('orgs', []))
# also check org cache for fresher data
if human_a.get('platform') == 'github':
cached_a = get_cached_orgs(human_a.get('username', ''))
if cached_a:
orgs_a.update(cached_a)
if human_b.get('platform') == 'github':
cached_b = get_cached_orgs(human_b.get('username', ''))
if cached_b:
orgs_b.update(cached_b)
shared_orgs = orgs_a & orgs_b
if shared_orgs:
return True, f"same org: {', '.join(list(shared_orgs)[:3])}"
# 2. same company
company_a = (extra_a.get('company') or '').lower().strip('@').strip()
company_b = (extra_b.get('company') or '').lower().strip('@').strip()
if company_a and company_b and len(company_a) > 2:
if company_a == company_b or company_a in company_b or company_b in company_a:
return True, f"same company: {company_a or company_b}"
# 3. co-contributors to same major repos (from stored top_repos)
repos_a = set()
repos_b = set()
for r in extra_a.get('top_repos', []):
if r.get('stars', 0) > 50: # only significant repos
repos_a.add(r.get('name', '').lower())
for r in extra_b.get('top_repos', []):
if r.get('stars', 0) > 50:
repos_b.add(r.get('name', '').lower())
shared_repos = repos_a & repos_b
if len(shared_repos) >= 2:
return True, f"co-contributors: {', '.join(list(shared_repos)[:3])}"
# 4. deep checks (more API calls - only if requested)
if deep_check:
user_a = human_a.get('username', '')
user_b = human_b.get('username', '')
# check mutual follows
if human_a.get('platform') == 'github' and human_b.get('platform') == 'github':
if check_mutual_github_follows(user_a, user_b):
return True, "mutual github follows"
if check_mutual_github_follows(user_b, user_a):
return True, "mutual github follows"
return False, None
def save_deep_profile(db, profile):
"""save a deep-scraped profile to the database"""
# convert to standard human format
# IMPORTANT: extra field contains ALL data for activity-based contact selection
human_data = {
'platform': profile['source'],
'username': profile['username'],
'url': profile['url'],
'name': profile.get('real_name'),
'bio': profile.get('bio'),
'location': profile.get('location'),
'score': profile.get('score', 0),
'confidence': 0.8 if profile.get('linked_profiles') else 0.5,
'signals': profile.get('signals', []),
'negative_signals': profile.get('negative_signals', []),
'reasons': [],
'contact': {
'email': profile.get('email'),
'emails': profile.get('emails', []),
'twitter': profile.get('twitter'),
'mastodon': profile.get('mastodon'),
'matrix': profile.get('matrix'),
'websites': profile.get('websites'),
'contact_method': profile.get('contact_method'),
'contact_details': profile.get('contact_details'),
},
'extra': {
# identity
'real_name': profile.get('real_name'),
'company': profile.get('company'),
'hireable': profile.get('hireable'),
'orgs': profile.get('orgs'),
# github activity (for activity-based contact)
'top_repos': profile.get('top_repos'),
'languages': profile.get('languages'),
'topics': profile.get('topics'),
'aligned_topics': profile.get('aligned_topics'),
'followers': profile.get('followers'),
'public_repos': profile.get('public_repos'),
'commit_count': len(profile.get('emails', [])), # rough proxy
# cross-platform links (for activity-based contact)
'email': profile.get('email'),
'emails': profile.get('emails', []),
'twitter': profile.get('twitter'),
'mastodon': profile.get('mastodon'),
'matrix': profile.get('matrix'),
'bluesky': profile.get('bluesky'),
'reddit': profile.get('reddit'),
'lobsters': profile.get('lobsters'),
'linkedin': profile.get('linkedin'),
'youtube': profile.get('youtube'),
'discord': profile.get('discord'),
'telegram': profile.get('telegram'),
'linked_profiles': profile.get('linked_profiles'),
# ALL discovered handles (comprehensive)
'handles': profile.get('handles', {}),
# activity counts (populated by platform scrapers)
'mastodon_statuses': profile.get('mastodon_statuses', 0),
'twitter_tweets': profile.get('twitter_tweets', 0),
'reddit_activity': profile.get('reddit_activity', 0),
'reddit_karma': profile.get('reddit_karma', 0),
'lobsters_karma': profile.get('lobsters_karma', 0),
'bluesky_posts': profile.get('bluesky_posts', 0),
},
'scraped_at': profile.get('scraped_at'),
}
# build reasons
if profile.get('signals'):
human_data['reasons'].append(f"signals: {', '.join(profile['signals'][:5])}")
if profile.get('aligned_topics'):
human_data['reasons'].append(f"topics: {', '.join(profile['aligned_topics'][:5])}")
if profile.get('linked_profiles'):
platforms = list(profile['linked_profiles'].keys())
human_data['reasons'].append(f"also on: {', '.join(platforms)}")
if profile.get('location'):
human_data['reasons'].append(f"location: {profile['location']}")
if profile.get('contact_method'):
human_data['reasons'].append(f"contact: {profile['contact_method']}")
db.save_human(human_data)
return human_data