connectd/matchd/fingerprint.py

211 lines
6 KiB
Python
Raw Permalink Normal View History

"""
matchd/fingerprint.py - generate values profiles for humans
"""
import json
from collections import defaultdict
# values dimensions we track
VALUES_DIMENSIONS = [
'privacy', # surveillance concern, degoogle, self-hosted
'decentralization', # p2p, fediverse, local-first
'cooperation', # coops, mutual aid, community
'queer_friendly', # lgbtq+, pronouns
'environmental', # solarpunk, degrowth, sustainability
'anticapitalist', # post-capitalism, worker ownership
'builder', # creates vs consumes
'pnw_oriented', # pacific northwest connection
]
# skill categories
SKILL_CATEGORIES = [
'backend', # python, go, rust, databases
'frontend', # js, react, css
'devops', # docker, k8s, linux admin
'hardware', # electronics, embedded, iot
'design', # ui/ux, graphics
'community', # organizing, facilitation
'writing', # documentation, content
]
# signal to dimension mapping
SIGNAL_TO_DIMENSION = {
'privacy': 'privacy',
'selfhosted': 'privacy',
'degoogle': 'privacy',
'decentralized': 'decentralization',
'local_first': 'decentralization',
'p2p': 'decentralization',
'federated_chat': 'decentralization',
'foss': 'decentralization',
'cooperative': 'cooperation',
'community': 'cooperation',
'mutual_aid': 'cooperation',
'intentional_community': 'cooperation',
'queer': 'queer_friendly',
'pronouns': 'queer_friendly',
'blm': 'queer_friendly',
'acab': 'queer_friendly',
'solarpunk': 'environmental',
'anticapitalist': 'anticapitalist',
'pnw': 'pnw_oriented',
'pnw_state': 'pnw_oriented',
'remote': 'pnw_oriented',
'home_automation': 'builder',
'modern_lang': 'builder',
'unix': 'builder',
'containers': 'builder',
}
# language to skill mapping
LANGUAGE_TO_SKILL = {
'python': 'backend',
'go': 'backend',
'rust': 'backend',
'java': 'backend',
'ruby': 'backend',
'php': 'backend',
'javascript': 'frontend',
'typescript': 'frontend',
'html': 'frontend',
'css': 'frontend',
'vue': 'frontend',
'shell': 'devops',
'dockerfile': 'devops',
'nix': 'devops',
'hcl': 'devops',
'c': 'hardware',
'c++': 'hardware',
'arduino': 'hardware',
'verilog': 'hardware',
}
def generate_fingerprint(human_data):
"""
generate a values fingerprint for a human
input: human dict from database (has signals, languages, etc)
output: fingerprint dict with values_vector, skills, interests
"""
# parse stored json fields
signals = human_data.get('signals', [])
if isinstance(signals, str):
signals = json.loads(signals)
extra = human_data.get('extra', {})
if isinstance(extra, str):
extra = json.loads(extra)
languages = extra.get('languages', {})
topics = extra.get('topics', [])
# build values vector
values_vector = defaultdict(float)
# from signals
for signal in signals:
dimension = SIGNAL_TO_DIMENSION.get(signal)
if dimension:
values_vector[dimension] += 1.0
# normalize values vector (0-1 scale)
max_val = max(values_vector.values()) if values_vector else 1
values_vector = {k: min(v / max_val, 1.0) for k, v in values_vector.items()}
# fill in missing dimensions with 0
for dim in VALUES_DIMENSIONS:
if dim not in values_vector:
values_vector[dim] = 0.0
# determine skills from languages
skills = defaultdict(float)
total_repos = sum(languages.values()) if languages else 1
for lang, count in languages.items():
skill = LANGUAGE_TO_SKILL.get(lang.lower())
if skill:
skills[skill] += count / total_repos
# normalize skills
if skills:
max_skill = max(skills.values())
skills = {k: min(v / max_skill, 1.0) for k, v in skills.items()}
# interests from topics and signals
interests = list(set(topics + signals))
# location preference
location_pref = None
if 'pnw' in signals or 'pnw_state' in signals:
location_pref = 'pnw'
elif 'remote' in signals:
location_pref = 'remote'
elif human_data.get('location'):
loc = human_data['location'].lower()
if any(x in loc for x in ['seattle', 'portland', 'washington', 'oregon', 'pnw', 'cascadia']):
location_pref = 'pnw'
# availability (based on hireable flag if present)
availability = None
if extra.get('hireable'):
availability = 'open'
return {
'human_id': human_data.get('id'),
'values_vector': dict(values_vector),
'skills': dict(skills),
'interests': interests,
'location_pref': location_pref,
'availability': availability,
}
def fingerprint_similarity(fp_a, fp_b):
"""
calculate similarity between two fingerprints
returns 0-1 score
"""
# values similarity (cosine-ish)
va = fp_a.get('values_vector', {})
vb = fp_b.get('values_vector', {})
all_dims = set(va.keys()) | set(vb.keys())
if not all_dims:
return 0.0
dot_product = sum(va.get(d, 0) * vb.get(d, 0) for d in all_dims)
mag_a = sum(v**2 for v in va.values()) ** 0.5
mag_b = sum(v**2 for v in vb.values()) ** 0.5
if mag_a == 0 or mag_b == 0:
values_sim = 0.0
else:
values_sim = dot_product / (mag_a * mag_b)
# interest overlap (jaccard)
ia = set(fp_a.get('interests', []))
ib = set(fp_b.get('interests', []))
if ia or ib:
interest_sim = len(ia & ib) / len(ia | ib)
else:
interest_sim = 0.0
# location compatibility
loc_a = fp_a.get('location_pref')
loc_b = fp_b.get('location_pref')
loc_sim = 0.0
if loc_a == loc_b and loc_a is not None:
loc_sim = 1.0
elif loc_a == 'remote' or loc_b == 'remote':
loc_sim = 0.5
elif loc_a == 'pnw' or loc_b == 'pnw':
loc_sim = 0.3
# weighted combination
similarity = (values_sim * 0.5) + (interest_sim * 0.3) + (loc_sim * 0.2)
return similarity