mirror of
https://github.com/sudoxnym/connectd.git
synced 2026-04-14 19:46:30 +00:00
138 lines
3.7 KiB
Python
138 lines
3.7 KiB
Python
|
|
"""
|
||
|
|
matchd/rank.py - score and rank match quality
|
||
|
|
"""
|
||
|
|
|
||
|
|
from itertools import combinations
|
||
|
|
from .fingerprint import generate_fingerprint
|
||
|
|
from .overlap import find_overlap, is_same_person
|
||
|
|
from scoutd.deep import check_already_connected
|
||
|
|
|
||
|
|
|
||
|
|
def rank_matches(matches):
|
||
|
|
"""
|
||
|
|
rank a list of matches by quality
|
||
|
|
returns sorted list with quality scores
|
||
|
|
"""
|
||
|
|
ranked = []
|
||
|
|
|
||
|
|
for match in matches:
|
||
|
|
# base score from overlap
|
||
|
|
score = match.get('overlap_score', 0)
|
||
|
|
|
||
|
|
# bonus for geographic match
|
||
|
|
if match.get('geographic_match'):
|
||
|
|
score *= 1.2
|
||
|
|
|
||
|
|
# bonus for high fingerprint similarity
|
||
|
|
fp_sim = match.get('fingerprint_similarity')
|
||
|
|
if fp_sim and fp_sim > 0.7:
|
||
|
|
score *= 1.3
|
||
|
|
|
||
|
|
# bonus for complementary skills
|
||
|
|
comp_skills = match.get('complementary_skills', [])
|
||
|
|
if len(comp_skills) >= 3:
|
||
|
|
score *= 1.1
|
||
|
|
|
||
|
|
match['quality_score'] = score
|
||
|
|
ranked.append(match)
|
||
|
|
|
||
|
|
# sort by quality score
|
||
|
|
ranked.sort(key=lambda x: x['quality_score'], reverse=True)
|
||
|
|
|
||
|
|
return ranked
|
||
|
|
|
||
|
|
|
||
|
|
def find_all_matches(db, min_score=30, min_overlap=20):
|
||
|
|
"""
|
||
|
|
find all potential matches from database
|
||
|
|
returns list of match dicts
|
||
|
|
"""
|
||
|
|
print("matchd: finding all potential matches...")
|
||
|
|
|
||
|
|
# get all humans above threshold
|
||
|
|
humans = db.get_all_humans(min_score=min_score)
|
||
|
|
print(f" {len(humans)} humans to match")
|
||
|
|
|
||
|
|
# generate fingerprints
|
||
|
|
fingerprints = {}
|
||
|
|
for human in humans:
|
||
|
|
fp = generate_fingerprint(human)
|
||
|
|
fingerprints[human['id']] = fp
|
||
|
|
db.save_fingerprint(human['id'], fp)
|
||
|
|
|
||
|
|
print(f" generated {len(fingerprints)} fingerprints")
|
||
|
|
|
||
|
|
# find all pairs
|
||
|
|
matches = []
|
||
|
|
checked = 0
|
||
|
|
skipped_same = 0
|
||
|
|
skipped_connected = 0
|
||
|
|
|
||
|
|
for human_a, human_b in combinations(humans, 2):
|
||
|
|
checked += 1
|
||
|
|
|
||
|
|
# skip if likely same person
|
||
|
|
if is_same_person(human_a, human_b):
|
||
|
|
skipped_same += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# skip if already connected (same org, company, co-contributors)
|
||
|
|
connected, reason = check_already_connected(human_a, human_b)
|
||
|
|
if connected:
|
||
|
|
skipped_connected += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# calculate overlap
|
||
|
|
fp_a = fingerprints.get(human_a['id'])
|
||
|
|
fp_b = fingerprints.get(human_b['id'])
|
||
|
|
|
||
|
|
overlap = find_overlap(human_a, human_b, fp_a, fp_b)
|
||
|
|
|
||
|
|
if overlap['overlap_score'] >= min_overlap:
|
||
|
|
match = {
|
||
|
|
'human_a': human_a,
|
||
|
|
'human_b': human_b,
|
||
|
|
**overlap
|
||
|
|
}
|
||
|
|
matches.append(match)
|
||
|
|
|
||
|
|
# save to db
|
||
|
|
db.save_match(human_a['id'], human_b['id'], overlap)
|
||
|
|
|
||
|
|
if checked % 1000 == 0:
|
||
|
|
print(f" checked {checked} pairs, {len(matches)} matches so far...")
|
||
|
|
|
||
|
|
print(f" checked {checked} pairs")
|
||
|
|
print(f" skipped {skipped_same} (same person), {skipped_connected} (already connected)")
|
||
|
|
print(f" found {len(matches)} potential matches")
|
||
|
|
|
||
|
|
# rank them
|
||
|
|
ranked = rank_matches(matches)
|
||
|
|
|
||
|
|
return ranked
|
||
|
|
|
||
|
|
|
||
|
|
def get_top_matches(db, limit=50):
|
||
|
|
"""
|
||
|
|
get top matches from database
|
||
|
|
"""
|
||
|
|
match_rows = db.get_matches(limit=limit)
|
||
|
|
|
||
|
|
matches = []
|
||
|
|
for row in match_rows:
|
||
|
|
human_a = db.get_human_by_id(row['human_a_id'])
|
||
|
|
human_b = db.get_human_by_id(row['human_b_id'])
|
||
|
|
|
||
|
|
if human_a and human_b:
|
||
|
|
matches.append({
|
||
|
|
'id': row['id'],
|
||
|
|
'human_a': human_a,
|
||
|
|
'human_b': human_b,
|
||
|
|
'overlap_score': row['overlap_score'],
|
||
|
|
'overlap_reasons': row['overlap_reasons'],
|
||
|
|
'geographic_match': row['geographic_match'],
|
||
|
|
'status': row['status'],
|
||
|
|
})
|
||
|
|
|
||
|
|
return matches
|