connectd/haos-addon/scoutd/signals.py

159 lines
4.9 KiB
Python
Raw Normal View History

"""
shared signal patterns for all scrapers
"""
import re
# positive signals - what we're looking for
POSITIVE_PATTERNS = [
# values
(r'\b(solarpunk|cyberpunk)\b', 'solarpunk', 10),
(r'\b(anarchis[tm]|mutual.?aid)\b', 'mutual_aid', 10),
(r'\b(cooperative|collective|worker.?owned?|coop|co.?op)\b', 'cooperative', 15),
(r'\b(community|commons)\b', 'community', 5),
(r'\b(intentional.?community|cohousing|commune)\b', 'intentional_community', 20),
# queer-friendly
(r'\b(queer|lgbtq?|trans|nonbinary|enby|genderqueer)\b', 'queer', 15),
(r'\b(they/them|she/her|he/him|xe/xem|any.?pronouns)\b', 'pronouns', 10),
(r'\bblm\b', 'blm', 5),
(r'\b(acab|1312)\b', 'acab', 5),
# tech values
(r'\b(privacy|surveillance|anti.?surveillance)\b', 'privacy', 10),
(r'\b(self.?host(?:ed|ing)?|homelab|home.?server)\b', 'selfhosted', 15),
(r'\b(local.?first|offline.?first)\b', 'local_first', 15),
(r'\b(decentralized?|federation|federated|fediverse)\b', 'decentralized', 10),
(r'\b(foss|libre|open.?source|copyleft)\b', 'foss', 10),
(r'\b(home.?assistant|home.?automation)\b', 'home_automation', 10),
(r'\b(mesh|p2p|peer.?to.?peer)\b', 'p2p', 10),
(r'\b(matrix|xmpp|irc)\b', 'federated_chat', 5),
(r'\b(degoogle|de.?google)\b', 'degoogle', 10),
# location/availability
(r'\b(seattle|portland|pnw|cascadia|pacific.?northwest)\b', 'pnw', 20),
(r'\b(washington|oregon)\b', 'pnw_state', 10),
(r'\b(remote|anywhere|relocate|looking.?to.?move)\b', 'remote', 10),
# anti-capitalism
(r'\b(anti.?capitalis[tm]|post.?capitalis[tm]|degrowth)\b', 'anticapitalist', 10),
# neurodivergent (often overlaps with our values)
(r'\b(neurodivergent|adhd|autistic|autism)\b', 'neurodivergent', 5),
# technical skills (bonus for builders)
(r'\b(rust|go|python|typescript)\b', 'modern_lang', 3),
(r'\b(linux|bsd|nixos)\b', 'unix', 3),
(r'\b(kubernetes|docker|podman)\b', 'containers', 3),
]
# negative signals - red flags
NEGATIVE_PATTERNS = [
(r'\b(qanon|maga|trump|wwg1wga)\b', 'maga', -50),
(r'\b(covid.?hoax|plandemic|5g.?conspiracy)\b', 'conspiracy', -50),
(r'\b(nwo|illuminati|deep.?state)\b', 'conspiracy', -30),
(r'\b(anti.?vax|antivax)\b', 'antivax', -30),
(r'\b(sovereign.?citizen)\b', 'sovcit', -40),
(r'\b(crypto.?bro|web3|nft|blockchain|bitcoin|ethereum)\b', 'crypto', -15),
(r'\b(conservative|republican)\b', 'conservative', -20),
(r'\b(free.?speech.?absolutist)\b', 'freeze_peach', -20),
]
# target topics for repo discovery
TARGET_TOPICS = [
'local-first', 'self-hosted', 'privacy', 'mesh-network',
'cooperative', 'solarpunk', 'decentralized', 'p2p',
'fediverse', 'activitypub', 'matrix-org', 'homeassistant',
'esphome', 'open-source-hardware', 'right-to-repair',
'mutual-aid', 'commons', 'degoogle', 'privacy-tools',
]
# ecosystem repos - high signal contributors
ECOSYSTEM_REPOS = [
'home-assistant/core',
'esphome/esphome',
'matrix-org/synapse',
'LemmyNet/lemmy',
'mastodon/mastodon',
'owncast/owncast',
'nextcloud/server',
'immich-app/immich',
'jellyfin/jellyfin',
'navidrome/navidrome',
'paperless-ngx/paperless-ngx',
'actualbudget/actual',
'firefly-iii/firefly-iii',
'logseq/logseq',
'AppFlowy-IO/AppFlowy',
'siyuan-note/siyuan',
'anytype/anytype-ts',
'calcom/cal.com',
'plausible/analytics',
'umami-software/umami',
]
# aligned subreddits
ALIGNED_SUBREDDITS = {
'intentionalcommunity': 25,
'cohousing': 25,
'cooperatives': 20,
'solarpunk': 20,
'selfhosted': 15,
'homeassistant': 15,
'homelab': 10,
'privacy': 15,
'PrivacyGuides': 15,
'degoogle': 15,
'anticonsumption': 10,
'Frugal': 5,
'simpleliving': 5,
'Seattle': 10,
'Portland': 10,
'cascadia': 15,
'linux': 5,
'opensource': 10,
'FOSS': 10,
}
# negative subreddits
NEGATIVE_SUBREDDITS = [
'conspiracy', 'conservative', 'walkaway', 'louderwithcrowder',
'JordanPeterson', 'TimPool', 'NoNewNormal', 'LockdownSkepticism',
]
# high-signal mastodon instances
ALIGNED_INSTANCES = {
'tech.lgbt': 20,
'social.coop': 25,
'fosstodon.org': 10,
'hackers.town': 15,
'hachyderm.io': 10,
'infosec.exchange': 5,
}
def analyze_text(text):
"""
analyze text for signals
returns: (score, signals_found, negative_signals)
"""
if not text:
return 0, [], []
text = text.lower()
score = 0
signals = []
negatives = []
for pattern, signal_name, points in POSITIVE_PATTERNS:
if re.search(pattern, text, re.IGNORECASE):
score += points
signals.append(signal_name)
for pattern, signal_name, points in NEGATIVE_PATTERNS:
if re.search(pattern, text, re.IGNORECASE):
score += points # points are already negative
negatives.append(signal_name)
return score, list(set(signals)), list(set(negatives))