Friends_forever / src /extract_features.py
ak0601's picture
Upload 6 files
e386167 verified
raw
history blame
2.26 kB
import re
import random
from collections import Counter, defaultdict
def parse_chat(file_path):
pattern = r"(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}) - ([^:]+): (.*)"
messages = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
match = re.match(pattern, line)
if match:
date, time, sender, text = match.groups()
# Normalize names
if sender == "ak":
sender = "Aman"
elif sender == "Sarah con H":
sender = "Sarah"
messages.append({
"date": date,
"time": time,
"sender": sender,
"text": text.strip()
})
return messages
def extract_inside_jokes(messages):
funny_candidates = []
cute_candidates = []
memory_candidates = []
phrase_counter = Counter()
funny_keywords = ["lol", "πŸ˜‚", "🀣", "lmao", "funny", "haha", "hehe","hahaha"]
cute_keywords = ["miss", "thank", "sweet", "cute", "proud", "happy","aww","glad"]
for msg in messages:
text = msg["text"].lower()
# Funny moments
if any(k in text for k in funny_keywords):
funny_candidates.append(msg["text"])
# Cute/emotional moments
if any(k in text for k in cute_keywords):
cute_candidates.append(msg["text"])
# Memorable random moments
if len(msg["text"].split()) > 4: # skip too short
memory_candidates.append(msg["text"])
# Count repeated words
phrase_counter.update(text.split())
top_words = [w for w, c in phrase_counter.most_common(40)]
return {
"funny": funny_candidates,
"cute": cute_candidates,
"memories": memory_candidates,
"top_words": top_words
}
def random_memory(messages):
"""Returns a random meaningful moment."""
long_messages = [m["text"] for m in messages if len(m["text"]) > 10]
if not long_messages:
return "One of your old conversations ❀️"
return random.choice(long_messages)