Spaces:
Runtime error
Runtime error
Commit
·
52cf520
1
Parent(s):
819d651
Add application files
Browse files- app.py +17 -14
- services/__pycache__/enhanced_contact_finder.cpython-310.pyc +0 -0
- services/enhanced_contact_finder.py +225 -351
app.py
CHANGED
|
@@ -590,19 +590,21 @@ def get_contacts_html() -> str:
|
|
| 590 |
"""
|
| 591 |
|
| 592 |
html = """
|
| 593 |
-
<div style="background: var(--
|
| 594 |
-
<div style="font-size: 13px; color: var(--
|
| 595 |
-
<strong
|
| 596 |
-
|
| 597 |
</div>
|
| 598 |
</div>
|
| 599 |
"""
|
| 600 |
for c in reversed(knowledge_base["contacts"]):
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
|
|
|
|
|
|
| 606 |
html += f"""
|
| 607 |
<div class="prospect-card" style="padding: 16px 20px;">
|
| 608 |
<div style="display: flex; justify-content: space-between; align-items: center;">
|
|
@@ -610,10 +612,11 @@ def get_contacts_html() -> str:
|
|
| 610 |
<div style="font-weight: 600; color: var(--text-primary);">👤 {c.get("name", "Unknown")}</div>
|
| 611 |
<div style="font-size: 13px; color: var(--text-secondary); margin-top: 4px;">{c.get("title", "Unknown title")}</div>
|
| 612 |
<div style="font-size: 13px; color: var(--text-secondary);">🏢 {c.get("company", "Unknown company")}</div>
|
| 613 |
-
{f'<div style="font-size: 13px; color: var(--primary-blue); margin-top: 4px;">📧 {c.get("email")}
|
| 614 |
</div>
|
| 615 |
-
<span class="prospect-card-badge
|
| 616 |
</div>
|
|
|
|
| 617 |
</div>
|
| 618 |
"""
|
| 619 |
return html
|
|
@@ -1138,12 +1141,12 @@ After processing {num_prospects} prospects, provide summary:
|
|
| 1138 |
output += f" - {p.get('summary')[:150]}...\n" if len(p.get('summary', '')) > 150 else f" - {p.get('summary')}\n"
|
| 1139 |
|
| 1140 |
if contacts_found:
|
| 1141 |
-
output += "\n### 👥 Decision Makers Found\n\n"
|
| 1142 |
-
output += ">
|
| 1143 |
for c in contacts_found:
|
| 1144 |
output += f"- **{c.get('name', 'Unknown')}** - {c.get('title', 'Unknown')} at {c.get('company', 'Unknown')}\n"
|
| 1145 |
if c.get('email'):
|
| 1146 |
-
output += f" - Email: {c.get('email')} *(
|
| 1147 |
|
| 1148 |
if emails_drafted:
|
| 1149 |
output += "\n### ✉️ Emails Drafted\n\n"
|
|
|
|
| 590 |
"""
|
| 591 |
|
| 592 |
html = """
|
| 593 |
+
<div style="background: var(--success-bg, #d4edda); border: 1px solid var(--success-border, #c3e6cb); border-radius: 8px; padding: 12px 16px; margin-bottom: 16px;">
|
| 594 |
+
<div style="font-size: 13px; color: var(--success-text, #155724);">
|
| 595 |
+
<strong>✅ Verified Contacts:</strong> All contacts shown here were found through web searches of LinkedIn profiles,
|
| 596 |
+
company team pages, and public directories. Only contacts with <strong>verified email addresses</strong> found on the web are displayed.
|
| 597 |
</div>
|
| 598 |
</div>
|
| 599 |
"""
|
| 600 |
for c in reversed(knowledge_base["contacts"]):
|
| 601 |
+
source = c.get("source", "web_search")
|
| 602 |
+
source_label = {
|
| 603 |
+
"web_search": "Found via web search",
|
| 604 |
+
"linkedin": "Found via LinkedIn",
|
| 605 |
+
"team_page": "Found on company page",
|
| 606 |
+
"web_search_and_scraping": "Verified from web"
|
| 607 |
+
}.get(source, "Verified")
|
| 608 |
html += f"""
|
| 609 |
<div class="prospect-card" style="padding: 16px 20px;">
|
| 610 |
<div style="display: flex; justify-content: space-between; align-items: center;">
|
|
|
|
| 612 |
<div style="font-weight: 600; color: var(--text-primary);">👤 {c.get("name", "Unknown")}</div>
|
| 613 |
<div style="font-size: 13px; color: var(--text-secondary); margin-top: 4px;">{c.get("title", "Unknown title")}</div>
|
| 614 |
<div style="font-size: 13px; color: var(--text-secondary);">🏢 {c.get("company", "Unknown company")}</div>
|
| 615 |
+
{f'<div style="font-size: 13px; color: var(--primary-blue); margin-top: 4px;">📧 {c.get("email")}</div>' if c.get("email") else ''}
|
| 616 |
</div>
|
| 617 |
+
<span class="prospect-card-badge badge-engaged">VERIFIED</span>
|
| 618 |
</div>
|
| 619 |
+
<div style="font-size: 11px; color: var(--text-secondary); margin-top: 8px;">{source_label}</div>
|
| 620 |
</div>
|
| 621 |
"""
|
| 622 |
return html
|
|
|
|
| 1141 |
output += f" - {p.get('summary')[:150]}...\n" if len(p.get('summary', '')) > 150 else f" - {p.get('summary')}\n"
|
| 1142 |
|
| 1143 |
if contacts_found:
|
| 1144 |
+
output += "\n### 👥 Verified Decision Makers Found\n\n"
|
| 1145 |
+
output += "> ✅ **Verified:** These contacts were found through web searches of LinkedIn, company pages, and public sources.\n\n"
|
| 1146 |
for c in contacts_found:
|
| 1147 |
output += f"- **{c.get('name', 'Unknown')}** - {c.get('title', 'Unknown')} at {c.get('company', 'Unknown')}\n"
|
| 1148 |
if c.get('email'):
|
| 1149 |
+
output += f" - Email: {c.get('email')} *(verified)*\n"
|
| 1150 |
|
| 1151 |
if emails_drafted:
|
| 1152 |
output += "\n### ✉️ Emails Drafted\n\n"
|
services/__pycache__/enhanced_contact_finder.cpython-310.pyc
CHANGED
|
Binary files a/services/__pycache__/enhanced_contact_finder.cpython-310.pyc and b/services/__pycache__/enhanced_contact_finder.cpython-310.pyc differ
|
|
|
services/enhanced_contact_finder.py
CHANGED
|
@@ -72,15 +72,11 @@ class EnhancedContactFinder:
|
|
| 72 |
r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*\n\s*([A-Z][^,\n]+)',
|
| 73 |
]
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
|
|
|
|
| 77 |
'{first}.{last}', # [email protected]
|
| 78 |
'{first}{last}', # [email protected]
|
| 79 |
-
'{first}_{last}', # [email protected]
|
| 80 |
-
'{first}-{last}', # [email protected]
|
| 81 |
-
'{first}', # [email protected]
|
| 82 |
-
'{f}{last}', # [email protected]
|
| 83 |
-
'{first}{l}', # [email protected]
|
| 84 |
]
|
| 85 |
|
| 86 |
async def find_real_contacts(
|
|
@@ -91,123 +87,74 @@ class EnhancedContactFinder:
|
|
| 91 |
max_contacts: int = 3
|
| 92 |
) -> List[Contact]:
|
| 93 |
"""
|
| 94 |
-
Find real decision-makers
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
Returns:
|
| 97 |
-
List of Contact objects with
|
| 98 |
"""
|
| 99 |
-
logger.info(f"EnhancedFinder: Finding
|
| 100 |
print(f"\n[CONTACT FINDER] Starting search for {company_name}")
|
| 101 |
print(f"[CONTACT FINDER] Domain: {domain}")
|
| 102 |
print(f"[CONTACT FINDER] Target titles: {target_titles}")
|
| 103 |
print(f"[CONTACT FINDER] Max contacts: {max_contacts}")
|
|
|
|
| 104 |
|
| 105 |
contacts = []
|
| 106 |
seen_emails: Set[str] = set()
|
| 107 |
|
| 108 |
-
# Strategy 1: Search
|
| 109 |
-
print(f"[CONTACT FINDER] Strategy 1: Searching
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
logger.info(f"EnhancedFinder: Found {title} via LinkedIn search")
|
| 122 |
-
print(f"[CONTACT FINDER] ✓ FOUND: {linkedin_contact.name} ({linkedin_contact.title}) - {linkedin_contact.email}")
|
| 123 |
-
|
| 124 |
-
if len(contacts) >= max_contacts:
|
| 125 |
-
print(f"[CONTACT FINDER] Found enough contacts ({len(contacts)}), returning early")
|
| 126 |
-
return contacts
|
| 127 |
-
else:
|
| 128 |
-
print(f"[CONTACT FINDER] ✗ Not found via LinkedIn")
|
| 129 |
-
|
| 130 |
-
# Strategy 2: Scrape company team pages
|
| 131 |
if len(contacts) < max_contacts:
|
| 132 |
-
|
|
|
|
| 133 |
company_name,
|
| 134 |
domain,
|
| 135 |
target_titles,
|
| 136 |
seen_emails,
|
| 137 |
max_contacts - len(contacts)
|
| 138 |
)
|
| 139 |
-
contacts.extend(
|
| 140 |
-
|
| 141 |
|
| 142 |
-
# Strategy 3:
|
| 143 |
if len(contacts) < max_contacts:
|
| 144 |
-
|
|
|
|
| 145 |
company_name,
|
| 146 |
domain,
|
| 147 |
target_titles,
|
| 148 |
seen_emails,
|
| 149 |
max_contacts - len(contacts)
|
| 150 |
)
|
| 151 |
-
contacts.extend(
|
| 152 |
-
|
| 153 |
|
| 154 |
-
logger.info(f"EnhancedFinder: Total {len(contacts)}
|
| 155 |
print(f"[CONTACT FINDER] === FINAL RESULT ===")
|
| 156 |
-
print(f"[CONTACT FINDER] Total contacts found: {len(contacts)}")
|
| 157 |
for i, contact in enumerate(contacts[:max_contacts], 1):
|
| 158 |
-
print(f"[CONTACT FINDER] {i}. {contact.name} ({contact.title}) - {contact.email}")
|
|
|
|
|
|
|
|
|
|
| 159 |
print(f"[CONTACT FINDER] ====================\n")
|
| 160 |
return contacts[:max_contacts]
|
| 161 |
|
| 162 |
-
async def
|
| 163 |
-
self,
|
| 164 |
-
company_name: str,
|
| 165 |
-
title: str,
|
| 166 |
-
domain: str,
|
| 167 |
-
seen_emails: Set[str]
|
| 168 |
-
) -> Optional[Contact]:
|
| 169 |
-
"""Search LinkedIn specifically for decision-makers"""
|
| 170 |
-
|
| 171 |
-
# LinkedIn-specific search queries
|
| 172 |
-
queries = [
|
| 173 |
-
f'site:linkedin.com/in {title} at {company_name}',
|
| 174 |
-
f'linkedin {company_name} {title}',
|
| 175 |
-
f'"{title}" "{company_name}" linkedin.com',
|
| 176 |
-
]
|
| 177 |
-
|
| 178 |
-
for query in queries:
|
| 179 |
-
try:
|
| 180 |
-
print(f"[CONTACT FINDER] Query: '{query}'")
|
| 181 |
-
results = await self.search.search(query, max_results=5)
|
| 182 |
-
print(f"[CONTACT FINDER] Results: {len(results)} found")
|
| 183 |
-
|
| 184 |
-
for result in results:
|
| 185 |
-
# Extract name and title from LinkedIn result
|
| 186 |
-
contact_info = self._extract_linkedin_info(result, title, company_name)
|
| 187 |
-
|
| 188 |
-
if contact_info and contact_info.get('name'):
|
| 189 |
-
name = contact_info['name']
|
| 190 |
-
detected_title = contact_info.get('title', title)
|
| 191 |
-
|
| 192 |
-
# Generate email from name
|
| 193 |
-
email = await self._generate_verified_email(name, domain, seen_emails)
|
| 194 |
-
|
| 195 |
-
if email:
|
| 196 |
-
return Contact(
|
| 197 |
-
id=str(uuid.uuid4()),
|
| 198 |
-
name=name,
|
| 199 |
-
email=email,
|
| 200 |
-
title=detected_title,
|
| 201 |
-
prospect_id=""
|
| 202 |
-
)
|
| 203 |
-
|
| 204 |
-
except Exception as e:
|
| 205 |
-
logger.debug(f"EnhancedFinder: LinkedIn search error for '{query}': {str(e)}")
|
| 206 |
-
continue
|
| 207 |
-
|
| 208 |
-
return None
|
| 209 |
-
|
| 210 |
-
async def _scrape_team_pages(
|
| 211 |
self,
|
| 212 |
company_name: str,
|
| 213 |
domain: str,
|
|
@@ -215,148 +162,54 @@ class EnhancedContactFinder:
|
|
| 215 |
seen_emails: Set[str],
|
| 216 |
max_needed: int
|
| 217 |
) -> List[Contact]:
|
| 218 |
-
"""
|
| 219 |
-
|
| 220 |
contacts = []
|
| 221 |
|
| 222 |
-
#
|
| 223 |
-
|
| 224 |
-
f'
|
| 225 |
-
f'
|
| 226 |
-
f'site:{domain}
|
| 227 |
-
f'{company_name}
|
| 228 |
]
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
for query in team_page_queries:
|
| 233 |
try:
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
url = result.get('url', '')
|
| 237 |
-
if domain in url and any(pattern in url.lower() for pattern in self.team_page_patterns):
|
| 238 |
-
team_page_urls.add(url)
|
| 239 |
-
except Exception as e:
|
| 240 |
-
logger.debug(f"EnhancedFinder: Team page search error: {str(e)}")
|
| 241 |
-
continue
|
| 242 |
-
|
| 243 |
-
# Also try common team page URLs directly
|
| 244 |
-
for pattern in self.team_page_patterns[:3]: # Try top 3 patterns
|
| 245 |
-
team_page_urls.add(f"https://{domain}{pattern}")
|
| 246 |
-
team_page_urls.add(f"https://www.{domain}{pattern}")
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
except Exception as e:
|
| 268 |
-
logger.debug(f"
|
| 269 |
continue
|
| 270 |
|
| 271 |
return contacts
|
| 272 |
|
| 273 |
-
async def
|
| 274 |
-
self,
|
| 275 |
-
url: str,
|
| 276 |
-
company_name: str,
|
| 277 |
-
domain: str,
|
| 278 |
-
target_titles: List[str],
|
| 279 |
-
seen_emails: Set[str]
|
| 280 |
-
) -> List[Contact]:
|
| 281 |
-
"""Extract contact information from a webpage"""
|
| 282 |
-
|
| 283 |
-
contacts = []
|
| 284 |
-
|
| 285 |
-
try:
|
| 286 |
-
# Scrape the page
|
| 287 |
-
page_content = await self.scraper.scrape_page(url)
|
| 288 |
-
|
| 289 |
-
if not page_content:
|
| 290 |
-
return contacts
|
| 291 |
-
|
| 292 |
-
# Extract all text
|
| 293 |
-
text = page_content.get('text', '')
|
| 294 |
-
|
| 295 |
-
# Find all potential contacts using regex patterns
|
| 296 |
-
potential_contacts = []
|
| 297 |
-
|
| 298 |
-
for pattern in self.name_patterns:
|
| 299 |
-
matches = re.finditer(pattern, text, re.MULTILINE)
|
| 300 |
-
for match in matches:
|
| 301 |
-
name = match.group(1).strip()
|
| 302 |
-
title = match.group(2).strip() if len(match.groups()) > 1 else ""
|
| 303 |
-
|
| 304 |
-
# Validate name
|
| 305 |
-
if self._is_valid_name(name):
|
| 306 |
-
potential_contacts.append({
|
| 307 |
-
'name': name,
|
| 308 |
-
'title': title
|
| 309 |
-
})
|
| 310 |
-
|
| 311 |
-
# Also look for email addresses directly on the page
|
| 312 |
-
email_pattern = r'\b[A-Za-z0-9._%+-]+@' + re.escape(domain) + r'\b'
|
| 313 |
-
found_emails = re.findall(email_pattern, text, re.IGNORECASE)
|
| 314 |
-
|
| 315 |
-
# Match contacts with titles we're looking for
|
| 316 |
-
for pc in potential_contacts:
|
| 317 |
-
name = pc['name']
|
| 318 |
-
title = pc['title']
|
| 319 |
-
|
| 320 |
-
# Check if title matches any of our target titles
|
| 321 |
-
title_match = any(
|
| 322 |
-
target.lower() in title.lower() or title.lower() in target.lower()
|
| 323 |
-
for target in target_titles
|
| 324 |
-
)
|
| 325 |
-
|
| 326 |
-
if title_match or not pc['title']: # Include if title matches or no title found
|
| 327 |
-
# Try to find email for this person
|
| 328 |
-
email = None
|
| 329 |
-
|
| 330 |
-
# First, check if we found a direct email for this person on the page
|
| 331 |
-
name_parts = name.lower().split()
|
| 332 |
-
for found_email in found_emails:
|
| 333 |
-
if any(part in found_email.lower() for part in name_parts):
|
| 334 |
-
email = found_email
|
| 335 |
-
break
|
| 336 |
-
|
| 337 |
-
# If no direct email, generate one
|
| 338 |
-
if not email:
|
| 339 |
-
email = await self._generate_verified_email(name, domain, seen_emails)
|
| 340 |
-
|
| 341 |
-
if email and email.lower() not in seen_emails:
|
| 342 |
-
# Use matched title or best guess from target titles
|
| 343 |
-
final_title = title if title else target_titles[0]
|
| 344 |
-
|
| 345 |
-
contacts.append(Contact(
|
| 346 |
-
id=str(uuid.uuid4()),
|
| 347 |
-
name=name,
|
| 348 |
-
email=email,
|
| 349 |
-
title=final_title,
|
| 350 |
-
prospect_id=""
|
| 351 |
-
))
|
| 352 |
-
seen_emails.add(email.lower())
|
| 353 |
-
|
| 354 |
-
except Exception as e:
|
| 355 |
-
logger.error(f"EnhancedFinder: Error extracting contacts from {url}: {str(e)}")
|
| 356 |
-
|
| 357 |
-
return contacts
|
| 358 |
-
|
| 359 |
-
async def _search_company_contacts(
|
| 360 |
self,
|
| 361 |
company_name: str,
|
| 362 |
domain: str,
|
|
@@ -364,122 +217,188 @@ class EnhancedContactFinder:
|
|
| 364 |
seen_emails: Set[str],
|
| 365 |
max_needed: int
|
| 366 |
) -> List[Contact]:
|
| 367 |
-
"""
|
| 368 |
-
|
| 369 |
contacts = []
|
| 370 |
|
| 371 |
-
#
|
| 372 |
-
|
| 373 |
-
f
|
| 374 |
-
f
|
| 375 |
-
f
|
| 376 |
-
f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
]
|
| 378 |
|
| 379 |
-
for
|
| 380 |
try:
|
| 381 |
-
|
|
|
|
|
|
|
| 382 |
|
| 383 |
-
|
| 384 |
-
# Try to extract contact info
|
| 385 |
-
extracted = self._extract_contact_from_text(
|
| 386 |
-
result.get('title', '') + ' ' + result.get('body', ''),
|
| 387 |
-
target_titles,
|
| 388 |
-
company_name
|
| 389 |
-
)
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
title = extracted['title']
|
| 394 |
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
-
if
|
| 398 |
contacts.append(Contact(
|
| 399 |
id=str(uuid.uuid4()),
|
| 400 |
name=name,
|
| 401 |
email=email,
|
| 402 |
-
title=title,
|
| 403 |
prospect_id=""
|
| 404 |
))
|
| 405 |
seen_emails.add(email.lower())
|
|
|
|
| 406 |
|
| 407 |
if len(contacts) >= max_needed:
|
| 408 |
return contacts
|
| 409 |
|
| 410 |
except Exception as e:
|
| 411 |
-
logger.debug(f"
|
| 412 |
continue
|
| 413 |
|
| 414 |
return contacts
|
| 415 |
|
| 416 |
-
def
|
| 417 |
self,
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
# LinkedIn title format: "Name - Title at Company | LinkedIn"
|
| 427 |
-
linkedin_pattern = r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*[-–—]\s*([^|]+?)\s*(?:at|@)\s*([^|]+)'
|
| 428 |
-
match = re.search(linkedin_pattern, text)
|
| 429 |
-
|
| 430 |
-
if match:
|
| 431 |
-
name = match.group(1).strip()
|
| 432 |
-
title = match.group(2).strip()
|
| 433 |
-
company = match.group(3).strip()
|
| 434 |
-
|
| 435 |
-
# Validate that it's the right company
|
| 436 |
-
if company_name.lower() in company.lower() and self._is_valid_name(name):
|
| 437 |
-
return {
|
| 438 |
-
'name': name,
|
| 439 |
-
'title': title
|
| 440 |
-
}
|
| 441 |
-
|
| 442 |
-
# Try other patterns
|
| 443 |
-
for pattern in self.name_patterns:
|
| 444 |
-
match = re.search(pattern, text)
|
| 445 |
-
if match and len(match.groups()) >= 2:
|
| 446 |
-
name = match.group(1).strip()
|
| 447 |
-
title = match.group(2).strip()
|
| 448 |
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
-
return
|
| 456 |
|
| 457 |
-
def
|
| 458 |
-
|
| 459 |
-
text:
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
for pattern in self.name_patterns:
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
if
|
| 470 |
-
name =
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
return None
|
| 484 |
|
| 485 |
def _is_valid_name(self, name: str) -> bool:
|
|
@@ -515,51 +434,6 @@ class EnhancedContactFinder:
|
|
| 515 |
|
| 516 |
return True
|
| 517 |
|
| 518 |
-
async def _generate_verified_email(
|
| 519 |
-
self,
|
| 520 |
-
name: str,
|
| 521 |
-
domain: str,
|
| 522 |
-
seen_emails: Set[str]
|
| 523 |
-
) -> Optional[str]:
|
| 524 |
-
"""Generate and validate email address from name"""
|
| 525 |
-
|
| 526 |
-
# Clean name
|
| 527 |
-
name_clean = re.sub(r"[^a-zA-Z\s]", "", name).strip().lower()
|
| 528 |
-
parts = name_clean.split()
|
| 529 |
-
|
| 530 |
-
if len(parts) < 2:
|
| 531 |
-
return None
|
| 532 |
-
|
| 533 |
-
first = parts[0]
|
| 534 |
-
last = parts[-1]
|
| 535 |
-
|
| 536 |
-
# Try different email formats
|
| 537 |
-
for fmt in self.email_formats:
|
| 538 |
-
try:
|
| 539 |
-
email_prefix = fmt.format(
|
| 540 |
-
first=first,
|
| 541 |
-
last=last,
|
| 542 |
-
f=first[0] if first else '',
|
| 543 |
-
l=last[0] if last else ''
|
| 544 |
-
)
|
| 545 |
-
|
| 546 |
-
email = f"{email_prefix}@{domain}"
|
| 547 |
-
|
| 548 |
-
# Validate format
|
| 549 |
-
validated = validate_email(email, check_deliverability=False)
|
| 550 |
-
normalized = validated.normalized
|
| 551 |
-
|
| 552 |
-
# Check if not seen
|
| 553 |
-
if normalized.lower() not in seen_emails:
|
| 554 |
-
# Check if not a generic email
|
| 555 |
-
if not self._is_generic_email(email_prefix):
|
| 556 |
-
return normalized
|
| 557 |
-
|
| 558 |
-
except EmailNotValidError:
|
| 559 |
-
continue
|
| 560 |
-
|
| 561 |
-
return None
|
| 562 |
-
|
| 563 |
def _is_generic_email(self, prefix: str) -> bool:
|
| 564 |
"""Check if email prefix is generic (info, contact, etc.)"""
|
| 565 |
|
|
|
|
| 72 |
r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\s*\n\s*([A-Z][^,\n]+)',
|
| 73 |
]
|
| 74 |
|
| 75 |
+
# We do NOT estimate emails - only use verified emails found on web
|
| 76 |
+
# This list is kept for reference but not used for generation
|
| 77 |
+
self._common_email_patterns = [
|
| 78 |
'{first}.{last}', # [email protected]
|
| 79 |
'{first}{last}', # [email protected]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
]
|
| 81 |
|
| 82 |
async def find_real_contacts(
|
|
|
|
| 87 |
max_contacts: int = 3
|
| 88 |
) -> List[Contact]:
|
| 89 |
"""
|
| 90 |
+
Find real decision-makers with VERIFIED email addresses.
|
| 91 |
+
|
| 92 |
+
IMPORTANT: Only returns contacts where we found ACTUAL email addresses
|
| 93 |
+
from web sources. Does NOT generate or estimate emails.
|
| 94 |
|
| 95 |
Returns:
|
| 96 |
+
List of Contact objects with verified names and emails only
|
| 97 |
"""
|
| 98 |
+
logger.info(f"EnhancedFinder: Finding VERIFIED contacts at '{company_name}'")
|
| 99 |
print(f"\n[CONTACT FINDER] Starting search for {company_name}")
|
| 100 |
print(f"[CONTACT FINDER] Domain: {domain}")
|
| 101 |
print(f"[CONTACT FINDER] Target titles: {target_titles}")
|
| 102 |
print(f"[CONTACT FINDER] Max contacts: {max_contacts}")
|
| 103 |
+
print(f"[CONTACT FINDER] NOTE: Only returning contacts with VERIFIED emails found on web")
|
| 104 |
|
| 105 |
contacts = []
|
| 106 |
seen_emails: Set[str] = set()
|
| 107 |
|
| 108 |
+
# Strategy 1: Search for actual email addresses directly
|
| 109 |
+
print(f"[CONTACT FINDER] Strategy 1: Searching for actual email addresses...")
|
| 110 |
+
email_contacts = await self._search_for_emails(
|
| 111 |
+
company_name,
|
| 112 |
+
domain,
|
| 113 |
+
target_titles,
|
| 114 |
+
seen_emails,
|
| 115 |
+
max_contacts
|
| 116 |
+
)
|
| 117 |
+
contacts.extend(email_contacts)
|
| 118 |
+
print(f"[CONTACT FINDER] Found {len(email_contacts)} contacts with verified emails")
|
| 119 |
+
|
| 120 |
+
# Strategy 2: Scrape company team/contact pages for emails
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
if len(contacts) < max_contacts:
|
| 122 |
+
print(f"[CONTACT FINDER] Strategy 2: Scraping company pages for contact emails...")
|
| 123 |
+
scraped_contacts = await self._scrape_for_verified_emails(
|
| 124 |
company_name,
|
| 125 |
domain,
|
| 126 |
target_titles,
|
| 127 |
seen_emails,
|
| 128 |
max_contacts - len(contacts)
|
| 129 |
)
|
| 130 |
+
contacts.extend(scraped_contacts)
|
| 131 |
+
print(f"[CONTACT FINDER] Found {len(scraped_contacts)} contacts from page scraping")
|
| 132 |
|
| 133 |
+
# Strategy 3: Search LinkedIn + news for names WITH email mentions
|
| 134 |
if len(contacts) < max_contacts:
|
| 135 |
+
print(f"[CONTACT FINDER] Strategy 3: Searching for executives with public emails...")
|
| 136 |
+
linkedin_contacts = await self._find_contacts_with_emails(
|
| 137 |
company_name,
|
| 138 |
domain,
|
| 139 |
target_titles,
|
| 140 |
seen_emails,
|
| 141 |
max_contacts - len(contacts)
|
| 142 |
)
|
| 143 |
+
contacts.extend(linkedin_contacts)
|
| 144 |
+
print(f"[CONTACT FINDER] Found {len(linkedin_contacts)} contacts from web search")
|
| 145 |
|
| 146 |
+
logger.info(f"EnhancedFinder: Total {len(contacts)} VERIFIED contacts found for '{company_name}'")
|
| 147 |
print(f"[CONTACT FINDER] === FINAL RESULT ===")
|
| 148 |
+
print(f"[CONTACT FINDER] Total VERIFIED contacts found: {len(contacts)}")
|
| 149 |
for i, contact in enumerate(contacts[:max_contacts], 1):
|
| 150 |
+
print(f"[CONTACT FINDER] {i}. {contact.name} ({contact.title}) - {contact.email} [VERIFIED]")
|
| 151 |
+
if len(contacts) == 0:
|
| 152 |
+
print(f"[CONTACT FINDER] No contacts with verified emails found.")
|
| 153 |
+
print(f"[CONTACT FINDER] This is normal - many companies don't publish executive emails.")
|
| 154 |
print(f"[CONTACT FINDER] ====================\n")
|
| 155 |
return contacts[:max_contacts]
|
| 156 |
|
| 157 |
+
async def _search_for_emails(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
self,
|
| 159 |
company_name: str,
|
| 160 |
domain: str,
|
|
|
|
| 162 |
seen_emails: Set[str],
|
| 163 |
max_needed: int
|
| 164 |
) -> List[Contact]:
|
| 165 |
+
"""Search specifically for email addresses associated with company executives"""
|
|
|
|
| 166 |
contacts = []
|
| 167 |
|
| 168 |
+
# Direct email search queries
|
| 169 |
+
email_queries = [
|
| 170 |
+
f'"{domain}" email CEO OR founder OR director',
|
| 171 |
+
f'"{company_name}" contact email executive',
|
| 172 |
+
f'site:{domain} email contact',
|
| 173 |
+
f'"{company_name}" "@{domain}" CEO OR VP OR director',
|
| 174 |
]
|
| 175 |
|
| 176 |
+
for query in email_queries:
|
|
|
|
|
|
|
| 177 |
try:
|
| 178 |
+
print(f"[CONTACT FINDER] Query: '{query}'")
|
| 179 |
+
results = await self.search.search(query, max_results=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
for result in results:
|
| 182 |
+
text = result.get('title', '') + ' ' + result.get('body', '')
|
| 183 |
+
|
| 184 |
+
# Extract emails from text
|
| 185 |
+
found_emails = self._extract_emails_from_text(text, domain)
|
| 186 |
+
|
| 187 |
+
for email in found_emails:
|
| 188 |
+
if email.lower() not in seen_emails and not self._is_generic_email(email.split('@')[0]):
|
| 189 |
+
# Try to find associated name and title
|
| 190 |
+
name, title = self._extract_name_near_email(text, email, target_titles)
|
| 191 |
+
|
| 192 |
+
if name:
|
| 193 |
+
contacts.append(Contact(
|
| 194 |
+
id=str(uuid.uuid4()),
|
| 195 |
+
name=name,
|
| 196 |
+
email=email,
|
| 197 |
+
title=title or "Executive",
|
| 198 |
+
prospect_id=""
|
| 199 |
+
))
|
| 200 |
+
seen_emails.add(email.lower())
|
| 201 |
+
print(f"[CONTACT FINDER] ✓ FOUND: {name} - {email}")
|
| 202 |
+
|
| 203 |
+
if len(contacts) >= max_needed:
|
| 204 |
+
return contacts
|
| 205 |
|
| 206 |
except Exception as e:
|
| 207 |
+
logger.debug(f"Email search error: {str(e)}")
|
| 208 |
continue
|
| 209 |
|
| 210 |
return contacts
|
| 211 |
|
| 212 |
+
async def _scrape_for_verified_emails(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
self,
|
| 214 |
company_name: str,
|
| 215 |
domain: str,
|
|
|
|
| 217 |
seen_emails: Set[str],
|
| 218 |
max_needed: int
|
| 219 |
) -> List[Contact]:
|
| 220 |
+
"""Scrape company pages to find actual email addresses"""
|
|
|
|
| 221 |
contacts = []
|
| 222 |
|
| 223 |
+
# Pages likely to have contact info
|
| 224 |
+
pages_to_check = [
|
| 225 |
+
f"https://{domain}/contact",
|
| 226 |
+
f"https://{domain}/contact-us",
|
| 227 |
+
f"https://{domain}/about",
|
| 228 |
+
f"https://{domain}/about-us",
|
| 229 |
+
f"https://{domain}/team",
|
| 230 |
+
f"https://{domain}/leadership",
|
| 231 |
+
f"https://{domain}/our-team",
|
| 232 |
+
f"https://www.{domain}/contact",
|
| 233 |
+
f"https://www.{domain}/about",
|
| 234 |
+
f"https://www.{domain}/team",
|
| 235 |
]
|
| 236 |
|
| 237 |
+
for url in pages_to_check:
|
| 238 |
try:
|
| 239 |
+
page_content = await self.scraper.scrape_page(url)
|
| 240 |
+
if not page_content:
|
| 241 |
+
continue
|
| 242 |
|
| 243 |
+
text = page_content.get('text', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
# Find all emails on page
|
| 246 |
+
found_emails = self._extract_emails_from_text(text, domain)
|
|
|
|
| 247 |
|
| 248 |
+
for email in found_emails:
|
| 249 |
+
if email.lower() not in seen_emails and not self._is_generic_email(email.split('@')[0]):
|
| 250 |
+
# Try to find associated name
|
| 251 |
+
name, title = self._extract_name_near_email(text, email, target_titles)
|
| 252 |
|
| 253 |
+
if name:
|
| 254 |
contacts.append(Contact(
|
| 255 |
id=str(uuid.uuid4()),
|
| 256 |
name=name,
|
| 257 |
email=email,
|
| 258 |
+
title=title or "Contact",
|
| 259 |
prospect_id=""
|
| 260 |
))
|
| 261 |
seen_emails.add(email.lower())
|
| 262 |
+
print(f"[CONTACT FINDER] ✓ SCRAPED: {name} - {email} from {url}")
|
| 263 |
|
| 264 |
if len(contacts) >= max_needed:
|
| 265 |
return contacts
|
| 266 |
|
| 267 |
except Exception as e:
|
| 268 |
+
logger.debug(f"Scrape error for {url}: {str(e)}")
|
| 269 |
continue
|
| 270 |
|
| 271 |
return contacts
|
| 272 |
|
| 273 |
+
async def _find_contacts_with_emails(
|
| 274 |
self,
|
| 275 |
+
company_name: str,
|
| 276 |
+
domain: str,
|
| 277 |
+
target_titles: List[str],
|
| 278 |
+
seen_emails: Set[str],
|
| 279 |
+
max_needed: int
|
| 280 |
+
) -> List[Contact]:
|
| 281 |
+
"""Search for executives and only return those with verified emails"""
|
| 282 |
+
contacts = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
+
for title in target_titles:
|
| 285 |
+
# Search for person WITH email mention
|
| 286 |
+
queries = [
|
| 287 |
+
f'"{company_name}" {title} email "@{domain}"',
|
| 288 |
+
f'"{company_name}" {title} contact email',
|
| 289 |
+
f'site:linkedin.com "{company_name}" {title} email',
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
for query in queries:
|
| 293 |
+
try:
|
| 294 |
+
results = await self.search.search(query, max_results=5)
|
| 295 |
+
|
| 296 |
+
for result in results:
|
| 297 |
+
text = result.get('title', '') + ' ' + result.get('body', '')
|
| 298 |
+
|
| 299 |
+
# Only proceed if we find an actual email
|
| 300 |
+
found_emails = self._extract_emails_from_text(text, domain)
|
| 301 |
+
|
| 302 |
+
for email in found_emails:
|
| 303 |
+
if email.lower() not in seen_emails and not self._is_generic_email(email.split('@')[0]):
|
| 304 |
+
# Extract name from text
|
| 305 |
+
name = self._extract_name_from_text(text, company_name)
|
| 306 |
+
|
| 307 |
+
if name:
|
| 308 |
+
contacts.append(Contact(
|
| 309 |
+
id=str(uuid.uuid4()),
|
| 310 |
+
name=name,
|
| 311 |
+
email=email,
|
| 312 |
+
title=title,
|
| 313 |
+
prospect_id=""
|
| 314 |
+
))
|
| 315 |
+
seen_emails.add(email.lower())
|
| 316 |
+
print(f"[CONTACT FINDER] ✓ FOUND: {name} ({title}) - {email}")
|
| 317 |
+
|
| 318 |
+
if len(contacts) >= max_needed:
|
| 319 |
+
return contacts
|
| 320 |
+
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.debug(f"Search error: {str(e)}")
|
| 323 |
+
continue
|
| 324 |
|
| 325 |
+
return contacts
|
| 326 |
|
| 327 |
+
def _extract_emails_from_text(self, text: str, domain: str) -> List[str]:
|
| 328 |
+
"""Extract email addresses from text, prioritizing company domain"""
|
| 329 |
+
if not text:
|
| 330 |
+
return []
|
| 331 |
+
|
| 332 |
+
# Find all emails
|
| 333 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 334 |
+
all_emails = re.findall(email_pattern, text, re.IGNORECASE)
|
| 335 |
+
|
| 336 |
+
# Prioritize company domain emails
|
| 337 |
+
company_emails = [e for e in all_emails if domain.lower() in e.lower()]
|
| 338 |
+
|
| 339 |
+
# Filter out junk
|
| 340 |
+
filtered = []
|
| 341 |
+
ignore_patterns = ['example.com', 'domain.com', 'email.com', 'test.com', 'sample.com',
|
| 342 |
+
'noreply', 'no-reply', 'donotreply', 'unsubscribe', 'privacy',
|
| 343 |
+
'support@', 'info@', 'contact@', 'hello@', 'sales@', 'help@']
|
| 344 |
+
|
| 345 |
+
for email in company_emails:
|
| 346 |
+
if not any(pattern in email.lower() for pattern in ignore_patterns):
|
| 347 |
+
filtered.append(email.lower())
|
| 348 |
|
| 349 |
+
return list(set(filtered))
|
| 350 |
+
|
| 351 |
+
def _extract_name_near_email(self, text: str, email: str, target_titles: List[str]) -> tuple:
|
| 352 |
+
"""Extract name that appears near an email address"""
|
| 353 |
+
if not text or not email:
|
| 354 |
+
return None, None
|
| 355 |
+
|
| 356 |
+
# Find context around email (200 chars before and after)
|
| 357 |
+
email_pos = text.lower().find(email.lower())
|
| 358 |
+
if email_pos == -1:
|
| 359 |
+
return None, None
|
| 360 |
+
|
| 361 |
+
start = max(0, email_pos - 200)
|
| 362 |
+
end = min(len(text), email_pos + len(email) + 200)
|
| 363 |
+
context = text[start:end]
|
| 364 |
+
|
| 365 |
+
# Look for name patterns in context
|
| 366 |
+
name = None
|
| 367 |
+
title = None
|
| 368 |
+
|
| 369 |
+
# Try to find name-title patterns
|
| 370 |
for pattern in self.name_patterns:
|
| 371 |
+
match = re.search(pattern, context)
|
| 372 |
+
if match:
|
| 373 |
+
potential_name = match.group(1).strip()
|
| 374 |
+
if self._is_valid_name(potential_name):
|
| 375 |
+
name = potential_name
|
| 376 |
+
if len(match.groups()) > 1:
|
| 377 |
+
title = match.group(2).strip()
|
| 378 |
+
break
|
| 379 |
+
|
| 380 |
+
# If no name found, try simpler extraction
|
| 381 |
+
if not name:
|
| 382 |
+
# Look for capitalized name-like words near email
|
| 383 |
+
words = context.split()
|
| 384 |
+
for i, word in enumerate(words):
|
| 385 |
+
if word and word[0].isupper() and len(word) > 2:
|
| 386 |
+
if i + 1 < len(words) and words[i+1] and words[i+1][0].isupper():
|
| 387 |
+
potential_name = f"{word} {words[i+1]}"
|
| 388 |
+
if self._is_valid_name(potential_name):
|
| 389 |
+
name = potential_name
|
| 390 |
+
break
|
| 391 |
+
|
| 392 |
+
return name, title
|
| 393 |
|
| 394 |
+
def _extract_name_from_text(self, text: str, company_name: str) -> Optional[str]:
|
| 395 |
+
"""Extract a person's name from text"""
|
| 396 |
+
for pattern in self.name_patterns:
|
| 397 |
+
match = re.search(pattern, text)
|
| 398 |
+
if match:
|
| 399 |
+
name = match.group(1).strip()
|
| 400 |
+
if self._is_valid_name(name) and company_name.lower() not in name.lower():
|
| 401 |
+
return name
|
| 402 |
return None
|
| 403 |
|
| 404 |
def _is_valid_name(self, name: str) -> bool:
|
|
|
|
| 434 |
|
| 435 |
return True
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
def _is_generic_email(self, prefix: str) -> bool:
|
| 438 |
"""Check if email prefix is generic (info, contact, etc.)"""
|
| 439 |
|