Skip to content

Instantly share code, notes, and snippets.

@kakarukeys
Created September 9, 2021 01:56
Show Gist options
  • Save kakarukeys/51551cd1ad38bb77b0a849d929b7844c to your computer and use it in GitHub Desktop.
Save kakarukeys/51551cd1ad38bb77b0a849d929b7844c to your computer and use it in GitHub Desktop.
summary email harvesting
import re
from pymongo import MongoClient
from settings import DEST_DATABASE_URL
if __name__ == '__main__':
with open("CREDENTIALS") as f:
credentials = f.read().strip()
client = MongoClient(DEST_DATABASE_URL.replace("$(cat CREDENTIALS)", credentials))
db = client.linkedin_data
coll = db.contact_raw
counts = {"total": 0, "with_email": 0}
for rec in coll.find({}, {"summary": 1}).sort("$natural", -1):
counts["total"] += 1
if rec.get("summary"):
emails = re.findall(r'\S+@\w+\.\w+', str(rec["summary"]))
if emails:
print(emails)
counts["with_email"] += 1
if counts["total"] % 1000 == 0:
print(counts)
# {'total': 6400000, 'with_email': 43571}
# 11,000,000 records, .006807969, 75k emails
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment