Skip to content

Instantly share code, notes, and snippets.

@dchaplinsky
Created December 30, 2014 10:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dchaplinsky/66d7d48b9e3cc3a2abf3 to your computer and use it in GitHub Desktop.
Save dchaplinsky/66d7d48b9e3cc3a2abf3 to your computer and use it in GitHub Desktop.
import re
import os.path
import requests
from random import sample, random
from collections import Counter
from pymongo import MongoClient
from glob2 import glob
client = MongoClient()
db = client.decl
tasks = db.tasks
URL = "http://unshred.it/static"
REDUNDANCY = 3
def get_files(in_dir):
dataset = []
for f in glob("%s/**" % in_dir):
_, ext = os.path.splitext(f)
if ext.lower() in (".pdf", ".doc", ".zip"):
dataset.append(f.replace(in_dir, URL))
return dataset
email_re = re.compile(
r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*"
r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]'
r'|\\[\001-011\013\014\016-\177])*"'
r")@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?$",
re.IGNORECASE)
def unique(seq):
""" Perserves the order of the elements in the original sequense """
seen = set()
for i in seq:
if i not in seen:
seen.add(i)
yield i
def get_users(lst):
with open(lst, "r") as fp:
userset = filter(lambda x: not x.startswith("#"),
map(str.strip, fp.readlines()))
userset = list(unique(userset))
invalid = filter(lambda x: email_re.match(x) is None, userset)
print("List of invalid emails:")
print("\n".join(invalid))
return filter(lambda x: email_re.match(x) is not None, userset)
if __name__ == '__main__':
allocated_files = Counter()
if tasks.count() != 0:
exit("Tasks collection is not empty, it has %s tasks" % tasks.count())
taskset = get_files("out2/pdfs")
# for i, t in enumerate(taskset):
# resp = requests.head(t)
# if resp.status_code == 404:
# print("%s is not found on server" % t)
# if i and (i % 50 == 0):
# print("%s links checked" % i)
# exit("Oh shit")
userset = get_users("users_full.txt")
for i, user in enumerate(userset):
if len(taskset) == 0:
print(userset[i:])
break
files_to_send = sample(
taskset,
min(REDUNDANCY + (1 if random() < 0.14 else 0), len(taskset)))
tasks.insert({
"email": user,
"sent": False,
"files": files_to_send
})
allocated_files.update(files_to_send)
for f, cnt in allocated_files.most_common():
if cnt >= REDUNDANCY and f in taskset:
taskset.remove(f)
del allocated_files[f]
print(len([x for x, cnt in allocated_files.most_common() if cnt == 1]))
print(len([x for x, cnt in allocated_files.most_common() if cnt == 2]))
print(len([x for x, cnt in allocated_files.most_common() if cnt == 3]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment