Created
December 30, 2014 10:40
-
-
Save dchaplinsky/66d7d48b9e3cc3a2abf3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os.path | |
import requests | |
from random import sample, random | |
from collections import Counter | |
from pymongo import MongoClient | |
from glob2 import glob | |
client = MongoClient() | |
db = client.decl | |
tasks = db.tasks | |
URL = "http://unshred.it/static" | |
REDUNDANCY = 3 | |
def get_files(in_dir): | |
dataset = [] | |
for f in glob("%s/**" % in_dir): | |
_, ext = os.path.splitext(f) | |
if ext.lower() in (".pdf", ".doc", ".zip"): | |
dataset.append(f.replace(in_dir, URL)) | |
return dataset | |
email_re = re.compile( | |
r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*" | |
r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]' | |
r'|\\[\001-011\013\014\016-\177])*"' | |
r")@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?$", | |
re.IGNORECASE) | |
def unique(seq): | |
""" Perserves the order of the elements in the original sequense """ | |
seen = set() | |
for i in seq: | |
if i not in seen: | |
seen.add(i) | |
yield i | |
def get_users(lst): | |
with open(lst, "r") as fp: | |
userset = filter(lambda x: not x.startswith("#"), | |
map(str.strip, fp.readlines())) | |
userset = list(unique(userset)) | |
invalid = filter(lambda x: email_re.match(x) is None, userset) | |
print("List of invalid emails:") | |
print("\n".join(invalid)) | |
return filter(lambda x: email_re.match(x) is not None, userset) | |
if __name__ == '__main__': | |
allocated_files = Counter() | |
if tasks.count() != 0: | |
exit("Tasks collection is not empty, it has %s tasks" % tasks.count()) | |
taskset = get_files("out2/pdfs") | |
# for i, t in enumerate(taskset): | |
# resp = requests.head(t) | |
# if resp.status_code == 404: | |
# print("%s is not found on server" % t) | |
# if i and (i % 50 == 0): | |
# print("%s links checked" % i) | |
# exit("Oh shit") | |
userset = get_users("users_full.txt") | |
for i, user in enumerate(userset): | |
if len(taskset) == 0: | |
print(userset[i:]) | |
break | |
files_to_send = sample( | |
taskset, | |
min(REDUNDANCY + (1 if random() < 0.14 else 0), len(taskset))) | |
tasks.insert({ | |
"email": user, | |
"sent": False, | |
"files": files_to_send | |
}) | |
allocated_files.update(files_to_send) | |
for f, cnt in allocated_files.most_common(): | |
if cnt >= REDUNDANCY and f in taskset: | |
taskset.remove(f) | |
del allocated_files[f] | |
print(len([x for x, cnt in allocated_files.most_common() if cnt == 1])) | |
print(len([x for x, cnt in allocated_files.most_common() if cnt == 2])) | |
print(len([x for x, cnt in allocated_files.most_common() if cnt == 3])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment