Created
May 1, 2015 00:07
-
-
Save bburky/9986caf43bf43835ae14 to your computer and use it in GitHub Desktop.
Find reddit whitehats
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import asyncio | |
import aiohttp | |
import lxml.html | |
CDX_API_URL = 'http://web.archive.org/cdx/search/cdx' | |
# This uses an (undocumented?) flag 'id_' to force getting unmodified original files | |
DOWNLOAD_URL_PATTERN = 'http://web.archive.org/web/{timestamp}id_/{original}' | |
@asyncio.coroutine | |
def get_trophy_user(timestamp, original): | |
with (yield from sem): | |
response = yield from aiohttp.request('GET', DOWNLOAD_URL_PATTERN.format(timestamp=timestamp, original=original)) | |
body = yield from response.text() | |
doc = lxml.html.document_fromstring(body) | |
trophy, = doc.xpath('//div[a[span[@class="award-name" and text() = "White Hat"]]]') or [None] | |
if trophy is not None: | |
return trophy.xpath('div[@class="winner-info"]/span[@class="winner-name"]/a/text()')[0] | |
return None | |
@asyncio.coroutine | |
def get_archived(url): | |
response = yield from aiohttp.request('GET', CDX_API_URL, params={'url': url}) | |
archives = [] | |
for line in (yield from response.text()).splitlines(): | |
urlkey, timestamp, original, mimetype, statuscode, digest, length = line.split(' ') | |
archives.append((timestamp, original)) | |
return archives | |
@asyncio.coroutine | |
def whitehats(): | |
archives = yield from get_archived('https://www.reddit.com/awards') | |
users_list = yield from asyncio.gather(*(get_trophy_user(timestamp, original) for (timestamp, original) in archives)) | |
users = set(users_list) | |
users.discard(None) | |
return users | |
loop = asyncio.get_event_loop() | |
sem = asyncio.Semaphore(5) # limit concurrent connections to archive.org | |
for user in loop.run_until_complete(whitehats()): | |
print(user) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment