Skip to content

Instantly share code, notes, and snippets.

@bburky
Created May 1, 2015 00:07
Show Gist options
  • Save bburky/9986caf43bf43835ae14 to your computer and use it in GitHub Desktop.
Save bburky/9986caf43bf43835ae14 to your computer and use it in GitHub Desktop.
Find reddit whitehats
#!/usr/bin/env python3
import asyncio
import aiohttp
import lxml.html
CDX_API_URL = 'http://web.archive.org/cdx/search/cdx'
# This uses an (undocumented?) flag 'id_' to force getting unmodified original files
DOWNLOAD_URL_PATTERN = 'http://web.archive.org/web/{timestamp}id_/{original}'
@asyncio.coroutine
def get_trophy_user(timestamp, original):
with (yield from sem):
response = yield from aiohttp.request('GET', DOWNLOAD_URL_PATTERN.format(timestamp=timestamp, original=original))
body = yield from response.text()
doc = lxml.html.document_fromstring(body)
trophy, = doc.xpath('//div[a[span[@class="award-name" and text() = "White Hat"]]]') or [None]
if trophy is not None:
return trophy.xpath('div[@class="winner-info"]/span[@class="winner-name"]/a/text()')[0]
return None
@asyncio.coroutine
def get_archived(url):
response = yield from aiohttp.request('GET', CDX_API_URL, params={'url': url})
archives = []
for line in (yield from response.text()).splitlines():
urlkey, timestamp, original, mimetype, statuscode, digest, length = line.split(' ')
archives.append((timestamp, original))
return archives
@asyncio.coroutine
def whitehats():
archives = yield from get_archived('https://www.reddit.com/awards')
users_list = yield from asyncio.gather(*(get_trophy_user(timestamp, original) for (timestamp, original) in archives))
users = set(users_list)
users.discard(None)
return users
loop = asyncio.get_event_loop()
sem = asyncio.Semaphore(5) # limit concurrent connections to archive.org
for user in loop.run_until_complete(whitehats()):
print(user)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment