Skip to content

Instantly share code, notes, and snippets.

@knowitnothing
Last active December 17, 2015 04:48
Show Gist options
  • Save knowitnothing/5552760 to your computer and use it in GitHub Desktop.
Save knowitnothing/5552760 to your computer and use it in GitHub Desktop.
import re
import sys
import bs4
import treq
from datetime import datetime
from twisted.internet import reactor
SECURE = True
BASE_URL = 'https://bitcointalk.org/index.php?topic=198694.0'
pending_calls = [0]
found = []
def get_pages(response, url):
print>>sys.stderr, "Got response"
pending_calls[0] -= 1
soup = bs4.BeautifulSoup(response)
pages = soup.findAll('a', {'class': 'navPages'})
to_check = set([page['href'] for page in pages])
extract_content(soup, url)
for page in to_check:
extract_numbers(page, get_content)
def get_content(response, url):
pending_calls[0] -= 1
print>>sys.stderr, pending_calls
soup = bs4.BeautifulSoup(response)
extract_content(soup, url)
if not pending_calls[0]:
reactor.stop()
def parse_date(d):
pm = (d[-2:] == 'PM')
if d.startswith('Today'):
hour, minute, sec = map(int, d.split()[2].split(':'))
if pm and hour != 12:
hour += 12
date = datetime.utcnow().replace(hour=hour, minute=minute, second=sec,
microsecond=0)
else:
date = datetime.strptime(d[:-3], '%B %d, %Y, %H:%M:%S')
if pm and date.hour != 12:
date = date.replace(hour=date.hour + 12)
return date
def extract_content(soup, url):
# Get posts
users = [td.find('a').text for td in soup.findAll(
'td', {'class':'poster_info'})]
when = [h.find('div', {'class': 'smalltext'}).text for h in soup.findAll(
'td', {'class': 'td_headerandpost'})]
msgs = [post for post in soup.findAll('div', {'class': 'post'})]
ignore = set(['anyroll'])
pat = re.compile('\d{1,4}')
# Find out the guesses.
for u, w, post in zip(users, when, msgs):
if u in ignore:
continue
failed = []
for elem in list(post.children):
if isinstance(elem, bs4.element.Tag):
# False positives.
if 'class' in elem.attrs and ('quote' in elem['class'][0] or
'code' in elem['class'][0]):
continue
elif 'src' in elem.attrs or 'href' in elem.attrs:
continue
elem = elem.text
else:
if not isinstance(elem, (unicode, bs4.element.NavigableString)):
continue
s = elem.strip()
# Discarding false positives.
# XXX Complement with your findings.
if s == 'Hash: SHA1' or s.count(' ') > 18 or 'bitcointalk' in s:
continue
elif s.startswith('Address:'):
continue
if 0 < len(s) < 5 and s.isdigit():
found.append((int(s), parse_date(w), u, url))
break
else:
res = pat.search(s)
if res is not None and len(s) != 34: # BTC address
# More false positives XXX
before = s[res.start()-1] if res.start() else None
after = s[res.end()] if res.end() < len(s) else None
if before == 'd' and after == '?':
continue
elif before == '"' and after == '"':
continue
elif before == ' ' and after == '/':
continue
elif 0 < len(res.group()) < 5:
found.append((int(res.group()), parse_date(w), u, url))
break
failed.append(s)
else:
#print>>sys.stderr, 'Failed at', u, failed, url
pass
def extract_numbers(topic_url=BASE_URL, cb=get_pages):
protocol = 'https' if SECURE else 'http'
d = treq.get(topic_url)
d.addCallback(lambda response:
treq.text_content(response).addCallback(cb, topic_url))
pending_calls[0] += 1
print>>sys.stderr, "Sent request to %s" % topic_url
if __name__ == "__main__":
out = open(sys.argv[1], 'w')
extract_numbers()
reactor.run()
last_num = 0
available = []
out.write('num,when,user,where\n')
for num, when, user, where in sorted(found):
line = u'%d,%s,%s,%s\n' % (num, when, user, where)
out.write(line.encode('utf8'))
if num > 1000:
break
if num - last_num > 1:
for i in range(last_num + 1, num):
available.append(i)
last_num = num
out.close()
print available
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment