knowitnothing/gist:5552760

## gistfile1.py
import re
import sys
import bs4
import treq
from datetime import datetime
from twisted.internet import reactor


SECURE = True
BASE_URL = 'https://bitcointalk.org/index.php?topic=198694.0'

pending_calls = [0]
found = []

def get_pages(response, url):
    print>>sys.stderr, "Got response"
    pending_calls[0] -= 1

    soup = bs4.BeautifulSoup(response)

    pages = soup.findAll('a', {'class': 'navPages'})
    to_check = set([page['href'] for page in pages])

    extract_content(soup, url)

    for page in to_check:
        extract_numbers(page, get_content)

def get_content(response, url):
    pending_calls[0] -= 1
    print>>sys.stderr, pending_calls
    soup = bs4.BeautifulSoup(response)
    extract_content(soup, url)

    if not pending_calls[0]:
        reactor.stop()

def parse_date(d):
    pm = (d[-2:] == 'PM')

    if d.startswith('Today'):
        hour, minute, sec = map(int, d.split()[2].split(':'))
        if pm and hour != 12:
            hour += 12
        date = datetime.utcnow().replace(hour=hour, minute=minute, second=sec,
                microsecond=0)
    else:
        date = datetime.strptime(d[:-3], '%B %d, %Y, %H:%M:%S')
        if pm and date.hour != 12:
            date = date.replace(hour=date.hour + 12)

    return date


def extract_content(soup, url):
    # Get posts
    users = [td.find('a').text for td in soup.findAll(
        'td', {'class':'poster_info'})]
    when = [h.find('div', {'class': 'smalltext'}).text for h in soup.findAll(
        'td', {'class': 'td_headerandpost'})]
    msgs = [post for post in soup.findAll('div', {'class': 'post'})]

    ignore = set(['anyroll'])
    pat = re.compile('\d{1,4}')

    # Find out the guesses.
    for u, w, post in zip(users, when, msgs):
        if u in ignore:
            continue

        failed = []

        for elem in list(post.children):
            if isinstance(elem, bs4.element.Tag):
                # False positives.
                if 'class' in elem.attrs and ('quote' in elem['class'][0] or
                        'code' in elem['class'][0]):
                    continue
                elif 'src' in elem.attrs or 'href' in elem.attrs:
                    continue

                elem = elem.text
            else:
                if not isinstance(elem, (unicode, bs4.element.NavigableString)):
                    continue
            s = elem.strip()
            # Discarding false positives.
            # XXX Complement with your findings.
            if s == 'Hash: SHA1' or s.count(' ') > 18 or 'bitcointalk' in s:
                continue
            elif s.startswith('Address:'):
                continue


            if 0 < len(s) < 5 and s.isdigit():
                found.append((int(s), parse_date(w), u, url))
                break
            else:
                res = pat.search(s)
                if res is not None and len(s) != 34: # BTC address
                    # More false positives XXX
                    before = s[res.start()-1] if res.start() else None
                    after = s[res.end()] if res.end() < len(s) else None
                    if before == 'd' and after == '?':
                        continue
                    elif before == '"' and after == '"':
                        continue
                    elif before == ' ' and after == '/':
                        continue

                    elif 0 < len(res.group()) < 5:
                        found.append((int(res.group()), parse_date(w), u, url))
                        break
                failed.append(s)
        else:
            #print>>sys.stderr, 'Failed at', u, failed, url
            pass

def extract_numbers(topic_url=BASE_URL, cb=get_pages):
    protocol = 'https' if SECURE else 'http'

    d = treq.get(topic_url)
    d.addCallback(lambda response:
            treq.text_content(response).addCallback(cb, topic_url))
    pending_calls[0] += 1
    print>>sys.stderr, "Sent request to %s" % topic_url


if __name__ == "__main__":
    out = open(sys.argv[1], 'w')

    extract_numbers()
    reactor.run()

    last_num = 0
    available = []

    out.write('num,when,user,where\n')
    for num, when, user, where in sorted(found):
        line = u'%d,%s,%s,%s\n' % (num, when, user, where)
        out.write(line.encode('utf8'))

        if num > 1000:
            break

        if num - last_num > 1:
            for i in range(last_num + 1, num):
                available.append(i)
        last_num = num
    out.close()

    print available
	import re
	import sys
	import bs4
	import treq
	from datetime import datetime
	from twisted.internet import reactor


	SECURE = True
	BASE_URL = 'https://bitcointalk.org/index.php?topic=198694.0'

	pending_calls = [0]
	found = []

	def get_pages(response, url):
	print>>sys.stderr, "Got response"
	pending_calls[0] -= 1

	soup = bs4.BeautifulSoup(response)

	pages = soup.findAll('a', {'class': 'navPages'})
	to_check = set([page['href'] for page in pages])

	extract_content(soup, url)

	for page in to_check:
	extract_numbers(page, get_content)

	def get_content(response, url):
	pending_calls[0] -= 1
	print>>sys.stderr, pending_calls
	soup = bs4.BeautifulSoup(response)
	extract_content(soup, url)

	if not pending_calls[0]:
	reactor.stop()

	def parse_date(d):
	pm = (d[-2:] == 'PM')

	if d.startswith('Today'):
	hour, minute, sec = map(int, d.split()[2].split(':'))
	if pm and hour != 12:
	hour += 12
	date = datetime.utcnow().replace(hour=hour, minute=minute, second=sec,
	microsecond=0)
	else:
	date = datetime.strptime(d[:-3], '%B %d, %Y, %H:%M:%S')
	if pm and date.hour != 12:
	date = date.replace(hour=date.hour + 12)

	return date


	def extract_content(soup, url):
	# Get posts
	users = [td.find('a').text for td in soup.findAll(
	'td', {'class':'poster_info'})]
	when = [h.find('div', {'class': 'smalltext'}).text for h in soup.findAll(
	'td', {'class': 'td_headerandpost'})]
	msgs = [post for post in soup.findAll('div', {'class': 'post'})]

	ignore = set(['anyroll'])
	pat = re.compile('\d{1,4}')

	# Find out the guesses.
	for u, w, post in zip(users, when, msgs):
	if u in ignore:
	continue

	failed = []

	for elem in list(post.children):
	if isinstance(elem, bs4.element.Tag):
	# False positives.
	if 'class' in elem.attrs and ('quote' in elem['class'][0] or
	'code' in elem['class'][0]):
	continue
	elif 'src' in elem.attrs or 'href' in elem.attrs:
	continue

	elem = elem.text
	else:
	if not isinstance(elem, (unicode, bs4.element.NavigableString)):
	continue
	s = elem.strip()
	# Discarding false positives.
	# XXX Complement with your findings.
	if s == 'Hash: SHA1' or s.count(' ') > 18 or 'bitcointalk' in s:
	continue
	elif s.startswith('Address:'):
	continue


	if 0 < len(s) < 5 and s.isdigit():
	found.append((int(s), parse_date(w), u, url))
	break
	else:
	res = pat.search(s)
	if res is not None and len(s) != 34: # BTC address
	# More false positives XXX
	before = s[res.start()-1] if res.start() else None
	after = s[res.end()] if res.end() < len(s) else None
	if before == 'd' and after == '?':
	continue
	elif before == '"' and after == '"':
	continue
	elif before == ' ' and after == '/':
	continue

	elif 0 < len(res.group()) < 5:
	found.append((int(res.group()), parse_date(w), u, url))
	break
	failed.append(s)
	else:
	#print>>sys.stderr, 'Failed at', u, failed, url
	pass

	def extract_numbers(topic_url=BASE_URL, cb=get_pages):
	protocol = 'https' if SECURE else 'http'

	d = treq.get(topic_url)
	d.addCallback(lambda response:
	treq.text_content(response).addCallback(cb, topic_url))
	pending_calls[0] += 1
	print>>sys.stderr, "Sent request to %s" % topic_url


	if __name__ == "__main__":
	out = open(sys.argv[1], 'w')

	extract_numbers()
	reactor.run()

	last_num = 0
	available = []

	out.write('num,when,user,where\n')
	for num, when, user, where in sorted(found):
	line = u'%d,%s,%s,%s\n' % (num, when, user, where)
	out.write(line.encode('utf8'))

	if num > 1000:
	break

	if num - last_num > 1:
	for i in range(last_num + 1, num):
	available.append(i)
	last_num = num
	out.close()

	print available