Skip to content

Instantly share code, notes, and snippets.

@ollar
Created January 23, 2014 20:53
Show Gist options
  • Save ollar/8586550 to your computer and use it in GitHub Desktop.
Save ollar/8586550 to your computer and use it in GitHub Desktop.
import urllib2
import httplib2
from sgmllib import SGMLParser
request = urllib2.Request('http://bash.im')
userAgent = 'Mozilla/5.0 (X11; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0'
opener = urllib2.build_opener()
request.add_header('User-agent', userAgent)
bashim_sock = opener.open(request)
bashim_html = bashim_sock.read().decode('cp1251').encode('utf-8')
bashim_sock.close()
class BashPostsParser(SGMLParser):
"""Grubs posts from bash.im"""
def reset(self):
SGMLParser.reset(self)
self.pieces = []
self.post = 0
self.inner_div = 0
self.post_counter = 0
def start_div(self, attrs):
if self.post > 0:
self.inner_div += 1
_class = [k for v,k in attrs if v == 'class']
if 'text' in _class:
self.post += 1
self.post_counter += 1
def end_div(self):
if self.post > 0:
self.inner_div -= 1
if self.inner_div <= 0:
self.post -= 1
def handle_data(self, text):
if self.post > 0:
if len(self.pieces) > self.post_counter - 1:
self.pieces[self.post_counter - 1] += '\n'+text
else:
self.pieces.append(text)
def output(self):
return self.pieces
post_parser = BashPostsParser()
post_parser.feed(bashim_html)
post_parser.close()
sep = "\n"+"-"*100+"\n"
post_li = [u(text) for post in post_parser.output()]
if __name__ == "__main__":
f = open('d3_posts','w')
f.write(sep.join(post_parser.output()))
f.close()
print post_li
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment