Skip to content

Instantly share code, notes, and snippets.

@blzzua
Created March 20, 2017 14:47
Show Gist options
  • Save blzzua/7acb035373e801d51ff320df1f2489b8 to your computer and use it in GitHub Desktop.
Save blzzua/7acb035373e801d51ff320df1f2489b8 to your computer and use it in GitHub Desktop.
html parsing bash.im citate
#!/usr/bin/python3
# -*- encoding: utf8 -*-
from html.parser import HTMLParser
from html import unescape
import pycurl
from io import BytesIO
class Citate():
def __init__(self):
self.convert_charrefs=True
self.cit_id=''
self.cit_date='';
self.cit_str=''
class MyLinksParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.is_quote = False
self.is_abysstop_date = False
self.is_citid = False
self.curcit=None
self.citates = []
def handle_starttag(self, tag, attrs):
if tag == 'div':
for attr in attrs:
if attr == ('class', 'text'):
self.is_quote = True
if tag == 'span':
for attr in attrs:
if attr == ('class', 'id'):
self.is_citid = True
if attr == ('class', 'date'):
self.is_abysstop_date = True
if tag == 'a':
for attr in attrs:
if attr == ('class', 'id'):
self.is_citid = True
if tag == 'br' :
if self.curcit is not None and self.is_quote:
self.curcit.cit_str += '\n'
def handle_endtag(self, tag):
if ( self.is_quote and tag == 'div' ):
self.is_quote = False
if self.curcit != None:
self.citates.append(self.curcit)
self.curcit = None
else:
pass
def handle_data(self, data):
if ( self.is_quote ) :
if self.curcit is None:
self.curcit = Citate()
self.curcit.cit_str += data
if ( self.is_citid ) :
if self.curcit is None:
self.curcit = Citate()
self.curcit.cit_id = data
self.is_citid = False
if ( self.is_abysstop_date ) :
if self.curcit is None:
self.curcit = Citate()
self.curcit.cit_date = data
self.is_abysstop_date = False
def handle_entityref(self,data):
if ( self.is_quote ) :
if self.curcit is None:
self.curcit = Citate()
text = unescape('&'+data+';')
self.curcit.cit_str += text
parser = MyLinksParser()
c = pycurl.Curl()
buffer = BytesIO()
c.setopt(c.URL, 'http://bash.im/random')
c.setopt(c.WRITEDATA,buffer)
c.setopt(c.PROXYTYPE, pycurl.PROXYTYPE_HTTP)
# there use proxy
c.setopt(c.PROXY, '127.0.0.1')
c.setopt(c.PROXYPORT, 8080)
c.perform()
res=buffer.getvalue()
sitedata=res.decode('cp1251')
parser.feed(sitedata)
for i in parser.citates:
print ( i.cit_id, ': ', + len(i.cit_str) , ': ' , i.cit_str )
print ('--')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment