blzzua/http-parse-bashim.py

## http-parse-bashim.py
#!/usr/bin/python3
# -*- encoding: utf8 -*-

from html.parser import HTMLParser
from html import unescape
import pycurl
from io import BytesIO

class Citate():
    def __init__(self):
        self.convert_charrefs=True
        self.cit_id=''
        self.cit_date='';
        self.cit_str=''


class MyLinksParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.is_quote = False
        self.is_abysstop_date = False
        self.is_citid = False
        self.curcit=None
        self.citates = []

    def handle_starttag(self, tag, attrs):
        if tag == 'div':
            for attr in attrs:
                if attr == ('class', 'text'):
                   self.is_quote = True
        if tag == 'span':
            for attr in attrs:
                if attr == ('class', 'id'):
                    self.is_citid = True
                if attr == ('class', 'date'):
                    self.is_abysstop_date = True
        if tag == 'a':
            for attr in attrs:
                if attr == ('class', 'id'):
                    self.is_citid = True

        if tag == 'br' :
            if self.curcit is not None and self.is_quote:
                self.curcit.cit_str += '\n'


    def handle_endtag(self, tag):
        if ( self.is_quote and tag == 'div' ):
            self.is_quote = False
            if self.curcit != None:
                self.citates.append(self.curcit)
            self.curcit = None
        else:
            pass


    def handle_data(self, data):
        if ( self.is_quote ) :
            if self.curcit is None:
                        self.curcit = Citate()
            self.curcit.cit_str += data
        if ( self.is_citid ) :
            if self.curcit is None:
                        self.curcit = Citate()
            self.curcit.cit_id = data
            self.is_citid = False
        if ( self.is_abysstop_date ) :
            if self.curcit is None:
                        self.curcit = Citate()
            self.curcit.cit_date = data
            self.is_abysstop_date = False

    def handle_entityref(self,data):
        if ( self.is_quote ) :
            if self.curcit is None:
                        self.curcit = Citate()
            text = unescape('&'+data+';')
            self.curcit.cit_str += text

parser = MyLinksParser()
c = pycurl.Curl()
buffer = BytesIO()

c.setopt(c.URL, 'http://bash.im/random')
c.setopt(c.WRITEDATA,buffer)
c.setopt(c.PROXYTYPE, pycurl.PROXYTYPE_HTTP)
# there use proxy
c.setopt(c.PROXY, '127.0.0.1')
c.setopt(c.PROXYPORT, 8080)
c.perform()
res=buffer.getvalue()
sitedata=res.decode('cp1251')

parser.feed(sitedata)

for i in parser.citates:
    print ( i.cit_id, ': ', + len(i.cit_str) , ': ' , i.cit_str  )
    print ('--')
	#!/usr/bin/python3
	# -- encoding: utf8 --

	from html.parser import HTMLParser
	from html import unescape
	import pycurl
	from io import BytesIO

	class Citate():
	def __init__(self):
	self.convert_charrefs=True
	self.cit_id=''
	self.cit_date='';
	self.cit_str=''


	class MyLinksParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.is_quote = False
	self.is_abysstop_date = False
	self.is_citid = False
	self.curcit=None
	self.citates = []

	def handle_starttag(self, tag, attrs):
	if tag == 'div':
	for attr in attrs:
	if attr == ('class', 'text'):
	self.is_quote = True
	if tag == 'span':
	for attr in attrs:
	if attr == ('class', 'id'):
	self.is_citid = True
	if attr == ('class', 'date'):
	self.is_abysstop_date = True
	if tag == 'a':
	for attr in attrs:
	if attr == ('class', 'id'):
	self.is_citid = True

	if tag == 'br' :
	if self.curcit is not None and self.is_quote:
	self.curcit.cit_str += '\n'


	def handle_endtag(self, tag):
	if ( self.is_quote and tag == 'div' ):
	self.is_quote = False
	if self.curcit != None:
	self.citates.append(self.curcit)
	self.curcit = None
	else:
	pass


	def handle_data(self, data):
	if ( self.is_quote ) :
	if self.curcit is None:
	self.curcit = Citate()
	self.curcit.cit_str += data
	if ( self.is_citid ) :
	if self.curcit is None:
	self.curcit = Citate()
	self.curcit.cit_id = data
	self.is_citid = False
	if ( self.is_abysstop_date ) :
	if self.curcit is None:
	self.curcit = Citate()
	self.curcit.cit_date = data
	self.is_abysstop_date = False

	def handle_entityref(self,data):
	if ( self.is_quote ) :
	if self.curcit is None:
	self.curcit = Citate()
	text = unescape('&'+data+';')
	self.curcit.cit_str += text

	parser = MyLinksParser()
	c = pycurl.Curl()
	buffer = BytesIO()

	c.setopt(c.URL, 'http://bash.im/random')
	c.setopt(c.WRITEDATA,buffer)
	c.setopt(c.PROXYTYPE, pycurl.PROXYTYPE_HTTP)
	# there use proxy
	c.setopt(c.PROXY, '127.0.0.1')
	c.setopt(c.PROXYPORT, 8080)
	c.perform()
	res=buffer.getvalue()
	sitedata=res.decode('cp1251')

	parser.feed(sitedata)

	for i in parser.citates:
	print ( i.cit_id, ': ', + len(i.cit_str) , ': ' , i.cit_str )
	print ('--')