smithtrenton/gist:ad08548f06dbb96b46a4

## gistfile1.py
from html.parser import HTMLParser
from urllib.request import urlopen

#parses html page for data between <td></td>
#works for http://us-proxy.org/ and http://free-proxy-list.net/uk-proxy.html
class ProxyParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._read = False
        self._count = 0
        self._list = []

    def handle_starttag(self, tag, attrs):
        if tag == 'td':
            self._read = True
    def handle_endtag(self, tag):
        if tag == 'td':
            self._read = False
            self._count += 1
    def handle_data(self, data):
        if self._read and (self._count % 8) < 2:
            if (self._count % 8) == 0:
                self._last = data
            else:
                self._list.append((self._last, data))
    def run(self, url):
        page = urlopen(url)
        src = str(page.read())
        page.close()
        self.feed(src)
        return self._list

#simple function that uses ProxyParser class to parse a url
def getProxyList(url):
    parser = ProxyParser()
    return parser.run(url)

print(len(getProxyList('http://us-proxy.org/')))
print(len(getProxyList('http://free-proxy-list.net/uk-proxy.html')))
	from html.parser import HTMLParser
	from urllib.request import urlopen

	#parses html page for data between <td></td>
	#works for http://us-proxy.org/ and http://free-proxy-list.net/uk-proxy.html
	class ProxyParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self._read = False
	self._count = 0
	self._list = []

	def handle_starttag(self, tag, attrs):
	if tag == 'td':
	self._read = True
	def handle_endtag(self, tag):
	if tag == 'td':
	self._read = False
	self._count += 1
	def handle_data(self, data):
	if self._read and (self._count % 8) < 2:
	if (self._count % 8) == 0:
	self._last = data
	else:
	self._list.append((self._last, data))
	def run(self, url):
	page = urlopen(url)
	src = str(page.read())
	page.close()
	self.feed(src)
	return self._list

	#simple function that uses ProxyParser class to parse a url
	def getProxyList(url):
	parser = ProxyParser()
	return parser.run(url)

	print(len(getProxyList('http://us-proxy.org/')))
	print(len(getProxyList('http://free-proxy-list.net/uk-proxy.html')))