Skip to content

Instantly share code, notes, and snippets.

@smithtrenton
Created May 7, 2015 11:53
Show Gist options
  • Save smithtrenton/ad08548f06dbb96b46a4 to your computer and use it in GitHub Desktop.
Save smithtrenton/ad08548f06dbb96b46a4 to your computer and use it in GitHub Desktop.
from html.parser import HTMLParser
from urllib.request import urlopen
#parses html page for data between <td></td>
#works for http://us-proxy.org/ and http://free-proxy-list.net/uk-proxy.html
class ProxyParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._read = False
self._count = 0
self._list = []
def handle_starttag(self, tag, attrs):
if tag == 'td':
self._read = True
def handle_endtag(self, tag):
if tag == 'td':
self._read = False
self._count += 1
def handle_data(self, data):
if self._read and (self._count % 8) < 2:
if (self._count % 8) == 0:
self._last = data
else:
self._list.append((self._last, data))
def run(self, url):
page = urlopen(url)
src = str(page.read())
page.close()
self.feed(src)
return self._list
#simple function that uses ProxyParser class to parse a url
def getProxyList(url):
parser = ProxyParser()
return parser.run(url)
print(len(getProxyList('http://us-proxy.org/')))
print(len(getProxyList('http://free-proxy-list.net/uk-proxy.html')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment