Skip to content

Instantly share code, notes, and snippets.

@jackyyf
Created July 4, 2015 01:11
Show Gist options
  • Save jackyyf/9ecdb3faabcd9e9bea83 to your computer and use it in GitHub Desktop.
Save jackyyf/9ecdb3faabcd9e9bea83 to your computer and use it in GitHub Desktop.
Gist by paste.py @ 2015-07-04 09:11:16.505897
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from tornado.ioloop import IOLoop, PeriodicCallback
from tornado.web import RequestHandler, Application, url
from tornado.httpserver import HTTPServer
from tornado.httpclient import AsyncHTTPClient, HTTPError
from tornado.gen import coroutine, Return, WaitIterator
from tornado.netutil import bind_sockets, bind_unix_socket
from tornado.process import fork_processes
from json import loads, dumps
from urllib import urlencode
from sys import exc_info
from datetime import datetime
from rfc822 import parsedate_tz, mktime_tz
from random import choice
# from xml.dom import minidom
from lxml import etree
from pytz import utc
import json
import geoip
import logging
import time
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
del sys.setdefaultencoding
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
geolite2 = geoip.open_database('geolite2.mmdb')
proxyru_base = 'http://proxy.com.ru'
app_log = logging.getLogger('tornado.application')
pages_re = re.compile(r'href="(/list_\d+\.html)"')
proxies_re = re.compile(r'<tr><b><td>(?P<serial>\d+)</td><td>(?P<ip>.+?)</td><td>(?P<port>\d+)</td><td>(?P<ptype>.+?)</td><td>.+?</td></b></tr>')
listen_addr = '9000'
# Some utilities for this project.
def benchmark(func):
def _wrapper(*args, **kwargs):
now = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - now
app_log.info('benchmark result for %s: runtime = %.3f second(s)', func.__name__, elapsed)
return result
return _wrapper
def jsonify(plist, _filter=None, single=False):
proxies = plist.filter(_filter=_filter)
if not proxies:
return json.dumps({
'error': 1,
'reason': 'No suitable proxy found.'
})
if single:
proxy = choice(proxies)
return json.dumps({
'error': 0,
'id': proxy.serial,
'ip': proxy.ip,
'port': proxy.port,
'type': proxy.ptype,
'country': proxy.country,
'latitude': proxy.lat,
'longitude': proxy.lon,
'location': proxy.full_name,
'last_modified': plist.last_updated.isoformat(),
'last_check': plist.last_tried.isoformat(),
})
else:
return json.dumps({
'proxies': map(lambda proxy : {
'id': proxy.serial,
'ip': proxy.ip,
'port': proxy.port,
'type': proxy.ptype,
'country': proxy.country,
'latitude': proxy.lat,
'longitude': proxy.lon,
'location': proxy.full_name,
}, proxies),
'error': 0,
'last_modified': plist.last_updated.isoformat(),
'last_check': plist.last_tried.isoformat(),
})
def add_sub_tag(parent, child_name, child_text=None):
child_elem = etree.SubElement(parent, child_name)
if child_text is not None:
child_elem.text = unicode(child_text)
return child_elem
def xmlify(plist, _filter=None, single=False):
proxies = plist.filter(_filter=_filter)
if not proxies:
return '''\
<?xml version='1.0' encoding='utf-8'?>
<Result>
<Error>1</Error>
<Reason>No suitable proxy found.</Reason>
</Result>\n'''
root_tag = etree.Element('Result')
add_sub_tag(root_tag, 'Error', 0)
add_sub_tag(root_tag, 'LastModified', plist.last_updated.isoformat())
add_sub_tag(root_tag, 'LastCheck', plist.last_updated.isoformat())
if single:
proxy = choice(proxies)
proxy_node = add_sub_tag(root_tag, 'Proxy')
add_sub_tag(proxy_node, 'ID', proxy.serial)
add_sub_tag(proxy_node, 'IP', proxy.ip)
add_sub_tag(proxy_node, 'Port', proxy.port)
add_sub_tag(proxy_node, 'Type', proxy.ptype)
add_sub_tag(proxy_node, 'Country', proxy.country)
add_sub_tag(proxy_node, 'Latitude', proxy.lat)
add_sub_tag(proxy_node, 'Longitude', proxy.lon)
add_sub_tag(proxy_node, 'Location', proxy.full_name)
else:
proxies_node = add_sub_tag(root_tag, 'Proxies')
for proxy in proxies:
proxy_node = add_sub_tag(proxies_node, 'Proxy')
add_sub_tag(proxy_node, 'ID', proxy.serial)
add_sub_tag(proxy_node, 'IP', proxy.ip)
add_sub_tag(proxy_node, 'Port', proxy.port)
add_sub_tag(proxy_node, 'Type', proxy.ptype)
add_sub_tag(proxy_node, 'Country', proxy.country)
add_sub_tag(proxy_node, 'Latitude', proxy.lat)
add_sub_tag(proxy_node, 'Longitude', proxy.lon)
add_sub_tag(proxy_node, 'Location', proxy.full_name)
return etree.tostring(root_tag, pretty_print=True, encoding='utf-8', xml_declaration=True)
class Proxy(object):
PROXY_FULL_ANONYMOUS = 0
PROXY_ANONYMOUS = 1
PROXY_TRANSPARENT = 2
proxy_type = {
u'透明代理': PROXY_TRANSPARENT,
u'匿名代理': PROXY_ANONYMOUS,
u'高度匿名': PROXY_FULL_ANONYMOUS,
}
proxy_name = [
'transparent',
'anonymous',
'full_anonymous',
]
def __init__(self, serial, ip, port, ptype):
self.serial = int(serial)
self.ip = ip
self.port = int(port)
self.ptype = self.proxy_type[ptype.decode('gb2312')] # Damn you, gb****
app_log.info('new proxy: #%d @ %s:%d type: %s', self.serial, ip, self.port, self.proxy_name[self.ptype])
geoinfo = geolite2.lookup(ip)
if geoinfo is not None:
self.country = geoinfo.country
self.lat, self.lon = geoinfo.location
self.tz = geoinfo.timezone
info_dict = geoinfo.get_info_dict()
if 'city' in info_dict:
self.city = info_dict['city']['names']['en']
else:
self.city = None
if 'subdivisions' in info_dict:
self.subdiv = ' '.join(map(lambda x : x['names']['en'], info_dict['subdivisions']))
else:
self.subdiv = None
if 'country' in info_dict:
self.country_name = info_dict['country']['names']['en']
else:
self.country_name = None
else:
self.country = None
self.lat = None
self.lon = None
self.tz = None
self.city = None
self.subdiv = None
@property
def full_name(self):
if self.country_name is None:
return ''
ret = self.country_name
if self.subdiv is not None:
ret += ' ' + self.subdiv
if self.city is not None:
ret += ' ' + self.city
return ret
class ProxyFilter(object):
def __init__(self, country=None, serial=None, ptype=None):
self.country = country
self.serial = serial
self.ptype = ptype
def filter(self, proxy):
if self.country is not None and self.country != proxy.country:
return False
if self.serial is not None and self.serial != proxy.serial:
return False
if self.ptype is not None and self.ptype != proxy.ptype:
return False
return True
@property
def always_true(self):
return self.country is None and self.serial is None and self.ptype is None
# Background task, call ProxyList:fetch frequently to make sure data are up to date :)
class ProxyList(object):
def __init__(self):
self.proxies = []
self.client = AsyncHTTPClient(max_clients=32)
self.last_updated = utc.localize(datetime.utcfromtimestamp(0))
self.last_tried = utc.localize(datetime.utcfromtimestamp(0))
self.running = False
@coroutine
def fetch(self):
# Two step method: first let's fetch first page and find out how many proxies are currently available.
if self.running:
app_log.warn('okay, another network lag. i\'ll quit right now.')
return
self.running = True
try:
app_log.info('update triggered at %s', utc.localize(datetime.utcnow()).isoformat())
try:
result = yield self.client.fetch(proxyru_base + '/', validate_cert=False, connect_timeout=5, request_timeout=20, if_modified_since=self.last_updated, follow_redirects=False)
except HTTPError as e:
if e.code == 304:
app_log.info('proxy list not updated.')
return
if e.code == 599:
app_log.warning('just another timeout, hang tight!')
return
app_log.critical('server responsed with unknown code %d, check your code please!', e.code)
return
finally:
self.last_tried = utc.localize(datetime.utcnow())
if 'Last-Modified' in result.headers:
last_updated = utc.localize(datetime.utcfromtimestamp(mktime_tz(parsedate_tz(result.headers['Last-Modified']))))
else:
# No Last-Modified header, assume just modified.
app_log.warning('server responsed with no last-modified header, be aware of any other changes.')
last_updated = utc.localize(datetime.utcnow())
pages = pages_re.findall(result.body)
futures = []
for page in pages:
futures.append(self.client.fetch(proxyru_base + '/' + page, validate_cert=False, connect_timeout=5, request_timeout=20, if_modified_since=self.last_updated, follow_redirects=False))
result_iterator = WaitIterator(*futures)
new_proxies = []
while not result_iterator.done():
try:
result = yield result_iterator.next()
except HTTPError as e:
if e.code == 599:
app_log.error('damn! proxy.com.ru breaks down again during my fetch.')
for it in futures:
it.cancel()
return
app_log.critical('hey! check your code! error when fetching page %d, error code = %d', result_iterator.current_index, e.code)
return
else:
for proxy in proxies_re.finditer(result.body):
new_proxies.append(Proxy(proxy.group('serial'), proxy.group('ip'), proxy.group('port'), proxy.group('ptype')))
self.proxies = sorted(new_proxies, key = lambda proxy : proxy.serial)
self.last_updated = last_updated
finally:
app_log.info('bye!')
self.running = False
def filter(self, _filter=None):
if _filter is None or _filter.always_true:
return self.proxies
return filter(_filter.filter, self.proxies)
# HTTP Handlers goes here
class SingleHandler(RequestHandler):
def get(self, ptype=None, ext='json'):
country = self.get_query_argument('country', None)
serial = self.get_query_argument('serial', None)
if ptype is not None:
if ptype == 'transparent':
ptype = Proxy.PROXY_TRANSPARENT
elif ptype == 'anonymous':
ptype = Proxy.PROXY_ANONYMOUS
elif ptype == 'full-anonymous':
ptype = Proxy.PROXY_FULL_ANONYMOUS
else:
ptype = None
_filter = ProxyFilter(country=country, serial=serial, ptype=ptype)
if ext == 'json':
self.set_header('Content-Type', 'application/json')
self.write(jsonify(plist, _filter, True))
elif ext == 'xml':
self.set_header('Content-Type', 'text/xml')
self.write(xmlify(plist, _filter, True))
else:
self.set_header('Content-Type', 'application/json')
self.write(jsonify(plist, _filter, True))
class ListHandler(RequestHandler):
def get(self, ptype=None, ext='json'):
country = self.get_query_argument('country', None)
serial = self.get_query_argument('serial', None)
if ptype is not None:
if ptype == 'transparent':
ptype = Proxy.PROXY_TRANSPARENT
elif ptype == 'anonymous':
ptype = Proxy.PROXY_ANONYMOUS
elif ptype == 'full-anonymous':
ptype = Proxy.PROXY_FULL_ANONYMOUS
else:
ptype = None
_filter = ProxyFilter(country=country, serial=serial, ptype=ptype)
if ext == 'json':
self.set_header('Content-Type', 'application/json')
self.write(jsonify(plist, _filter))
elif ext == 'xml':
self.set_header('Content-Type', 'text/xml')
self.write(xmlify(plist, _filter))
else:
self.set_header('Content-Type', 'application/json')
self.write(jsonify(plist, _filter))
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
plist = ProxyList()
PeriodicCallback(plist.fetch, 10100).start()
app = Application([
url(r'/single(?:/(?P<ptype>.*?))?(?:\.(?P<ext>.*))?', SingleHandler),
url(r'/(?P<ptype>.*?)?(?:\.(?P<ext>.*))?', ListHandler),
])
if listen_addr.startswith('unix:'):
socket = [bind_unix_socket(listen_addr[5:], mode=0660)]
else:
if ':' in listen_addr:
addr, port = listen_addr.rsplit(':', 1)
port = int(port)
else:
addr = None
port = int(listen_addr)
socket = bind_sockets(port, addr)
server = HTTPServer(app)
server.add_sockets(socket)
IOLoop.instance().add_callback(plist.fetch)
IOLoop.instance().start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment