Skip to content

Instantly share code, notes, and snippets.

@initbrain
Last active March 8, 2018 14:16
Show Gist options
  • Save initbrain/6636847 to your computer and use it in GitHub Desktop.
Save initbrain/6636847 to your computer and use it in GitHub Desktop.
urlquery.net parser
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lxml.etree import tostring
from lxml.html import fromstring
import urllib2
def req(url, http_proxy=None, timeout=30, retry=2):
result = None
if not timeout or timeout is None:
timeout = 30
if retry is None or retry < 0:
retry = 2
if http_proxy:
# http://username:password@someproxyserver.com:1337
http_proxy_full_auth_string = "http://%s:%s@%s:%s" % (http_proxy["user"],
http_proxy["passwd"],
http_proxy["server"],
http_proxy["port"])
proxy_handler = urllib2.ProxyHandler({"http": http_proxy_full_auth_string,
"https": http_proxy_full_auth_string})
opener = urllib2.build_opener(proxy_handler)
#urllib2.install_opener(opener)
else:
proxy_handler = urllib2.ProxyHandler({})
opener = urllib2.build_opener(proxy_handler)
postDatas = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Cache-Control": "no-cache",
"Pragma": "no-cache"}
request = urllib2.Request(url, None, postDatas)
loop = 0
while not result and loop <= retry:
try:
# Permet de passer par le proxy sans installer d'opener
connection = opener.open(request, timeout=timeout)
except Exception as err:
# Si il y a une erreur de connexion (timeout etc.)
print "Error with %s: %s" % (url, err)
else:
retcode = connection.getcode()
# if retcode != 200:
# result.log("Mauvais code retourne par %s: %d" % (url, retcode), logging.ERROR)
# else:
# try:
source = connection.read()
# except Exception as err:
# # Si il y a une erreur de connexion (timeout etc.)
# result.add_error(err, "%s ne repond pas" % url)
# else:
connection.close()
# if not source:
# result.log("La page retournee par %s est vide" % url, logging.ERROR)
# else:
if source:
# result.add_data(source, display=False)
result = source
break
loop += 1
return result
if __name__ == '__main__':
url = "http://urlquery.net/"
sourceUrlQuery = req(url)
rootUrlQuery = fromstring(sourceUrlQuery)
resUrlQuery = []
for td in rootUrlQuery.xpath('//tr[contains(@class, "odd_highlight") or contains(@class, "even_highlight")]'):
date = td.xpath('td/nobr/center/text()')[0]
#print date
level = td.xpath('td[contains(@align, "center")]/b/text()')[0]
#print level
targeturl = td.xpath('td/a/@title')[0]
#print targeturl
country = td.xpath('td/img/@title')[0]
#print country
ipaddr = tostring(td.xpath('td/img')[0]).split('>')[1]
#print ipaddr
resUrlQuery.append([date, level, targeturl, country, ipaddr])
#print "#"*50
print '\n\n'.join("date: %s\nlevel: %s\ntargeturl: %s\ncountry: %s\nipaddr: %s" % (date, level, targeturl, country, ipaddr) for date, level, targeturl, country, ipaddr in resUrlQuery)
print "\n", "#"*50, "\n%d result%s" % (len(resUrlQuery), 's' if len(resUrlQuery) > 1 else '')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment