Skip to content

Instantly share code, notes, and snippets.

@realazthat
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save realazthat/3863cf3a0479c6fb2d1c to your computer and use it in GitHub Desktop.
Save realazthat/3863cf3a0479c6fb2d1c to your computer and use it in GitHub Desktop.
spider for files of a certain extension
import yaml
import argparse
from attrdict import AttrDict
#from check_config import check_config
def check_config(config,partial=False):
pass
def config2struct(item):
if isinstance(item,list):
result = []
for e in item:
result += [config2struct(e)]
return result
elif isinstance(item,(dict, AttrDict)):
result = {}
for key,value in item.items():
result[key] = config2struct(value)
return result
else:
return item
def config2structofconfig(config,item):
if isinstance(item,list):
result = []
for e in item:
result += [config2structofconfig(config,e)]
return result
elif isinstance(item,dict):
result = {}
for key,value in item.items():
result[key] = config2structofconfig(config,value)
return result
else:
return config
def effective_dict(config_dicts):
result = AttrDict()
for config_dict in config_dicts:
assert isinstance(config_dict, dict)
result += config_dict
return config2struct(result)
class Config:
def __init__(self, configs, oconfig={}):
#list of config dict-like struct
self.configs = list(configs)
#the overriding config from the cmd-line options
self.oconfig = oconfig
#equivalent dictionaries for all of the above
self.all_config_dicts = [config2struct(config) for config in configs]
self.all_config_dicts += [config2struct(oconfig)]
#and dictionaries of the same stucture, except they point to which
# config originated the configuration
self.all_config_dicts_of_configs = [config2structofconfig(config,config) for config in configs]
self.all_config_dicts_of_configs += [config2structofconfig(oconfig,oconfig)]
#a pure dictionary of all the values merged together
self.effective_dict = effective_dict(self.all_config_dicts)
#a pure dictionary of all the values merged together, except instead
# of each value, it holds the original config which produces that value
self.effective_dict_of_configs = effective_dict(self.all_config_dicts_of_configs)
def get(self,path):
cur = self.effective_dict
for d in path:
cur = cur[d]
return cur
def effective(self):
return self.effective_dict
def compute_configfiles(configfiles, arg_options):
configs = []
for configfile in configfiles:
config_dict = yaml.load(configfile)
check_config(config_dict,partial=True)
configs += [config_dict]
options = {}
if arg_options is not None:
for option in arg_options:
key,_,value = option.partition('=')
keyparts = key.split('.')
options_pointer = options
for keypart in keyparts[:-1]:
if keypart in options_pointer:
child = options_pointer[keypart]
options_pointer = child
else:
child = {}
options_pointer[keypart] = child
options_pointer = child
options_pointer[keyparts[-1]] = value
config = Config(configs, options)
effective_config_dict = config.effective()
check_config(effective_config_dict)
return effective_config_dict
if __name__ == '__main__':
config0 = {'server':
{ 'port': 8080
, 'session':
{
'key': 'hellow'
, 'timeout': 600
, 'test_list': [1,2]
}
}
, 'deep_list':
[{ 'item': 'item', 'wat': 'no'}, 5]
}
config1 = {'server': {'session': {'key': 'newkey','t': 't', 'test_list': [3,4]} } }
config3 = {'deep_list': [{'item': 'no'}] }
options = {'server': {'port': 18080}}
c = Config([config0,config1,config3], options)
print (c.effective())
#result = AttrDict()
#result += config0
#result += config1
#result += options
#print result
spider:
chunk_size: 10240
max_buffer_size: 30720
max_bytes_processed: 5242880
downloads:
chunk_size: 10240
max_bytes_processed: 1048576
#python3 main.py --config config.yaml http://portableapps.com/ state.db .exe .7z .gz .zip
import requests
import argparse
import utils
import urllib
import json
import re
import logging
import time
import io
import sqlite3
import os
import tempfile
import hashlib
import shutil
import sys
from common import compute_configfiles
parser = argparse.ArgumentParser(description='Spider a site and download files of a certain extension.')
parser.add_argument('url', type=str,
help='site to spider')
parser.add_argument('state', type=str
, help='state file')
parser.add_argument('ext', type=str, nargs='+',
help='extensions to download')
group = parser.add_mutually_exclusive_group()
group.add_argument('--dump', dest='dump'
, action='store_const', const=True, default=False
, help='print download urls held by state to stdout')
group.add_argument('--download', dest='download'
, action='store_const', const=True, default=False
, help='download download urls held by state')
parser.add_argument('-c', '--config', dest='configs', type=argparse.FileType('r')
, action='append', default=[]
, help='yaml configuration file; specifying this multiple times overides configuration'
+ ' in the order they are specified')
parser.add_argument('-o', '--option', dest='options', metavar='<option>', type=str
, action='append', default=[]
, help='option in the form of -oOPTION=VALUE; overrides config.')
args = parser.parse_args()
config = compute_configfiles(args.configs, args.options)
logging.basicConfig(level=logging.INFO, filename='log.log')
URL_RE = utils.URL_RE().url_RE
HREF_DQ_RE = b'href\=\\"([^\\"]*)\\"'
HREF_DQ_RE = re.compile(HREF_DQ_RE)
HREF_SQ_RE = b"href\=\\'([^\\']*)\\'"
HREF_SQ_RE = re.compile(HREF_SQ_RE)
SRC_SQ_RE = b"src\=\\'([^\\']*)\\'"
SRC_SQ_RE = re.compile(HREF_SQ_RE)
SRC_DQ_RE = b'src\=\\"([^\\"]*)\\"'
SRC_DQ_RE = re.compile(HREF_DQ_RE)
url_mask = urllib.parse.urlsplit(args.url)
conn = sqlite3.connect(args.state)
cur = conn.cursor()
sql = """
CREATE TABLE IF NOT EXISTS pages_seen(
url text PRIMARY KEY
, complete bool default 0
, errored bool default 0
, ignored bool default 0
);
CREATE INDEX IF NOT EXISTS pages_seen_complete ON pages_seen (complete,errored,ignored);
CREATE TABLE IF NOT EXISTS downloads_seen(
url text PRIMARY KEY
, complete bool default 0
, errored bool default 0
, ignored bool default 0
);
CREATE INDEX IF NOT EXISTS downloads_seen_complete ON downloads_seen (complete,errored,ignored);
"""
conn.executescript(sql)
def print_status(file):
print ( 'pages left:',count_pages_left(cur)
, 'pages complete:',count_pages_seen(cur)-count_pages_left(cur)-count_pages_errored(cur)
, 'pages errored:',count_pages_errored(cur)
, 'pages ignored:',count_pages_ignored(cur)
, 'downloads left:',count_downloads_left(cur)
, 'downloads complete:',count_downloads_seen(cur)-count_downloads_left(cur)-count_downloads_errored(cur)
, 'downloads errored:',count_downloads_errored(cur)
, 'downloads ignored:',count_downloads_ignored(cur)
, file=file)
def is_page_seen(c,url):
assert isinstance(url,str)
sql = """
SELECT count(*)
FROM pages_seen
WHERE url=?
"""
c.execute(sql,(url,))
row = cur.fetchone()
return row[0] != 0
def is_page_complete(c,url):
assert isinstance(url,str)
sql = """
SELECT count(*)
FROM pages_seen
WHERE url=?
AND complete=1
"""
c.execute(sql,(url,))
row = cur.fetchone()
return row[0] != 0
def is_download_seen(c,url):
assert isinstance(url,str)
sql = """
SELECT count(*)
FROM downloads_seen
WHERE url=?
"""
c.execute(sql,(url,))
row = cur.fetchone()
return row[0] != 0
def is_download_complete(c,url):
assert isinstance(url,str)
sql = """
SELECT count(*)
FROM downloads_seen
WHERE url=?
AND complete=1
"""
c.execute(sql,(url,))
row = cur.fetchone()
return row[0] != 0
def add_page(c,url):
assert isinstance(url,str)
sql = """
INSERT INTO
pages_seen
(url)
VALUES (?)
"""
c.execute(sql,(url,))
return c.rowcount
def add_download(c,url):
assert isinstance(url,str)
sql = """
INSERT INTO
downloads_seen
(url)
VALUES (?)
"""
c.execute(sql,(url,))
return c.rowcount
def set_page_complete(c,url):
assert isinstance(url,str)
sql = """
UPDATE pages_seen
SET complete=1
WHERE url=?
"""
c.execute(sql,(url,))
return c.rowcount
def set_page_errored(c,url):
assert isinstance(url,str)
sql = """
UPDATE pages_seen
SET errored=1
WHERE url=?
"""
c.execute(sql,(url,))
return c.rowcount
def set_page_ignored(c,url):
assert isinstance(url,str)
sql = """
UPDATE pages_seen
SET ignored=1
WHERE url=?
"""
c.execute(sql,(url,))
return c.rowcount
def set_download_complete(c,url):
assert isinstance(url,str)
sql = """
UPDATE downloads_seen
SET complete=1
WHERE url=?
"""
c.execute(sql,(url,))
return c.rowcount
def set_download_errored(c,url):
assert isinstance(url,str)
sql = """
UPDATE downloads_seen
SET errored=1
WHERE url=?
"""
c.execute(sql,(url,))
return c.rowcount
def set_download_ignored(c,url):
assert isinstance(url,str)
sql = """
UPDATE downloads_seen
SET ignored=1
WHERE url=?
"""
c.execute(sql,(url,))
return c.rowcount
def count_pages_seen(c):
sql = """
SELECT count(*)
FROM pages_seen
"""
c.execute(sql)
return c.fetchone()[0]
def count_pages_left(c):
sql = """
SELECT count(*)
FROM pages_seen
WHERE complete=0
AND errored=0
AND ignored=0
"""
c.execute(sql)
return c.fetchone()[0]
def count_pages_errored(c):
sql = """
SELECT count(*)
FROM pages_seen
WHERE errored=1
"""
c.execute(sql)
return c.fetchone()[0]
def count_pages_ignored(c):
sql = """
SELECT count(*)
FROM pages_seen
WHERE ignored=1
"""
c.execute(sql)
return c.fetchone()[0]
def count_downloads_seen(c):
sql = """
SELECT count(*)
FROM downloads_seen
"""
c.execute(sql)
return c.fetchone()[0]
def count_downloads_left(c):
sql = """
SELECT count(*)
FROM downloads_seen
WHERE complete=0
AND errored=0
AND ignored=0
"""
c.execute(sql)
return c.fetchone()[0]
def count_downloads_errored(c):
sql = """
SELECT count(*)
FROM downloads_seen
WHERE errored=1
"""
c.execute(sql)
return c.fetchone()[0]
def count_downloads_ignored(c):
sql = """
SELECT count(*)
FROM downloads_seen
WHERE ignored=1
"""
c.execute(sql)
return c.fetchone()[0]
def get_next_pages(c,count):
sql = """
SELECT url
FROM pages_seen
WHERE complete=0
AND errored=0
AND ignored=0
LIMIT ?
"""
c.execute(sql,(count,))
return [row[0] for row in c.fetchall()]
def get_next_downloads(c,count):
sql = """
SELECT url
FROM downloads_seen
WHERE complete=0
AND errored=0
AND ignored=0
LIMIT ?
"""
c.execute(sql,(count,))
return [row[0] for row in c.fetchall()]
if args.dump:
for url in get_next_downloads(cur,count_downloads_left(cur)):
print (url)
exit(0)
class SpiderErrorException(Exception):
pass
class SpiderIgnoreException(Exception):
pass
def spider_page(url0):
assert isinstance(url0,tuple)
#print ('url0:',url0.geturl())
spider_config = config.get('spider',{})
chunk_size = spider_config.get('chunk_size', 1024*10)
max_buffer_size = spider_config.get('max_buffer_size', chunk_size*3)
max_bytes_processed = spider_config.get('max_bytes_processed', 1024*1024*5)
urls = []
results = []
r = requests.get(url0.geturl(), stream=True)
"""
if r.headers:
if 'content-type' in r.headers:
content_type = r.headers['content-type'].strip()
content_type,_,_ = content_type.partition(';')
valid_mime_types = set(['application/atom+xml','application/dash+xml'])
if not content_type.startswith('text'):
raise SpiderIgnoreException('cannot spider resource with non-text-based mime-type, url: %s' % (url0.geturl(),))
"""
buf = bytearray()
bytes_processed = 0
for chunk in r.iter_content(chunk_size=10240):
if chunk:
buf += chunk
bytes_processed += len(chunk)
if bytes_processed > max_bytes_processed:
raise SpiderIgnoreException('request was too large to spider, url: %s' % (url0.geturl(),))
urls += [match.group(0).strip() for match in URL_RE.finditer(buf)]
urls += [match.group(1).strip() for match in HREF_DQ_RE.finditer(buf)]
urls += [match.group(1).strip() for match in HREF_SQ_RE.finditer(buf)]
urls += [match.group(1).strip() for match in SRC_DQ_RE.finditer(buf)]
urls += [match.group(1).strip() for match in SRC_SQ_RE.finditer(buf)]
if len(buf) > max_buffer_size:
buf = buf[-max_buffer_size:]
assert len(buf) == max_buffer_size
for url in urls:
url = urllib.parse.urlsplit(url.decode())
results += [url]
del r
#print ('results:',results)
return results
def process_download(url):
assert isinstance(url, str)
h = hashlib.new('sha256')
r = requests.get(url, stream=True)
size = 0
downloads_config = config.get('downloads',{})
chunk_size = downloads_config.get('chunk_size', 1024*10)
max_bytes_processed = downloads_config.get('max_bytes_processed', 1024*1024*5)
#print ('config:',config, 'max_bytes_processed', max_bytes_processed)
path = urllib.parse.urlsplit(url).path
root,ext = os.path.splitext(path)
if ext not in args.ext:
logging.error('url: %s ext: %s does not match any extensions specified on the commandline' % (url,ext))
return
with tempfile.TemporaryFile() as outfile:
for chunk in r.iter_content(chunk_size=1024*10):
if chunk:
outfile.write(chunk)
outfile.flush()
os.fsync(outfile.fileno())
h.update(chunk)
size += len(chunk)
if size > max_bytes_processed:
raise SpiderIgnoreException('request was too large to spider, url: %s' % (url,))
H = h.hexdigest()
outfile.seek(0)
with open(H+'.'+ext,'w+b') as dstfile:
shutil.copyfileobj(outfile,dstfile)
set_download_complete(cur,url)
print ( 'downloaded: ', url, 'to: ', h.hexdigest(), 'size:', size, file=sys.stderr)
del r
if args.download:
while count_downloads_left(cur) > 0:
urls = list(get_next_downloads(cur,20))
while len(urls):
url = urls.pop()
print ( 'downloading: ', url, file=sys.stderr)
try:
process_download(url)
except KeyboardInterrupt:
raise
except (requests.exceptions.RequestException,SpiderErrorException):
logging.exception('error downloading url, url: %s' % (url,))
set_download_errored(cur,url)
except (SpiderIgnoreException):
logging.exception('downloading url ignored, url: %s' % (url,))
set_download_ignored(cur,url)
conn.commit()
print_status(file=sys.stderr)
exit(0)
argurl = urllib.parse.urlsplit(args.url).geturl()
if not is_page_seen(cur,argurl):
add_page(cur,argurl)
conn.commit()
def process_url(cur,url):
global urls, url_mask
assert isinstance(url,tuple)
#print ('is_page_complete(cur,url.geturl()):',is_page_complete(cur,url.geturl()))
if is_page_complete(cur,url.geturl()):
return
new_urls = spider_page(url)
#print ('new_urls:',new_urls)
for new_url in new_urls:
a = url.geturl()
b = new_url.geturl()
#print ('a:',a, 'b:',b, 'type(a):',type(a), 'type(b):', type(b))
new_url = urllib.parse.urljoin(a,b)
new_url = urllib.parse.urlsplit(new_url)
#TODO, take extension off the path, and compare it that way
for ext in args.ext:
if new_url.path.lower().endswith(ext) and not is_download_seen(cur,new_url.geturl()):
add_download(cur,new_url.geturl())
continue
if new_url.netloc != url_mask.netloc:
continue
#TODO more mask, match path
if is_page_seen(cur,new_url.geturl()) or is_page_complete(cur,new_url.geturl()):
continue
add_page(cur,new_url.geturl())
set_page_complete(cur,url.geturl())
#print ('pages_left:',count_pages_left(cur)
# , 'pages_seen:',count_pages_seen(cur)
# , 'downloads_left:',count_downloads_left(cur))
while len(get_next_pages(cur,1)) > 0:
urls = list(get_next_pages(cur,20))
#print ('urls:',urls)
while len(urls):
url = urls.pop()
#print ('url:',url)
try:
process_url(cur,urllib.parse.urlsplit(url))
except KeyboardInterrupt:
raise
except (requests.exceptions.RequestException,SpiderErrorException):
logging.exception('error processing url, url: %s' % (url,))
set_page_errored(cur,url)
except (SpiderIgnoreException):
logging.warning('processing url, ignored, url: %s' % (url,))
set_page_ignored(cur,url)
#except:
# logging.exception('error processing url, url: %s' % (url,))
conn.commit()
time.sleep(.01)
print_status(file=sys.stderr)
# -*- coding: utf-8 -*-
import traceback,sys
import socket
import random
import time
import os
import unittest
import re
"""
from https://gist.github.com/1595135
Written by Christian Stigen Larsen, http://csl.sublevel3.org
Placed in the public domain by the author, 2012-01-11
"""
def ip_int_from_string(s):
"Convert dotted IPv4 address to integer."
return reduce(lambda a,b: a<<8 | b, map(int, s.split(".")))
def ip_int_to_string(ip):
"Convert 32-bit integer to dotted IPv4 address."
return ".".join(map(lambda n: str(ip>>n & 0xFF), [24,16,8,0]))
def format_relative_time(t):
minute_seconds = 60
hour_seconds = minute_seconds * 60
day_seconds = hour_seconds * 24
week_seconds = day_seconds * 7
year_seconds = day_seconds * 365
if t < minute_seconds:
return '{seconds} seconds'.format(seconds=t)
elif t < hour_seconds:
return '{minutes} minutes'.format(minutes=int(t/minute_seconds))
elif t < day_seconds:
hours = int(t/hour_seconds)
minutes = int((t - (hours*hour_seconds)) / minute_seconds)
return '{hours} hours {minutes} minutes'.format(hours=hours,minutes=minutes)
elif t < year_seconds:
days = int(t/day_seconds)
hours = int((t - (days*day_seconds)) / hour_seconds)
return '{days} days {hours} hours'.format(days=days,hours=hours)
years = int(t/year_seconds)
weeks = int((t - (years*year_seconds)) / week_seconds)
return '{years} years {weeks} weeks'.format(years=years,weeks=weeks)
class IPLocator:
def __init__(self,config):
self.config = config
try:
import pygeoip
self.gic = pygeoip.GeoIP(config['geoipcityip4_path'])
except Exception as e:
print >> sys.stderr, 'pygeoip setup error',e
def obtain_locations(self,ip):
pass
def obtain_address_info(host,config,iplocate=True,rdns=True):
results = {}
ip = None
#print >> sys.stderr, 'host:',host
try:
data = socket.gethostbyname(host)
#ip = repr(data)
ip = data
#print 'ip:',ip
except Exception:
raise
results['ip'] = ip
if iplocate:
results['iplocations'] = {}
try:
import urllib
import json
#FIXME: does this need to be cleaned up??
rresponse = urllib.urlopen('http://api.hostip.info/get_json.php?ip={ip}&position=true'.format(ip=ip)).read()
rresponse_json = json.loads(rresponse)
#print 'rresponse_json:',rresponse_json
country_name = rresponse_json['country_name']
country_name = None if country_name is None else country_name.encode('utf-8')
city = rresponse_json['city']
city = None if city is None else city.encode('utf-8')
lng = rresponse_json['lng']
lat = rresponse_json['lat']
response = (
'|hostip| country: "{country}" city: "{city}" longitude: {longitude} latitude: {latitude}'.format(
country=country_name,
city=city,
longitude=lng,
latitude=lat))
results['iplocations']['hostip'] = response
except Exception as e:
print >> sys.stderr, 'hostip error:',e
results['iplocations']['hostip'] = '|hostip| error'
try:
import pygeoip
gic = pygeoip.GeoIP(config['geoipcityip4_path'])
record = gic.record_by_addr(ip)
response = ('|geoipcityip4| ' + str(record))
results['iplocations']['geoipcityip4'] = response
except Exception as e:
print >> sys.stderr, 'pygeoip error',e
results['iplocations']['geoipcityip4'] = '|geoipcityip4| error'
try:
ip_int = ip_int_from_string(ip)
#print >> sys.stderr, 'ip_int:',ip_int
with open(config['IpToCountry.csv']) as ip2country:
for line in ip2country:
line = line.strip()
line_data = line.split(',')
if len(line) == 0:
continue
if line[0] == '#':
continue
"""
print 'line:',line
print 'line_data:',line_data
print 'len(line_data):',len(line_data)
"""
start_str = line_data[0].strip()[1:-1]
end_str = line_data[1].strip()[1:-1]
"""
print 'start_str:',start_str
print 'end_str:',end_str
print
"""
ip_first_int = int(start_str)
ip_last_int = int(end_str)
if ip_first_int <= ip_int and ip_int <= ip_last_int:
#print line_data
registry = line_data[2].strip()[1:-1]
country = line_data[6].strip()[1:-1]
ip_first = ip_int_to_string(ip_first_int)
ip_last = ip_int_to_string(ip_last_int)
response = ('|IpToCountry| range:[{ip_first}-{ip_last}], registry: {registry}, country: {country}'.format(
ip_first=ip_first,ip_last=ip_last,registry=registry,country=country))
results['iplocations']['IpToCountry'] = response
break
results['iplocations']['IpToCountry'] = '|IpToCountry| error, no results'
except Exception as e:
print >> sys.stderr, 'IpToCountry error:',e
results['iplocations']['IpToCountry'] = '|IpToCountry| error'
if rdns:
results['domains'] = []
try:
from dns import resolver,reversename
addr=reversename.from_address(ip)
#print >> sys.stderr, addr
for hmm in resolver.query(addr,"PTR"):
results['domains'] += [str(hmm)]
except Exception as e:
print >> sys.stderr, 'Reverse DNS error:',e
return results
def get_standard_argparser():
import argparse
parser = argparse.ArgumentParser(add_help=True)
parser.add_argument('--delim', metavar='<delimchar>', type=str, nargs='?', help='delimeter, defaults to space', default=' ')
parser.add_argument('--indelim', metavar='<delimchar>', type=str, nargs='?',
help='input delimiter, overrides --delim, defaults to --delim')
parser.add_argument('--outdelim', metavar='<delimchar>', type=str, nargs='?',
help='output delimeter, overrides --delim, defaults to --delim')
parser.add_argument('--quote', metavar='<quotechar>', type=str, nargs='?', help='quote, defaults to \'"\'', default='"')
parser.add_argument('--inquote', metavar='<quotechar>', type=str, nargs='?',
help='input quote, overrides --quote, defaults to --quote')
parser.add_argument('--outquote', metavar='<quotechar>', type=str, nargs='?',
help='output quote, overrides --quote, defaults to --quote')
parser.add_argument('--infile', '-i', metavar='<path>', type=argparse.FileType('rb'), nargs='?',
help='input file path, defaults to stdin', default=sys.stdin)
parser.add_argument('--outfile', '-o', metavar='<path>', type=argparse.FileType('wb'), nargs='?',
help='output file path, defaults to stdout', default=sys.stdout)
return parser
def default_headers_cb(tool,headers):
tool.csvwriter.writerow(headers)
tool.outfile.flush()
def default_row_cb(tool,row,row_data):
tool.csvwriter.writerow(row)
tool.outfile.flush()
class GenericTool:
def __init__(self):
self.headers_cb = default_headers_cb
self.row_cbs = [default_row_cb]
self.csvwriter = None
self.infile = None
self.outfile = None
self.parser = get_standard_argparser()
self.parsed_args = None
def parse_args(self):
self.parsed_args = self.parser.parse_args()
def run(self):
import csv
parsed_args = self.parsed_args
indelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.indelim
outdelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.outdelim
inquote = parsed_args.quote if parsed_args.inquote is None else parsed_args.inquote
outquote = parsed_args.quote if parsed_args.outquote is None else result.outquote
with parsed_args.infile as infile:
outfile = parsed_args.outfile
self.infile = infile
self.outfile = outfile
csvreader = self.csvreader = csv.reader(infile, delimiter=indelim, quotechar=inquote)
csvwriter = self.csvwriter = csv.writer(outfile, delimiter=outdelim, quotechar=outquote)
headers = []
for row in csvreader:
headers = row
break
if self.headers_cb is not None:
self.headers_cb(self,headers)
for row in csvreader:
try:
row_data = {}
for idx in range(len(headers)):
row_data[headers[idx]] = row[idx]
for row_cb in self.row_cbs:
try:
row_cb(self,row,row_data)
except IOError as e:
raise
except Exception as e:
print >> sys.stderr, 'Exception while row_cb:',e
print >> sys.stderr, 'row_cb:',row_cb
traceback.print_exc(file=sys.stderr)
except IOError as e:
raise
except Exception as e:
print >> sys.stderr, 'Exception while parsing line from stdin:',e
print >> sys.stderr, 'line:',row
traceback.print_exc(file=sys.stderr)
def generate_random_alphanumerics(length):
abcs = 'abcdefghijklmnopqrstuvwxyz'
result = [abcs[random.randint(0,len(abcs)-1)] for _ in range(length)]
result = ''.join(result)
return result
def generate_FUZZY_URLP_RE_STR():
valid_scheme_chars = 'a-zA-Z'
valid_domain_chars = '\w\\.'
def _valid_path_chars():
safe = '\\$\\-_\\.\\+'
extra = '\!\*\\(\)\,' #removed \\'
unreserved = '\w'+ safe+extra
reserved = '\\;/\\?\:\\@\\&\\='
escape = '\\%'
xchar = unreserved + reserved + escape
return xchar
valid_path_chars = _valid_path_chars()
tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
'net', 'org', 'pro', 'tel', 'travel']
result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \
+ '[' + valid_domain_chars + ']*' \
+ '\.([a-zA-Z]{2}|' + '|'.join(tlds) + ')(\\:[\d]*)?/' \
+ '[' + valid_path_chars + ']*' \
+ '(\\#[' + valid_path_chars + ']*)?' + ')'
return result
def generate_FUZZY_URL_RE_STR():
valid_scheme_chars = 'a-zA-Z'
valid_domain_chars = '\w\\.'
def _valid_path_chars():
safe = '\\$\\-_\\.\\+'
extra = '\!\*\\(\)\,' #removed \\'
unreserved = '\w'+ safe+extra
reserved = '\\;/\\?\:\\@\\&\\='
escape = '\\%'
xchar = unreserved + reserved + escape
return xchar
valid_path_chars = _valid_path_chars()
tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
'net', 'org', 'pro', 'tel', 'travel']
domain_RE_STR = r'(\w+(\.\w+)*)'
path_RE_STR = r'(/[' + valid_path_chars + '])'
query_RE_STR = r'(\?[' + valid_path_chars + '])'
fragment_RE_STR = r'(\#[' + valid_path_chars + '])'
"""
TODO:
* "...so" matches, make sure no two "." in a domain
* "something.sol" matches something.so
"""
result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \
+ '[' + valid_domain_chars + ']*' \
+ '\.([a-zA-Z]{2}|' + '|'.join(tlds) + ')(\\:[\d]*)?/?' \
+ '[' + valid_path_chars + ']*' \
+ '(\\#[' + valid_path_chars + ']*)?' + ')'
return result
"""
timeout
Socket timeout for each test.
tests
Proxies to test for.
log
File object to log to.
"""
def test_proxy(ip,port,tests=['HTTP','SOCKS4','SOCKS5'], timeout=20,log=os.devnull):
import socksocket
str2proxytype = {'SOCKS4':socksocket.PROXY_TYPE_SOCKS4,
'SOCKS5':socksocket.PROXY_TYPE_SOCKS5,
'HTTP':socksocket.PROXY_TYPE_HTTP}
proxytype2str = {socksocket.PROXY_TYPE_SOCKS4:'SOCKS4',
socksocket.PROXY_TYPE_SOCKS5:'SOCKS5',
socksocket.PROXY_TYPE_HTTP:'HTTP'}
for test in tests:
if test not in str2proxytype:
#print >> log, 'test:',test,'is unknown, next!'
continue
proxytype = str2proxytype[test]
#print >> log, ' trying', test
s = socksocket.socksocket()
s.settimeout(20)
try:
s.setproxy(proxytype=proxytype,addr=ip,port=port,rdns=True)
s.connect(('google.com',80))
s.sendall( '''GET / HTTP/1.1\r\nHost: google.com\r\n\r\n''' )
start = time.time()
timeout = s.gettimeout()
while True:
b = s.recv(1)
if len(b):
break
time.sleep(.001)
if timeout is not None and time.time() - start >= timeout:
raise socket.timeout
#print >> log, 'found one:',proxy,proxytype2str[proxytype]
return proxytype2str[proxytype]
except socket.error as e:
#print >> log, ' socket.error'
continue
except socksocket.ProxyError:
#print >> log, ' socksocket.ProxyError'
continue
except Exception as e:
print >> log, ' UNKNOWN ERROR:',e
traceback.print_exc(file=sys.stderr)
finally:
s.close()
return None
class URL_RE:
def __init__(self):
valid_scheme_chars = 'a-zA-Z'
valid_domain_chars = '\w\\.'
def _valid_path_chars():
safe = '\\$\\-_\\.\\+'
extra = '\!\*\\(\)\,' #removed \\'
unreserved = '\w'+ safe+extra
reserved = '\\;/\\?\:\\@\\&\\='
escape = '\\%'
xchar = unreserved + reserved + escape
return xchar
valid_path_chars = _valid_path_chars()
tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
'net', 'org', 'pro', 'tel', 'travel',
'travel', 'xxx', 'post',
'arpa',
u'бг', u'ελ', u'ישראל', u'мкд', u'日本', u'日本国', u'ລາວ', #u'ليبيا'‎,
]
self.scheme_RE_STR = r'([' + valid_scheme_chars + ']*\:)'
self.domain_RE_STR = r'(\w+(\.\w+)*)\.([a-zA-Z]{2}|' + '|'.join(tlds) + ')(\\:[\d]*)?'
self.path_RE_STR = r'(/[' + valid_path_chars + ']*)'
self.query_RE_STR = r'(\?[' + valid_path_chars + ']*)'
self.fragment_RE_STR = r'(#[' + valid_path_chars + ']*)'
self.domain_RE = re.compile(self.domain_RE_STR)
self.url_RE_STR = u'(^|\s)(({scheme}?//)?({domain})({path})?({query})?({fragment})?)($|\s)'
self.url_RE_STR = self.url_RE_STR.format(scheme=self.scheme_RE_STR,
domain=self.domain_RE_STR,
path=self.path_RE_STR,
query=self.query_RE_STR,
fragment=self.fragment_RE_STR).encode('utf-8')
self.url_RE = re.compile(self.url_RE_STR)
class TestURL_RE(unittest.TestCase):
def setUp(self):
self.url_re = URL_RE()
def test_domain_two_dots(self):
self.assertIsNotNone(self.url_re.url_RE.search('a.com'))
self.assertIsNone(self.url_re.url_RE.search('a..com'))
def test_invalid_tld(self):
self.assertIsNotNone(self.url_re.url_RE.search('a.co'))
self.assertIsNone(self.url_re.url_RE.search('a.cod'))
def test_urls(self):
urls = [
('google.com', True),
('google..com', False),
('google.comp', False),
('google.cop', False),
('//google.com', True),
('mmm://google.com', True),
('google.com/', True),
('google.com/?', True),
('google.com/?#', True),
('google.com/#', True),
('google.com?', True),
('google.com?#', True),
('google.com#', True),
('mmm..........chummus.com', False),
]
for url, expectation in urls:
print ('url:',url)
if expectation:
self.assertIsNotNone(self.url_re.url_RE.search(url))
else:
self.assertIsNone(self.url_re.url_RE.search(url))
def main():
unittest.main()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment