Last active
August 29, 2015 14:20
-
-
Save realazthat/3863cf3a0479c6fb2d1c to your computer and use it in GitHub Desktop.
spider for files of a certain extension
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import yaml | |
import argparse | |
from attrdict import AttrDict | |
#from check_config import check_config | |
def check_config(config,partial=False): | |
pass | |
def config2struct(item): | |
if isinstance(item,list): | |
result = [] | |
for e in item: | |
result += [config2struct(e)] | |
return result | |
elif isinstance(item,(dict, AttrDict)): | |
result = {} | |
for key,value in item.items(): | |
result[key] = config2struct(value) | |
return result | |
else: | |
return item | |
def config2structofconfig(config,item): | |
if isinstance(item,list): | |
result = [] | |
for e in item: | |
result += [config2structofconfig(config,e)] | |
return result | |
elif isinstance(item,dict): | |
result = {} | |
for key,value in item.items(): | |
result[key] = config2structofconfig(config,value) | |
return result | |
else: | |
return config | |
def effective_dict(config_dicts): | |
result = AttrDict() | |
for config_dict in config_dicts: | |
assert isinstance(config_dict, dict) | |
result += config_dict | |
return config2struct(result) | |
class Config: | |
def __init__(self, configs, oconfig={}): | |
#list of config dict-like struct | |
self.configs = list(configs) | |
#the overriding config from the cmd-line options | |
self.oconfig = oconfig | |
#equivalent dictionaries for all of the above | |
self.all_config_dicts = [config2struct(config) for config in configs] | |
self.all_config_dicts += [config2struct(oconfig)] | |
#and dictionaries of the same stucture, except they point to which | |
# config originated the configuration | |
self.all_config_dicts_of_configs = [config2structofconfig(config,config) for config in configs] | |
self.all_config_dicts_of_configs += [config2structofconfig(oconfig,oconfig)] | |
#a pure dictionary of all the values merged together | |
self.effective_dict = effective_dict(self.all_config_dicts) | |
#a pure dictionary of all the values merged together, except instead | |
# of each value, it holds the original config which produces that value | |
self.effective_dict_of_configs = effective_dict(self.all_config_dicts_of_configs) | |
def get(self,path): | |
cur = self.effective_dict | |
for d in path: | |
cur = cur[d] | |
return cur | |
def effective(self): | |
return self.effective_dict | |
def compute_configfiles(configfiles, arg_options): | |
configs = [] | |
for configfile in configfiles: | |
config_dict = yaml.load(configfile) | |
check_config(config_dict,partial=True) | |
configs += [config_dict] | |
options = {} | |
if arg_options is not None: | |
for option in arg_options: | |
key,_,value = option.partition('=') | |
keyparts = key.split('.') | |
options_pointer = options | |
for keypart in keyparts[:-1]: | |
if keypart in options_pointer: | |
child = options_pointer[keypart] | |
options_pointer = child | |
else: | |
child = {} | |
options_pointer[keypart] = child | |
options_pointer = child | |
options_pointer[keyparts[-1]] = value | |
config = Config(configs, options) | |
effective_config_dict = config.effective() | |
check_config(effective_config_dict) | |
return effective_config_dict | |
if __name__ == '__main__': | |
config0 = {'server': | |
{ 'port': 8080 | |
, 'session': | |
{ | |
'key': 'hellow' | |
, 'timeout': 600 | |
, 'test_list': [1,2] | |
} | |
} | |
, 'deep_list': | |
[{ 'item': 'item', 'wat': 'no'}, 5] | |
} | |
config1 = {'server': {'session': {'key': 'newkey','t': 't', 'test_list': [3,4]} } } | |
config3 = {'deep_list': [{'item': 'no'}] } | |
options = {'server': {'port': 18080}} | |
c = Config([config0,config1,config3], options) | |
print (c.effective()) | |
#result = AttrDict() | |
#result += config0 | |
#result += config1 | |
#result += options | |
#print result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spider: | |
chunk_size: 10240 | |
max_buffer_size: 30720 | |
max_bytes_processed: 5242880 | |
downloads: | |
chunk_size: 10240 | |
max_bytes_processed: 1048576 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#python3 main.py --config config.yaml http://portableapps.com/ state.db .exe .7z .gz .zip | |
import requests | |
import argparse | |
import utils | |
import urllib | |
import json | |
import re | |
import logging | |
import time | |
import io | |
import sqlite3 | |
import os | |
import tempfile | |
import hashlib | |
import shutil | |
import sys | |
from common import compute_configfiles | |
parser = argparse.ArgumentParser(description='Spider a site and download files of a certain extension.') | |
parser.add_argument('url', type=str, | |
help='site to spider') | |
parser.add_argument('state', type=str | |
, help='state file') | |
parser.add_argument('ext', type=str, nargs='+', | |
help='extensions to download') | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument('--dump', dest='dump' | |
, action='store_const', const=True, default=False | |
, help='print download urls held by state to stdout') | |
group.add_argument('--download', dest='download' | |
, action='store_const', const=True, default=False | |
, help='download download urls held by state') | |
parser.add_argument('-c', '--config', dest='configs', type=argparse.FileType('r') | |
, action='append', default=[] | |
, help='yaml configuration file; specifying this multiple times overides configuration' | |
+ ' in the order they are specified') | |
parser.add_argument('-o', '--option', dest='options', metavar='<option>', type=str | |
, action='append', default=[] | |
, help='option in the form of -oOPTION=VALUE; overrides config.') | |
args = parser.parse_args() | |
config = compute_configfiles(args.configs, args.options) | |
logging.basicConfig(level=logging.INFO, filename='log.log') | |
URL_RE = utils.URL_RE().url_RE | |
HREF_DQ_RE = b'href\=\\"([^\\"]*)\\"' | |
HREF_DQ_RE = re.compile(HREF_DQ_RE) | |
HREF_SQ_RE = b"href\=\\'([^\\']*)\\'" | |
HREF_SQ_RE = re.compile(HREF_SQ_RE) | |
SRC_SQ_RE = b"src\=\\'([^\\']*)\\'" | |
SRC_SQ_RE = re.compile(HREF_SQ_RE) | |
SRC_DQ_RE = b'src\=\\"([^\\"]*)\\"' | |
SRC_DQ_RE = re.compile(HREF_DQ_RE) | |
url_mask = urllib.parse.urlsplit(args.url) | |
conn = sqlite3.connect(args.state) | |
cur = conn.cursor() | |
sql = """ | |
CREATE TABLE IF NOT EXISTS pages_seen( | |
url text PRIMARY KEY | |
, complete bool default 0 | |
, errored bool default 0 | |
, ignored bool default 0 | |
); | |
CREATE INDEX IF NOT EXISTS pages_seen_complete ON pages_seen (complete,errored,ignored); | |
CREATE TABLE IF NOT EXISTS downloads_seen( | |
url text PRIMARY KEY | |
, complete bool default 0 | |
, errored bool default 0 | |
, ignored bool default 0 | |
); | |
CREATE INDEX IF NOT EXISTS downloads_seen_complete ON downloads_seen (complete,errored,ignored); | |
""" | |
conn.executescript(sql) | |
def print_status(file): | |
print ( 'pages left:',count_pages_left(cur) | |
, 'pages complete:',count_pages_seen(cur)-count_pages_left(cur)-count_pages_errored(cur) | |
, 'pages errored:',count_pages_errored(cur) | |
, 'pages ignored:',count_pages_ignored(cur) | |
, 'downloads left:',count_downloads_left(cur) | |
, 'downloads complete:',count_downloads_seen(cur)-count_downloads_left(cur)-count_downloads_errored(cur) | |
, 'downloads errored:',count_downloads_errored(cur) | |
, 'downloads ignored:',count_downloads_ignored(cur) | |
, file=file) | |
def is_page_seen(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
SELECT count(*) | |
FROM pages_seen | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
row = cur.fetchone() | |
return row[0] != 0 | |
def is_page_complete(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
SELECT count(*) | |
FROM pages_seen | |
WHERE url=? | |
AND complete=1 | |
""" | |
c.execute(sql,(url,)) | |
row = cur.fetchone() | |
return row[0] != 0 | |
def is_download_seen(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
SELECT count(*) | |
FROM downloads_seen | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
row = cur.fetchone() | |
return row[0] != 0 | |
def is_download_complete(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
SELECT count(*) | |
FROM downloads_seen | |
WHERE url=? | |
AND complete=1 | |
""" | |
c.execute(sql,(url,)) | |
row = cur.fetchone() | |
return row[0] != 0 | |
def add_page(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
INSERT INTO | |
pages_seen | |
(url) | |
VALUES (?) | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def add_download(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
INSERT INTO | |
downloads_seen | |
(url) | |
VALUES (?) | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def set_page_complete(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
UPDATE pages_seen | |
SET complete=1 | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def set_page_errored(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
UPDATE pages_seen | |
SET errored=1 | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def set_page_ignored(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
UPDATE pages_seen | |
SET ignored=1 | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def set_download_complete(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
UPDATE downloads_seen | |
SET complete=1 | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def set_download_errored(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
UPDATE downloads_seen | |
SET errored=1 | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def set_download_ignored(c,url): | |
assert isinstance(url,str) | |
sql = """ | |
UPDATE downloads_seen | |
SET ignored=1 | |
WHERE url=? | |
""" | |
c.execute(sql,(url,)) | |
return c.rowcount | |
def count_pages_seen(c): | |
sql = """ | |
SELECT count(*) | |
FROM pages_seen | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_pages_left(c): | |
sql = """ | |
SELECT count(*) | |
FROM pages_seen | |
WHERE complete=0 | |
AND errored=0 | |
AND ignored=0 | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_pages_errored(c): | |
sql = """ | |
SELECT count(*) | |
FROM pages_seen | |
WHERE errored=1 | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_pages_ignored(c): | |
sql = """ | |
SELECT count(*) | |
FROM pages_seen | |
WHERE ignored=1 | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_downloads_seen(c): | |
sql = """ | |
SELECT count(*) | |
FROM downloads_seen | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_downloads_left(c): | |
sql = """ | |
SELECT count(*) | |
FROM downloads_seen | |
WHERE complete=0 | |
AND errored=0 | |
AND ignored=0 | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_downloads_errored(c): | |
sql = """ | |
SELECT count(*) | |
FROM downloads_seen | |
WHERE errored=1 | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def count_downloads_ignored(c): | |
sql = """ | |
SELECT count(*) | |
FROM downloads_seen | |
WHERE ignored=1 | |
""" | |
c.execute(sql) | |
return c.fetchone()[0] | |
def get_next_pages(c,count): | |
sql = """ | |
SELECT url | |
FROM pages_seen | |
WHERE complete=0 | |
AND errored=0 | |
AND ignored=0 | |
LIMIT ? | |
""" | |
c.execute(sql,(count,)) | |
return [row[0] for row in c.fetchall()] | |
def get_next_downloads(c,count): | |
sql = """ | |
SELECT url | |
FROM downloads_seen | |
WHERE complete=0 | |
AND errored=0 | |
AND ignored=0 | |
LIMIT ? | |
""" | |
c.execute(sql,(count,)) | |
return [row[0] for row in c.fetchall()] | |
if args.dump: | |
for url in get_next_downloads(cur,count_downloads_left(cur)): | |
print (url) | |
exit(0) | |
class SpiderErrorException(Exception): | |
pass | |
class SpiderIgnoreException(Exception): | |
pass | |
def spider_page(url0): | |
assert isinstance(url0,tuple) | |
#print ('url0:',url0.geturl()) | |
spider_config = config.get('spider',{}) | |
chunk_size = spider_config.get('chunk_size', 1024*10) | |
max_buffer_size = spider_config.get('max_buffer_size', chunk_size*3) | |
max_bytes_processed = spider_config.get('max_bytes_processed', 1024*1024*5) | |
urls = [] | |
results = [] | |
r = requests.get(url0.geturl(), stream=True) | |
""" | |
if r.headers: | |
if 'content-type' in r.headers: | |
content_type = r.headers['content-type'].strip() | |
content_type,_,_ = content_type.partition(';') | |
valid_mime_types = set(['application/atom+xml','application/dash+xml']) | |
if not content_type.startswith('text'): | |
raise SpiderIgnoreException('cannot spider resource with non-text-based mime-type, url: %s' % (url0.geturl(),)) | |
""" | |
buf = bytearray() | |
bytes_processed = 0 | |
for chunk in r.iter_content(chunk_size=10240): | |
if chunk: | |
buf += chunk | |
bytes_processed += len(chunk) | |
if bytes_processed > max_bytes_processed: | |
raise SpiderIgnoreException('request was too large to spider, url: %s' % (url0.geturl(),)) | |
urls += [match.group(0).strip() for match in URL_RE.finditer(buf)] | |
urls += [match.group(1).strip() for match in HREF_DQ_RE.finditer(buf)] | |
urls += [match.group(1).strip() for match in HREF_SQ_RE.finditer(buf)] | |
urls += [match.group(1).strip() for match in SRC_DQ_RE.finditer(buf)] | |
urls += [match.group(1).strip() for match in SRC_SQ_RE.finditer(buf)] | |
if len(buf) > max_buffer_size: | |
buf = buf[-max_buffer_size:] | |
assert len(buf) == max_buffer_size | |
for url in urls: | |
url = urllib.parse.urlsplit(url.decode()) | |
results += [url] | |
del r | |
#print ('results:',results) | |
return results | |
def process_download(url): | |
assert isinstance(url, str) | |
h = hashlib.new('sha256') | |
r = requests.get(url, stream=True) | |
size = 0 | |
downloads_config = config.get('downloads',{}) | |
chunk_size = downloads_config.get('chunk_size', 1024*10) | |
max_bytes_processed = downloads_config.get('max_bytes_processed', 1024*1024*5) | |
#print ('config:',config, 'max_bytes_processed', max_bytes_processed) | |
path = urllib.parse.urlsplit(url).path | |
root,ext = os.path.splitext(path) | |
if ext not in args.ext: | |
logging.error('url: %s ext: %s does not match any extensions specified on the commandline' % (url,ext)) | |
return | |
with tempfile.TemporaryFile() as outfile: | |
for chunk in r.iter_content(chunk_size=1024*10): | |
if chunk: | |
outfile.write(chunk) | |
outfile.flush() | |
os.fsync(outfile.fileno()) | |
h.update(chunk) | |
size += len(chunk) | |
if size > max_bytes_processed: | |
raise SpiderIgnoreException('request was too large to spider, url: %s' % (url,)) | |
H = h.hexdigest() | |
outfile.seek(0) | |
with open(H+'.'+ext,'w+b') as dstfile: | |
shutil.copyfileobj(outfile,dstfile) | |
set_download_complete(cur,url) | |
print ( 'downloaded: ', url, 'to: ', h.hexdigest(), 'size:', size, file=sys.stderr) | |
del r | |
if args.download: | |
while count_downloads_left(cur) > 0: | |
urls = list(get_next_downloads(cur,20)) | |
while len(urls): | |
url = urls.pop() | |
print ( 'downloading: ', url, file=sys.stderr) | |
try: | |
process_download(url) | |
except KeyboardInterrupt: | |
raise | |
except (requests.exceptions.RequestException,SpiderErrorException): | |
logging.exception('error downloading url, url: %s' % (url,)) | |
set_download_errored(cur,url) | |
except (SpiderIgnoreException): | |
logging.exception('downloading url ignored, url: %s' % (url,)) | |
set_download_ignored(cur,url) | |
conn.commit() | |
print_status(file=sys.stderr) | |
exit(0) | |
argurl = urllib.parse.urlsplit(args.url).geturl() | |
if not is_page_seen(cur,argurl): | |
add_page(cur,argurl) | |
conn.commit() | |
def process_url(cur,url): | |
global urls, url_mask | |
assert isinstance(url,tuple) | |
#print ('is_page_complete(cur,url.geturl()):',is_page_complete(cur,url.geturl())) | |
if is_page_complete(cur,url.geturl()): | |
return | |
new_urls = spider_page(url) | |
#print ('new_urls:',new_urls) | |
for new_url in new_urls: | |
a = url.geturl() | |
b = new_url.geturl() | |
#print ('a:',a, 'b:',b, 'type(a):',type(a), 'type(b):', type(b)) | |
new_url = urllib.parse.urljoin(a,b) | |
new_url = urllib.parse.urlsplit(new_url) | |
#TODO, take extension off the path, and compare it that way | |
for ext in args.ext: | |
if new_url.path.lower().endswith(ext) and not is_download_seen(cur,new_url.geturl()): | |
add_download(cur,new_url.geturl()) | |
continue | |
if new_url.netloc != url_mask.netloc: | |
continue | |
#TODO more mask, match path | |
if is_page_seen(cur,new_url.geturl()) or is_page_complete(cur,new_url.geturl()): | |
continue | |
add_page(cur,new_url.geturl()) | |
set_page_complete(cur,url.geturl()) | |
#print ('pages_left:',count_pages_left(cur) | |
# , 'pages_seen:',count_pages_seen(cur) | |
# , 'downloads_left:',count_downloads_left(cur)) | |
while len(get_next_pages(cur,1)) > 0: | |
urls = list(get_next_pages(cur,20)) | |
#print ('urls:',urls) | |
while len(urls): | |
url = urls.pop() | |
#print ('url:',url) | |
try: | |
process_url(cur,urllib.parse.urlsplit(url)) | |
except KeyboardInterrupt: | |
raise | |
except (requests.exceptions.RequestException,SpiderErrorException): | |
logging.exception('error processing url, url: %s' % (url,)) | |
set_page_errored(cur,url) | |
except (SpiderIgnoreException): | |
logging.warning('processing url, ignored, url: %s' % (url,)) | |
set_page_ignored(cur,url) | |
#except: | |
# logging.exception('error processing url, url: %s' % (url,)) | |
conn.commit() | |
time.sleep(.01) | |
print_status(file=sys.stderr) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import traceback,sys | |
import socket | |
import random | |
import time | |
import os | |
import unittest | |
import re | |
""" | |
from https://gist.github.com/1595135 | |
Written by Christian Stigen Larsen, http://csl.sublevel3.org | |
Placed in the public domain by the author, 2012-01-11 | |
""" | |
def ip_int_from_string(s): | |
"Convert dotted IPv4 address to integer." | |
return reduce(lambda a,b: a<<8 | b, map(int, s.split("."))) | |
def ip_int_to_string(ip): | |
"Convert 32-bit integer to dotted IPv4 address." | |
return ".".join(map(lambda n: str(ip>>n & 0xFF), [24,16,8,0])) | |
def format_relative_time(t): | |
minute_seconds = 60 | |
hour_seconds = minute_seconds * 60 | |
day_seconds = hour_seconds * 24 | |
week_seconds = day_seconds * 7 | |
year_seconds = day_seconds * 365 | |
if t < minute_seconds: | |
return '{seconds} seconds'.format(seconds=t) | |
elif t < hour_seconds: | |
return '{minutes} minutes'.format(minutes=int(t/minute_seconds)) | |
elif t < day_seconds: | |
hours = int(t/hour_seconds) | |
minutes = int((t - (hours*hour_seconds)) / minute_seconds) | |
return '{hours} hours {minutes} minutes'.format(hours=hours,minutes=minutes) | |
elif t < year_seconds: | |
days = int(t/day_seconds) | |
hours = int((t - (days*day_seconds)) / hour_seconds) | |
return '{days} days {hours} hours'.format(days=days,hours=hours) | |
years = int(t/year_seconds) | |
weeks = int((t - (years*year_seconds)) / week_seconds) | |
return '{years} years {weeks} weeks'.format(years=years,weeks=weeks) | |
class IPLocator: | |
def __init__(self,config): | |
self.config = config | |
try: | |
import pygeoip | |
self.gic = pygeoip.GeoIP(config['geoipcityip4_path']) | |
except Exception as e: | |
print >> sys.stderr, 'pygeoip setup error',e | |
def obtain_locations(self,ip): | |
pass | |
def obtain_address_info(host,config,iplocate=True,rdns=True): | |
results = {} | |
ip = None | |
#print >> sys.stderr, 'host:',host | |
try: | |
data = socket.gethostbyname(host) | |
#ip = repr(data) | |
ip = data | |
#print 'ip:',ip | |
except Exception: | |
raise | |
results['ip'] = ip | |
if iplocate: | |
results['iplocations'] = {} | |
try: | |
import urllib | |
import json | |
#FIXME: does this need to be cleaned up?? | |
rresponse = urllib.urlopen('http://api.hostip.info/get_json.php?ip={ip}&position=true'.format(ip=ip)).read() | |
rresponse_json = json.loads(rresponse) | |
#print 'rresponse_json:',rresponse_json | |
country_name = rresponse_json['country_name'] | |
country_name = None if country_name is None else country_name.encode('utf-8') | |
city = rresponse_json['city'] | |
city = None if city is None else city.encode('utf-8') | |
lng = rresponse_json['lng'] | |
lat = rresponse_json['lat'] | |
response = ( | |
'|hostip| country: "{country}" city: "{city}" longitude: {longitude} latitude: {latitude}'.format( | |
country=country_name, | |
city=city, | |
longitude=lng, | |
latitude=lat)) | |
results['iplocations']['hostip'] = response | |
except Exception as e: | |
print >> sys.stderr, 'hostip error:',e | |
results['iplocations']['hostip'] = '|hostip| error' | |
try: | |
import pygeoip | |
gic = pygeoip.GeoIP(config['geoipcityip4_path']) | |
record = gic.record_by_addr(ip) | |
response = ('|geoipcityip4| ' + str(record)) | |
results['iplocations']['geoipcityip4'] = response | |
except Exception as e: | |
print >> sys.stderr, 'pygeoip error',e | |
results['iplocations']['geoipcityip4'] = '|geoipcityip4| error' | |
try: | |
ip_int = ip_int_from_string(ip) | |
#print >> sys.stderr, 'ip_int:',ip_int | |
with open(config['IpToCountry.csv']) as ip2country: | |
for line in ip2country: | |
line = line.strip() | |
line_data = line.split(',') | |
if len(line) == 0: | |
continue | |
if line[0] == '#': | |
continue | |
""" | |
print 'line:',line | |
print 'line_data:',line_data | |
print 'len(line_data):',len(line_data) | |
""" | |
start_str = line_data[0].strip()[1:-1] | |
end_str = line_data[1].strip()[1:-1] | |
""" | |
print 'start_str:',start_str | |
print 'end_str:',end_str | |
""" | |
ip_first_int = int(start_str) | |
ip_last_int = int(end_str) | |
if ip_first_int <= ip_int and ip_int <= ip_last_int: | |
#print line_data | |
registry = line_data[2].strip()[1:-1] | |
country = line_data[6].strip()[1:-1] | |
ip_first = ip_int_to_string(ip_first_int) | |
ip_last = ip_int_to_string(ip_last_int) | |
response = ('|IpToCountry| range:[{ip_first}-{ip_last}], registry: {registry}, country: {country}'.format( | |
ip_first=ip_first,ip_last=ip_last,registry=registry,country=country)) | |
results['iplocations']['IpToCountry'] = response | |
break | |
results['iplocations']['IpToCountry'] = '|IpToCountry| error, no results' | |
except Exception as e: | |
print >> sys.stderr, 'IpToCountry error:',e | |
results['iplocations']['IpToCountry'] = '|IpToCountry| error' | |
if rdns: | |
results['domains'] = [] | |
try: | |
from dns import resolver,reversename | |
addr=reversename.from_address(ip) | |
#print >> sys.stderr, addr | |
for hmm in resolver.query(addr,"PTR"): | |
results['domains'] += [str(hmm)] | |
except Exception as e: | |
print >> sys.stderr, 'Reverse DNS error:',e | |
return results | |
def get_standard_argparser(): | |
import argparse | |
parser = argparse.ArgumentParser(add_help=True) | |
parser.add_argument('--delim', metavar='<delimchar>', type=str, nargs='?', help='delimeter, defaults to space', default=' ') | |
parser.add_argument('--indelim', metavar='<delimchar>', type=str, nargs='?', | |
help='input delimiter, overrides --delim, defaults to --delim') | |
parser.add_argument('--outdelim', metavar='<delimchar>', type=str, nargs='?', | |
help='output delimeter, overrides --delim, defaults to --delim') | |
parser.add_argument('--quote', metavar='<quotechar>', type=str, nargs='?', help='quote, defaults to \'"\'', default='"') | |
parser.add_argument('--inquote', metavar='<quotechar>', type=str, nargs='?', | |
help='input quote, overrides --quote, defaults to --quote') | |
parser.add_argument('--outquote', metavar='<quotechar>', type=str, nargs='?', | |
help='output quote, overrides --quote, defaults to --quote') | |
parser.add_argument('--infile', '-i', metavar='<path>', type=argparse.FileType('rb'), nargs='?', | |
help='input file path, defaults to stdin', default=sys.stdin) | |
parser.add_argument('--outfile', '-o', metavar='<path>', type=argparse.FileType('wb'), nargs='?', | |
help='output file path, defaults to stdout', default=sys.stdout) | |
return parser | |
def default_headers_cb(tool,headers): | |
tool.csvwriter.writerow(headers) | |
tool.outfile.flush() | |
def default_row_cb(tool,row,row_data): | |
tool.csvwriter.writerow(row) | |
tool.outfile.flush() | |
class GenericTool: | |
def __init__(self): | |
self.headers_cb = default_headers_cb | |
self.row_cbs = [default_row_cb] | |
self.csvwriter = None | |
self.infile = None | |
self.outfile = None | |
self.parser = get_standard_argparser() | |
self.parsed_args = None | |
def parse_args(self): | |
self.parsed_args = self.parser.parse_args() | |
def run(self): | |
import csv | |
parsed_args = self.parsed_args | |
indelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.indelim | |
outdelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.outdelim | |
inquote = parsed_args.quote if parsed_args.inquote is None else parsed_args.inquote | |
outquote = parsed_args.quote if parsed_args.outquote is None else result.outquote | |
with parsed_args.infile as infile: | |
outfile = parsed_args.outfile | |
self.infile = infile | |
self.outfile = outfile | |
csvreader = self.csvreader = csv.reader(infile, delimiter=indelim, quotechar=inquote) | |
csvwriter = self.csvwriter = csv.writer(outfile, delimiter=outdelim, quotechar=outquote) | |
headers = [] | |
for row in csvreader: | |
headers = row | |
break | |
if self.headers_cb is not None: | |
self.headers_cb(self,headers) | |
for row in csvreader: | |
try: | |
row_data = {} | |
for idx in range(len(headers)): | |
row_data[headers[idx]] = row[idx] | |
for row_cb in self.row_cbs: | |
try: | |
row_cb(self,row,row_data) | |
except IOError as e: | |
raise | |
except Exception as e: | |
print >> sys.stderr, 'Exception while row_cb:',e | |
print >> sys.stderr, 'row_cb:',row_cb | |
traceback.print_exc(file=sys.stderr) | |
except IOError as e: | |
raise | |
except Exception as e: | |
print >> sys.stderr, 'Exception while parsing line from stdin:',e | |
print >> sys.stderr, 'line:',row | |
traceback.print_exc(file=sys.stderr) | |
def generate_random_alphanumerics(length): | |
abcs = 'abcdefghijklmnopqrstuvwxyz' | |
result = [abcs[random.randint(0,len(abcs)-1)] for _ in range(length)] | |
result = ''.join(result) | |
return result | |
def generate_FUZZY_URLP_RE_STR(): | |
valid_scheme_chars = 'a-zA-Z' | |
valid_domain_chars = '\w\\.' | |
def _valid_path_chars(): | |
safe = '\\$\\-_\\.\\+' | |
extra = '\!\*\\(\)\,' #removed \\' | |
unreserved = '\w'+ safe+extra | |
reserved = '\\;/\\?\:\\@\\&\\=' | |
escape = '\\%' | |
xchar = unreserved + reserved + escape | |
return xchar | |
valid_path_chars = _valid_path_chars() | |
tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov', | |
'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name', | |
'net', 'org', 'pro', 'tel', 'travel'] | |
result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \ | |
+ '[' + valid_domain_chars + ']*' \ | |
+ '\.([a-zA-Z]{2}|' + '|'.join(tlds) + ')(\\:[\d]*)?/' \ | |
+ '[' + valid_path_chars + ']*' \ | |
+ '(\\#[' + valid_path_chars + ']*)?' + ')' | |
return result | |
def generate_FUZZY_URL_RE_STR(): | |
valid_scheme_chars = 'a-zA-Z' | |
valid_domain_chars = '\w\\.' | |
def _valid_path_chars(): | |
safe = '\\$\\-_\\.\\+' | |
extra = '\!\*\\(\)\,' #removed \\' | |
unreserved = '\w'+ safe+extra | |
reserved = '\\;/\\?\:\\@\\&\\=' | |
escape = '\\%' | |
xchar = unreserved + reserved + escape | |
return xchar | |
valid_path_chars = _valid_path_chars() | |
tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov', | |
'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name', | |
'net', 'org', 'pro', 'tel', 'travel'] | |
domain_RE_STR = r'(\w+(\.\w+)*)' | |
path_RE_STR = r'(/[' + valid_path_chars + '])' | |
query_RE_STR = r'(\?[' + valid_path_chars + '])' | |
fragment_RE_STR = r'(\#[' + valid_path_chars + '])' | |
""" | |
TODO: | |
* "...so" matches, make sure no two "." in a domain | |
* "something.sol" matches something.so | |
""" | |
result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \ | |
+ '[' + valid_domain_chars + ']*' \ | |
+ '\.([a-zA-Z]{2}|' + '|'.join(tlds) + ')(\\:[\d]*)?/?' \ | |
+ '[' + valid_path_chars + ']*' \ | |
+ '(\\#[' + valid_path_chars + ']*)?' + ')' | |
return result | |
""" | |
timeout | |
Socket timeout for each test. | |
tests | |
Proxies to test for. | |
log | |
File object to log to. | |
""" | |
def test_proxy(ip,port,tests=['HTTP','SOCKS4','SOCKS5'], timeout=20,log=os.devnull): | |
import socksocket | |
str2proxytype = {'SOCKS4':socksocket.PROXY_TYPE_SOCKS4, | |
'SOCKS5':socksocket.PROXY_TYPE_SOCKS5, | |
'HTTP':socksocket.PROXY_TYPE_HTTP} | |
proxytype2str = {socksocket.PROXY_TYPE_SOCKS4:'SOCKS4', | |
socksocket.PROXY_TYPE_SOCKS5:'SOCKS5', | |
socksocket.PROXY_TYPE_HTTP:'HTTP'} | |
for test in tests: | |
if test not in str2proxytype: | |
#print >> log, 'test:',test,'is unknown, next!' | |
continue | |
proxytype = str2proxytype[test] | |
#print >> log, ' trying', test | |
s = socksocket.socksocket() | |
s.settimeout(20) | |
try: | |
s.setproxy(proxytype=proxytype,addr=ip,port=port,rdns=True) | |
s.connect(('google.com',80)) | |
s.sendall( '''GET / HTTP/1.1\r\nHost: google.com\r\n\r\n''' ) | |
start = time.time() | |
timeout = s.gettimeout() | |
while True: | |
b = s.recv(1) | |
if len(b): | |
break | |
time.sleep(.001) | |
if timeout is not None and time.time() - start >= timeout: | |
raise socket.timeout | |
#print >> log, 'found one:',proxy,proxytype2str[proxytype] | |
return proxytype2str[proxytype] | |
except socket.error as e: | |
#print >> log, ' socket.error' | |
continue | |
except socksocket.ProxyError: | |
#print >> log, ' socksocket.ProxyError' | |
continue | |
except Exception as e: | |
print >> log, ' UNKNOWN ERROR:',e | |
traceback.print_exc(file=sys.stderr) | |
finally: | |
s.close() | |
return None | |
class URL_RE: | |
def __init__(self): | |
valid_scheme_chars = 'a-zA-Z' | |
valid_domain_chars = '\w\\.' | |
def _valid_path_chars(): | |
safe = '\\$\\-_\\.\\+' | |
extra = '\!\*\\(\)\,' #removed \\' | |
unreserved = '\w'+ safe+extra | |
reserved = '\\;/\\?\:\\@\\&\\=' | |
escape = '\\%' | |
xchar = unreserved + reserved + escape | |
return xchar | |
valid_path_chars = _valid_path_chars() | |
tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov', | |
'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name', | |
'net', 'org', 'pro', 'tel', 'travel', | |
'travel', 'xxx', 'post', | |
'arpa', | |
u'бг', u'ελ', u'ישראל', u'мкд', u'日本', u'日本国', u'ລາວ', #u'ليبيا', | |
] | |
self.scheme_RE_STR = r'([' + valid_scheme_chars + ']*\:)' | |
self.domain_RE_STR = r'(\w+(\.\w+)*)\.([a-zA-Z]{2}|' + '|'.join(tlds) + ')(\\:[\d]*)?' | |
self.path_RE_STR = r'(/[' + valid_path_chars + ']*)' | |
self.query_RE_STR = r'(\?[' + valid_path_chars + ']*)' | |
self.fragment_RE_STR = r'(#[' + valid_path_chars + ']*)' | |
self.domain_RE = re.compile(self.domain_RE_STR) | |
self.url_RE_STR = u'(^|\s)(({scheme}?//)?({domain})({path})?({query})?({fragment})?)($|\s)' | |
self.url_RE_STR = self.url_RE_STR.format(scheme=self.scheme_RE_STR, | |
domain=self.domain_RE_STR, | |
path=self.path_RE_STR, | |
query=self.query_RE_STR, | |
fragment=self.fragment_RE_STR).encode('utf-8') | |
self.url_RE = re.compile(self.url_RE_STR) | |
class TestURL_RE(unittest.TestCase): | |
def setUp(self): | |
self.url_re = URL_RE() | |
def test_domain_two_dots(self): | |
self.assertIsNotNone(self.url_re.url_RE.search('a.com')) | |
self.assertIsNone(self.url_re.url_RE.search('a..com')) | |
def test_invalid_tld(self): | |
self.assertIsNotNone(self.url_re.url_RE.search('a.co')) | |
self.assertIsNone(self.url_re.url_RE.search('a.cod')) | |
def test_urls(self): | |
urls = [ | |
('google.com', True), | |
('google..com', False), | |
('google.comp', False), | |
('google.cop', False), | |
('//google.com', True), | |
('mmm://google.com', True), | |
('google.com/', True), | |
('google.com/?', True), | |
('google.com/?#', True), | |
('google.com/#', True), | |
('google.com?', True), | |
('google.com?#', True), | |
('google.com#', True), | |
('mmm..........chummus.com', False), | |
] | |
for url, expectation in urls: | |
print ('url:',url) | |
if expectation: | |
self.assertIsNotNone(self.url_re.url_RE.search(url)) | |
else: | |
self.assertIsNone(self.url_re.url_RE.search(url)) | |
def main(): | |
unittest.main() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment