realazthat/common.py

## common.py


import yaml
import argparse
from attrdict import AttrDict
#from check_config import check_config

def check_config(config,partial=False):
    pass


def config2struct(item):

    if isinstance(item,list):
        result = []

        for e in item:
            result += [config2struct(e)]
        return result

    elif isinstance(item,(dict, AttrDict)):
        result = {}
        for key,value in item.items():

            result[key] = config2struct(value)
        return result
    else:
        return item


def config2structofconfig(config,item):

    if isinstance(item,list):
        result = []

        for e in item:
            result += [config2structofconfig(config,e)]
        return result

    elif isinstance(item,dict):
        result = {}
        for key,value in item.items():

            result[key] = config2structofconfig(config,value)
        return result
    else:
        return config

def effective_dict(config_dicts):

    result = AttrDict()

    for config_dict in config_dicts:
        assert isinstance(config_dict, dict)
        result += config_dict
    return config2struct(result)


class Config:
    def __init__(self, configs, oconfig={}):

        #list of config dict-like struct
        self.configs = list(configs)

        #the overriding config from the cmd-line options
        self.oconfig = oconfig

        #equivalent dictionaries for all of the above
        self.all_config_dicts = [config2struct(config) for config in configs]
        self.all_config_dicts += [config2struct(oconfig)]

        #and dictionaries of the same stucture, except they point to which
        # config originated the configuration
        self.all_config_dicts_of_configs = [config2structofconfig(config,config) for config in configs]
        self.all_config_dicts_of_configs += [config2structofconfig(oconfig,oconfig)]

        #a pure dictionary of all the values merged together
        self.effective_dict = effective_dict(self.all_config_dicts)
        #a pure dictionary of all the values merged together, except instead
        # of each value, it holds the original config which produces that value
        self.effective_dict_of_configs = effective_dict(self.all_config_dicts_of_configs)


    def get(self,path):
        cur = self.effective_dict
        for d in path:
            cur = cur[d]
        return cur

    def effective(self):
        return self.effective_dict


def compute_configfiles(configfiles, arg_options):
    configs = []
    for configfile in configfiles:
        config_dict = yaml.load(configfile)

        check_config(config_dict,partial=True)

        configs += [config_dict]


    options = {}
    if arg_options is not None:
        for option in arg_options:
            key,_,value = option.partition('=')


            keyparts = key.split('.')

            options_pointer = options
            for keypart in keyparts[:-1]:

                if keypart in options_pointer:
                    child = options_pointer[keypart]
                    options_pointer = child
                else:
                    child = {}
                    options_pointer[keypart] = child
                    options_pointer = child

            options_pointer[keyparts[-1]] = value


    config = Config(configs, options)
    effective_config_dict = config.effective()

    check_config(effective_config_dict)

    return effective_config_dict


if __name__ == '__main__':

    config0 = {'server':
                    { 'port': 8080
                    , 'session':
                        {
                          'key': 'hellow'
                        , 'timeout': 600
                        , 'test_list': [1,2]
                        }
                    }
                , 'deep_list':
                    [{ 'item': 'item', 'wat': 'no'}, 5]
                }
    config1 = {'server': {'session': {'key': 'newkey','t': 't', 'test_list': [3,4]} } }
    config3 = {'deep_list': [{'item': 'no'}] }

    options = {'server': {'port': 18080}}


    c = Config([config0,config1,config3], options)


    print (c.effective())
    #result = AttrDict()
    #result += config0
    #result += config1
    #result += options
    #print result

## config.default.yml


spider:
    chunk_size: 10240
    max_buffer_size: 30720
    max_bytes_processed: 5242880

downloads:
    chunk_size: 10240
    max_bytes_processed: 1048576


## main.py


#python3 main.py --config config.yaml http://portableapps.com/ state.db .exe .7z .gz .zip

import requests
import argparse
import utils
import urllib
import json
import re
import logging
import time
import io
import sqlite3
import os
import tempfile
import hashlib
import shutil
import sys

from common import compute_configfiles

parser = argparse.ArgumentParser(description='Spider a site and download files of a certain extension.')
parser.add_argument('url', type=str,
                    help='site to spider')
parser.add_argument('state', type=str
                    , help='state file')
parser.add_argument('ext', type=str, nargs='+',
                    help='extensions to download')

group = parser.add_mutually_exclusive_group()
group.add_argument('--dump', dest='dump'
                    , action='store_const', const=True, default=False
                    , help='print download urls held by state to stdout')
group.add_argument('--download', dest='download'
                    , action='store_const', const=True, default=False
                    , help='download download urls held by state')


parser.add_argument('-c', '--config', dest='configs', type=argparse.FileType('r')
                    , action='append', default=[]
                    , help='yaml configuration file; specifying this multiple times overides configuration'
                           + ' in the order they are specified')

parser.add_argument('-o', '--option', dest='options', metavar='<option>', type=str
                    , action='append', default=[]
                    , help='option in the form of -oOPTION=VALUE; overrides config.')


args = parser.parse_args()

config = compute_configfiles(args.configs, args.options)


logging.basicConfig(level=logging.INFO, filename='log.log')


URL_RE = utils.URL_RE().url_RE

HREF_DQ_RE = b'href\=\\"([^\\"]*)\\"'
HREF_DQ_RE = re.compile(HREF_DQ_RE)


HREF_SQ_RE = b"href\=\\'([^\\']*)\\'"
HREF_SQ_RE = re.compile(HREF_SQ_RE)

SRC_SQ_RE = b"src\=\\'([^\\']*)\\'"
SRC_SQ_RE = re.compile(HREF_SQ_RE)

SRC_DQ_RE = b'src\=\\"([^\\"]*)\\"'
SRC_DQ_RE = re.compile(HREF_DQ_RE)


url_mask = urllib.parse.urlsplit(args.url)
conn = sqlite3.connect(args.state)
cur = conn.cursor()

sql = """
CREATE TABLE IF NOT EXISTS pages_seen(
      url text PRIMARY KEY
    , complete bool default 0
    , errored bool default 0
    , ignored bool default 0
);
CREATE INDEX IF NOT EXISTS pages_seen_complete ON pages_seen (complete,errored,ignored);

CREATE TABLE IF NOT EXISTS downloads_seen(
      url text PRIMARY KEY
    , complete bool default 0
    , errored bool default 0
    , ignored bool default 0
);
CREATE INDEX IF NOT EXISTS downloads_seen_complete ON downloads_seen (complete,errored,ignored);

"""
conn.executescript(sql)


def print_status(file):

    print (   'pages left:',count_pages_left(cur)
            , 'pages complete:',count_pages_seen(cur)-count_pages_left(cur)-count_pages_errored(cur)
            , 'pages errored:',count_pages_errored(cur)
            , 'pages ignored:',count_pages_ignored(cur)
            , 'downloads left:',count_downloads_left(cur)
            , 'downloads complete:',count_downloads_seen(cur)-count_downloads_left(cur)-count_downloads_errored(cur)
            , 'downloads errored:',count_downloads_errored(cur)
            , 'downloads ignored:',count_downloads_ignored(cur)
            , file=file)


def is_page_seen(c,url):
    assert isinstance(url,str)
    sql = """
SELECT count(*)
FROM pages_seen
WHERE url=?
    """
    c.execute(sql,(url,))
    row = cur.fetchone()
    return row[0] != 0
def is_page_complete(c,url):
    assert isinstance(url,str)
    sql = """
SELECT count(*)
FROM pages_seen
WHERE url=?
  AND complete=1
    """
    c.execute(sql,(url,))
    row = cur.fetchone()
    return row[0] != 0
def is_download_seen(c,url):
    assert isinstance(url,str)
    sql = """
SELECT count(*)
FROM downloads_seen
WHERE url=?
    """
    c.execute(sql,(url,))
    row = cur.fetchone()
    return row[0] != 0
def is_download_complete(c,url):
    assert isinstance(url,str)
    sql = """
SELECT count(*)
FROM downloads_seen
WHERE url=?
  AND complete=1
    """
    c.execute(sql,(url,))
    row = cur.fetchone()
    return row[0] != 0

def add_page(c,url):
    assert isinstance(url,str)
    sql = """
INSERT INTO
pages_seen
(url)
VALUES (?)
    """
    c.execute(sql,(url,))
    return c.rowcount


def add_download(c,url):
    assert isinstance(url,str)
    sql = """
INSERT INTO
downloads_seen
(url)
VALUES (?)
    """
    c.execute(sql,(url,))
    return c.rowcount

def set_page_complete(c,url):
    assert isinstance(url,str)
    sql = """
UPDATE pages_seen
SET complete=1
WHERE url=?
    """
    c.execute(sql,(url,))
    return c.rowcount
def set_page_errored(c,url):
    assert isinstance(url,str)
    sql = """
UPDATE pages_seen
SET errored=1
WHERE url=?
    """
    c.execute(sql,(url,))
    return c.rowcount
def set_page_ignored(c,url):
    assert isinstance(url,str)
    sql = """
UPDATE pages_seen
SET ignored=1
WHERE url=?
    """
    c.execute(sql,(url,))
    return c.rowcount
def set_download_complete(c,url):
    assert isinstance(url,str)
    sql = """
UPDATE downloads_seen
SET complete=1
WHERE url=?
    """
    c.execute(sql,(url,))
    return c.rowcount

def set_download_errored(c,url):
    assert isinstance(url,str)
    sql = """
UPDATE downloads_seen
SET errored=1
WHERE url=?
    """
    c.execute(sql,(url,))
    return c.rowcount
def set_download_ignored(c,url):
    assert isinstance(url,str)
    sql = """
UPDATE downloads_seen
SET ignored=1
WHERE url=?
    """
    c.execute(sql,(url,))
    return c.rowcount

def count_pages_seen(c):
    sql = """
SELECT count(*)
FROM pages_seen
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_pages_left(c):
    sql = """
SELECT count(*)
FROM pages_seen
WHERE complete=0
  AND errored=0
  AND ignored=0
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_pages_errored(c):
    sql = """
SELECT count(*)
FROM pages_seen
WHERE errored=1
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_pages_ignored(c):
    sql = """
SELECT count(*)
FROM pages_seen
WHERE ignored=1
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_downloads_seen(c):
    sql = """
SELECT count(*)
FROM downloads_seen
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_downloads_left(c):
    sql = """
SELECT count(*)
FROM downloads_seen
WHERE complete=0
  AND errored=0
  AND ignored=0
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_downloads_errored(c):
    sql = """
SELECT count(*)
FROM downloads_seen
WHERE errored=1
    """
    c.execute(sql)
    return c.fetchone()[0]
def count_downloads_ignored(c):
    sql = """
SELECT count(*)
FROM downloads_seen
WHERE ignored=1
    """
    c.execute(sql)
    return c.fetchone()[0]

def get_next_pages(c,count):
    sql = """
SELECT url
FROM pages_seen
WHERE complete=0
  AND errored=0
  AND ignored=0
LIMIT ?
    """
    c.execute(sql,(count,))
    return [row[0] for row in c.fetchall()]


def get_next_downloads(c,count):

    sql = """
SELECT url
FROM downloads_seen
WHERE complete=0
  AND errored=0
  AND ignored=0
LIMIT ?
    """
    c.execute(sql,(count,))
    return [row[0] for row in c.fetchall()]


if args.dump:

    for url in get_next_downloads(cur,count_downloads_left(cur)):
        print (url)
    exit(0)


class SpiderErrorException(Exception):
    pass
class SpiderIgnoreException(Exception):
    pass
def spider_page(url0):
    assert isinstance(url0,tuple)

    #print ('url0:',url0.geturl())
    spider_config = config.get('spider',{})
    chunk_size = spider_config.get('chunk_size', 1024*10)
    max_buffer_size = spider_config.get('max_buffer_size', chunk_size*3)
    max_bytes_processed = spider_config.get('max_bytes_processed', 1024*1024*5)

    urls = []
    results = []

    r = requests.get(url0.geturl(), stream=True)
    """
    if r.headers:
        if 'content-type' in r.headers:
            content_type = r.headers['content-type'].strip()
            content_type,_,_ = content_type.partition(';')

            valid_mime_types = set(['application/atom+xml','application/dash+xml'])
            if not content_type.startswith('text'):
                raise SpiderIgnoreException('cannot spider resource with non-text-based mime-type, url: %s' % (url0.geturl(),))
    """
    buf = bytearray()
    bytes_processed = 0


    for chunk in r.iter_content(chunk_size=10240):


        if chunk:
            buf += chunk
            bytes_processed += len(chunk)

            if bytes_processed > max_bytes_processed:
                raise SpiderIgnoreException('request was too large to spider, url: %s' % (url0.geturl(),))
            urls += [match.group(0).strip() for match in URL_RE.finditer(buf)]

            urls += [match.group(1).strip() for match in HREF_DQ_RE.finditer(buf)]
            urls += [match.group(1).strip() for match in HREF_SQ_RE.finditer(buf)]
            urls += [match.group(1).strip() for match in SRC_DQ_RE.finditer(buf)]
            urls += [match.group(1).strip() for match in SRC_SQ_RE.finditer(buf)]

            if len(buf) > max_buffer_size:
                buf = buf[-max_buffer_size:]
                assert len(buf) == max_buffer_size


    for url in urls:
        url = urllib.parse.urlsplit(url.decode())

        results += [url]


    del r


    #print ('results:',results)

    return results
def process_download(url):
    assert isinstance(url, str)
    h = hashlib.new('sha256')
    r = requests.get(url, stream=True)
    size = 0

    downloads_config = config.get('downloads',{})
    chunk_size = downloads_config.get('chunk_size', 1024*10)
    max_bytes_processed = downloads_config.get('max_bytes_processed', 1024*1024*5)

    #print ('config:',config, 'max_bytes_processed', max_bytes_processed)

    path = urllib.parse.urlsplit(url).path

    root,ext = os.path.splitext(path)

    if ext not in args.ext:
        logging.error('url: %s ext: %s does not match any extensions specified on the commandline' % (url,ext))
        return

    with tempfile.TemporaryFile() as outfile:
        for chunk in r.iter_content(chunk_size=1024*10):
            if chunk:
                outfile.write(chunk)
                outfile.flush()
                os.fsync(outfile.fileno())
                h.update(chunk)
                size += len(chunk)

                if size > max_bytes_processed:
                    raise SpiderIgnoreException('request was too large to spider, url: %s' % (url,))

        H = h.hexdigest()

        outfile.seek(0)
        with open(H+'.'+ext,'w+b') as dstfile:
            shutil.copyfileobj(outfile,dstfile)
    set_download_complete(cur,url)

    print (   'downloaded: ', url, 'to: ', h.hexdigest(), 'size:', size, file=sys.stderr)
    del r

if args.download:

    while count_downloads_left(cur) > 0:
        urls = list(get_next_downloads(cur,20))
        while len(urls):
            url = urls.pop()
            print (   'downloading: ', url, file=sys.stderr)

            try:
                process_download(url)
            except KeyboardInterrupt:
                raise
            except (requests.exceptions.RequestException,SpiderErrorException):
                logging.exception('error downloading url, url: %s' % (url,))
                set_download_errored(cur,url)
            except (SpiderIgnoreException):
                logging.exception('downloading url ignored, url: %s' % (url,))
                set_download_ignored(cur,url)

            conn.commit()


            print_status(file=sys.stderr)


    exit(0)


argurl = urllib.parse.urlsplit(args.url).geturl()

if not is_page_seen(cur,argurl):
    add_page(cur,argurl)
    conn.commit()


def process_url(cur,url):
    global urls, url_mask
    assert isinstance(url,tuple)

    #print ('is_page_complete(cur,url.geturl()):',is_page_complete(cur,url.geturl()))
    if is_page_complete(cur,url.geturl()):
        return

    new_urls = spider_page(url)

    #print ('new_urls:',new_urls)

    for new_url in new_urls:
        a = url.geturl()
        b = new_url.geturl()

        #print ('a:',a, 'b:',b, 'type(a):',type(a), 'type(b):', type(b))
        new_url = urllib.parse.urljoin(a,b)
        new_url = urllib.parse.urlsplit(new_url)

        #TODO, take extension off the path, and compare it that way
        for ext in args.ext:
            if new_url.path.lower().endswith(ext) and not is_download_seen(cur,new_url.geturl()):
                add_download(cur,new_url.geturl())
                continue


        if new_url.netloc != url_mask.netloc:
            continue
        #TODO more mask, match path

        if is_page_seen(cur,new_url.geturl()) or is_page_complete(cur,new_url.geturl()):
            continue

        add_page(cur,new_url.geturl())

    set_page_complete(cur,url.geturl())

    #print ('pages_left:',count_pages_left(cur)
    #        , 'pages_seen:',count_pages_seen(cur)
    #        , 'downloads_left:',count_downloads_left(cur))


while len(get_next_pages(cur,1)) > 0:
    urls = list(get_next_pages(cur,20))

    #print ('urls:',urls)

    while len(urls):
        url = urls.pop()
        #print ('url:',url)
        try:
            process_url(cur,urllib.parse.urlsplit(url))
        except KeyboardInterrupt:
            raise
        except (requests.exceptions.RequestException,SpiderErrorException):
            logging.exception('error processing url, url: %s' % (url,))
            set_page_errored(cur,url)
        except (SpiderIgnoreException):
            logging.warning('processing url, ignored, url: %s' % (url,))
            set_page_ignored(cur,url)
        #except:
        #    logging.exception('error processing url, url: %s' % (url,))

        conn.commit()
        time.sleep(.01)
        print_status(file=sys.stderr)


## utils.py
# -*- coding: utf-8 -*-
import traceback,sys
import socket
import random
import time
import os
import unittest
import re

"""
from https://gist.github.com/1595135

Written by Christian Stigen Larsen, http://csl.sublevel3.org
Placed in the public domain by the author, 2012-01-11
"""
def ip_int_from_string(s):
    "Convert dotted IPv4 address to integer."
    return reduce(lambda a,b: a<<8 | b, map(int, s.split(".")))

def ip_int_to_string(ip):
    "Convert 32-bit integer to dotted IPv4 address."
    return ".".join(map(lambda n: str(ip>>n & 0xFF), [24,16,8,0]))


def format_relative_time(t):

    minute_seconds = 60
    hour_seconds = minute_seconds * 60
    day_seconds = hour_seconds * 24
    week_seconds = day_seconds * 7
    year_seconds = day_seconds * 365

    if t < minute_seconds:
        return '{seconds} seconds'.format(seconds=t)
    elif t < hour_seconds:
        return '{minutes} minutes'.format(minutes=int(t/minute_seconds))
    elif t < day_seconds:
        hours = int(t/hour_seconds)
        minutes = int((t - (hours*hour_seconds)) / minute_seconds)
        return '{hours} hours {minutes} minutes'.format(hours=hours,minutes=minutes)
    elif t < year_seconds:
        days = int(t/day_seconds)
        hours = int((t - (days*day_seconds)) / hour_seconds)
        return '{days} days {hours} hours'.format(days=days,hours=hours)

    years = int(t/year_seconds)
    weeks = int((t - (years*year_seconds)) / week_seconds)
    return '{years} years {weeks} weeks'.format(years=years,weeks=weeks)


class IPLocator:
    def __init__(self,config):
        self.config = config

        try:
            import pygeoip
            self.gic = pygeoip.GeoIP(config['geoipcityip4_path'])

        except Exception as e:
            print >> sys.stderr, 'pygeoip setup error',e

    def obtain_locations(self,ip):
        pass

def obtain_address_info(host,config,iplocate=True,rdns=True):

    results = {}


    ip = None
    #print >> sys.stderr, 'host:',host
    try:
        data = socket.gethostbyname(host)
        #ip = repr(data)
        ip = data

        #print 'ip:',ip
    except Exception:
        raise

    results['ip'] = ip

    if iplocate:
        results['iplocations'] = {}
        try:
            import urllib
            import json

            #FIXME: does this need to be cleaned up??
            rresponse = urllib.urlopen('http://api.hostip.info/get_json.php?ip={ip}&position=true'.format(ip=ip)).read()


            rresponse_json = json.loads(rresponse)
            #print 'rresponse_json:',rresponse_json

            country_name = rresponse_json['country_name']
            country_name = None if country_name is None else country_name.encode('utf-8')
            city = rresponse_json['city']
            city = None if city is None else city.encode('utf-8')
            lng = rresponse_json['lng']
            lat = rresponse_json['lat']

            response = (
                '|hostip| country: "{country}" city: "{city}" longitude: {longitude} latitude: {latitude}'.format(
                    country=country_name,
                    city=city,
                    longitude=lng,
                    latitude=lat))
            results['iplocations']['hostip'] = response


        except Exception as e:
            print >> sys.stderr, 'hostip error:',e
            results['iplocations']['hostip'] = '|hostip| error'


        try:
            import pygeoip
            gic = pygeoip.GeoIP(config['geoipcityip4_path'])

            record = gic.record_by_addr(ip)


            response = ('|geoipcityip4| ' + str(record))

            results['iplocations']['geoipcityip4'] = response
        except Exception as e:
            print >> sys.stderr, 'pygeoip error',e
            results['iplocations']['geoipcityip4'] = '|geoipcityip4| error'

        try:
            ip_int = ip_int_from_string(ip)

            #print >> sys.stderr, 'ip_int:',ip_int

            with open(config['IpToCountry.csv']) as ip2country:
                for line in ip2country:
                    line = line.strip()
                    line_data = line.split(',')
                    if len(line) == 0:
                        continue
                    if line[0] == '#':
                        continue

                    """
                    print 'line:',line
                    print 'line_data:',line_data
                    print 'len(line_data):',len(line_data)
                    """
                    start_str = line_data[0].strip()[1:-1]
                    end_str = line_data[1].strip()[1:-1]
                    """
                    print 'start_str:',start_str
                    print 'end_str:',end_str
                    print
                    """
                    ip_first_int = int(start_str)
                    ip_last_int = int(end_str)
                    if ip_first_int <= ip_int and ip_int <= ip_last_int:

                        #print line_data
                        registry = line_data[2].strip()[1:-1]
                        country = line_data[6].strip()[1:-1]
                        ip_first = ip_int_to_string(ip_first_int)
                        ip_last = ip_int_to_string(ip_last_int)

                        response = ('|IpToCountry| range:[{ip_first}-{ip_last}], registry: {registry}, country: {country}'.format(
                                ip_first=ip_first,ip_last=ip_last,registry=registry,country=country))

                        results['iplocations']['IpToCountry'] = response

                        break
                results['iplocations']['IpToCountry'] = '|IpToCountry| error, no results'
        except Exception as e:
            print >> sys.stderr, 'IpToCountry error:',e
            results['iplocations']['IpToCountry'] = '|IpToCountry| error'


    if rdns:
        results['domains'] = []
        try:
            from dns import resolver,reversename
            addr=reversename.from_address(ip)
            #print >> sys.stderr, addr

            for hmm in resolver.query(addr,"PTR"):
                results['domains'] += [str(hmm)]
        except Exception as e:

            print >> sys.stderr,  'Reverse DNS error:',e

    return results


def get_standard_argparser():
    import argparse
    parser = argparse.ArgumentParser(add_help=True)

    parser.add_argument('--delim', metavar='<delimchar>', type=str, nargs='?', help='delimeter, defaults to space', default=' ')
    parser.add_argument('--indelim', metavar='<delimchar>', type=str, nargs='?',
                        help='input delimiter, overrides --delim, defaults to --delim')
    parser.add_argument('--outdelim', metavar='<delimchar>', type=str, nargs='?',
                        help='output delimeter, overrides --delim, defaults to --delim')

    parser.add_argument('--quote', metavar='<quotechar>', type=str, nargs='?', help='quote, defaults to \'"\'', default='"')
    parser.add_argument('--inquote', metavar='<quotechar>', type=str, nargs='?',
                        help='input quote, overrides --quote, defaults to --quote')
    parser.add_argument('--outquote', metavar='<quotechar>', type=str, nargs='?',
                        help='output quote, overrides --quote, defaults to --quote')


    parser.add_argument('--infile', '-i', metavar='<path>', type=argparse.FileType('rb'), nargs='?',
                        help='input file path, defaults to stdin', default=sys.stdin)
    parser.add_argument('--outfile', '-o', metavar='<path>', type=argparse.FileType('wb'), nargs='?',
                        help='output file path, defaults to stdout', default=sys.stdout)

    return parser

def default_headers_cb(tool,headers):

    tool.csvwriter.writerow(headers)
    tool.outfile.flush()

def default_row_cb(tool,row,row_data):
    tool.csvwriter.writerow(row)
    tool.outfile.flush()

class GenericTool:

    def __init__(self):

        self.headers_cb = default_headers_cb
        self.row_cbs = [default_row_cb]
        self.csvwriter = None
        self.infile = None
        self.outfile = None
        self.parser = get_standard_argparser()
        self.parsed_args = None

    def parse_args(self):
        self.parsed_args = self.parser.parse_args()

    def run(self):
        import csv

        parsed_args = self.parsed_args

        indelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.indelim
        outdelim = parsed_args.delim  if parsed_args.indelim is None else parsed_args.outdelim

        inquote = parsed_args.quote if parsed_args.inquote is None else parsed_args.inquote
        outquote = parsed_args.quote if parsed_args.outquote is None else result.outquote


        with parsed_args.infile as infile:

            outfile = parsed_args.outfile

            self.infile = infile
            self.outfile = outfile

            csvreader = self.csvreader = csv.reader(infile, delimiter=indelim, quotechar=inquote)
            csvwriter = self.csvwriter = csv.writer(outfile, delimiter=outdelim, quotechar=outquote)

            headers = []
            for row in csvreader:
                headers = row
                break

            if self.headers_cb is not None:
                self.headers_cb(self,headers)

            for row in csvreader:
                try:

                    row_data = {}
                    for idx in range(len(headers)):
                        row_data[headers[idx]] = row[idx]

                    for row_cb in self.row_cbs:
                        try:
                            row_cb(self,row,row_data)
                        except IOError as e:
                            raise
                        except Exception as e:
                            print >> sys.stderr,  'Exception while row_cb:',e
                            print >> sys.stderr,  'row_cb:',row_cb
                            traceback.print_exc(file=sys.stderr)

                except IOError as e:
                    raise
                except Exception as e:
                    print >> sys.stderr,  'Exception while parsing line from stdin:',e
                    print >> sys.stderr,  'line:',row

                    traceback.print_exc(file=sys.stderr)


def generate_random_alphanumerics(length):

    abcs = 'abcdefghijklmnopqrstuvwxyz'
    result = [abcs[random.randint(0,len(abcs)-1)] for _ in range(length)]

    result = ''.join(result)
    return result


def generate_FUZZY_URLP_RE_STR():
    valid_scheme_chars = 'a-zA-Z'
    valid_domain_chars = '\w\\.'
    def _valid_path_chars():
        safe = '\\$\\-_\\.\\+'
        extra = '\!\*\\(\)\,' #removed \\'
        unreserved = '\w'+ safe+extra
        reserved = '\\;/\\?\:\\@\\&\\='
        escape = '\\%'
        xchar = unreserved + reserved + escape

        return xchar
    valid_path_chars = _valid_path_chars()
    tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
            'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
            'net', 'org', 'pro', 'tel', 'travel']
    result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \
        + '[' + valid_domain_chars + ']*' \
        + '\.([a-zA-Z]{2}|' +  '|'.join(tlds) + ')(\\:[\d]*)?/' \
        + '[' + valid_path_chars + ']*' \
        + '(\\#[' + valid_path_chars + ']*)?' + ')'
    return result

def generate_FUZZY_URL_RE_STR():
    valid_scheme_chars = 'a-zA-Z'
    valid_domain_chars = '\w\\.'
    def _valid_path_chars():
        safe = '\\$\\-_\\.\\+'
        extra = '\!\*\\(\)\,' #removed \\'
        unreserved = '\w'+ safe+extra
        reserved = '\\;/\\?\:\\@\\&\\='
        escape = '\\%'
        xchar = unreserved + reserved + escape

        return xchar
    valid_path_chars = _valid_path_chars()
    tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
            'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
            'net', 'org', 'pro', 'tel', 'travel']

    domain_RE_STR = r'(\w+(\.\w+)*)'
    path_RE_STR = r'(/[' + valid_path_chars + '])'
    query_RE_STR = r'(\?[' + valid_path_chars + '])'
    fragment_RE_STR = r'(\#[' + valid_path_chars + '])'

    """
    TODO:
    * "...so" matches, make sure no two "." in a domain
    * "something.sol" matches something.so
    """
    result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \
        + '[' + valid_domain_chars + ']*' \
        + '\.([a-zA-Z]{2}|' +  '|'.join(tlds) + ')(\\:[\d]*)?/?' \
        + '[' + valid_path_chars + ']*' \
        + '(\\#[' + valid_path_chars + ']*)?' + ')'
    return result


"""

timeout
    Socket timeout for each test.
tests
    Proxies to test for.
log
    File object to log to.
"""
def test_proxy(ip,port,tests=['HTTP','SOCKS4','SOCKS5'], timeout=20,log=os.devnull):
    import socksocket

    str2proxytype = {'SOCKS4':socksocket.PROXY_TYPE_SOCKS4,
                     'SOCKS5':socksocket.PROXY_TYPE_SOCKS5,
                     'HTTP':socksocket.PROXY_TYPE_HTTP}
    proxytype2str = {socksocket.PROXY_TYPE_SOCKS4:'SOCKS4',
                     socksocket.PROXY_TYPE_SOCKS5:'SOCKS5',
                     socksocket.PROXY_TYPE_HTTP:'HTTP'}

    for test in tests:
        if test not in str2proxytype:
            #print >> log, 'test:',test,'is unknown, next!'
            continue
        proxytype = str2proxytype[test]

        #print >> log, '  trying', test

        s = socksocket.socksocket()
        s.settimeout(20)
        try:
            s.setproxy(proxytype=proxytype,addr=ip,port=port,rdns=True)
            s.connect(('google.com',80))

            s.sendall( '''GET / HTTP/1.1\r\nHost: google.com\r\n\r\n''' )

            start = time.time()
            timeout = s.gettimeout()
            while True:
                b = s.recv(1)
                if len(b):
                    break
                time.sleep(.001)
                if timeout is not None and time.time() - start >= timeout:
                    raise socket.timeout
            #print >> log, 'found one:',proxy,proxytype2str[proxytype]
            return proxytype2str[proxytype]
        except socket.error as e:
            #print >> log, '    socket.error'
            continue
        except socksocket.ProxyError:
            #print >> log, '    socksocket.ProxyError'
            continue
        except Exception as e:
            print >> log, '    UNKNOWN ERROR:',e
            traceback.print_exc(file=sys.stderr)
        finally:
            s.close()
    return None


class URL_RE:

    def __init__(self):
        valid_scheme_chars = 'a-zA-Z'
        valid_domain_chars = '\w\\.'
        def _valid_path_chars():
            safe = '\\$\\-_\\.\\+'
            extra = '\!\*\\(\)\,' #removed \\'
            unreserved = '\w'+ safe+extra
            reserved = '\\;/\\?\:\\@\\&\\='
            escape = '\\%'
            xchar = unreserved + reserved + escape

            return xchar
        valid_path_chars = _valid_path_chars()
        tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
                'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
                'net', 'org', 'pro', 'tel', 'travel',

                'travel', 'xxx', 'post',

                'arpa',

                u'бг', u'ελ', u'ישראל', u'мкд', u'日本', u'日本国', u'ລາວ', #u'ليبيا'‎,
                ]

        self.scheme_RE_STR = r'([' + valid_scheme_chars + ']*\:)'
        self.domain_RE_STR = r'(\w+(\.\w+)*)\.([a-zA-Z]{2}|' +  '|'.join(tlds) + ')(\\:[\d]*)?'
        self.path_RE_STR = r'(/[' + valid_path_chars + ']*)'
        self.query_RE_STR = r'(\?[' + valid_path_chars + ']*)'
        self.fragment_RE_STR = r'(#[' + valid_path_chars + ']*)'

        self.domain_RE = re.compile(self.domain_RE_STR)


        self.url_RE_STR = u'(^|\s)(({scheme}?//)?({domain})({path})?({query})?({fragment})?)($|\s)'
        self.url_RE_STR = self.url_RE_STR.format(scheme=self.scheme_RE_STR,
                                                 domain=self.domain_RE_STR,
                                                 path=self.path_RE_STR,
                                                 query=self.query_RE_STR,
                                                 fragment=self.fragment_RE_STR).encode('utf-8')
        self.url_RE = re.compile(self.url_RE_STR)

class TestURL_RE(unittest.TestCase):

    def setUp(self):
        self.url_re = URL_RE()

    def test_domain_two_dots(self):

        self.assertIsNotNone(self.url_re.url_RE.search('a.com'))
        self.assertIsNone(self.url_re.url_RE.search('a..com'))

    def test_invalid_tld(self):

        self.assertIsNotNone(self.url_re.url_RE.search('a.co'))
        self.assertIsNone(self.url_re.url_RE.search('a.cod'))


    def test_urls(self):

        urls = [
                ('google.com', True),
                ('google..com', False),
                ('google.comp', False),
                ('google.cop', False),
                ('//google.com', True),
                ('mmm://google.com', True),
                ('google.com/', True),
                ('google.com/?', True),
                ('google.com/?#', True),
                ('google.com/#', True),
                ('google.com?', True),
                ('google.com?#', True),
                ('google.com#', True),
                ('mmm..........chummus.com', False),
                ]
        for url, expectation in urls:
            print ('url:',url)

            if expectation:
                self.assertIsNotNone(self.url_re.url_RE.search(url))
            else:
                self.assertIsNone(self.url_re.url_RE.search(url))


def main():
    unittest.main()


if __name__ == "__main__":
    main()


	import yaml
	import argparse
	from attrdict import AttrDict
	#from check_config import check_config

	def check_config(config,partial=False):
	pass


	def config2struct(item):

	if isinstance(item,list):
	result = []

	for e in item:
	result += [config2struct(e)]
	return result

	elif isinstance(item,(dict, AttrDict)):
	result = {}
	for key,value in item.items():

	result[key] = config2struct(value)
	return result
	else:
	return item


	def config2structofconfig(config,item):

	if isinstance(item,list):
	result = []

	for e in item:
	result += [config2structofconfig(config,e)]
	return result

	elif isinstance(item,dict):
	result = {}
	for key,value in item.items():

	result[key] = config2structofconfig(config,value)
	return result
	else:
	return config

	def effective_dict(config_dicts):

	result = AttrDict()

	for config_dict in config_dicts:
	assert isinstance(config_dict, dict)
	result += config_dict
	return config2struct(result)



	class Config:
	def __init__(self, configs, oconfig={}):

	#list of config dict-like struct
	self.configs = list(configs)

	#the overriding config from the cmd-line options
	self.oconfig = oconfig

	#equivalent dictionaries for all of the above
	self.all_config_dicts = [config2struct(config) for config in configs]
	self.all_config_dicts += [config2struct(oconfig)]

	#and dictionaries of the same stucture, except they point to which
	# config originated the configuration
	self.all_config_dicts_of_configs = [config2structofconfig(config,config) for config in configs]
	self.all_config_dicts_of_configs += [config2structofconfig(oconfig,oconfig)]

	#a pure dictionary of all the values merged together
	self.effective_dict = effective_dict(self.all_config_dicts)
	#a pure dictionary of all the values merged together, except instead
	# of each value, it holds the original config which produces that value
	self.effective_dict_of_configs = effective_dict(self.all_config_dicts_of_configs)


	def get(self,path):
	cur = self.effective_dict
	for d in path:
	cur = cur[d]
	return cur

	def effective(self):
	return self.effective_dict




	def compute_configfiles(configfiles, arg_options):
	configs = []
	for configfile in configfiles:
	config_dict = yaml.load(configfile)

	check_config(config_dict,partial=True)

	configs += [config_dict]


	options = {}
	if arg_options is not None:
	for option in arg_options:
	key,_,value = option.partition('=')


	keyparts = key.split('.')

	options_pointer = options
	for keypart in keyparts[:-1]:

	if keypart in options_pointer:
	child = options_pointer[keypart]
	options_pointer = child
	else:
	child = {}
	options_pointer[keypart] = child
	options_pointer = child

	options_pointer[keyparts[-1]] = value


	config = Config(configs, options)
	effective_config_dict = config.effective()

	check_config(effective_config_dict)

	return effective_config_dict




	if __name__ == '__main__':

	config0 = {'server':
	{ 'port': 8080
	, 'session':
	{
	'key': 'hellow'
	, 'timeout': 600
	, 'test_list': [1,2]
	}
	}
	, 'deep_list':
	[{ 'item': 'item', 'wat': 'no'}, 5]
	}
	config1 = {'server': {'session': {'key': 'newkey','t': 't', 'test_list': [3,4]} } }
	config3 = {'deep_list': [{'item': 'no'}] }

	options = {'server': {'port': 18080}}


	c = Config([config0,config1,config3], options)


	print (c.effective())
	#result = AttrDict()
	#result += config0
	#result += config1
	#result += options
	#print result


	spider:
	chunk_size: 10240
	max_buffer_size: 30720
	max_bytes_processed: 5242880

	downloads:
	chunk_size: 10240
	max_bytes_processed: 1048576


	#python3 main.py --config config.yaml http://portableapps.com/ state.db .exe .7z .gz .zip

	import requests
	import argparse
	import utils
	import urllib
	import json
	import re
	import logging
	import time
	import io
	import sqlite3
	import os
	import tempfile
	import hashlib
	import shutil
	import sys

	from common import compute_configfiles

	parser = argparse.ArgumentParser(description='Spider a site and download files of a certain extension.')
	parser.add_argument('url', type=str,
	help='site to spider')
	parser.add_argument('state', type=str
	, help='state file')
	parser.add_argument('ext', type=str, nargs='+',
	help='extensions to download')

	group = parser.add_mutually_exclusive_group()
	group.add_argument('--dump', dest='dump'
	, action='store_const', const=True, default=False
	, help='print download urls held by state to stdout')
	group.add_argument('--download', dest='download'
	, action='store_const', const=True, default=False
	, help='download download urls held by state')


	parser.add_argument('-c', '--config', dest='configs', type=argparse.FileType('r')
	, action='append', default=[]
	, help='yaml configuration file; specifying this multiple times overides configuration'
	+ ' in the order they are specified')

	parser.add_argument('-o', '--option', dest='options', metavar='<option>', type=str
	, action='append', default=[]
	, help='option in the form of -oOPTION=VALUE; overrides config.')



	args = parser.parse_args()

	config = compute_configfiles(args.configs, args.options)



	logging.basicConfig(level=logging.INFO, filename='log.log')






	URL_RE = utils.URL_RE().url_RE

	HREF_DQ_RE = b'href\=\\"([^\\"]*)\\"'
	HREF_DQ_RE = re.compile(HREF_DQ_RE)


	HREF_SQ_RE = b"href\=\\'([^\\']*)\\'"
	HREF_SQ_RE = re.compile(HREF_SQ_RE)

	SRC_SQ_RE = b"src\=\\'([^\\']*)\\'"
	SRC_SQ_RE = re.compile(HREF_SQ_RE)

	SRC_DQ_RE = b'src\=\\"([^\\"]*)\\"'
	SRC_DQ_RE = re.compile(HREF_DQ_RE)





	url_mask = urllib.parse.urlsplit(args.url)
	conn = sqlite3.connect(args.state)
	cur = conn.cursor()

	sql = """
	CREATE TABLE IF NOT EXISTS pages_seen(
	url text PRIMARY KEY
	, complete bool default 0
	, errored bool default 0
	, ignored bool default 0
	);
	CREATE INDEX IF NOT EXISTS pages_seen_complete ON pages_seen (complete,errored,ignored);

	CREATE TABLE IF NOT EXISTS downloads_seen(
	url text PRIMARY KEY
	, complete bool default 0
	, errored bool default 0
	, ignored bool default 0
	);
	CREATE INDEX IF NOT EXISTS downloads_seen_complete ON downloads_seen (complete,errored,ignored);

	"""
	conn.executescript(sql)





	def print_status(file):

	print ( 'pages left:',count_pages_left(cur)
	, 'pages complete:',count_pages_seen(cur)-count_pages_left(cur)-count_pages_errored(cur)
	, 'pages errored:',count_pages_errored(cur)
	, 'pages ignored:',count_pages_ignored(cur)
	, 'downloads left:',count_downloads_left(cur)
	, 'downloads complete:',count_downloads_seen(cur)-count_downloads_left(cur)-count_downloads_errored(cur)
	, 'downloads errored:',count_downloads_errored(cur)
	, 'downloads ignored:',count_downloads_ignored(cur)
	, file=file)


	def is_page_seen(c,url):
	assert isinstance(url,str)
	sql = """
	SELECT count(*)
	FROM pages_seen
	WHERE url=?
	"""
	c.execute(sql,(url,))
	row = cur.fetchone()
	return row[0] != 0
	def is_page_complete(c,url):
	assert isinstance(url,str)
	sql = """
	SELECT count(*)
	FROM pages_seen
	WHERE url=?
	AND complete=1
	"""
	c.execute(sql,(url,))
	row = cur.fetchone()
	return row[0] != 0
	def is_download_seen(c,url):
	assert isinstance(url,str)
	sql = """
	SELECT count(*)
	FROM downloads_seen
	WHERE url=?
	"""
	c.execute(sql,(url,))
	row = cur.fetchone()
	return row[0] != 0
	def is_download_complete(c,url):
	assert isinstance(url,str)
	sql = """
	SELECT count(*)
	FROM downloads_seen
	WHERE url=?
	AND complete=1
	"""
	c.execute(sql,(url,))
	row = cur.fetchone()
	return row[0] != 0

	def add_page(c,url):
	assert isinstance(url,str)
	sql = """
	INSERT INTO
	pages_seen
	(url)
	VALUES (?)
	"""
	c.execute(sql,(url,))
	return c.rowcount


	def add_download(c,url):
	assert isinstance(url,str)
	sql = """
	INSERT INTO
	downloads_seen
	(url)
	VALUES (?)
	"""
	c.execute(sql,(url,))
	return c.rowcount

	def set_page_complete(c,url):
	assert isinstance(url,str)
	sql = """
	UPDATE pages_seen
	SET complete=1
	WHERE url=?
	"""
	c.execute(sql,(url,))
	return c.rowcount
	def set_page_errored(c,url):
	assert isinstance(url,str)
	sql = """
	UPDATE pages_seen
	SET errored=1
	WHERE url=?
	"""
	c.execute(sql,(url,))
	return c.rowcount
	def set_page_ignored(c,url):
	assert isinstance(url,str)
	sql = """
	UPDATE pages_seen
	SET ignored=1
	WHERE url=?
	"""
	c.execute(sql,(url,))
	return c.rowcount
	def set_download_complete(c,url):
	assert isinstance(url,str)
	sql = """
	UPDATE downloads_seen
	SET complete=1
	WHERE url=?
	"""
	c.execute(sql,(url,))
	return c.rowcount

	def set_download_errored(c,url):
	assert isinstance(url,str)
	sql = """
	UPDATE downloads_seen
	SET errored=1
	WHERE url=?
	"""
	c.execute(sql,(url,))
	return c.rowcount
	def set_download_ignored(c,url):
	assert isinstance(url,str)
	sql = """
	UPDATE downloads_seen
	SET ignored=1
	WHERE url=?
	"""
	c.execute(sql,(url,))
	return c.rowcount

	def count_pages_seen(c):
	sql = """
	SELECT count(*)
	FROM pages_seen
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_pages_left(c):
	sql = """
	SELECT count(*)
	FROM pages_seen
	WHERE complete=0
	AND errored=0
	AND ignored=0
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_pages_errored(c):
	sql = """
	SELECT count(*)
	FROM pages_seen
	WHERE errored=1
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_pages_ignored(c):
	sql = """
	SELECT count(*)
	FROM pages_seen
	WHERE ignored=1
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_downloads_seen(c):
	sql = """
	SELECT count(*)
	FROM downloads_seen
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_downloads_left(c):
	sql = """
	SELECT count(*)
	FROM downloads_seen
	WHERE complete=0
	AND errored=0
	AND ignored=0
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_downloads_errored(c):
	sql = """
	SELECT count(*)
	FROM downloads_seen
	WHERE errored=1
	"""
	c.execute(sql)
	return c.fetchone()[0]
	def count_downloads_ignored(c):
	sql = """
	SELECT count(*)
	FROM downloads_seen
	WHERE ignored=1
	"""
	c.execute(sql)
	return c.fetchone()[0]

	def get_next_pages(c,count):
	sql = """
	SELECT url
	FROM pages_seen
	WHERE complete=0
	AND errored=0
	AND ignored=0
	LIMIT ?
	"""
	c.execute(sql,(count,))
	return [row[0] for row in c.fetchall()]


	def get_next_downloads(c,count):

	sql = """
	SELECT url
	FROM downloads_seen
	WHERE complete=0
	AND errored=0
	AND ignored=0
	LIMIT ?
	"""
	c.execute(sql,(count,))
	return [row[0] for row in c.fetchall()]




	if args.dump:

	for url in get_next_downloads(cur,count_downloads_left(cur)):
	print (url)
	exit(0)





	class SpiderErrorException(Exception):
	pass
	class SpiderIgnoreException(Exception):
	pass
	def spider_page(url0):
	assert isinstance(url0,tuple)

	#print ('url0:',url0.geturl())
	spider_config = config.get('spider',{})
	chunk_size = spider_config.get('chunk_size', 1024*10)
	max_buffer_size = spider_config.get('max_buffer_size', chunk_size*3)
	max_bytes_processed = spider_config.get('max_bytes_processed', 102410245)

	urls = []
	results = []

	r = requests.get(url0.geturl(), stream=True)
	"""
	if r.headers:
	if 'content-type' in r.headers:
	content_type = r.headers['content-type'].strip()
	content_type,_,_ = content_type.partition(';')

	valid_mime_types = set(['application/atom+xml','application/dash+xml'])
	if not content_type.startswith('text'):
	raise SpiderIgnoreException('cannot spider resource with non-text-based mime-type, url: %s' % (url0.geturl(),))
	"""
	buf = bytearray()
	bytes_processed = 0



	for chunk in r.iter_content(chunk_size=10240):


	if chunk:
	buf += chunk
	bytes_processed += len(chunk)

	if bytes_processed > max_bytes_processed:
	raise SpiderIgnoreException('request was too large to spider, url: %s' % (url0.geturl(),))
	urls += [match.group(0).strip() for match in URL_RE.finditer(buf)]

	urls += [match.group(1).strip() for match in HREF_DQ_RE.finditer(buf)]
	urls += [match.group(1).strip() for match in HREF_SQ_RE.finditer(buf)]
	urls += [match.group(1).strip() for match in SRC_DQ_RE.finditer(buf)]
	urls += [match.group(1).strip() for match in SRC_SQ_RE.finditer(buf)]

	if len(buf) > max_buffer_size:
	buf = buf[-max_buffer_size:]
	assert len(buf) == max_buffer_size



	for url in urls:
	url = urllib.parse.urlsplit(url.decode())

	results += [url]



	del r





	#print ('results:',results)

	return results
	def process_download(url):
	assert isinstance(url, str)
	h = hashlib.new('sha256')
	r = requests.get(url, stream=True)
	size = 0

	downloads_config = config.get('downloads',{})
	chunk_size = downloads_config.get('chunk_size', 1024*10)
	max_bytes_processed = downloads_config.get('max_bytes_processed', 102410245)

	#print ('config:',config, 'max_bytes_processed', max_bytes_processed)

	path = urllib.parse.urlsplit(url).path

	root,ext = os.path.splitext(path)

	if ext not in args.ext:
	logging.error('url: %s ext: %s does not match any extensions specified on the commandline' % (url,ext))
	return

	with tempfile.TemporaryFile() as outfile:
	for chunk in r.iter_content(chunk_size=1024*10):
	if chunk:
	outfile.write(chunk)
	outfile.flush()
	os.fsync(outfile.fileno())
	h.update(chunk)
	size += len(chunk)

	if size > max_bytes_processed:
	raise SpiderIgnoreException('request was too large to spider, url: %s' % (url,))

	H = h.hexdigest()

	outfile.seek(0)
	with open(H+'.'+ext,'w+b') as dstfile:
	shutil.copyfileobj(outfile,dstfile)
	set_download_complete(cur,url)

	print ( 'downloaded: ', url, 'to: ', h.hexdigest(), 'size:', size, file=sys.stderr)
	del r

	if args.download:

	while count_downloads_left(cur) > 0:
	urls = list(get_next_downloads(cur,20))
	while len(urls):
	url = urls.pop()
	print ( 'downloading: ', url, file=sys.stderr)

	try:
	process_download(url)
	except KeyboardInterrupt:
	raise
	except (requests.exceptions.RequestException,SpiderErrorException):
	logging.exception('error downloading url, url: %s' % (url,))
	set_download_errored(cur,url)
	except (SpiderIgnoreException):
	logging.exception('downloading url ignored, url: %s' % (url,))
	set_download_ignored(cur,url)

	conn.commit()


	print_status(file=sys.stderr)






	exit(0)


	argurl = urllib.parse.urlsplit(args.url).geturl()

	if not is_page_seen(cur,argurl):
	add_page(cur,argurl)
	conn.commit()


	def process_url(cur,url):
	global urls, url_mask
	assert isinstance(url,tuple)

	#print ('is_page_complete(cur,url.geturl()):',is_page_complete(cur,url.geturl()))
	if is_page_complete(cur,url.geturl()):
	return

	new_urls = spider_page(url)

	#print ('new_urls:',new_urls)

	for new_url in new_urls:
	a = url.geturl()
	b = new_url.geturl()

	#print ('a:',a, 'b:',b, 'type(a):',type(a), 'type(b):', type(b))
	new_url = urllib.parse.urljoin(a,b)
	new_url = urllib.parse.urlsplit(new_url)

	#TODO, take extension off the path, and compare it that way
	for ext in args.ext:
	if new_url.path.lower().endswith(ext) and not is_download_seen(cur,new_url.geturl()):
	add_download(cur,new_url.geturl())
	continue


	if new_url.netloc != url_mask.netloc:
	continue
	#TODO more mask, match path

	if is_page_seen(cur,new_url.geturl()) or is_page_complete(cur,new_url.geturl()):
	continue

	add_page(cur,new_url.geturl())

	set_page_complete(cur,url.geturl())

	#print ('pages_left:',count_pages_left(cur)
	# , 'pages_seen:',count_pages_seen(cur)
	# , 'downloads_left:',count_downloads_left(cur))




	while len(get_next_pages(cur,1)) > 0:
	urls = list(get_next_pages(cur,20))

	#print ('urls:',urls)

	while len(urls):
	url = urls.pop()
	#print ('url:',url)
	try:
	process_url(cur,urllib.parse.urlsplit(url))
	except KeyboardInterrupt:
	raise
	except (requests.exceptions.RequestException,SpiderErrorException):
	logging.exception('error processing url, url: %s' % (url,))
	set_page_errored(cur,url)
	except (SpiderIgnoreException):
	logging.warning('processing url, ignored, url: %s' % (url,))
	set_page_ignored(cur,url)
	#except:
	# logging.exception('error processing url, url: %s' % (url,))

	conn.commit()
	time.sleep(.01)
	print_status(file=sys.stderr)
	# -- coding: utf-8 --
	import traceback,sys
	import socket
	import random
	import time
	import os
	import unittest
	import re

	"""
	from https://gist.github.com/1595135

	Written by Christian Stigen Larsen, http://csl.sublevel3.org
	Placed in the public domain by the author, 2012-01-11
	"""
	def ip_int_from_string(s):
	"Convert dotted IPv4 address to integer."
	return reduce(lambda a,b: a<<8 \| b, map(int, s.split(".")))

	def ip_int_to_string(ip):
	"Convert 32-bit integer to dotted IPv4 address."
	return ".".join(map(lambda n: str(ip>>n & 0xFF), [24,16,8,0]))


	def format_relative_time(t):

	minute_seconds = 60
	hour_seconds = minute_seconds * 60
	day_seconds = hour_seconds * 24
	week_seconds = day_seconds * 7
	year_seconds = day_seconds * 365

	if t < minute_seconds:
	return '{seconds} seconds'.format(seconds=t)
	elif t < hour_seconds:
	return '{minutes} minutes'.format(minutes=int(t/minute_seconds))
	elif t < day_seconds:
	hours = int(t/hour_seconds)
	minutes = int((t - (hours*hour_seconds)) / minute_seconds)
	return '{hours} hours {minutes} minutes'.format(hours=hours,minutes=minutes)
	elif t < year_seconds:
	days = int(t/day_seconds)
	hours = int((t - (days*day_seconds)) / hour_seconds)
	return '{days} days {hours} hours'.format(days=days,hours=hours)

	years = int(t/year_seconds)
	weeks = int((t - (years*year_seconds)) / week_seconds)
	return '{years} years {weeks} weeks'.format(years=years,weeks=weeks)



	class IPLocator:
	def __init__(self,config):
	self.config = config

	try:
	import pygeoip
	self.gic = pygeoip.GeoIP(config['geoipcityip4_path'])

	except Exception as e:
	print >> sys.stderr, 'pygeoip setup error',e

	def obtain_locations(self,ip):
	pass

	def obtain_address_info(host,config,iplocate=True,rdns=True):

	results = {}


	ip = None
	#print >> sys.stderr, 'host:',host
	try:
	data = socket.gethostbyname(host)
	#ip = repr(data)
	ip = data

	#print 'ip:',ip
	except Exception:
	raise

	results['ip'] = ip

	if iplocate:
	results['iplocations'] = {}
	try:
	import urllib
	import json

	#FIXME: does this need to be cleaned up??
	rresponse = urllib.urlopen('http://api.hostip.info/get_json.php?ip={ip}&position=true'.format(ip=ip)).read()


	rresponse_json = json.loads(rresponse)
	#print 'rresponse_json:',rresponse_json

	country_name = rresponse_json['country_name']
	country_name = None if country_name is None else country_name.encode('utf-8')
	city = rresponse_json['city']
	city = None if city is None else city.encode('utf-8')
	lng = rresponse_json['lng']
	lat = rresponse_json['lat']

	response = (
	'\|hostip\| country: "{country}" city: "{city}" longitude: {longitude} latitude: {latitude}'.format(
	country=country_name,
	city=city,
	longitude=lng,
	latitude=lat))
	results['iplocations']['hostip'] = response


	except Exception as e:
	print >> sys.stderr, 'hostip error:',e
	results['iplocations']['hostip'] = '\|hostip\| error'


	try:
	import pygeoip
	gic = pygeoip.GeoIP(config['geoipcityip4_path'])

	record = gic.record_by_addr(ip)


	response = ('\|geoipcityip4\| ' + str(record))

	results['iplocations']['geoipcityip4'] = response
	except Exception as e:
	print >> sys.stderr, 'pygeoip error',e
	results['iplocations']['geoipcityip4'] = '\|geoipcityip4\| error'

	try:
	ip_int = ip_int_from_string(ip)

	#print >> sys.stderr, 'ip_int:',ip_int

	with open(config['IpToCountry.csv']) as ip2country:
	for line in ip2country:
	line = line.strip()
	line_data = line.split(',')
	if len(line) == 0:
	continue
	if line[0] == '#':
	continue

	"""
	print 'line:',line
	print 'line_data:',line_data
	print 'len(line_data):',len(line_data)
	"""
	start_str = line_data[0].strip()[1:-1]
	end_str = line_data[1].strip()[1:-1]
	"""
	print 'start_str:',start_str
	print 'end_str:',end_str
	print
	"""
	ip_first_int = int(start_str)
	ip_last_int = int(end_str)
	if ip_first_int <= ip_int and ip_int <= ip_last_int:

	#print line_data
	registry = line_data[2].strip()[1:-1]
	country = line_data[6].strip()[1:-1]
	ip_first = ip_int_to_string(ip_first_int)
	ip_last = ip_int_to_string(ip_last_int)

	response = ('\|IpToCountry\| range:[{ip_first}-{ip_last}], registry: {registry}, country: {country}'.format(
	ip_first=ip_first,ip_last=ip_last,registry=registry,country=country))

	results['iplocations']['IpToCountry'] = response

	break
	results['iplocations']['IpToCountry'] = '\|IpToCountry\| error, no results'
	except Exception as e:
	print >> sys.stderr, 'IpToCountry error:',e
	results['iplocations']['IpToCountry'] = '\|IpToCountry\| error'



	if rdns:
	results['domains'] = []
	try:
	from dns import resolver,reversename
	addr=reversename.from_address(ip)
	#print >> sys.stderr, addr

	for hmm in resolver.query(addr,"PTR"):
	results['domains'] += [str(hmm)]
	except Exception as e:

	print >> sys.stderr, 'Reverse DNS error:',e

	return results



	def get_standard_argparser():
	import argparse
	parser = argparse.ArgumentParser(add_help=True)

	parser.add_argument('--delim', metavar='<delimchar>', type=str, nargs='?', help='delimeter, defaults to space', default=' ')
	parser.add_argument('--indelim', metavar='<delimchar>', type=str, nargs='?',
	help='input delimiter, overrides --delim, defaults to --delim')
	parser.add_argument('--outdelim', metavar='<delimchar>', type=str, nargs='?',
	help='output delimeter, overrides --delim, defaults to --delim')

	parser.add_argument('--quote', metavar='<quotechar>', type=str, nargs='?', help='quote, defaults to \'"\'', default='"')
	parser.add_argument('--inquote', metavar='<quotechar>', type=str, nargs='?',
	help='input quote, overrides --quote, defaults to --quote')
	parser.add_argument('--outquote', metavar='<quotechar>', type=str, nargs='?',
	help='output quote, overrides --quote, defaults to --quote')


	parser.add_argument('--infile', '-i', metavar='<path>', type=argparse.FileType('rb'), nargs='?',
	help='input file path, defaults to stdin', default=sys.stdin)
	parser.add_argument('--outfile', '-o', metavar='<path>', type=argparse.FileType('wb'), nargs='?',
	help='output file path, defaults to stdout', default=sys.stdout)

	return parser

	def default_headers_cb(tool,headers):

	tool.csvwriter.writerow(headers)
	tool.outfile.flush()

	def default_row_cb(tool,row,row_data):
	tool.csvwriter.writerow(row)
	tool.outfile.flush()

	class GenericTool:

	def __init__(self):

	self.headers_cb = default_headers_cb
	self.row_cbs = [default_row_cb]
	self.csvwriter = None
	self.infile = None
	self.outfile = None
	self.parser = get_standard_argparser()
	self.parsed_args = None

	def parse_args(self):
	self.parsed_args = self.parser.parse_args()

	def run(self):
	import csv

	parsed_args = self.parsed_args

	indelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.indelim
	outdelim = parsed_args.delim if parsed_args.indelim is None else parsed_args.outdelim

	inquote = parsed_args.quote if parsed_args.inquote is None else parsed_args.inquote
	outquote = parsed_args.quote if parsed_args.outquote is None else result.outquote


	with parsed_args.infile as infile:

	outfile = parsed_args.outfile

	self.infile = infile
	self.outfile = outfile

	csvreader = self.csvreader = csv.reader(infile, delimiter=indelim, quotechar=inquote)
	csvwriter = self.csvwriter = csv.writer(outfile, delimiter=outdelim, quotechar=outquote)

	headers = []
	for row in csvreader:
	headers = row
	break

	if self.headers_cb is not None:
	self.headers_cb(self,headers)

	for row in csvreader:
	try:

	row_data = {}
	for idx in range(len(headers)):
	row_data[headers[idx]] = row[idx]

	for row_cb in self.row_cbs:
	try:
	row_cb(self,row,row_data)
	except IOError as e:
	raise
	except Exception as e:
	print >> sys.stderr, 'Exception while row_cb:',e
	print >> sys.stderr, 'row_cb:',row_cb
	traceback.print_exc(file=sys.stderr)

	except IOError as e:
	raise
	except Exception as e:
	print >> sys.stderr, 'Exception while parsing line from stdin:',e
	print >> sys.stderr, 'line:',row

	traceback.print_exc(file=sys.stderr)


	def generate_random_alphanumerics(length):

	abcs = 'abcdefghijklmnopqrstuvwxyz'
	result = [abcs[random.randint(0,len(abcs)-1)] for _ in range(length)]

	result = ''.join(result)
	return result



	def generate_FUZZY_URLP_RE_STR():
	valid_scheme_chars = 'a-zA-Z'
	valid_domain_chars = '\w\\.'
	def _valid_path_chars():
	safe = '\\$\\-_\\.\\+'
	extra = '\!\*\\(\)\,' #removed \\'
	unreserved = '\w'+ safe+extra
	reserved = '\\;/\\?\:\\@\\&\\='
	escape = '\\%'
	xchar = unreserved + reserved + escape

	return xchar
	valid_path_chars = _valid_path_chars()
	tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
	'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
	'net', 'org', 'pro', 'tel', 'travel']
	result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \
	+ '[' + valid_domain_chars + ']*' \
	+ '\.([a-zA-Z]{2}\|' + '\|'.join(tlds) + ')(\\:[\d]*)?/' \
	+ '[' + valid_path_chars + ']*' \
	+ '(\\#[' + valid_path_chars + ']*)?' + ')'
	return result

	def generate_FUZZY_URL_RE_STR():
	valid_scheme_chars = 'a-zA-Z'
	valid_domain_chars = '\w\\.'
	def _valid_path_chars():
	safe = '\\$\\-_\\.\\+'
	extra = '\!\*\\(\)\,' #removed \\'
	unreserved = '\w'+ safe+extra
	reserved = '\\;/\\?\:\\@\\&\\='
	escape = '\\%'
	xchar = unreserved + reserved + escape

	return xchar
	valid_path_chars = _valid_path_chars()
	tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
	'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
	'net', 'org', 'pro', 'tel', 'travel']

	domain_RE_STR = r'(\w+(\.\w+)*)'
	path_RE_STR = r'(/[' + valid_path_chars + '])'
	query_RE_STR = r'(\?[' + valid_path_chars + '])'
	fragment_RE_STR = r'(\#[' + valid_path_chars + '])'

	"""
	TODO:
	* "...so" matches, make sure no two "." in a domain
	* "something.sol" matches something.so
	"""
	result = r'((([' + valid_scheme_chars + ']*\:)?//)?' \
	+ '[' + valid_domain_chars + ']*' \
	+ '\.([a-zA-Z]{2}\|' + '\|'.join(tlds) + ')(\\:[\d]*)?/?' \
	+ '[' + valid_path_chars + ']*' \
	+ '(\\#[' + valid_path_chars + ']*)?' + ')'
	return result



	"""

	timeout
	Socket timeout for each test.
	tests
	Proxies to test for.
	log
	File object to log to.
	"""
	def test_proxy(ip,port,tests=['HTTP','SOCKS4','SOCKS5'], timeout=20,log=os.devnull):
	import socksocket

	str2proxytype = {'SOCKS4':socksocket.PROXY_TYPE_SOCKS4,
	'SOCKS5':socksocket.PROXY_TYPE_SOCKS5,
	'HTTP':socksocket.PROXY_TYPE_HTTP}
	proxytype2str = {socksocket.PROXY_TYPE_SOCKS4:'SOCKS4',
	socksocket.PROXY_TYPE_SOCKS5:'SOCKS5',
	socksocket.PROXY_TYPE_HTTP:'HTTP'}

	for test in tests:
	if test not in str2proxytype:
	#print >> log, 'test:',test,'is unknown, next!'
	continue
	proxytype = str2proxytype[test]

	#print >> log, ' trying', test

	s = socksocket.socksocket()
	s.settimeout(20)
	try:
	s.setproxy(proxytype=proxytype,addr=ip,port=port,rdns=True)
	s.connect(('google.com',80))

	s.sendall( '''GET / HTTP/1.1\r\nHost: google.com\r\n\r\n''' )

	start = time.time()
	timeout = s.gettimeout()
	while True:
	b = s.recv(1)
	if len(b):
	break
	time.sleep(.001)
	if timeout is not None and time.time() - start >= timeout:
	raise socket.timeout
	#print >> log, 'found one:',proxy,proxytype2str[proxytype]
	return proxytype2str[proxytype]
	except socket.error as e:
	#print >> log, ' socket.error'
	continue
	except socksocket.ProxyError:
	#print >> log, ' socksocket.ProxyError'
	continue
	except Exception as e:
	print >> log, ' UNKNOWN ERROR:',e
	traceback.print_exc(file=sys.stderr)
	finally:
	s.close()
	return None


	class URL_RE:

	def __init__(self):
	valid_scheme_chars = 'a-zA-Z'
	valid_domain_chars = '\w\\.'
	def _valid_path_chars():
	safe = '\\$\\-_\\.\\+'
	extra = '\!\*\\(\)\,' #removed \\'
	unreserved = '\w'+ safe+extra
	reserved = '\\;/\\?\:\\@\\&\\='
	escape = '\\%'
	xchar = unreserved + reserved + escape

	return xchar
	valid_path_chars = _valid_path_chars()
	tlds = ['aero', 'asia', 'biz', 'cat', 'com', 'coop', 'edu', 'gov',
	'info', 'int', 'jobs', 'mil', 'mobi', 'museum', 'name',
	'net', 'org', 'pro', 'tel', 'travel',

	'travel', 'xxx', 'post',

	'arpa',

	u'бг', u'ελ', u'ישראל', u'мкд', u'日本', u'日本国', u'ລາວ', #u'ليبيا'‎,
	]

	self.scheme_RE_STR = r'([' + valid_scheme_chars + ']*\:)'
	self.domain_RE_STR = r'(\w+(\.\w+))\.([a-zA-Z]{2}\|' + '\|'.join(tlds) + ')(\\:[\d])?'
	self.path_RE_STR = r'(/[' + valid_path_chars + ']*)'
	self.query_RE_STR = r'(\?[' + valid_path_chars + ']*)'
	self.fragment_RE_STR = r'(#[' + valid_path_chars + ']*)'

	self.domain_RE = re.compile(self.domain_RE_STR)


	self.url_RE_STR = u'(^\|\s)(({scheme}?//)?({domain})({path})?({query})?({fragment})?)($\|\s)'
	self.url_RE_STR = self.url_RE_STR.format(scheme=self.scheme_RE_STR,
	domain=self.domain_RE_STR,
	path=self.path_RE_STR,
	query=self.query_RE_STR,
	fragment=self.fragment_RE_STR).encode('utf-8')
	self.url_RE = re.compile(self.url_RE_STR)

	class TestURL_RE(unittest.TestCase):

	def setUp(self):
	self.url_re = URL_RE()

	def test_domain_two_dots(self):

	self.assertIsNotNone(self.url_re.url_RE.search('a.com'))
	self.assertIsNone(self.url_re.url_RE.search('a..com'))

	def test_invalid_tld(self):

	self.assertIsNotNone(self.url_re.url_RE.search('a.co'))
	self.assertIsNone(self.url_re.url_RE.search('a.cod'))


	def test_urls(self):

	urls = [
	('google.com', True),
	('google..com', False),
	('google.comp', False),
	('google.cop', False),
	('//google.com', True),
	('mmm://google.com', True),
	('google.com/', True),
	('google.com/?', True),
	('google.com/?#', True),
	('google.com/#', True),
	('google.com?', True),
	('google.com?#', True),
	('google.com#', True),
	('mmm..........chummus.com', False),
	]
	for url, expectation in urls:
	print ('url:',url)

	if expectation:
	self.assertIsNotNone(self.url_re.url_RE.search(url))
	else:
	self.assertIsNone(self.url_re.url_RE.search(url))


	def main():
	unittest.main()


	if __name__ == "__main__":
	main()