Created
April 15, 2014 20:05
-
-
Save tecknoh19/10767105 to your computer and use it in GitHub Desktop.
Google URL Harvester. Google Dorks supported. Scrapes Google for specified search parameters and returns a filtered log file of domain names that has been cleansed of duplicate entires.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# URL Harvester written by Andy Bricker | |
# http://andybricker.com | |
# andy at andybricker.com | |
# Requirements | |
# Python 2.7 (Has not been tasted on later versions) | |
# Beautiful Soup library for Python (http://www.crummy.com/software/BeautifulSoup/) | |
# Usage: | |
# python urlHarvest.py books stores -n 50 -l myLogFile.txt | |
# Google Dorks are supported | |
# python urlHarvest.py inurl:.com.eu/foobar.php intext:I like computers -n 50 -l /home/me/logs/myLogFile.txt | |
# Script will crawl google collections the specified number of results for a given search. The script will then | |
# build a URL array while preventing duplicate entries. Finally, a line my line logfile is generated containins | |
# the results. | |
# Like the script? Donate | |
# LiteCoin: LcFU5upJyS7FsEeB5sb25vFTS69dH6fugr | |
# DogeCoin: D7SPH1LYJn9Co4GCZePH3JvzR5RkZEPi5M | |
from optparse import OptionParser | |
options = OptionParser(usage='%prog search [options]', description='Python URL Harvester by Andy Bricker. http://AndyBricker.Com') | |
options.add_option('-n', '--number', type='int', default=5, help='Number of search results to parse (default: 5)') | |
options.add_option('-l', '--log_file', type='string', default='urlHarvest.txt', help='Name of the output logfile. Paths accepted. (default: urlHarvest.txt)') | |
def addLog(target, opts): | |
log_file = open(opts.log_file, "a") | |
log_file.write(target + '\n') | |
log_file.close() | |
def main(): | |
print "" | |
print "=======================================================" | |
print "Checking arguments." | |
opts, args = options.parse_args() | |
z = 0 | |
if len(args) < 1: | |
options.print_help() | |
exit() | |
domainList = [] | |
print "Beginning Google Search of " + str(opts.number) + " records. Please be patient." | |
# Check Google against our search to build URL list | |
from google import search | |
for url in search(args[0], stop=opts.number): | |
from urlparse import urlparse | |
parsed_uri = urlparse( url ) | |
domain = '{uri.netloc}'.format(uri=parsed_uri) | |
domainList.append(domain); | |
print "Search Complete, filtering results." | |
domainList = list(set(domainList)) | |
print "Building log file." | |
for target in domainList: | |
addLog(target, opts) | |
print "Harvest complete. Log data written to " + opts.log_file | |
print "" | |
print "=======================================================" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Checking arguments.
Beginning Google Search of 500 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 51, in main
for url in search(args[0], stop=opts.number):
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 269, in search
html = get_page(url)
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 89, in get_page
response = urlopen(request)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 442, in error
result = self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 629, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 503: Service Unavailable