Skip to content

Instantly share code, notes, and snippets.

@tecknoh19
Created April 15, 2014 20:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tecknoh19/10767105 to your computer and use it in GitHub Desktop.
Save tecknoh19/10767105 to your computer and use it in GitHub Desktop.
Google URL Harvester. Google Dorks supported. Scrapes Google for specified search parameters and returns a filtered log file of domain names that has been cleansed of duplicate entires.
# URL Harvester written by Andy Bricker
# http://andybricker.com
# andy at andybricker.com
# Requirements
# Python 2.7 (Has not been tasted on later versions)
# Beautiful Soup library for Python (http://www.crummy.com/software/BeautifulSoup/)
# Usage:
# python urlHarvest.py books stores -n 50 -l myLogFile.txt
# Google Dorks are supported
# python urlHarvest.py inurl:.com.eu/foobar.php intext:I like computers -n 50 -l /home/me/logs/myLogFile.txt
# Script will crawl google collections the specified number of results for a given search. The script will then
# build a URL array while preventing duplicate entries. Finally, a line my line logfile is generated containins
# the results.
# Like the script? Donate
# LiteCoin: LcFU5upJyS7FsEeB5sb25vFTS69dH6fugr
# DogeCoin: D7SPH1LYJn9Co4GCZePH3JvzR5RkZEPi5M
from optparse import OptionParser
options = OptionParser(usage='%prog search [options]', description='Python URL Harvester by Andy Bricker. http://AndyBricker.Com')
options.add_option('-n', '--number', type='int', default=5, help='Number of search results to parse (default: 5)')
options.add_option('-l', '--log_file', type='string', default='urlHarvest.txt', help='Name of the output logfile. Paths accepted. (default: urlHarvest.txt)')
def addLog(target, opts):
log_file = open(opts.log_file, "a")
log_file.write(target + '\n')
log_file.close()
def main():
print ""
print "======================================================="
print "Checking arguments."
opts, args = options.parse_args()
z = 0
if len(args) < 1:
options.print_help()
exit()
domainList = []
print "Beginning Google Search of " + str(opts.number) + " records. Please be patient."
# Check Google against our search to build URL list
from google import search
for url in search(args[0], stop=opts.number):
from urlparse import urlparse
parsed_uri = urlparse( url )
domain = '{uri.netloc}'.format(uri=parsed_uri)
domainList.append(domain);
print "Search Complete, filtering results."
domainList = list(set(domainList))
print "Building log file."
for target in domainList:
addLog(target, opts)
print "Harvest complete. Log data written to " + opts.log_file
print ""
print "======================================================="
if __name__ == '__main__':
main()
@demogorgonz
Copy link

Checking arguments.
Beginning Google Search of 50 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 50, in main
from google import search
ImportError: No module named google

@SeidBenseid
Copy link

@demogorgonz
pip install google

@DigitalCorrosion
Copy link

Checking arguments.
Beginning Google Search of 500 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 51, in main
for url in search(args[0], stop=opts.number):
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 269, in search
html = get_page(url)
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 89, in get_page
response = urlopen(request)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 442, in error
result = self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 629, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 503: Service Unavailable

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment