Skip to content

Instantly share code, notes, and snippets.

@melpomene
Created October 11, 2011 11:33
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save melpomene/1277869 to your computer and use it in GitHub Desktop.
Save melpomene/1277869 to your computer and use it in GitHub Desktop.
Wordlist generating script that parses websites for words.
#!/usr/bin/env python2.7
''' Creates wordlists from web scraping. BeautifulSoup requierd (pip install beautifulsoup) '''
import sys
import os
import robotparser
from BeautifulSoup import BeautifulSoup as bs
import urllib2
from urlparse import urlparse
PATH = '/wordlist.txt'
visited =[]
''' Returns all links found on page'''
def return_links(raw_page):
soup = bs(raw_page)
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
return links
''' Saves all words in source code seperated with whitespace to file (on PATH) with one word per row'''
def save_wordlist(raw_page):
soup = bs(raw_page)
wordlist = str.split(soup.__str__())
f = open(PATH, 'a')
for word in wordlist:
f.write(word+'\n')
f.close()
def recheck_robot(rp, up):
rp.set_url(up.scheme +"://"+ up.netloc + "/robots.txt")
rp.read()
if verbose:
print "Checking robot.txt on : "+ up.scheme +"://"+ up.netloc + "/robots.txt"
''' Recursive method that checks Robotparser if it is allowed to crawl and if allowed
it parses all word and make recursive call to all found URLs'''
def scrape(baseurl, page, rp):
if page is None:
return
url = urlparse(page)
if url.netloc=="":
if baseurl[-1] != "/" and url.path != "" and url.path[0] != "/":
baseurl = baseurl + "/"
if url.path != "" and baseurl[-1] == "/" and url.path[0] == "/" :
baseurl = baseurl[:-1]
newurl = baseurl + url.path
if "http" not in newurl:
newurl = "http://"+newurl
else:
if baseurl != url.netloc:
recheck_robot(rp, url)
newurl = url.geturl()
baseurl = url.netloc
if newurl in visited:
return
visited.append(newurl)
if rp.can_fetch("*", newurl):
if verbose:
print "Allowed to fetch page "+newurl+". Initiating scrape."
try:
raw_page = urllib2.urlopen(newurl)
raw_page = raw_page.read()
#scrape for words.
save_wordlist(raw_page)
# scrape for links. Foreach link scrape.
links = return_links(raw_page)
if not links:
return
for link in links:
scrape(baseurl, link, rp)
except (urllib2.URLError, urllib2.HTTPError, ValueError):
return
else:
if verbose:
print "Not allowed to fetch page "+baseurl+page+". Shutting down operations"
return
if __name__ == "__main__":
if len(sys.argv) == 1:
print "Call with 'python wordcollector.py [--verbose] [url]'"
exit()
if sys.argv[1] == '--verbose':
if len(sys.argv) == 2:
print "Call with 'python wordcollector.py [--verbose] [url]'"
exit()
verbose = True
url = sys.argv[2]
else:
verbose = False
url = sys.argv[1]
if verbose :
print "URL: " + url
up = urlparse(url)
up.netloc
if verbose:
print "Reading " +up.scheme +"://"+ up.netloc +"/robots.txt"
rp = robotparser.RobotFileParser()
recheck_robot(rp, urlparse(url))
if rp.can_fetch("*", url):
if verbose:
print "Allowed to fetch root. Initiating reqursive scrape."
# INITIATE RECURSIVE SCRAPE.
try:
scrape(url, "", rp)
except KeyboardInterrupt:
pass
if verbose:
print""
print "---------------------"
print "Scrape was completed."
print "Number of words harvested:"
os.system("wc -l " + PATH)
print "---------------------"
else:
if verbose:
print "Not allowed to fetch root. Shutting down operations"
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment