Skip to content

Instantly share code, notes, and snippets.

@barvinograd
Created September 17, 2013 09:23
Show Gist options
  • Save barvinograd/6592043 to your computer and use it in GitHub Desktop.
Save barvinograd/6592043 to your computer and use it in GitHub Desktop.
A simple python crawler. Produces a list wiki page view counts file dumps, but fairly generic.
#!/usr/bin/python
import urllib2
import urlparse
import re
from bs4 import BeautifulSoup
downloadQ = []
downloaded = set()
rootUrl = "http://dumps.wikimedia.org/other/pagecounts-raw/"
followRegex = ".*pagecounts[a-zA-Z0-9\-/]*$"
dontFollowRegex = ".*projectcounts.*$"
keepRegex = r".*pagecounts-\d{8}-\d{6}\.gz"
stayInDomain = True
followContentTypes = ["text/html"]
def processUrl(url):
#parse url
parsedUrl = urlparse.urlparse(url)
#check for domain restriction
if (stayInDomain and parsedUrl.netloc != rootUrlLocNetLoc):
return
if (dontFollowRegexCompiled.match(parsedUrl.path)):
return
#add to queue
downloadQ.append((url, followRegexCompiled.match(parsedUrl.path) != None, keepRegexCompiled.match(parsedUrl.path) != None))
keepRegexCompiled = re.compile(keepRegex)
followRegexCompiled = re.compile(followRegex)
dontFollowRegexCompiled = re.compile(dontFollowRegex)
#init queue
rootUrlLocNetLoc = urlparse.urlparse(rootUrl).netloc
processUrl(rootUrl)
#crawl
while len(downloadQ) > 0:
url, follow, keep = downloadQ.pop(0)
actualUrl = ""
data = None
if (follow and url not in downloaded):
#Download
response = urllib2.urlopen(url)
actualUrl = response.geturl() # in the case of a redirect, this function will return the url the data was fetched
#extract url
if (response.info().getheader('Content-Type') in followContentTypes):
data = response.read()
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
absolutePath = urlparse.urljoin(actualUrl, link.get('href'));
processUrl(absolutePath)
#mark as downloaded
downloaded.add(url)
downloaded.add(actualUrl)
elif (keep and url not in downloaded):
downloaded.add(url)
print url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment