Created
September 17, 2013 09:23
-
-
Save barvinograd/6592043 to your computer and use it in GitHub Desktop.
A simple python crawler. Produces a list wiki page view counts file dumps, but fairly generic.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import urllib2 | |
import urlparse | |
import re | |
from bs4 import BeautifulSoup | |
downloadQ = [] | |
downloaded = set() | |
rootUrl = "http://dumps.wikimedia.org/other/pagecounts-raw/" | |
followRegex = ".*pagecounts[a-zA-Z0-9\-/]*$" | |
dontFollowRegex = ".*projectcounts.*$" | |
keepRegex = r".*pagecounts-\d{8}-\d{6}\.gz" | |
stayInDomain = True | |
followContentTypes = ["text/html"] | |
def processUrl(url): | |
#parse url | |
parsedUrl = urlparse.urlparse(url) | |
#check for domain restriction | |
if (stayInDomain and parsedUrl.netloc != rootUrlLocNetLoc): | |
return | |
if (dontFollowRegexCompiled.match(parsedUrl.path)): | |
return | |
#add to queue | |
downloadQ.append((url, followRegexCompiled.match(parsedUrl.path) != None, keepRegexCompiled.match(parsedUrl.path) != None)) | |
keepRegexCompiled = re.compile(keepRegex) | |
followRegexCompiled = re.compile(followRegex) | |
dontFollowRegexCompiled = re.compile(dontFollowRegex) | |
#init queue | |
rootUrlLocNetLoc = urlparse.urlparse(rootUrl).netloc | |
processUrl(rootUrl) | |
#crawl | |
while len(downloadQ) > 0: | |
url, follow, keep = downloadQ.pop(0) | |
actualUrl = "" | |
data = None | |
if (follow and url not in downloaded): | |
#Download | |
response = urllib2.urlopen(url) | |
actualUrl = response.geturl() # in the case of a redirect, this function will return the url the data was fetched | |
#extract url | |
if (response.info().getheader('Content-Type') in followContentTypes): | |
data = response.read() | |
soup = BeautifulSoup(data) | |
for link in soup.find_all('a'): | |
absolutePath = urlparse.urljoin(actualUrl, link.get('href')); | |
processUrl(absolutePath) | |
#mark as downloaded | |
downloaded.add(url) | |
downloaded.add(actualUrl) | |
elif (keep and url not in downloaded): | |
downloaded.add(url) | |
print url |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment