barvinograd/simple_python_crawler.py

## simple_python_crawler.py
#!/usr/bin/python

import urllib2
import urlparse
import re
from bs4 import BeautifulSoup

downloadQ = []
downloaded = set()

rootUrl = "http://dumps.wikimedia.org/other/pagecounts-raw/"
followRegex = ".*pagecounts[a-zA-Z0-9\-/]*$"
dontFollowRegex = ".*projectcounts.*$"
keepRegex = r".*pagecounts-\d{8}-\d{6}\.gz"

stayInDomain = True
followContentTypes = ["text/html"]

def processUrl(url):
  #parse url
	parsedUrl = urlparse.urlparse(url)

	#check for domain restriction
	if (stayInDomain and parsedUrl.netloc != rootUrlLocNetLoc):
		return

	if (dontFollowRegexCompiled.match(parsedUrl.path)):
		return

	#add to queue
	downloadQ.append((url, followRegexCompiled.match(parsedUrl.path) != None, keepRegexCompiled.match(parsedUrl.path) != None))

keepRegexCompiled = re.compile(keepRegex)
followRegexCompiled = re.compile(followRegex)
dontFollowRegexCompiled = re.compile(dontFollowRegex)

#init queue
rootUrlLocNetLoc = urlparse.urlparse(rootUrl).netloc
processUrl(rootUrl)

#crawl

while len(downloadQ) > 0:
	url, follow, keep = downloadQ.pop(0)
	actualUrl = ""
	data = None

	if (follow and url not in downloaded):
		#Download
		response = urllib2.urlopen(url)
		actualUrl = response.geturl() # in the case of a redirect, this function will return the url the data was fetched
		#extract url
		if (response.info().getheader('Content-Type') in followContentTypes):
			data = response.read()
			soup = BeautifulSoup(data)
			for link in soup.find_all('a'):
				absolutePath = urlparse.urljoin(actualUrl, link.get('href'));
				processUrl(absolutePath)
		#mark as downloaded
		downloaded.add(url)
		downloaded.add(actualUrl)
 	elif (keep and url not in downloaded):
		downloaded.add(url)
		print url
	#!/usr/bin/python

	import urllib2
	import urlparse
	import re
	from bs4 import BeautifulSoup

	downloadQ = []
	downloaded = set()

	rootUrl = "http://dumps.wikimedia.org/other/pagecounts-raw/"
	followRegex = ".pagecounts[a-zA-Z0-9\-/]$"
	dontFollowRegex = ".projectcounts.$"
	keepRegex = r".*pagecounts-\d{8}-\d{6}\.gz"

	stayInDomain = True
	followContentTypes = ["text/html"]

	def processUrl(url):
	#parse url
	parsedUrl = urlparse.urlparse(url)

	#check for domain restriction
	if (stayInDomain and parsedUrl.netloc != rootUrlLocNetLoc):
	return

	if (dontFollowRegexCompiled.match(parsedUrl.path)):
	return

	#add to queue
	downloadQ.append((url, followRegexCompiled.match(parsedUrl.path) != None, keepRegexCompiled.match(parsedUrl.path) != None))

	keepRegexCompiled = re.compile(keepRegex)
	followRegexCompiled = re.compile(followRegex)
	dontFollowRegexCompiled = re.compile(dontFollowRegex)

	#init queue
	rootUrlLocNetLoc = urlparse.urlparse(rootUrl).netloc
	processUrl(rootUrl)

	#crawl

	while len(downloadQ) > 0:
	url, follow, keep = downloadQ.pop(0)
	actualUrl = ""
	data = None

	if (follow and url not in downloaded):
	#Download
	response = urllib2.urlopen(url)
	actualUrl = response.geturl() # in the case of a redirect, this function will return the url the data was fetched
	#extract url
	if (response.info().getheader('Content-Type') in followContentTypes):
	data = response.read()
	soup = BeautifulSoup(data)
	for link in soup.find_all('a'):
	absolutePath = urlparse.urljoin(actualUrl, link.get('href'));
	processUrl(absolutePath)
	#mark as downloaded
	downloaded.add(url)
	downloaded.add(actualUrl)
	elif (keep and url not in downloaded):
	downloaded.add(url)
	print url