gr-a-m/kdcrawler.py

## kdcrawler.py
from bs4 import BeautifulSoup
import urllib2
import json
import time

mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

def extract_tags(url):
	""" This function extracts any "tag" links from a url. """
	f = urllib2.urlopen(url)
	content = '\n'.join(f.readlines())
	soup = BeautifulSoup(content)
	tags = []
	for link in soup.find_all("a"):
		if link.get("rel") == None:
			continue
		if "tag" in link.get("rel"):
			tags.append(link.get_text())
	return tags

def generate_tags(url):
	""" This function gets tags from Algorithmia. """
	request = urllib2.Request('https://api.algorithmia.com/api/tags/AutoTagURL')
	request.add_header('Content-Type', 'application/json')
	request.add_header('Authorization', '###')  # Put your authorization key here
	request.add_header('Accept', 'application/json')
	response = urllib2.urlopen(request, json.dumps(url))
	response_string = "\n".join(response.readlines())
	return json.loads(response_string)["result"].keys()

def extract_articles(index):
	""" This function extracts KDNuggets article links from a month index. """
	f = urllib2.urlopen(index)
	content = '\n'.join(f.readlines())
	soup = BeautifulSoup(content)
	links = []
	ul = soup.find_all("ul", "three_ul")[0]
	for li in ul.find_all("li"):
		links.append(li.a.get("href"))
	return links

def main():
	for mo in mm:
		links = extract_articles("http://www.kdnuggets.com/2014/" + mo + "/index.html")
		for link in links:
			print("{}\t{}\t{}".format(link, ",".join(extract_tags(link)), ",".join(generate_tags(link))))
			time.sleep(1)

if __name__ == '__main__':
	main()
	from bs4 import BeautifulSoup
	import urllib2
	import json
	import time

	mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

	def extract_tags(url):
	""" This function extracts any "tag" links from a url. """
	f = urllib2.urlopen(url)
	content = '\n'.join(f.readlines())
	soup = BeautifulSoup(content)
	tags = []
	for link in soup.find_all("a"):
	if link.get("rel") == None:
	continue
	if "tag" in link.get("rel"):
	tags.append(link.get_text())
	return tags

	def generate_tags(url):
	""" This function gets tags from Algorithmia. """
	request = urllib2.Request('https://api.algorithmia.com/api/tags/AutoTagURL')
	request.add_header('Content-Type', 'application/json')
	request.add_header('Authorization', '###') # Put your authorization key here
	request.add_header('Accept', 'application/json')
	response = urllib2.urlopen(request, json.dumps(url))
	response_string = "\n".join(response.readlines())
	return json.loads(response_string)["result"].keys()

	def extract_articles(index):
	""" This function extracts KDNuggets article links from a month index. """
	f = urllib2.urlopen(index)
	content = '\n'.join(f.readlines())
	soup = BeautifulSoup(content)
	links = []
	ul = soup.find_all("ul", "three_ul")[0]
	for li in ul.find_all("li"):
	links.append(li.a.get("href"))
	return links

	def main():
	for mo in mm:
	links = extract_articles("http://www.kdnuggets.com/2014/" + mo + "/index.html")
	for link in links:
	print("{}\t{}\t{}".format(link, ",".join(extract_tags(link)), ",".join(generate_tags(link))))
	time.sleep(1)

	if __name__ == '__main__':
	main()