Skip to content

Instantly share code, notes, and snippets.

@gr-a-m
Created April 17, 2015 09:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gr-a-m/9ad82833500bb7176dbf to your computer and use it in GitHub Desktop.
Save gr-a-m/9ad82833500bb7176dbf to your computer and use it in GitHub Desktop.
kdcrawler.py
from bs4 import BeautifulSoup
import urllib2
import json
import time
mm = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
def extract_tags(url):
""" This function extracts any "tag" links from a url. """
f = urllib2.urlopen(url)
content = '\n'.join(f.readlines())
soup = BeautifulSoup(content)
tags = []
for link in soup.find_all("a"):
if link.get("rel") == None:
continue
if "tag" in link.get("rel"):
tags.append(link.get_text())
return tags
def generate_tags(url):
""" This function gets tags from Algorithmia. """
request = urllib2.Request('https://api.algorithmia.com/api/tags/AutoTagURL')
request.add_header('Content-Type', 'application/json')
request.add_header('Authorization', '###') # Put your authorization key here
request.add_header('Accept', 'application/json')
response = urllib2.urlopen(request, json.dumps(url))
response_string = "\n".join(response.readlines())
return json.loads(response_string)["result"].keys()
def extract_articles(index):
""" This function extracts KDNuggets article links from a month index. """
f = urllib2.urlopen(index)
content = '\n'.join(f.readlines())
soup = BeautifulSoup(content)
links = []
ul = soup.find_all("ul", "three_ul")[0]
for li in ul.find_all("li"):
links.append(li.a.get("href"))
return links
def main():
for mo in mm:
links = extract_articles("http://www.kdnuggets.com/2014/" + mo + "/index.html")
for link in links:
print("{}\t{}\t{}".format(link, ",".join(extract_tags(link)), ",".join(generate_tags(link))))
time.sleep(1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment