Skip to content

Instantly share code, notes, and snippets.

@amitkaps
Created March 7, 2014 17:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amitkaps/9415361 to your computer and use it in GitHub Desktop.
Save amitkaps/9415361 to your computer and use it in GitHub Desktop.
import urllib2
from bs4 import BeautifulSoup
import re
url = "http://geniekids.com/pa"
#url = "http://geniekids.com/pa/Respect-Fundamentals-of-Disciplining"
import scraperwiki
def get_page(url):
content = None
try:
content = urllib2.urlopen(url).read()
return content
except urllib2.URLError:
return content
def extract_main (url):
page = get_page(url)
soup = BeautifulSoup(page)
outlinks = []
tags = []
mainContent = soup.find('div', attrs={'id' : 'mainContent'})
title = mainContent.find('h1', attrs={'class' : 'title'}).get_text()
content = mainContent.find('div', attrs={'class': 'content'}).get_text()
for tag_link in mainContent.findAll('li', attrs={'class': re.compile('tax.*')}):
tags.append(str(tag_link.get_text()))
for link in mainContent.find_all('a'):
new_link = link.get('href')
if new_link and new_link[0:4] == '/pa/'and new_link.find("comment")==-1:
link_full = 'http://geniekids.com' + new_link
outlinks.append(link_full)
return title, content, outlinks, tags # title, content, outlinks, tags
def crawl_web(seed): # returns index, graph of inlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>, [list of pages it links to]
i = 0
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
title, content, outlinks, tags = extract_main(url)
scraperwiki.sqlite.save(unique_keys=["id"], data={"id":i, "title":title, "content":content, "tags":tags, "url":url})
print i
i +=1
graph[url] = outlinks
union(tocrawl, outlinks)
crawled.append(url)
return 1
def union(a, b):
for e in b:
if e not in a:
a.append(e)
print crawl_web(url)
data = scraperwiki.sqlite.select(
'''* FROM swdata ORDER BY title '''
)
print data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment