Skip to content

Instantly share code, notes, and snippets.

@amitkaps
Created March 7, 2014 17:06
Show Gist options
  • Save amitkaps/9415417 to your computer and use it in GitHub Desktop.
Save amitkaps/9415417 to your computer and use it in GitHub Desktop.
import scraperwiki
import urllib2
from bs4 import BeautifulSoup
import re
url ="https://projecteuler.net/problem="
def get_page(url,num):
content = None
try:
content = urllib2.urlopen(url+str(num)).read()
return content
except urllib2.URLError:
return content
def extract_main (page):
soup = BeautifulSoup(page)
content = soup.find('div', attrs={'id' : 'content'})
return content
contentAll = []
for i in range (1,420):
contentAll.append(extract_main(get_page(url,i)))
scraperwiki.sqlite.save(unique_keys=["id"], data={"id":i,"content":get_page(url,i)})
data = scraperwiki.sqlite.select(
'''* FROM swdata ORDER BY id '''
)
print data
print contentAll
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment