Skip to content

Instantly share code, notes, and snippets.

@iKlotho
Created April 1, 2015 15:24
Show Gist options
  • Save iKlotho/440abbd61c842f9ae8dd to your computer and use it in GitHub Desktop.
Save iKlotho/440abbd61c842f9ae8dd to your computer and use it in GitHub Desktop.
Getting data from web and store them into sqlite db
from bs4 import BeautifulSoup
from urllib2 import urlopen
import sqlite3
BASE_URL = "http://www.chicagoreader.com"
conn = sqlite3.connect('webscrap.db') # creating database
c = conn.cursor()
c.execute('''CREATE TABLE stocks
(category_url text, category text, winner text, runners_up text)''')
def make_soup(url): # parsing html to xml
html = urlopen(url).read()
return BeautifulSoup(html, "lxml")
def get_category_links(section_url):
soup = make_soup(section_url)
boccat = soup.find("dl", "boccat")
category_links = [BASE_URL + dd.a["href"] for dd in boccat.findAll("dd")]
return category_links
def get_category_winner(category_url):
soup = make_soup(category_url)
category = soup.find("h1", "headline").string
winner = [h2.string for h2 in soup.findAll("h2", "boc1")]
runners_up = [h2.string for h2 in soup.findAll("h2", "boc2")]
data = [(category_url,category,winner[0],runners_up[0])]
c.executemany('INSERT INTO stocks VALUES (?,?,?,?)', data)
if __name__ == '__main__':
food_n_drink = ("http://www.chicagoreader.com/chicago/"
"best-of-chicago-2011-food-drink/BestOf?oid=4106228")
categories = get_category_links(food_n_drink)
a = 0
for category in categories:
if a == 50:
conn.commit()
conn.close()
exit()
get_category_winner(category)
a += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment