Skip to content

Instantly share code, notes, and snippets.

@ddevault
Created March 22, 2015 16:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ddevault/bb097669c61f16e812f5 to your computer and use it in GitHub Desktop.
Save ddevault/bb097669c61f16e812f5 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import requests
import sys
import psycopg2
from bs4 import BeautifulSoup
connection = psycopg2.connect('dbname=bash user=sircmpwn')
sql = connection.cursor()
def get_page(page):
browse = "http://www.bash.org/?browse&p=%s"
return BeautifulSoup(requests.get(browse % page).text)
def handle_page(page):
print('Handling page %s' % page)
html = get_page(page)
quote_headers = [p for p in html.find_all('p') if p.get('class') == ['quote']]
quote_data = [p for p in html.find_all('p') if p.get('class') == ['qt']]
for i in range(0, len(quote_headers)):
header = quote_headers[i]
text = quote_data[i].get_text().replace('\r', '')
number = int(header.a.get('href')[1:])
score = int(header.text.split('(')[1].split(')')[0])
sql.execute('INSERT INTO quotes (id, score, text) VALUES (%s, %s, %s)', (number, score, text))
print('Added %s quotes' % len(quote_headers))
connection.commit()
# Get total number of pages
soup = BeautifulSoup(requests.get('http://www.bash.org/?browse').text)
total_pages = int(soup.find_all('option')[-1]['value'])
print('Scraping %s pages...' % total_pages)
for page in range(1, total_pages + 1):
handle_page(page)
sql.close()
connection.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment