Skip to content

Instantly share code, notes, and snippets.

@bbenne10
Created May 27, 2015 15:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bbenne10/857c4250cf2664dae66d to your computer and use it in GitHub Desktop.
Save bbenne10/857c4250cf2664dae66d to your computer and use it in GitHub Desktop.
#! /usr/bin/python
import re
from lxml.html import html5parser
from lxml.cssselect import CSSSelector
import pymongo
import requests
coll = pymongo.Client().guotewall.quotes
resp = requests.get("http://quotes.ryyanj.com").text
tree = html5parser.fromstring(resp)
quote_selector = CSSSelector(".quote-container")
ts_selector = CSSSelector(".timestamp")
quotebody_selector = CSSSelector(".quotebody")
attribution_selector = CSSSelector(".attribution")
attr_reg = re.compile("[a-zA-Z]+ [a-zA-Z]+")
for elem in quote_selector(tree):
ts = ts_selector(elem)[0].text
body = quotebody_selector(elem)[0].text
attr = attribution_selector(elem)[0].text.strip().lstrip("- ")
if attr == "George":
attr = "George Macon"
elif attr == "Wilson":
attr = "Wilson Martin"
elif attr == "Tyler":
attr = "Tyler Durkota"
elif attr == "Tim":
attr = "Tim Swihart"
# Only accept single attr'd quotes (that is - \w \w)
if not re.match(attr_reg, attr):
continue
doc = {"Quote": body,
"Author": attr,
"TS": ts}
coll.insert_one(doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment