Skip to content

Instantly share code, notes, and snippets.

@dmil
Created March 28, 2015 20:28
Show Gist options
  • Save dmil/a76ea4087d71d3e12f03 to your computer and use it in GitHub Desktop.
Save dmil/a76ea4087d71d3e12f03 to your computer and use it in GitHub Desktop.
Scrape PDFS
"""
Stub for scraping-related jobs
CSS Selectors Reference: http://www.w3schools.com/cssref/css_selectors.asp
"""
import requests, lxml.html
import re
# Select the element using a CSS Selector
def get_url_from_element(element):
pattern = r'<a href="\/url\?q=(http.*\.pdf)'
m = re.search(pattern, lxml.html.tostring(element))
if m:
return m.group(1)
else:
return ""
def scrape_page(page_no):
url = "https://www.google.com/search?q=site%3Ahouse.gov+ttf++pdf&biw=1366&bih=623&source=lnt&tbs=cdr%3A1%2Ccd_min%3A1%2F1%2F2015%2Ccd_max%3A&tbm=#q=site:house.gov+%22Truth+in+Testimony%22+pdf&tbs=cdr:1,cd_min:1/1/2015&start=" + str(page_no * 10)
response = requests.get(url)
doc = lxml.html.fromstring(response.content)
h3_elements = doc.cssselect("h3")
for element in h3_elements:
print get_url_from_element(element)
print ""
for pg in range(0,20):
scrape_page(pg)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment