Skip to content

Instantly share code, notes, and snippets.

@ckcollab
Last active February 18, 2016 21:58
Show Gist options
  • Save ckcollab/d781f9a7f5bf77727dbe to your computer and use it in GitHub Desktop.
Save ckcollab/d781f9a7f5bf77727dbe to your computer and use it in GitHub Desktop.
scraper example
import json
import os
import requests
import grequests
from lxml import html
class Scraper(object):
def __init__(self):
self.session = self.get_session()
def get_session(self):
session = requests.session()
username = os.environ.get("RBR_USERNAME")
password = os.environ.get("RBR_PASSWORD")
result = session.post("https://some_website.net/login", {
"log": username,
"pwd": password,
})
#import ipdb; ipdb.set_trace()
assert result.status_code == 200, "Unable to login?"
return session
def get_archive_urls(self):
if os.path.exists("archive_urls.json"):
return json.loads(open("archive_urls.json", "r").read())
else:
archives_page = self.session.get("https://some_website.net/archives")
doc = html.fromstring(archives_page.content)
pages = []
for link in doc.cssselect('a'):
if "href" in link.attrib and link.attrib["href"].startswith("https://some_website.net/archives/"):
pages.append(link.attrib["href"])
with open("archive_urls.json", "w") as archive_file:
archive_file.write(json.dumps(pages))
return pages
def scrape_archives(self):
pages = self.get_archive_urls()
rs = (grequests.get(u, session=self.session) for u in pages)
all_show_notes = {}
for result in grequests.imap(rs):
print("On URL ->", result.url)
doc = html.fromstring(result.content)
all_show_notes[result.url.split("/")[-1]] = {
"title": doc.cssselect("header h2 a")[0].text_content(),
"notes": doc.cssselect(".post-contents.cf p")[1].text_content().strip(),
"date": "{} {}".format(
doc.cssselect(".date .day")[0].text_content(),
doc.cssselect(".date .month")[0].text_content()
)
}
with open("show_notes.json", "w") as show_notes_file:
show_notes_file.write(json.dumps(all_show_notes))
print("Done and saved!")
if __name__ == "__main__":
scraper = Scraper()
scraper.scrape_archives()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment