Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Created May 29, 2013 16:55
Show Gist options
  • Save abelsonlive/5671858 to your computer and use it in GitHub Desktop.
Save abelsonlive/5671858 to your computer and use it in GitHub Desktop.
import re
import pandas as pd
import boto.s3
from boto.s3.key import Key
import sys
import os
from selenium import webdriver
from contextlib import closing
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from BeautifulSoup import BeautifulSoup
from datetime import datetime, timedelta
TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
ARTICLE_PATTERN = ".*nytimes\\.com/(.*/)?[0-9]+/[0-9]+/[0-9]+/.*"
AWS_ACCESS_KEY_ID = 'zzzzzzzzzzzzzzzzzzzzzzzz'
AWS_SECRET_ACCESS_KEY = 'zzzzzzzzzzzzzzzzzzzzzzzz'
BUCKET_NAME = "nyt-homepage-history"
def round_date_time(dt, bucket, as_date=False):
if not as_date:
dt = datetime.strptime(dt, TIME_FORMAT)
dt = dt - timedelta(minutes=dt.minute % int(bucket),
seconds=dt.second,
microseconds=dt.microsecond)
if as_date:
return dt
else:
return dt.strftime(TIME_FORMAT)
def gen_inputs(url="http://www.nytimes.com/"):
# generate date range by 10 minute increments
start = datetime.now()
d = round_date_time(start, bucket=10, as_date=True)
ss = "screenshots/" + d.strftime("%Y-%m-%d-%H-%M-%S") + ".png"
return {"img": ss, "date": d.strftime("%Y-%m-%d %H:%M:%S"), "url":url}
def id_available_cb(tag_id):
"""Return a callback that checks whether a link in a class is available."""
def callback(browser):
try:
b.find_element_by_id(tag_id)
except NoSuchElementException:
return False
else:
return True
return callback
def get_image_for_a_link(link):
try:
img = link.find_element_by_tag_name("img")
except NoSuchElementException:
img = None
if img is not None:
is_img = 1
img_width = img.get_attribute("width")
img_height = img.get_attribute("height")
img_src = img.get_attribute("src")
else:
is_img = 0
img_width = "NA"
img_height = "NA"
img_src = "NA"
return (is_img, img_width, img_height, img_src)
# this is nytimes specific
def clean_url(raw_url):
#REGEXES
https = re.compile(r"https://")
rm_qry = re.compile(r"(http://[A-Za-z0-9\./\-_]+)(\?.*)?(#.*)?(&.*)?")
sld_shw = re.compile(r".*(/slideshow/)([0-9/]+)?([a-zA-Z0-9]+/)?([a-zA-Z0-9]+/)?[A-Za-z0-9_-]*")
sld_pg = re.compile(r"([-|_]?[0-9]{1,2})\Z")
index = r"/index\.[a-zA-Z]+(/)?"
# REMOVE ALL HTTPS
if https.search(raw_url):
raw_url = https.sub("http://", raw_url)
# REMOVE QUERY STRINGS
if rm_qry.search(raw_url):
story = rm_qry.search(raw_url).group(1)
# CLEAN UP SLIDEDSHOW
if sld_shw.search(story):
ss_url = sld_shw.search(story).group(0)
if sld_pg.search(ss_url):
story = sld_pg.sub("", ss_url) + ".html"
else:
story = ss_url + ".html"
# CLEAN UP ENDINGS
if story.endswith("/"):
story = story[:-1]
if story.endswith("/"):
story = story[:-1]
# REMOVE UNNCESSARY /index.html's
return re.sub(index, "", story)
else:
# clean up endings
if raw_url.endswith("/"):
raw_url = raw_url[:-1]
if raw_url.endswith("/"):
raw_url = raw_url[:-1]
return re.sub(index, "", raw_url)
def get_links_positions_attributes_images():
full_links = []
story_links = []
font_sizes = []
headlines = []
pos_x = []
pos_y = []
is_img = []
img_width = []
img_height = []
img_src = []
links = b.find_elements_by_tag_name("a")
for l in links:
link = l.get_attribute("href")
if isinstance(link, basestring):
if re.match(ARTICLE_PATTERN, link):
# okay, we've got articles
# simplify url
story = clean_url(link)
story_links.append(story.encode('utf-8'))
# full url
full_links.append(link.encode('utf-8'))
# headline
headline = l.text.encode('utf-8')
if headline is "":
headline = "NA"
headlines.append(headline)
# font size
font_sizes.append(l.value_of_css_property('font-size'))
# positions
pos_x.append(l.location['x'])
pos_y.append(l.location['y'])
# images
img_array = get_image_for_a_link(l)
is_img.append(img_array[0])
img_width.append(img_array[1])
img_height.append(img_array[2])
img_src.append(img_array[3])
output = {"link":full_links, "story":story_links, "font_size":font_sizes,
"headline":headlines, "pos_x":pos_x, "pos_y":pos_y,
"is_img":is_img, "img_width":img_width, "img_height":img_height,
"img_src":img_src}
return output
def connect_to_s3():
# Connect to AWS
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
# create a bucket
bucket = conn.create_bucket(BUCKET_NAME, location=boto.s3.connection.Location.DEFAULT)
k = Key(bucket)
return k
def upload_file_to_s3(fp):
print "uploading to s3"
def percent_cb(complete, total):
sys.stdout.write('.')
sys.stdout.flush()
k.key = fp
k.set_contents_from_filename(fp, cb=percent_cb, num_cb=10)
os.system("rm " + fp)
print "\n"
def parse_10_min_chunk(date, url, img, wd):
print "getting data for", date
# open page safely
try:
b.get(url)
WebDriverWait(b, timeout=90).until(id_available_cb("home")) #nytimes specific
except:
return None
# take screenshot
print "taking screenshot as", img
b.save_screenshot(img)
upload_file_to_s3(wd + "/" + img)
s3_img_path = 'http://nyt-homepage-history.s3.amazonaws.com/' + img
# links and positions
print "parsing links, positions, attributes, images"
data = get_links_positions_attributes_images()
# add other elements repetitively
n = len(data['story'])
times = range(0, n)
data['date'] = [date.encode('utf-8') for i in times]
data['hp_url'] = [url.encode('utf-8') for i in times]
data['screenshot'] = [s3_img_path.encode('utf-8') for i in times]
return pd.DataFrame().from_dict(data).set_index('date')
# runner
wd = os.getcwd()
b = webdriver.Firefox()
b.set_window_size(1280, 800)
k = connect_to_s3()
inputs = gen_inputs()
# output shell
cols = ['headline', 'hp_url', 'screenshot', 'link',
'pos_x', 'pos_y', 'story', "font_size", "is_img","img_width",
"img_height", "img_src"]
df = pd.DataFrame(columns= cols)
chunk = parse_10_min_chunk(inputs['date'], inputs['url'], inputs['img'], wd)
df = df.append(chunk)
output_fp ="homepage_data2/test_output%s.csv" % inputs['date']
df.to_csv(output_fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment