Created
May 29, 2013 16:55
-
-
Save abelsonlive/5671858 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import pandas as pd | |
import boto.s3 | |
from boto.s3.key import Key | |
import sys | |
import os | |
from selenium import webdriver | |
from contextlib import closing | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.common.exceptions import NoSuchElementException | |
from BeautifulSoup import BeautifulSoup | |
from datetime import datetime, timedelta | |
TIME_FORMAT = "%Y-%m-%d %H:%M:%S" | |
ARTICLE_PATTERN = ".*nytimes\\.com/(.*/)?[0-9]+/[0-9]+/[0-9]+/.*" | |
AWS_ACCESS_KEY_ID = 'zzzzzzzzzzzzzzzzzzzzzzzz' | |
AWS_SECRET_ACCESS_KEY = 'zzzzzzzzzzzzzzzzzzzzzzzz' | |
BUCKET_NAME = "nyt-homepage-history" | |
def round_date_time(dt, bucket, as_date=False): | |
if not as_date: | |
dt = datetime.strptime(dt, TIME_FORMAT) | |
dt = dt - timedelta(minutes=dt.minute % int(bucket), | |
seconds=dt.second, | |
microseconds=dt.microsecond) | |
if as_date: | |
return dt | |
else: | |
return dt.strftime(TIME_FORMAT) | |
def gen_inputs(url="http://www.nytimes.com/"): | |
# generate date range by 10 minute increments | |
start = datetime.now() | |
d = round_date_time(start, bucket=10, as_date=True) | |
ss = "screenshots/" + d.strftime("%Y-%m-%d-%H-%M-%S") + ".png" | |
return {"img": ss, "date": d.strftime("%Y-%m-%d %H:%M:%S"), "url":url} | |
def id_available_cb(tag_id): | |
"""Return a callback that checks whether a link in a class is available.""" | |
def callback(browser): | |
try: | |
b.find_element_by_id(tag_id) | |
except NoSuchElementException: | |
return False | |
else: | |
return True | |
return callback | |
def get_image_for_a_link(link): | |
try: | |
img = link.find_element_by_tag_name("img") | |
except NoSuchElementException: | |
img = None | |
if img is not None: | |
is_img = 1 | |
img_width = img.get_attribute("width") | |
img_height = img.get_attribute("height") | |
img_src = img.get_attribute("src") | |
else: | |
is_img = 0 | |
img_width = "NA" | |
img_height = "NA" | |
img_src = "NA" | |
return (is_img, img_width, img_height, img_src) | |
# this is nytimes specific | |
def clean_url(raw_url): | |
#REGEXES | |
https = re.compile(r"https://") | |
rm_qry = re.compile(r"(http://[A-Za-z0-9\./\-_]+)(\?.*)?(#.*)?(&.*)?") | |
sld_shw = re.compile(r".*(/slideshow/)([0-9/]+)?([a-zA-Z0-9]+/)?([a-zA-Z0-9]+/)?[A-Za-z0-9_-]*") | |
sld_pg = re.compile(r"([-|_]?[0-9]{1,2})\Z") | |
index = r"/index\.[a-zA-Z]+(/)?" | |
# REMOVE ALL HTTPS | |
if https.search(raw_url): | |
raw_url = https.sub("http://", raw_url) | |
# REMOVE QUERY STRINGS | |
if rm_qry.search(raw_url): | |
story = rm_qry.search(raw_url).group(1) | |
# CLEAN UP SLIDEDSHOW | |
if sld_shw.search(story): | |
ss_url = sld_shw.search(story).group(0) | |
if sld_pg.search(ss_url): | |
story = sld_pg.sub("", ss_url) + ".html" | |
else: | |
story = ss_url + ".html" | |
# CLEAN UP ENDINGS | |
if story.endswith("/"): | |
story = story[:-1] | |
if story.endswith("/"): | |
story = story[:-1] | |
# REMOVE UNNCESSARY /index.html's | |
return re.sub(index, "", story) | |
else: | |
# clean up endings | |
if raw_url.endswith("/"): | |
raw_url = raw_url[:-1] | |
if raw_url.endswith("/"): | |
raw_url = raw_url[:-1] | |
return re.sub(index, "", raw_url) | |
def get_links_positions_attributes_images(): | |
full_links = [] | |
story_links = [] | |
font_sizes = [] | |
headlines = [] | |
pos_x = [] | |
pos_y = [] | |
is_img = [] | |
img_width = [] | |
img_height = [] | |
img_src = [] | |
links = b.find_elements_by_tag_name("a") | |
for l in links: | |
link = l.get_attribute("href") | |
if isinstance(link, basestring): | |
if re.match(ARTICLE_PATTERN, link): | |
# okay, we've got articles | |
# simplify url | |
story = clean_url(link) | |
story_links.append(story.encode('utf-8')) | |
# full url | |
full_links.append(link.encode('utf-8')) | |
# headline | |
headline = l.text.encode('utf-8') | |
if headline is "": | |
headline = "NA" | |
headlines.append(headline) | |
# font size | |
font_sizes.append(l.value_of_css_property('font-size')) | |
# positions | |
pos_x.append(l.location['x']) | |
pos_y.append(l.location['y']) | |
# images | |
img_array = get_image_for_a_link(l) | |
is_img.append(img_array[0]) | |
img_width.append(img_array[1]) | |
img_height.append(img_array[2]) | |
img_src.append(img_array[3]) | |
output = {"link":full_links, "story":story_links, "font_size":font_sizes, | |
"headline":headlines, "pos_x":pos_x, "pos_y":pos_y, | |
"is_img":is_img, "img_width":img_width, "img_height":img_height, | |
"img_src":img_src} | |
return output | |
def connect_to_s3(): | |
# Connect to AWS | |
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) | |
# create a bucket | |
bucket = conn.create_bucket(BUCKET_NAME, location=boto.s3.connection.Location.DEFAULT) | |
k = Key(bucket) | |
return k | |
def upload_file_to_s3(fp): | |
print "uploading to s3" | |
def percent_cb(complete, total): | |
sys.stdout.write('.') | |
sys.stdout.flush() | |
k.key = fp | |
k.set_contents_from_filename(fp, cb=percent_cb, num_cb=10) | |
os.system("rm " + fp) | |
print "\n" | |
def parse_10_min_chunk(date, url, img, wd): | |
print "getting data for", date | |
# open page safely | |
try: | |
b.get(url) | |
WebDriverWait(b, timeout=90).until(id_available_cb("home")) #nytimes specific | |
except: | |
return None | |
# take screenshot | |
print "taking screenshot as", img | |
b.save_screenshot(img) | |
upload_file_to_s3(wd + "/" + img) | |
s3_img_path = 'http://nyt-homepage-history.s3.amazonaws.com/' + img | |
# links and positions | |
print "parsing links, positions, attributes, images" | |
data = get_links_positions_attributes_images() | |
# add other elements repetitively | |
n = len(data['story']) | |
times = range(0, n) | |
data['date'] = [date.encode('utf-8') for i in times] | |
data['hp_url'] = [url.encode('utf-8') for i in times] | |
data['screenshot'] = [s3_img_path.encode('utf-8') for i in times] | |
return pd.DataFrame().from_dict(data).set_index('date') | |
# runner | |
wd = os.getcwd() | |
b = webdriver.Firefox() | |
b.set_window_size(1280, 800) | |
k = connect_to_s3() | |
inputs = gen_inputs() | |
# output shell | |
cols = ['headline', 'hp_url', 'screenshot', 'link', | |
'pos_x', 'pos_y', 'story', "font_size", "is_img","img_width", | |
"img_height", "img_src"] | |
df = pd.DataFrame(columns= cols) | |
chunk = parse_10_min_chunk(inputs['date'], inputs['url'], inputs['img'], wd) | |
df = df.append(chunk) | |
output_fp ="homepage_data2/test_output%s.csv" % inputs['date'] | |
df.to_csv(output_fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment