abelsonlive/nyt_homepage_tracker.py

## nyt_homepage_tracker.py
import re
import pandas as pd
import boto.s3
from boto.s3.key import Key
import sys
import os
from selenium import webdriver
from contextlib import closing
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from BeautifulSoup import BeautifulSoup
from datetime import datetime, timedelta

TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
ARTICLE_PATTERN = ".*nytimes\\.com/(.*/)?[0-9]+/[0-9]+/[0-9]+/.*"
AWS_ACCESS_KEY_ID = 'zzzzzzzzzzzzzzzzzzzzzzzz'
AWS_SECRET_ACCESS_KEY = 'zzzzzzzzzzzzzzzzzzzzzzzz'
BUCKET_NAME = "nyt-homepage-history"

def round_date_time(dt, bucket, as_date=False):
    if not as_date:
        dt = datetime.strptime(dt, TIME_FORMAT)

    dt = dt - timedelta(minutes=dt.minute % int(bucket),
                        seconds=dt.second,
                        microseconds=dt.microsecond)
    if as_date:
        return dt
    else:
        return dt.strftime(TIME_FORMAT)

def gen_inputs(url="http://www.nytimes.com/"):

    # generate date range by 10 minute increments
    start = datetime.now()
    d = round_date_time(start, bucket=10, as_date=True)
    ss = "screenshots/" + d.strftime("%Y-%m-%d-%H-%M-%S") + ".png"

    return {"img": ss, "date": d.strftime("%Y-%m-%d %H:%M:%S"), "url":url}

def id_available_cb(tag_id):
    """Return a callback that checks whether a link in a class is available."""
    def callback(browser):
        try:
            b.find_element_by_id(tag_id)
        except NoSuchElementException:
            return False
        else:
            return True
    return callback

def get_image_for_a_link(link):
    try:
        img = link.find_element_by_tag_name("img")
    except NoSuchElementException:
        img = None
    if img is not None:
        is_img = 1
        img_width = img.get_attribute("width")
        img_height = img.get_attribute("height")
        img_src = img.get_attribute("src")
    else:
        is_img = 0
        img_width = "NA"
        img_height = "NA"
        img_src = "NA"
    return (is_img, img_width, img_height, img_src)

# this is nytimes specific
def clean_url(raw_url):

    #REGEXES
    https = re.compile(r"https://")
    rm_qry = re.compile(r"(http://[A-Za-z0-9\./\-_]+)(\?.*)?(#.*)?(&.*)?")
    sld_shw = re.compile(r".*(/slideshow/)([0-9/]+)?([a-zA-Z0-9]+/)?([a-zA-Z0-9]+/)?[A-Za-z0-9_-]*")
    sld_pg = re.compile(r"([-|_]?[0-9]{1,2})\Z")
    index = r"/index\.[a-zA-Z]+(/)?"

    # REMOVE ALL HTTPS
    if https.search(raw_url):
        raw_url = https.sub("http://", raw_url)

    # REMOVE QUERY STRINGS
    if rm_qry.search(raw_url):
        story = rm_qry.search(raw_url).group(1)
        # CLEAN UP SLIDEDSHOW
        if sld_shw.search(story):
            ss_url = sld_shw.search(story).group(0)
            if sld_pg.search(ss_url):
                story = sld_pg.sub("", ss_url) + ".html"
            else:
                story = ss_url + ".html"
        # CLEAN UP ENDINGS
        if story.endswith("/"):
            story = story[:-1]
            if story.endswith("/"):
                story = story[:-1]
        # REMOVE UNNCESSARY /index.html's
        return re.sub(index, "", story)
    else:
        # clean up endings
        if raw_url.endswith("/"):
            raw_url = raw_url[:-1]
            if raw_url.endswith("/"):
                raw_url = raw_url[:-1]
        return re.sub(index, "", raw_url)

def get_links_positions_attributes_images():
    full_links = []
    story_links = []
    font_sizes = []
    headlines = []
    pos_x = []
    pos_y = []
    is_img = []
    img_width = []
    img_height = []
    img_src = []

    links = b.find_elements_by_tag_name("a")

    for l in links:
        link = l.get_attribute("href")
        if isinstance(link, basestring):
            if re.match(ARTICLE_PATTERN, link):
                # okay, we've got articles

                # simplify url
                story = clean_url(link)
                story_links.append(story.encode('utf-8'))

                # full url
                full_links.append(link.encode('utf-8'))

                # headline
                headline = l.text.encode('utf-8')
                if headline is "":
                    headline = "NA"
                headlines.append(headline)

                # font size
                font_sizes.append(l.value_of_css_property('font-size'))

                # positions
                pos_x.append(l.location['x'])
                pos_y.append(l.location['y'])

                # images
                img_array = get_image_for_a_link(l)
                is_img.append(img_array[0])
                img_width.append(img_array[1])
                img_height.append(img_array[2])
                img_src.append(img_array[3])

    output = {"link":full_links, "story":story_links, "font_size":font_sizes,
              "headline":headlines, "pos_x":pos_x, "pos_y":pos_y,
              "is_img":is_img, "img_width":img_width, "img_height":img_height,
              "img_src":img_src}
    return output

def connect_to_s3():
    # Connect to AWS
    conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

    # create a bucket
    bucket = conn.create_bucket(BUCKET_NAME, location=boto.s3.connection.Location.DEFAULT)
    k = Key(bucket)
    return k

def upload_file_to_s3(fp):
    print "uploading to s3"
    def percent_cb(complete, total):
        sys.stdout.write('.')
        sys.stdout.flush()

    k.key = fp
    k.set_contents_from_filename(fp, cb=percent_cb, num_cb=10)
    os.system("rm " + fp)
    print "\n"


def parse_10_min_chunk(date, url, img, wd):

    print "getting data for", date
    # open page safely
    try:
        b.get(url)
        WebDriverWait(b, timeout=90).until(id_available_cb("home")) #nytimes specific
    except:
        return None

    # take screenshot
    print "taking screenshot as", img
    b.save_screenshot(img)
    upload_file_to_s3(wd + "/" + img)
    s3_img_path = 'http://nyt-homepage-history.s3.amazonaws.com/' + img

    # links and positions
    print "parsing links, positions, attributes, images"
    data = get_links_positions_attributes_images()

    # add other elements repetitively
    n = len(data['story'])
    times = range(0, n)

    data['date'] = [date.encode('utf-8') for i in times]
    data['hp_url'] = [url.encode('utf-8') for i in times]
    data['screenshot'] = [s3_img_path.encode('utf-8') for i in times]

    return pd.DataFrame().from_dict(data).set_index('date')


# runner
wd = os.getcwd()
b = webdriver.Firefox()
b.set_window_size(1280, 800)
k = connect_to_s3()
inputs = gen_inputs()
# output shell
cols = ['headline', 'hp_url', 'screenshot', 'link',
        'pos_x', 'pos_y', 'story', "font_size", "is_img","img_width",
        "img_height", "img_src"]

df = pd.DataFrame(columns= cols)

chunk = parse_10_min_chunk(inputs['date'], inputs['url'], inputs['img'], wd)
df = df.append(chunk)
output_fp ="homepage_data2/test_output%s.csv" % inputs['date']
df.to_csv(output_fp)
	import re
	import pandas as pd
	import boto.s3
	from boto.s3.key import Key
	import sys
	import os
	from selenium import webdriver
	from contextlib import closing
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.common.exceptions import NoSuchElementException
	from BeautifulSoup import BeautifulSoup
	from datetime import datetime, timedelta

	TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
	ARTICLE_PATTERN = ".nytimes\\.com/(./)?[0-9]+/[0-9]+/[0-9]+/.*"
	AWS_ACCESS_KEY_ID = 'zzzzzzzzzzzzzzzzzzzzzzzz'
	AWS_SECRET_ACCESS_KEY = 'zzzzzzzzzzzzzzzzzzzzzzzz'
	BUCKET_NAME = "nyt-homepage-history"

	def round_date_time(dt, bucket, as_date=False):
	if not as_date:
	dt = datetime.strptime(dt, TIME_FORMAT)

	dt = dt - timedelta(minutes=dt.minute % int(bucket),
	seconds=dt.second,
	microseconds=dt.microsecond)
	if as_date:
	return dt
	else:
	return dt.strftime(TIME_FORMAT)

	def gen_inputs(url="http://www.nytimes.com/"):

	# generate date range by 10 minute increments
	start = datetime.now()
	d = round_date_time(start, bucket=10, as_date=True)
	ss = "screenshots/" + d.strftime("%Y-%m-%d-%H-%M-%S") + ".png"

	return {"img": ss, "date": d.strftime("%Y-%m-%d %H:%M:%S"), "url":url}

	def id_available_cb(tag_id):
	"""Return a callback that checks whether a link in a class is available."""
	def callback(browser):
	try:
	b.find_element_by_id(tag_id)
	except NoSuchElementException:
	return False
	else:
	return True
	return callback

	def get_image_for_a_link(link):
	try:
	img = link.find_element_by_tag_name("img")
	except NoSuchElementException:
	img = None
	if img is not None:
	is_img = 1
	img_width = img.get_attribute("width")
	img_height = img.get_attribute("height")
	img_src = img.get_attribute("src")
	else:
	is_img = 0
	img_width = "NA"
	img_height = "NA"
	img_src = "NA"
	return (is_img, img_width, img_height, img_src)

	# this is nytimes specific
	def clean_url(raw_url):

	#REGEXES
	https = re.compile(r"https://")
	rm_qry = re.compile(r"(http://[A-Za-z0-9\./\-_]+)(\?.)?(#.)?(&.*)?")
	sld_shw = re.compile(r".(/slideshow/)([0-9/]+)?([a-zA-Z0-9]+/)?([a-zA-Z0-9]+/)?[A-Za-z0-9_-]")
	sld_pg = re.compile(r"([-\|_]?[0-9]{1,2})\Z")
	index = r"/index\.[a-zA-Z]+(/)?"

	# REMOVE ALL HTTPS
	if https.search(raw_url):
	raw_url = https.sub("http://", raw_url)

	# REMOVE QUERY STRINGS
	if rm_qry.search(raw_url):
	story = rm_qry.search(raw_url).group(1)
	# CLEAN UP SLIDEDSHOW
	if sld_shw.search(story):
	ss_url = sld_shw.search(story).group(0)
	if sld_pg.search(ss_url):
	story = sld_pg.sub("", ss_url) + ".html"
	else:
	story = ss_url + ".html"
	# CLEAN UP ENDINGS
	if story.endswith("/"):
	story = story[:-1]
	if story.endswith("/"):
	story = story[:-1]
	# REMOVE UNNCESSARY /index.html's
	return re.sub(index, "", story)
	else:
	# clean up endings
	if raw_url.endswith("/"):
	raw_url = raw_url[:-1]
	if raw_url.endswith("/"):
	raw_url = raw_url[:-1]
	return re.sub(index, "", raw_url)

	def get_links_positions_attributes_images():
	full_links = []
	story_links = []
	font_sizes = []
	headlines = []
	pos_x = []
	pos_y = []
	is_img = []
	img_width = []
	img_height = []
	img_src = []

	links = b.find_elements_by_tag_name("a")

	for l in links:
	link = l.get_attribute("href")
	if isinstance(link, basestring):
	if re.match(ARTICLE_PATTERN, link):
	# okay, we've got articles

	# simplify url
	story = clean_url(link)
	story_links.append(story.encode('utf-8'))

	# full url
	full_links.append(link.encode('utf-8'))

	# headline
	headline = l.text.encode('utf-8')
	if headline is "":
	headline = "NA"
	headlines.append(headline)

	# font size
	font_sizes.append(l.value_of_css_property('font-size'))

	# positions
	pos_x.append(l.location['x'])
	pos_y.append(l.location['y'])

	# images
	img_array = get_image_for_a_link(l)
	is_img.append(img_array[0])
	img_width.append(img_array[1])
	img_height.append(img_array[2])
	img_src.append(img_array[3])

	output = {"link":full_links, "story":story_links, "font_size":font_sizes,
	"headline":headlines, "pos_x":pos_x, "pos_y":pos_y,
	"is_img":is_img, "img_width":img_width, "img_height":img_height,
	"img_src":img_src}
	return output

	def connect_to_s3():
	# Connect to AWS
	conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

	# create a bucket
	bucket = conn.create_bucket(BUCKET_NAME, location=boto.s3.connection.Location.DEFAULT)
	k = Key(bucket)
	return k

	def upload_file_to_s3(fp):
	print "uploading to s3"
	def percent_cb(complete, total):
	sys.stdout.write('.')
	sys.stdout.flush()

	k.key = fp
	k.set_contents_from_filename(fp, cb=percent_cb, num_cb=10)
	os.system("rm " + fp)
	print "\n"


	def parse_10_min_chunk(date, url, img, wd):

	print "getting data for", date
	# open page safely
	try:
	b.get(url)
	WebDriverWait(b, timeout=90).until(id_available_cb("home")) #nytimes specific
	except:
	return None

	# take screenshot
	print "taking screenshot as", img
	b.save_screenshot(img)
	upload_file_to_s3(wd + "/" + img)
	s3_img_path = 'http://nyt-homepage-history.s3.amazonaws.com/' + img

	# links and positions
	print "parsing links, positions, attributes, images"
	data = get_links_positions_attributes_images()

	# add other elements repetitively
	n = len(data['story'])
	times = range(0, n)

	data['date'] = [date.encode('utf-8') for i in times]
	data['hp_url'] = [url.encode('utf-8') for i in times]
	data['screenshot'] = [s3_img_path.encode('utf-8') for i in times]

	return pd.DataFrame().from_dict(data).set_index('date')


	# runner
	wd = os.getcwd()
	b = webdriver.Firefox()
	b.set_window_size(1280, 800)
	k = connect_to_s3()
	inputs = gen_inputs()
	# output shell
	cols = ['headline', 'hp_url', 'screenshot', 'link',
	'pos_x', 'pos_y', 'story', "font_size", "is_img","img_width",
	"img_height", "img_src"]

	df = pd.DataFrame(columns= cols)

	chunk = parse_10_min_chunk(inputs['date'], inputs['url'], inputs['img'], wd)
	df = df.append(chunk)
	output_fp ="homepage_data2/test_output%s.csv" % inputs['date']
	df.to_csv(output_fp)