abelsonlive/homepage.py

## homepage.py
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

def get_image_for_a_link(link):
    try:
        img = link.find_element_by_tag_name("img")
    except NoSuchElementException:
        img = None
    if img is not None:
        is_img = 1
        img_width = img.get_attribute("width")
        img_height = img.get_attribute("height")
        img_src = img.get_attribute("src")
    else:
        is_img = 0
        img_width = None
        img_height = None
        img_src = None
    return [is_img, img_width, img_height, img_src]

def scrape_links():
    data = []
    links = b.find_elements_by_tag_name("a")
    for l in links:
        try:
            link = l.get_attribute("href")
        except StaleElementReferenceException:
            pass
        else:
            if isinstance(link, basestring):
                print link
                img_array = get_image_for_a_link(l)

                link_dict = {
                    'link' : link.encode('utf-8'),
                    'headline' : l.text.encode('utf-8'),
                    'font_size' : l.value_of_css_property('font-size'),
                    'pos_x' : l.location['x'],
                    'pos_y' : l.location['y'],
                    'is_img' : img_array[0],
                    'img_width' : img_array[1],
                    'img_height' : img_array[2],
                    'img_src' : img_array[3]
                }

                data.append(link_dict)

    return data

if __name__ == '__main__':
    b = webdriver.PhantomJS()
    b.get("http://www.guardian.co.uk/")
    print scrape_links()
	from selenium import webdriver
	from selenium.common.exceptions import NoSuchElementException

	def get_image_for_a_link(link):
	try:
	img = link.find_element_by_tag_name("img")
	except NoSuchElementException:
	img = None
	if img is not None:
	is_img = 1
	img_width = img.get_attribute("width")
	img_height = img.get_attribute("height")
	img_src = img.get_attribute("src")
	else:
	is_img = 0
	img_width = None
	img_height = None
	img_src = None
	return [is_img, img_width, img_height, img_src]

	def scrape_links():
	data = []
	links = b.find_elements_by_tag_name("a")
	for l in links:
	try:
	link = l.get_attribute("href")
	except StaleElementReferenceException:
	pass
	else:
	if isinstance(link, basestring):
	print link
	img_array = get_image_for_a_link(l)

	link_dict = {
	'link' : link.encode('utf-8'),
	'headline' : l.text.encode('utf-8'),
	'font_size' : l.value_of_css_property('font-size'),
	'pos_x' : l.location['x'],
	'pos_y' : l.location['y'],
	'is_img' : img_array[0],
	'img_width' : img_array[1],
	'img_height' : img_array[2],
	'img_src' : img_array[3]
	}

	data.append(link_dict)

	return data

	if __name__ == '__main__':
	b = webdriver.PhantomJS()
	b.get("http://www.guardian.co.uk/")
	print scrape_links()