Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Last active December 22, 2015 11:29
Show Gist options
  • Save abelsonlive/6465679 to your computer and use it in GitHub Desktop.
Save abelsonlive/6465679 to your computer and use it in GitHub Desktop.
homepage scraping
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
def get_image_for_a_link(link):
try:
img = link.find_element_by_tag_name("img")
except NoSuchElementException:
img = None
if img is not None:
is_img = 1
img_width = img.get_attribute("width")
img_height = img.get_attribute("height")
img_src = img.get_attribute("src")
else:
is_img = 0
img_width = None
img_height = None
img_src = None
return [is_img, img_width, img_height, img_src]
def scrape_links():
data = []
links = b.find_elements_by_tag_name("a")
for l in links:
try:
link = l.get_attribute("href")
except StaleElementReferenceException:
pass
else:
if isinstance(link, basestring):
print link
img_array = get_image_for_a_link(l)
link_dict = {
'link' : link.encode('utf-8'),
'headline' : l.text.encode('utf-8'),
'font_size' : l.value_of_css_property('font-size'),
'pos_x' : l.location['x'],
'pos_y' : l.location['y'],
'is_img' : img_array[0],
'img_width' : img_array[1],
'img_height' : img_array[2],
'img_src' : img_array[3]
}
data.append(link_dict)
return data
if __name__ == '__main__':
b = webdriver.PhantomJS()
b.get("http://www.guardian.co.uk/")
print scrape_links()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment