Skip to content

Instantly share code, notes, and snippets.

@k1000
Created August 17, 2011 08:38
Show Gist options
  • Save k1000/1151103 to your computer and use it in GitHub Desktop.
Save k1000/1151103 to your computer and use it in GitHub Desktop.
selenium scrap
"""
Basic script to scrap pages with selenium
requires:
- lxml
- selenium http://seleniumhq.org/download/
execution:
1. start selenium server
$ java -jar selenium-server-standalone-2.16.jar
2. run this script
"""
import urlparse, os
from selenium import selenium
from lxml.html import fromstring
WEB_URL = "http://www.webtenerife.com"
LANG = "en"
IMG_DIR = '/Users/kamil/Desktop/PROJECTOS/webtenerife/scrap_actualidad-%s/' % LANG
def download(url):
"""Copy the contents of a file from a given URL
to a local file.
"""
import urllib
webFile = urllib.urlopen(url)
localPath = os.path.join(IMG_DIR, url.split("/")[-1:][0] )
localFile = open(localPath, 'w')
localFile.write(webFile.read())
webFile.close()
localFile.close()
return localPath
def get_thumbs(thumbs):
for thumb in thumbs:
thumb_name = thumb.get("src")
if thumb_name:
thumb_src = "%s%s" % (WEB_URL, thumb.get("src"))
print "downloading thumb %s" % thumb_src
download( thumb_src )
def get_image(img):
if img:
img_href = "%s%s" % (WEB_URL, img[0].get("src"))
print "downloading %s" % img_href
return download( img_href )
else:
return False
# User-Agent (this is cheating, ok?)
br = selenium("localhost", 4444, "*chrome", "http://www.webtenerife.com/Actualidad-es/?Lang=es")
br.start()
br.open("http://www.webtenerife.com/Actualidad-%s/?Lang=%s" % (LANG, LANG ) )
br.type("id=BuscadorFechas1_DateDesde_txt_Date", "22/10/2010")
#br.type("id=BuscadorFechas1_DateHasta_txt_Date", "29/11/10")
br.click("id=BuscadorFechas1_lblBuscar")
br.wait_for_page_to_load("100000")
pages_nr = br.get_xpath_count("//div[2]/ul/li/a")
print "%s pages" % pages_nr
for i in range(0, pages_nr):
href = "http://www.webtenerife.com/Actualidad-%s/index.htm?Lang=%s&ind=%s" % (LANG, LANG, i)
print "opening %s" % href
br.open( href )
br.wait_for_page_to_load("80000")
root = fromstring( br.get_html_source() )
thumbs = root.cssselect(".resultados li img")
get_thumbs( thumbs )
pages = root.cssselect(".resultados h3 a")
for page in pages:
page_href = page.get("href")
print "opening %s" % page_href
br.open( page_href )
br.wait_for_page_to_load("50000")
root = fromstring( br.get_html_source() )
img = root.cssselect("#imgCentral_PresentationModeControlsContainer_PresentationImage")
get_image( img )
@k1000
Copy link
Author

k1000 commented Aug 17, 2011

needs selenium-server-standalone-2.4.0.jar


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment