k1000/selenium_scrap.py

## selenium_scrap.py
"""
Basic script to scrap pages with selenium

requires:
- lxml
- selenium http://seleniumhq.org/download/

execution:
1. start selenium server
   $ java -jar selenium-server-standalone-2.16.jar
2. run this script

"""

import urlparse, os
from selenium import selenium
from lxml.html import fromstring


WEB_URL = "http://www.webtenerife.com"
LANG = "en"
IMG_DIR = '/Users/kamil/Desktop/PROJECTOS/webtenerife/scrap_actualidad-%s/' % LANG

def download(url):
	"""Copy the contents of a file from a given URL
	to a local file.
	"""
	import urllib
	webFile = urllib.urlopen(url)
	localPath = os.path.join(IMG_DIR, url.split("/")[-1:][0] )
	localFile = open(localPath, 'w')
	localFile.write(webFile.read())
	webFile.close()
	localFile.close()
	return localPath

def get_thumbs(thumbs):
	for thumb in thumbs:
		thumb_name = thumb.get("src")
		if thumb_name:
			thumb_src = "%s%s" % (WEB_URL, thumb.get("src"))
			print "downloading thumb %s" % thumb_src
			download(  thumb_src )

def get_image(img):
	if img:
		img_href = "%s%s" % (WEB_URL, img[0].get("src"))
		print "downloading %s" % img_href
		return download( img_href )
	else:
		return False


# User-Agent (this is cheating, ok?)
br = selenium("localhost", 4444, "*chrome", "http://www.webtenerife.com/Actualidad-es/?Lang=es")
br.start()
br.open("http://www.webtenerife.com/Actualidad-%s/?Lang=%s" % (LANG, LANG ) )
br.type("id=BuscadorFechas1_DateDesde_txt_Date", "22/10/2010")
#br.type("id=BuscadorFechas1_DateHasta_txt_Date", "29/11/10")
br.click("id=BuscadorFechas1_lblBuscar")
br.wait_for_page_to_load("100000")

pages_nr = br.get_xpath_count("//div[2]/ul/li/a")
print "%s pages" % pages_nr
for i in range(0, pages_nr):
	href = "http://www.webtenerife.com/Actualidad-%s/index.htm?Lang=%s&ind=%s" % (LANG, LANG, i)
	print "opening %s" % href
	br.open( href )
	br.wait_for_page_to_load("80000")
	root = fromstring( br.get_html_source() )
	thumbs = root.cssselect(".resultados li img")
	get_thumbs( thumbs )
	pages = root.cssselect(".resultados h3 a")
	for page in pages:
		page_href = page.get("href")
		print "opening %s" % page_href
		br.open( page_href )
		br.wait_for_page_to_load("50000")
		root = fromstring( br.get_html_source() )
		img = root.cssselect("#imgCentral_PresentationModeControlsContainer_PresentationImage")
		get_image( img )
	"""
	Basic script to scrap pages with selenium

	requires:
	- lxml
	- selenium http://seleniumhq.org/download/

	execution:
	1. start selenium server
	$ java -jar selenium-server-standalone-2.16.jar
	2. run this script

	"""

	import urlparse, os
	from selenium import selenium
	from lxml.html import fromstring


	WEB_URL = "http://www.webtenerife.com"
	LANG = "en"
	IMG_DIR = '/Users/kamil/Desktop/PROJECTOS/webtenerife/scrap_actualidad-%s/' % LANG

	def download(url):
	"""Copy the contents of a file from a given URL
	to a local file.
	"""
	import urllib
	webFile = urllib.urlopen(url)
	localPath = os.path.join(IMG_DIR, url.split("/")[-1:][0] )
	localFile = open(localPath, 'w')
	localFile.write(webFile.read())
	webFile.close()
	localFile.close()
	return localPath

	def get_thumbs(thumbs):
	for thumb in thumbs:
	thumb_name = thumb.get("src")
	if thumb_name:
	thumb_src = "%s%s" % (WEB_URL, thumb.get("src"))
	print "downloading thumb %s" % thumb_src
	download( thumb_src )

	def get_image(img):
	if img:
	img_href = "%s%s" % (WEB_URL, img[0].get("src"))
	print "downloading %s" % img_href
	return download( img_href )
	else:
	return False


	# User-Agent (this is cheating, ok?)
	br = selenium("localhost", 4444, "*chrome", "http://www.webtenerife.com/Actualidad-es/?Lang=es")
	br.start()
	br.open("http://www.webtenerife.com/Actualidad-%s/?Lang=%s" % (LANG, LANG ) )
	br.type("id=BuscadorFechas1_DateDesde_txt_Date", "22/10/2010")
	#br.type("id=BuscadorFechas1_DateHasta_txt_Date", "29/11/10")
	br.click("id=BuscadorFechas1_lblBuscar")
	br.wait_for_page_to_load("100000")

	pages_nr = br.get_xpath_count("//div[2]/ul/li/a")
	print "%s pages" % pages_nr
	for i in range(0, pages_nr):
	href = "http://www.webtenerife.com/Actualidad-%s/index.htm?Lang=%s&ind=%s" % (LANG, LANG, i)
	print "opening %s" % href
	br.open( href )
	br.wait_for_page_to_load("80000")
	root = fromstring( br.get_html_source() )
	thumbs = root.cssselect(".resultados li img")
	get_thumbs( thumbs )
	pages = root.cssselect(".resultados h3 a")
	for page in pages:
	page_href = page.get("href")
	print "opening %s" % page_href
	br.open( page_href )
	br.wait_for_page_to_load("50000")
	root = fromstring( br.get_html_source() )
	img = root.cssselect("#imgCentral_PresentationModeControlsContainer_PresentationImage")
	get_image( img )