lineker/extract_url.py

## extract_url.py
from lxml import html,etree
import requests
import pprint
#This will create a list of buyers:
#buyers = tree.xpath('//div[@title="buyer-name"]/text()')
#This will create a list of prices
#prices = tree.xpath('//span[@class="item-price"]/text()')

#print 'Buyers: ', buyers
#print 'Prices: ', prices


user_agents = [
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]

headers = {
    'User-Agent': user_agents[0]
}
baseurl = "http://www.canardscanins.ca"
#download page
response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers)
#print response.text

#convert to lxml etree
tree = html.fromstring(response.text)
#find all a tag and extract the href attribute
links = tree.xpath('//a/@href')
#find all a tag and extract the text between the open and closing tab
titles = tree.xpath('//a/text()')
print titles

parks = {}

for (i, item) in enumerate(links):
    parks[baseurl+item] = {"Name":titles[i]}
    #print titles[i] + " - " + baseurl+item
pprint.pprint(parks)
#for item in links:
#    print baseurl+item

#for each key,value in parks

key, value = parks.popitem()


page = requests.get(key, headers=headers)
ptree = html.fromstring(page.text)
#imgs = ptree.find(".//img")
imgs = ptree.xpath('//img')
#print etree.tostring(tree)
print imgs[0].getnext().text
print imgs[0].getnext().tail

#extract b nodes that contain text "Fondation"
founded = ptree.xpath('.//b[contains(text(),"Fondation")]')
print len(founded)
print etree.tostring(founded[0].getnext())

#for img in ptree.iterfind('.//img'):
#    print etree.tostring(img)
	from lxml import html,etree
	import requests
	import pprint
	#This will create a list of buyers:
	#buyers = tree.xpath('//div[@title="buyer-name"]/text()')
	#This will create a list of prices
	#prices = tree.xpath('//span[@class="item-price"]/text()')

	#print 'Buyers: ', buyers
	#print 'Prices: ', prices


	user_agents = [
	'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
	'Opera/9.25 (Windows NT 5.1; U; en)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
	'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
	'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
	]

	headers = {
	'User-Agent': user_agents[0]
	}
	baseurl = "http://www.canardscanins.ca"
	#download page
	response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers)
	#print response.text

	#convert to lxml etree
	tree = html.fromstring(response.text)
	#find all a tag and extract the href attribute
	links = tree.xpath('//a/@href')
	#find all a tag and extract the text between the open and closing tab
	titles = tree.xpath('//a/text()')
	print titles

	parks = {}

	for (i, item) in enumerate(links):
	parks[baseurl+item] = {"Name":titles[i]}
	#print titles[i] + " - " + baseurl+item
	pprint.pprint(parks)
	#for item in links:
	# print baseurl+item

	#for each key,value in parks

	key, value = parks.popitem()


	page = requests.get(key, headers=headers)
	ptree = html.fromstring(page.text)
	#imgs = ptree.find(".//img")
	imgs = ptree.xpath('//img')
	#print etree.tostring(tree)
	print imgs[0].getnext().text
	print imgs[0].getnext().tail

	#extract b nodes that contain text "Fondation"
	founded = ptree.xpath('.//b[contains(text(),"Fondation")]')
	print len(founded)
	print etree.tostring(founded[0].getnext())

	#for img in ptree.iterfind('.//img'):
	# print etree.tostring(img)