Skip to content

Instantly share code, notes, and snippets.

@lineker
Created April 15, 2015 01:43
Show Gist options
  • Save lineker/2aaf41fc2672f19885a9 to your computer and use it in GitHub Desktop.
Save lineker/2aaf41fc2672f19885a9 to your computer and use it in GitHub Desktop.
Scrapping web with python and lxml
from lxml import html,etree
import requests
import pprint
#This will create a list of buyers:
#buyers = tree.xpath('//div[@title="buyer-name"]/text()')
#This will create a list of prices
#prices = tree.xpath('//span[@class="item-price"]/text()')
#print 'Buyers: ', buyers
#print 'Prices: ', prices
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
headers = {
'User-Agent': user_agents[0]
}
baseurl = "http://www.canardscanins.ca"
#download page
response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers)
#print response.text
#convert to lxml etree
tree = html.fromstring(response.text)
#find all a tag and extract the href attribute
links = tree.xpath('//a/@href')
#find all a tag and extract the text between the open and closing tab
titles = tree.xpath('//a/text()')
print titles
parks = {}
for (i, item) in enumerate(links):
parks[baseurl+item] = {"Name":titles[i]}
#print titles[i] + " - " + baseurl+item
pprint.pprint(parks)
#for item in links:
# print baseurl+item
#for each key,value in parks
key, value = parks.popitem()
page = requests.get(key, headers=headers)
ptree = html.fromstring(page.text)
#imgs = ptree.find(".//img")
imgs = ptree.xpath('//img')
#print etree.tostring(tree)
print imgs[0].getnext().text
print imgs[0].getnext().tail
#extract b nodes that contain text "Fondation"
founded = ptree.xpath('.//b[contains(text(),"Fondation")]')
print len(founded)
print etree.tostring(founded[0].getnext())
#for img in ptree.iterfind('.//img'):
# print etree.tostring(img)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment