Skip to content

Instantly share code, notes, and snippets.

@nguaman
Created February 24, 2016 19:09
Show Gist options
  • Save nguaman/1cf2d41db49172fdb3c2 to your computer and use it in GitHub Desktop.
Save nguaman/1cf2d41db49172fdb3c2 to your computer and use it in GitHub Desktop.
Parsing #2 [i.cantonfair.org.cn]
from sys import exit
from pprint import pprint
import lxml.html
import requests
import re
url = 'http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page=1'
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
regexp_total_pages = re.compile(ur"Pages\s\d\/(\d+)")
text_total_pages = root.xpath('//*[@id="AspNetPager1"]/div[1]/text()')[0].strip()
total_pages = int(re.match(regexp_total_pages,text_total_pages).group(1))
all_links = list()
for i in range(1,total_pages + 1):
url = "http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page={page}".format(page=i)
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
all_links.append(root.xpath('//*[@id="form1"]/div[*]/div[*]/h3/a/@href'))
pprint(all_links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment