Skip to content

Instantly share code, notes, and snippets.

@oleksandriegorov
Created April 15, 2020 08:13
Show Gist options
  • Save oleksandriegorov/3fa7056eed5e2aa5f37cc21212ecc3fa to your computer and use it in GitHub Desktop.
Save oleksandriegorov/3fa7056eed5e2aa5f37cc21212ecc3fa to your computer and use it in GitHub Desktop.
scrape data from certain website using lxml
from lxml import html
import requests
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('company', help='company name')
args = parser.parse_args()
company=args.company
page = requests.get("https://lei.info/fullsearch?for={}".format(company))
tree = html.fromstring(page.content)
r=tree.xpath('//*[@class="results-list"]/li/a')
for i in r:
print(i.text,' - ',i.values())
r=tree.xpath('//*[@class="pagination"]/li/a[@title]')
last=len(r)-1
rvalues=r[last].values()
m=len(rvalues)-1
lastpagedata=r[last].values()[m].split('=')
l=len(lastpagedata)-1
#lastpage=int(lastpagedata[l])
lastpage=2
print(lastpage)
for pagenum in range(2,lastpage+1):
page = requests.get("https://lei.info/fullsearch?for={0}&page={1}".format(company,pagenum))
tree = html.fromstring(page.content)
r=tree.xpath('//*[@class="results-list"]/li/a')
for i in r:
print(i.text,' - ',i.values())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment