Skip to content

Instantly share code, notes, and snippets.

@datacorner
Last active September 20, 2018 06:51
Show Gist options
  • Save datacorner/47f99b6674fbe033a0f82e0405a94dc3 to your computer and use it in GitHub Desktop.
Save datacorner/47f99b6674fbe033a0f82e0405a94dc3 to your computer and use it in GitHub Desktop.
This function scrap one html page by gathering the XPath data (using the tags array) and give back a Pandas DataFrame
import requests
import lxml.html as lh
import pandas as pd
# URL
url = '...'
# XPath content to collect
tags = ['//a[@class="XX"]', \
'//p[@class="XX"]' , \
'//span[@class="XX"]', \
'//span[@class="XX"]', \
'//span[@class="XX"]']
cols = ['col1', \
'col2' , \
'col3', \
'col4', \
'col4']
# This function scrap one html page by gathering the XPath data (using the tags array) and give back a Pandas DataFrame
def scrapHtmlPage(url):
page = requests.get(url)
doc = lh.fromstring(page.content)
# Get the Web data via XPath
content = []
for i in range(len(tags)):
content.append(doc.xpath(tags[i]))
# Gather the data into a Pandas DataFrame array
df_liste = []
for j in range(len(tags)):
tmp = pd.DataFrame([content[j][i].text_content().strip() for i in range(len(content[i]))], columns=[cols[j]])
tmp['key'] = tmp.index
df_liste.append(tmp)
# Build the unique Dataframe with one tag (xpath) content per column
liste = df_liste[0]
for j in range(len(tags)-1):
liste = liste.join(df_liste[j+1], on='key', how='left', lsuffix='_l', rsuffix='_r')
liste['key'] = liste.index
del liste['key_l']
del liste['key_r']
return liste
@datacorner
Copy link
Author

This function scrap one html page by gathering the XPath data (using the tags array) and give back a Pandas DataFrame

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment