Last active
September 20, 2018 06:51
-
-
Save datacorner/47f99b6674fbe033a0f82e0405a94dc3 to your computer and use it in GitHub Desktop.
This function scrap one html page by gathering the XPath data (using the tags array) and give back a Pandas DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import lxml.html as lh | |
import pandas as pd | |
# URL | |
url = '...' | |
# XPath content to collect | |
tags = ['//a[@class="XX"]', \ | |
'//p[@class="XX"]' , \ | |
'//span[@class="XX"]', \ | |
'//span[@class="XX"]', \ | |
'//span[@class="XX"]'] | |
cols = ['col1', \ | |
'col2' , \ | |
'col3', \ | |
'col4', \ | |
'col4'] | |
# This function scrap one html page by gathering the XPath data (using the tags array) and give back a Pandas DataFrame | |
def scrapHtmlPage(url): | |
page = requests.get(url) | |
doc = lh.fromstring(page.content) | |
# Get the Web data via XPath | |
content = [] | |
for i in range(len(tags)): | |
content.append(doc.xpath(tags[i])) | |
# Gather the data into a Pandas DataFrame array | |
df_liste = [] | |
for j in range(len(tags)): | |
tmp = pd.DataFrame([content[j][i].text_content().strip() for i in range(len(content[i]))], columns=[cols[j]]) | |
tmp['key'] = tmp.index | |
df_liste.append(tmp) | |
# Build the unique Dataframe with one tag (xpath) content per column | |
liste = df_liste[0] | |
for j in range(len(tags)-1): | |
liste = liste.join(df_liste[j+1], on='key', how='left', lsuffix='_l', rsuffix='_r') | |
liste['key'] = liste.index | |
del liste['key_l'] | |
del liste['key_r'] | |
return liste |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This function scrap one html page by gathering the XPath data (using the tags array) and give back a Pandas DataFrame