Skip to content

Instantly share code, notes, and snippets.

@KobaKhit
Last active July 18, 2022 07:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KobaKhit/c0efbe6c219c7cfc21bfa5ce2f1a3d01 to your computer and use it in GitHub Desktop.
Save KobaKhit/c0efbe6c219c7cfc21bfa5ce2f1a3d01 to your computer and use it in GitHub Desktop.
Parse all html tables on a page and return them as a list of pandas dataframes. Modified from @srome
# http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
class HTMLTableParser:
@staticmethod
def get_element(node):
# for XPATH we have to count only for nodes with same type!
length = len(list(node.previous_siblings)) + 1
if (length) > 1:
return '%s:nth-child(%s)' % (node.name, length)
else:
return node.name
@classmethod
def get_css_path(cls,node):
path = [cls.get_element(node)]
for parent in node.parents:
if parent.name == 'body':
break
path.insert(0, cls.get_element(parent))
return ' > '.join(path)
def parse_url(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# if table does not have id property return css/xpath pointer
get_id = lambda x : x['id'] if 'id' in x else self.get_css_path(x)
tables = [(get_id(table),self.parse_html_table(table))\
for table in soup.find_all('table')]
return tables
def parse_html_table(self, table):
n_columns = 0
n_rows=0
column_names = []
# Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):
# Determine the number of rows in the table
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
# Set the number of columns for our table
n_columns = len(td_tags)
# Handle column names if we find them
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text().strip())
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,
index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
# Convert to float if possible
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
df[col] = df[col].str.strip()
pass
return df
def main():
url = 'http://www.espn.com/nba/standings'
hp = HTMLTableParser()
tables = hp.parse_url(url)
print(tables)
if __name__ == '__main__':
main()
@KobaKhit
Copy link
Author

@srome hello. I modified your script a little to account for the case where tables do not have id and instead use the css/xpath pointer. Also, I strip newline characters from column names.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment