KobaKhit/hmtl_table_parser.py

## hmtl_table_parser.py
# http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
class HTMLTableParser:
    @staticmethod
    def get_element(node):
        # for XPATH we have to count only for nodes with same type!
        length = len(list(node.previous_siblings)) + 1
        if (length) > 1:
            return '%s:nth-child(%s)' % (node.name, length)
        else:
            return node.name

    @classmethod
    def get_css_path(cls,node):
        path = [cls.get_element(node)]
        for parent in node.parents:
            if parent.name == 'body':
              break
            path.insert(0, cls.get_element(parent))
        return ' > '.join(path)

    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')

        # if table does not have id property return css/xpath pointer
        get_id = lambda x : x['id'] if 'id' in x else self.get_css_path(x)
        tables = [(get_id(table),self.parse_html_table(table))\
                    for table in soup.find_all('table')]
        return  tables

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th')
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text().strip())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                df[col] = df[col].str.strip()
                pass

        return df

def main():
    url = 'http://www.espn.com/nba/standings'
    hp = HTMLTableParser()
    tables = hp.parse_url(url)
    print(tables)


if __name__ == '__main__':
    main()
	# http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
	class HTMLTableParser:
	@staticmethod
	def get_element(node):
	# for XPATH we have to count only for nodes with same type!
	length = len(list(node.previous_siblings)) + 1
	if (length) > 1:
	return '%s:nth-child(%s)' % (node.name, length)
	else:
	return node.name

	@classmethod
	def get_css_path(cls,node):
	path = [cls.get_element(node)]
	for parent in node.parents:
	if parent.name == 'body':
	break
	path.insert(0, cls.get_element(parent))
	return ' > '.join(path)

	def parse_url(self, url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'lxml')

	# if table does not have id property return css/xpath pointer
	get_id = lambda x : x['id'] if 'id' in x else self.get_css_path(x)
	tables = [(get_id(table),self.parse_html_table(table))\
	for table in soup.find_all('table')]
	return tables

	def parse_html_table(self, table):
	n_columns = 0
	n_rows=0
	column_names = []

	# Find number of rows and columns
	# we also find the column titles if we can
	for row in table.find_all('tr'):

	# Determine the number of rows in the table
	td_tags = row.find_all('td')
	if len(td_tags) > 0:
	n_rows+=1
	if n_columns == 0:
	# Set the number of columns for our table
	n_columns = len(td_tags)

	# Handle column names if we find them
	th_tags = row.find_all('th')
	if len(th_tags) > 0 and len(column_names) == 0:
	for th in th_tags:
	column_names.append(th.get_text().strip())

	# Safeguard on Column Titles
	if len(column_names) > 0 and len(column_names) != n_columns:
	raise Exception("Column titles do not match the number of columns")

	columns = column_names if len(column_names) > 0 else range(0,n_columns)
	df = pd.DataFrame(columns = columns,
	index= range(0,n_rows))
	row_marker = 0
	for row in table.find_all('tr'):
	column_marker = 0
	columns = row.find_all('td')
	for column in columns:
	df.iat[row_marker,column_marker] = column.get_text()
	column_marker += 1
	if len(columns) > 0:
	row_marker += 1

	# Convert to float if possible
	for col in df:
	try:
	df[col] = df[col].astype(float)
	except ValueError:
	df[col] = df[col].str.strip()
	pass

	return df

	def main():
	url = 'http://www.espn.com/nba/standings'
	hp = HTMLTableParser()
	tables = hp.parse_url(url)
	print(tables)


	if __name__ == '__main__':
	main()