Skip to content

Instantly share code, notes, and snippets.

@ruehowl
Created December 27, 2018 18:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ruehowl/5a53c6929918b02f0f6e1ed0a506747f to your computer and use it in GitHub Desktop.
Save ruehowl/5a53c6929918b02f0f6e1ed0a506747f to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
from tabulate import tabulate
soup = BeautifulSoup(open('test.html'), 'lxml')
table = soup.find_all('table')[1]
n_columns = 12
n_rows = len(table.find_all('tr')) - 1
df = pd.DataFrame(columns = range(0,n_columns), index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
df.iat[row_marker,2] = "http://apac-helpdesk-newtickets/"+column.get('id')
row_marker += 1
del df[0]
df.to_csv('test.csv', encoding='utf-8')
d=d=df[df[4].str.contains("Web Hosting L2|Email Hosting")]
d=d.sort_values([8], ascending=[True])
print tabulate(d, headers='keys', tablefmt='psql')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment