Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jianghu52/c52c92ea56273799a77450a0ad0e0ff2 to your computer and use it in GitHub Desktop.
Save jianghu52/c52c92ea56273799a77450a0ad0e0ff2 to your computer and use it in GitHub Desktop.
#读取本地E:\python_pro\本地脚本\sample.html文件,并将其中的表格数据提取出来
import pandas as pd
from bs4 import BeautifulSoup
import json
# 定义一个结构体类
class Struct:
def __init__(self, link, target, imgPath, before_p, after_p):
self.link = link
self.target = target
self.imgPath = imgPath
self.before_p = before_p
self.after_p = after_p
#定义一个td结构体类
class TdStruct:
def __init__(self, tdvalues,thValue):
self.tdvalues = tdvalues
self.thValue = thValue
max_td = 0
max_tr = 0
def get_tds(htmlpath):
#读取本地文件
with open(htmlpath, 'r', encoding='utf-8') as f:
html = f.read()
#解析html
soup = BeautifulSoup(html, 'html.parser')
#取得table里的tbody数据
tbody = soup.find('table').find('tbody')
#获取最大tr数
max_tr = len(tbody.find_all('tr'))
#获取一行的最大td数
max_td = len(tbody.find_all('td'))
print('max_tr:', max_tr)
print('max_td:', max_td)
#便利tbody里的tr标签,取得每一行的td标签数据
td_list = []
for tr in tbody.find_all('tr'):
th = tr.find('th').contents
thvalue = ''.join(str(item) for item in th)
tdvalues = []
for td in tr.find_all('td'):
tdvalue = ''.join(str(item) for item in td.contents)
tdvalues.append(tdvalue)
td_list.append(TdStruct(tdvalues,thvalue))
return td_list
def get_th_json(htmlpath):
#读取本地文件
with open(htmlpath, 'r', encoding='utf-8') as f:
html = f.read()
#解析html
soup = BeautifulSoup(html, 'html.parser')
#取得table里的thead数据
thead = soup.find('table').find('thead')
#便利thead里的tr标签,取得每一行的th标签数据
th_list = []
data_value_list = []
for th in thead.find_all('th'):
target = ''
link = th.find('a')
if link:
link = link.get('href')
target = th.find('a').get('target')
else:
link = ''
imgPath = th.find('img')
if imgPath:
imgPath = imgPath.get('src')
else:
imgPath = ''
pstring = th.find('p')
before_p = ''
after_p = ''
if pstring:
content = ''.join(str(item) for item in pstring.contents)
#将<p>标签内的内容转换为字符串
if '<br/>' in content:
before_p, after_p = content.split('<br/>', 1) # 使用split方法按照<br/>进行分割
else:
before_p = content
after_p = ''
else:
pstring = ''
th_list.append(Struct(link, target, imgPath, before_p, after_p))
data_value_list.append(str( th.get('data-selection-value')))
tdlist = get_tds(htmlpath)
# 构建JSON对象
json_obj = {
'thvalues': []
}
# 将th_list结果追加到JSON对象中
for i in range(len(th_list)):
json_obj['thvalues'].append({
'link': th_list[i].link,
'target': th_list[i].target,
'imgPath': th_list[i].imgPath,
'before_p': th_list[i].before_p,
'after_p': th_list[i].after_p,
'data_value': data_value_list[i],
'thvalue': tdlist[i].thValue,
#循环追加tdvalues
'tdvalues': []
})
for j in range(21):
print('j is:', j)
print('i is==',i)
json_obj['thvalues'][i]['tdvalues'].append({
tdlist[j].thValue: tdlist[j].tdvalues[i-1]
})
# 输出JSON对象
#print(json.dumps(json_obj, ensure_ascii=False))
# 保存JSON对象到文件
with open(htmlpath.replace('.html', '.json'), 'w', encoding='utf-8') as f:
json.dump(json_obj, f, ensure_ascii=False)
get_th_json('E:\python_pro\本地脚本\sample.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment