jianghu52/python读取html的table，转换成json的例子

## python读取html的table，转换成json的例子
#读取本地E:\python_pro\本地脚本\sample.html文件，并将其中的表格数据提取出来
import pandas as pd
from bs4 import BeautifulSoup
import json

# 定义一个结构体类
class Struct:
    def __init__(self, link, target, imgPath, before_p, after_p):
        self.link = link
        self.target = target
        self.imgPath = imgPath
        self.before_p = before_p
        self.after_p = after_p

#定义一个td结构体类
class TdStruct:
    def __init__(self, tdvalues,thValue):
        self.tdvalues = tdvalues
        self.thValue = thValue
max_td = 0
max_tr = 0
def get_tds(htmlpath):

    #读取本地文件
    with open(htmlpath, 'r', encoding='utf-8') as f:
        html = f.read()

    #解析html
    soup = BeautifulSoup(html, 'html.parser')

    #取得table里的tbody数据
    tbody = soup.find('table').find('tbody')
    #获取最大tr数

    max_tr = len(tbody.find_all('tr'))

    #获取一行的最大td数
    max_td = len(tbody.find_all('td'))
    print('max_tr:', max_tr)
    print('max_td:', max_td)

    #便利tbody里的tr标签，取得每一行的td标签数据
    td_list = []
    for tr in tbody.find_all('tr'):
        th = tr.find('th').contents
        thvalue = ''.join(str(item) for item in th)
        tdvalues = []
        for td in tr.find_all('td'):
            tdvalue = ''.join(str(item) for item in td.contents)
            tdvalues.append(tdvalue)
        td_list.append(TdStruct(tdvalues,thvalue))

    return td_list

def get_th_json(htmlpath):
    #读取本地文件
    with open(htmlpath, 'r', encoding='utf-8') as f:
        html = f.read()

    #解析html
    soup = BeautifulSoup(html, 'html.parser')

    #取得table里的thead数据
    thead = soup.find('table').find('thead')
    #便利thead里的tr标签，取得每一行的th标签数据
    th_list = []
    data_value_list = []
    for th in thead.find_all('th'):
        target = ''
        link = th.find('a')
        if link:
            link = link.get('href')
            target = th.find('a').get('target')
        else:
            link = ''
        imgPath = th.find('img')
        if imgPath:
            imgPath = imgPath.get('src')
        else:
            imgPath = ''
        pstring = th.find('p')
        before_p = ''
        after_p = ''
        if pstring:
            content = ''.join(str(item) for item in pstring.contents)
            #将<p>标签内的内容转换为字符串
            if '<br/>' in content:
                before_p, after_p = content.split('<br/>', 1)  # 使用split方法按照<br/>进行分割
            else:
                before_p = content
                after_p = ''
        else:
            pstring = ''

        th_list.append(Struct(link, target, imgPath, before_p, after_p))

        data_value_list.append(str( th.get('data-selection-value')))

    tdlist = get_tds(htmlpath)
    # 构建JSON对象
    json_obj = {
        'thvalues': []
    }
    # 将th_list结果追加到JSON对象中
    for i in range(len(th_list)):
        json_obj['thvalues'].append({
            'link': th_list[i].link,
            'target': th_list[i].target,
            'imgPath': th_list[i].imgPath,
            'before_p': th_list[i].before_p,
            'after_p': th_list[i].after_p,
            'data_value': data_value_list[i],
            'thvalue': tdlist[i].thValue,
            #循环追加tdvalues
            'tdvalues': []
        })
        for j in range(21):
            print('j is:', j)
            print('i is==',i)

            json_obj['thvalues'][i]['tdvalues'].append({
                tdlist[j].thValue: tdlist[j].tdvalues[i-1]
        })

    # 输出JSON对象
    #print(json.dumps(json_obj, ensure_ascii=False))

    # 保存JSON对象到文件
    with open(htmlpath.replace('.html', '.json'), 'w', encoding='utf-8') as f:
        json.dump(json_obj, f, ensure_ascii=False)

get_th_json('E:\python_pro\本地脚本\sample.html')
	#读取本地E:\python_pro\本地脚本\sample.html文件，并将其中的表格数据提取出来
	import pandas as pd
	from bs4 import BeautifulSoup
	import json

	# 定义一个结构体类
	class Struct:
	def __init__(self, link, target, imgPath, before_p, after_p):
	self.link = link
	self.target = target
	self.imgPath = imgPath
	self.before_p = before_p
	self.after_p = after_p

	#定义一个td结构体类
	class TdStruct:
	def __init__(self, tdvalues,thValue):
	self.tdvalues = tdvalues
	self.thValue = thValue
	max_td = 0
	max_tr = 0
	def get_tds(htmlpath):

	#读取本地文件
	with open(htmlpath, 'r', encoding='utf-8') as f:
	html = f.read()

	#解析html
	soup = BeautifulSoup(html, 'html.parser')

	#取得table里的tbody数据
	tbody = soup.find('table').find('tbody')
	#获取最大tr数

	max_tr = len(tbody.find_all('tr'))

	#获取一行的最大td数
	max_td = len(tbody.find_all('td'))
	print('max_tr:', max_tr)
	print('max_td:', max_td)

	#便利tbody里的tr标签，取得每一行的td标签数据
	td_list = []
	for tr in tbody.find_all('tr'):
	th = tr.find('th').contents
	thvalue = ''.join(str(item) for item in th)
	tdvalues = []
	for td in tr.find_all('td'):
	tdvalue = ''.join(str(item) for item in td.contents)
	tdvalues.append(tdvalue)
	td_list.append(TdStruct(tdvalues,thvalue))

	return td_list

	def get_th_json(htmlpath):
	#读取本地文件
	with open(htmlpath, 'r', encoding='utf-8') as f:
	html = f.read()

	#解析html
	soup = BeautifulSoup(html, 'html.parser')

	#取得table里的thead数据
	thead = soup.find('table').find('thead')
	#便利thead里的tr标签，取得每一行的th标签数据
	th_list = []
	data_value_list = []
	for th in thead.find_all('th'):
	target = ''
	link = th.find('a')
	if link:
	link = link.get('href')
	target = th.find('a').get('target')
	else:
	link = ''
	imgPath = th.find('img')
	if imgPath:
	imgPath = imgPath.get('src')
	else:
	imgPath = ''
	pstring = th.find('p')
	before_p = ''
	after_p = ''
	if pstring:
	content = ''.join(str(item) for item in pstring.contents)
	#将<p>标签内的内容转换为字符串
	if '<br/>' in content:
	before_p, after_p = content.split('<br/>', 1) # 使用split方法按照<br/>进行分割
	else:
	before_p = content
	after_p = ''
	else:
	pstring = ''

	th_list.append(Struct(link, target, imgPath, before_p, after_p))

	data_value_list.append(str( th.get('data-selection-value')))

	tdlist = get_tds(htmlpath)
	# 构建JSON对象
	json_obj = {
	'thvalues': []
	}
	# 将th_list结果追加到JSON对象中
	for i in range(len(th_list)):
	json_obj['thvalues'].append({
	'link': th_list[i].link,
	'target': th_list[i].target,
	'imgPath': th_list[i].imgPath,
	'before_p': th_list[i].before_p,
	'after_p': th_list[i].after_p,
	'data_value': data_value_list[i],
	'thvalue': tdlist[i].thValue,
	#循环追加tdvalues
	'tdvalues': []
	})
	for j in range(21):
	print('j is:', j)
	print('i is==',i)

	json_obj['thvalues'][i]['tdvalues'].append({
	tdlist[j].thValue: tdlist[j].tdvalues[i-1]
	})

	# 输出JSON对象
	#print(json.dumps(json_obj, ensure_ascii=False))

	# 保存JSON对象到文件
	with open(htmlpath.replace('.html', '.json'), 'w', encoding='utf-8') as f:
	json.dump(json_obj, f, ensure_ascii=False)

	get_th_json('E:\python_pro\本地脚本\sample.html')