Last active
October 9, 2024 22:12
-
-
Save trylovetom/db3a1972bdf834727dd77d2cea89666e to your computer and use it in GitHub Desktop.
Parse HTML Table To JSON With Cheerio.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const text = `<table cellspacing=1 bordercolordark=#666666 cellpadding=0 width=100% align=center bgcolor=#D9D9D9 bordercolorlight=#ffffff border=0><tr class='chinese blue_t2'><td height=20>球队</td><td>球员</td><td>位置</td><td>原因</td><td>日期</td><td>备注</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>波特兰拓荒者</td><td> 纽基</td><td>中锋</td><td>脚部</td><td>2019/03/26</td><td>赛季报销</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>波特兰拓荒者</td><td> 麦高林</td><td>后卫</td><td>膝部</td><td>2019/03/17</td><td>预计缺阵</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>芝加哥公牛</td><td> 赞达拿赫捷臣</td><td>前锋</td><td>膝部</td><td>2019/01/26</td><td>赛季报销</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>芝加哥公牛</td><td> 奥图波达</td><td>前锋</td><td>肩部</td><td>2019/03/26</td><td>预计缺阵</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>芝加哥公牛</td><td> 云度卡达</td><td>中锋</td><td>拇指</td><td>2019/01/16</td><td>预计缺阵</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>芝加哥公牛</td><td> D.华伦天尼</td><td>前锋</td><td>足踝</td><td>2018/10/19</td><td>赛季报销</td></tr><tr bgcolor=#FFF0DF align=center><td height=20>芝加哥公牛</td><td> 查克拉维利</td><td>后卫</td><td>脚部</td><td>2019/03/26</td><td>预计缺阵</td></tr></table>` | |
console.log(tableToJSON(text, { useFirstRowForHeadings: true, headings: [ 'team', 'player', 'position', 'cause', 'date', 'note' ] })) | |
/** | |
* 在 node.js 中 轉換 html table 為 json 格式 | |
* @param {String} html HTML Text | |
* @param {{ useFirstRowForHeadings: Boolean ,headings: [String] }} options 設定(userFirstRowForHeadings: 是否用第一行作為標題, headings: 可自行設定的標題) | |
* @return {[{ String: String }]} 回傳結果 | |
*/ | |
function tableToJSON (html, options = {}) { | |
const cheerio = require('cheerio') | |
const { useFirstRowForHeadings, headings = [] } = options | |
const trs = cheerio('table tr', html) | |
const rowCount = trs.length | |
const output = [] | |
for (let rowIndex = 0; rowIndex < rowCount; rowIndex += 1) { | |
const children = trs[rowIndex].children | |
const colCount = children.length | |
const row = {} | |
for (let colIndex = 0; colIndex < colCount; colIndex += 1) { | |
const td = cheerio(children[colIndex]) | |
if (rowIndex === 0 && useFirstRowForHeadings) { | |
headings.push(td.text()) | |
} else { | |
row[headings[colIndex] || colIndex] = td.text() | |
} | |
} | |
if (!(rowIndex === 0 && useFirstRowForHeadings)) { | |
output.push(row) | |
} | |
} | |
return output | |
} |
Here are updated types:
import { type Cheerio, load } from "cheerio";
import type { AnyNode } from "domhandler";
function tableToJSON(
$: ReturnType<typeof load>,
table: Cheerio<AnyNode>,
options: {
useFirstRowForHeadings: boolean;
headings: string[];
},
) {
const { useFirstRowForHeadings, headings = [] } = options;
const trs = table.find("tr");
const rowCount = trs.length;
const output = [];
for (let rowIndex = 0; rowIndex < rowCount; rowIndex += 1) {
const tagName = rowIndex === 0 && useFirstRowForHeadings ? "th" : "td";
const children = $(trs[rowIndex]).find(tagName);
const colCount = children.length;
const row: {[key: string]: string} = {};
for (let colIndex = 0; colIndex < colCount; colIndex += 1) {
const td = $(children[colIndex]);
if (rowIndex === 0 && useFirstRowForHeadings) {
headings.push(td.text());
} else {
row[headings[colIndex] || colIndex] = td.text();
}
}
if (!(rowIndex === 0 && useFirstRowForHeadings)) {
output.push(row);
}
}
return output;
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
here is a improved typescript version: