Last active
June 12, 2023 08:23
-
-
Save akccakcctw/38d3689e72c74111a0056ff0fe4be056 to your computer and use it in GitHub Desktop.
convertHtmlToMarkdown_20230612
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function convertHtmlToMarkdown(html) { | |
let markdown = html; | |
// Remove excess tab characters | |
markdown = markdown.replace(/\t+/g, ''); | |
// Convert links | |
markdown = markdown.replace(/<a\s+href="([^"]*)"\s*>(.*?)<\/a>/g, '[$2]($1)'); | |
markdown = markdown.replace(/<a\s+target="_blank"\s+rel="noopener"\s+href="(.*?)">(.*?)<\/a>/g, '[$2]($1)'); | |
// Convert tables | |
markdown = markdown.replace(/<table>([\s\S]*?)<\/table>/g, convertHtmlToMarkdownTable); | |
// Handle class "textarea_data" | |
markdown = markdown.replace(/<li\s+class="textarea_data">(.*?)<\/li>/g, convertDataListItem); | |
// Handle description list item with class "textarea_des" | |
markdown = markdown.replace(/<li\s+class="textarea_des">(.*?)<\/li>/g, convertDescriptionListItem); | |
// Handle numeric list item with class "textarea_list" | |
markdown = markdown.replace(/<li\s+class="textarea_list">(.*?)<\/li>/g, convertNumericListItem); | |
// Handle alpha list item with class "textarea_list02" | |
markdown = markdown.replace(/<li\s+class="textarea_list02">(.*?)<\/li>/g, convertAlphaListItem); | |
// Handle bullet list item with class "textarea_list03" | |
markdown = markdown.replace(/<li\s+class="textarea_list03">(.*?)<\/li>/g, convertBulletListItem); | |
// Handle heading with class "textarea_title" | |
markdown = markdown.replace(/<li\s+class="textarea_title">(.*?)<\/li>/g, convertHeading); | |
// Handle heading with comment | |
markdown = markdown.replace(/<!--\s+(.*?)-->/g, ''); | |
// Handle headings | |
markdown = markdown.replace(/<h1>(.*?)<\/h1>/g, '# $1\n'); | |
markdown = markdown.replace(/<h2>(.*?)<\/h2>/g, '## $1\n'); | |
markdown = markdown.replace(/<h3>(.*?)<\/h3>/g, '### $1\n'); | |
markdown = markdown.replace(/<h4>(.*?)<\/h4>/g, '#### $1\n'); | |
markdown = markdown.replace(/<h5>(.*?)<\/h5>/g, '##### $1\n'); | |
markdown = markdown.replace(/<h6>(.*?)<\/h6>/g, '###### $1\n'); | |
// Replace HTML line breaks with Markdown line breaks | |
markdown = markdown.replace(/<br\s*\/?>/g, '\n'); | |
// Convert bold and italic formatting | |
markdown = markdown.replace(/<b>(.*?)<\/b>/g, '**$1**'); | |
markdown = markdown.replace(/<strong>(.*?)<\/strong>/g, '**$1**'); | |
markdown = markdown.replace(/<em>(.*?)<\/em>/g, '_$1_'); | |
markdown = markdown.replace(/<i>(.*?)<\/i>/g, '_$1_'); | |
// Replace <hr> tags with --- | |
markdown = markdown.replace(/<hr>/g, '\n---\n'); | |
return markdown; | |
} | |
function convertDescriptionListItem(listItemContent) { | |
const regex = /<li\s+class="textarea_des">(.*?)<\/li>/; | |
const matches = regex.exec(listItemContent); | |
if (matches) { | |
const description = matches[1].trim(); | |
return description; | |
} | |
return ''; | |
} | |
function convertDataListItem(listItemContent) { | |
const regex = /<li\s+class="textarea_data">(.*?)<\/li>/; | |
const matches = regex.exec(listItemContent); | |
if (matches) { | |
const description = matches[1].trim(); | |
return `\n::: right\n${description}\n:::\n`; | |
} | |
return ''; | |
} | |
function convertHeading(headingContent) { | |
const regex = /<li\s+class="textarea_title">(.*?)<\/li>/; | |
const matches = regex.exec(headingContent); | |
if (matches) { | |
const headingText = matches[1].trim(); | |
return `\n## ${headingText}\n`; | |
} | |
return ''; | |
} | |
function convertNumericListItem(listItemContent) { | |
const regex = /<li\s+class="textarea_list">(.*?)<\/li>/; | |
const matches = regex.exec(listItemContent); | |
if (matches) { | |
const listItem = matches[1].trim(); | |
const numericRegex = /<span>(.*?)<\/span>(.*)/; | |
const numericMatches = numericRegex.exec(listItem); | |
if (numericMatches) { | |
const listText = numericMatches[2].trim(); | |
return `1. ${listText}`; | |
} | |
} | |
return ''; | |
} | |
function convertAlphaListItem(_, listItemContent) { | |
const regex = /<span>\((.*?)\)<\/span>(.*)/; | |
const matches = regex.exec(listItemContent); | |
if (matches) { | |
const listText = matches[2].trim(); | |
const alpha = matches[1].toLowerCase(); | |
return `\t- ${alpha}. ${listText}`; | |
} | |
return ''; | |
} | |
function convertBulletListItem(_, listItemContent) { | |
const regex = /<span>•<\/span>(.*)/; | |
const matches = regex.exec(listItemContent); | |
if (matches) { | |
const listText = matches[1].trim(); | |
return `\t\t- ${listText}`; | |
} | |
return ''; | |
} | |
function convertHtmlToMarkdownTable(html) { | |
// 創建一個虛擬的 HTML 元素(一個 div)來解析 HTML 字符串 | |
var tempElement = document.createElement('div'); | |
tempElement.innerHTML = html; | |
// 獲取表格元素 | |
var tableElement = tempElement.querySelector('table'); | |
if (!tableElement) { | |
return ''; // 如果沒有找到表格元素,返回空字符串 | |
} | |
// 獲取表頭元素 | |
var theadElement = tableElement.querySelector('thead'); | |
var thElements = theadElement ? Array.from(theadElement.querySelectorAll('th, td')) : []; | |
// 獲取表體元素 | |
var tbodyElement = tableElement.querySelector('tbody'); | |
var trElements = tbodyElement ? Array.from(tbodyElement.querySelectorAll('tr')) : []; | |
// 創建 Markdown 表格字符串 | |
var markdownTable = ''; | |
// 添加表頭 | |
if (thElements.length > 0) { | |
markdownTable += '| '; | |
markdownTable += thElements.map(function (thElement) { | |
return '**' + thElement.textContent.trim() + '**'; | |
}).join(' | '); | |
markdownTable += ' |\n'; | |
markdownTable += '| '; | |
markdownTable += thElements.map(function (thElement) { | |
return ':---'; | |
}).join(' | '); | |
markdownTable += ' |\n'; | |
} | |
// 添加表體 | |
trElements.forEach(function (trElement) { | |
var tdElements = Array.from(trElement.querySelectorAll('td')); | |
markdownTable += '| '; | |
markdownTable += tdElements.map(function (tdElement) { | |
return tdElement.textContent.trim(); | |
}).join(' | '); | |
markdownTable += ' |\n'; | |
}); | |
return markdownTable; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment