Skip to content

Instantly share code, notes, and snippets.

@akccakcctw
Last active June 12, 2023 08:23
Show Gist options
  • Save akccakcctw/38d3689e72c74111a0056ff0fe4be056 to your computer and use it in GitHub Desktop.
Save akccakcctw/38d3689e72c74111a0056ff0fe4be056 to your computer and use it in GitHub Desktop.
convertHtmlToMarkdown_20230612
function convertHtmlToMarkdown(html) {
let markdown = html;
// Remove excess tab characters
markdown = markdown.replace(/\t+/g, '');
// Convert links
markdown = markdown.replace(/<a\s+href="([^"]*)"\s*>(.*?)<\/a>/g, '[$2]($1)');
markdown = markdown.replace(/<a\s+target="_blank"\s+rel="noopener"\s+href="(.*?)">(.*?)<\/a>/g, '[$2]($1)');
// Convert tables
markdown = markdown.replace(/<table>([\s\S]*?)<\/table>/g, convertHtmlToMarkdownTable);
// Handle class "textarea_data"
markdown = markdown.replace(/<li\s+class="textarea_data">(.*?)<\/li>/g, convertDataListItem);
// Handle description list item with class "textarea_des"
markdown = markdown.replace(/<li\s+class="textarea_des">(.*?)<\/li>/g, convertDescriptionListItem);
// Handle numeric list item with class "textarea_list"
markdown = markdown.replace(/<li\s+class="textarea_list">(.*?)<\/li>/g, convertNumericListItem);
// Handle alpha list item with class "textarea_list02"
markdown = markdown.replace(/<li\s+class="textarea_list02">(.*?)<\/li>/g, convertAlphaListItem);
// Handle bullet list item with class "textarea_list03"
markdown = markdown.replace(/<li\s+class="textarea_list03">(.*?)<\/li>/g, convertBulletListItem);
// Handle heading with class "textarea_title"
markdown = markdown.replace(/<li\s+class="textarea_title">(.*?)<\/li>/g, convertHeading);
// Handle heading with comment
markdown = markdown.replace(/<!--\s+(.*?)-->/g, '');
// Handle headings
markdown = markdown.replace(/<h1>(.*?)<\/h1>/g, '# $1\n');
markdown = markdown.replace(/<h2>(.*?)<\/h2>/g, '## $1\n');
markdown = markdown.replace(/<h3>(.*?)<\/h3>/g, '### $1\n');
markdown = markdown.replace(/<h4>(.*?)<\/h4>/g, '#### $1\n');
markdown = markdown.replace(/<h5>(.*?)<\/h5>/g, '##### $1\n');
markdown = markdown.replace(/<h6>(.*?)<\/h6>/g, '###### $1\n');
// Replace HTML line breaks with Markdown line breaks
markdown = markdown.replace(/<br\s*\/?>/g, '\n');
// Convert bold and italic formatting
markdown = markdown.replace(/<b>(.*?)<\/b>/g, '**$1**');
markdown = markdown.replace(/<strong>(.*?)<\/strong>/g, '**$1**');
markdown = markdown.replace(/<em>(.*?)<\/em>/g, '_$1_');
markdown = markdown.replace(/<i>(.*?)<\/i>/g, '_$1_');
// Replace <hr> tags with ---
markdown = markdown.replace(/<hr>/g, '\n---\n');
return markdown;
}
function convertDescriptionListItem(listItemContent) {
const regex = /<li\s+class="textarea_des">(.*?)<\/li>/;
const matches = regex.exec(listItemContent);
if (matches) {
const description = matches[1].trim();
return description;
}
return '';
}
function convertDataListItem(listItemContent) {
const regex = /<li\s+class="textarea_data">(.*?)<\/li>/;
const matches = regex.exec(listItemContent);
if (matches) {
const description = matches[1].trim();
return `\n::: right\n${description}\n:::\n`;
}
return '';
}
function convertHeading(headingContent) {
const regex = /<li\s+class="textarea_title">(.*?)<\/li>/;
const matches = regex.exec(headingContent);
if (matches) {
const headingText = matches[1].trim();
return `\n## ${headingText}\n`;
}
return '';
}
function convertNumericListItem(listItemContent) {
const regex = /<li\s+class="textarea_list">(.*?)<\/li>/;
const matches = regex.exec(listItemContent);
if (matches) {
const listItem = matches[1].trim();
const numericRegex = /<span>(.*?)<\/span>(.*)/;
const numericMatches = numericRegex.exec(listItem);
if (numericMatches) {
const listText = numericMatches[2].trim();
return `1. ${listText}`;
}
}
return '';
}
function convertAlphaListItem(_, listItemContent) {
const regex = /<span>\((.*?)\)<\/span>(.*)/;
const matches = regex.exec(listItemContent);
if (matches) {
const listText = matches[2].trim();
const alpha = matches[1].toLowerCase();
return `\t- ${alpha}. ${listText}`;
}
return '';
}
function convertBulletListItem(_, listItemContent) {
const regex = /<span>•<\/span>(.*)/;
const matches = regex.exec(listItemContent);
if (matches) {
const listText = matches[1].trim();
return `\t\t- ${listText}`;
}
return '';
}
function convertHtmlToMarkdownTable(html) {
// 創建一個虛擬的 HTML 元素(一個 div)來解析 HTML 字符串
var tempElement = document.createElement('div');
tempElement.innerHTML = html;
// 獲取表格元素
var tableElement = tempElement.querySelector('table');
if (!tableElement) {
return ''; // 如果沒有找到表格元素,返回空字符串
}
// 獲取表頭元素
var theadElement = tableElement.querySelector('thead');
var thElements = theadElement ? Array.from(theadElement.querySelectorAll('th, td')) : [];
// 獲取表體元素
var tbodyElement = tableElement.querySelector('tbody');
var trElements = tbodyElement ? Array.from(tbodyElement.querySelectorAll('tr')) : [];
// 創建 Markdown 表格字符串
var markdownTable = '';
// 添加表頭
if (thElements.length > 0) {
markdownTable += '| ';
markdownTable += thElements.map(function (thElement) {
return '**' + thElement.textContent.trim() + '**';
}).join(' | ');
markdownTable += ' |\n';
markdownTable += '| ';
markdownTable += thElements.map(function (thElement) {
return ':---';
}).join(' | ');
markdownTable += ' |\n';
}
// 添加表體
trElements.forEach(function (trElement) {
var tdElements = Array.from(trElement.querySelectorAll('td'));
markdownTable += '| ';
markdownTable += tdElements.map(function (tdElement) {
return tdElement.textContent.trim();
}).join(' | ');
markdownTable += ' |\n';
});
return markdownTable;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment