A NodeJS command to convert html files which are generated with the theme next by hexo to markdown files.
You should use npm to install cheerio
to-markdown
walk
firstly, and then use node
command to run the script.
#!/usr/bin/env node | |
"use strict"; | |
// 检查 node 版本 | |
const execSync = require('child_process').execSync; | |
const nodeVer = execSync('node -v').toString().replace(/\r|\n/ig, "").replace('v', ''); | |
if (nodeVer < '8.0.0') { | |
Utils.logRed('当前 node 版本:' + nodeVer + ',请更新至 8.0.0 或以上(建议使用 nvm 管理 node)'); | |
process.exit(); | |
} | |
var ascii = "\ | |
\n ('-. ('-. (`-. .-') _ \ | |
\n ( OO ).-. _( OO) _(OO )_ ( OO) ) \ | |
\n / . --. / (,------. ,--(_/ ,. \\ ,-.-') / '._ \ | |
\n | \\-. \\ | .---' \\ \\ /(__/ | |OO) |'--...__) \ | |
\n.-'-' | | | | \\ \\ / / | | \\ '--. .--' \ | |
\n \\| |_.' | (| '--. \\ ' /, | |(_/ | | \ | |
\n | .-. | | .--' \\ /__) ,| |_.' | | \ | |
\n | | | | | `---. \\ / (_| | | | \ | |
\n `--' `--' `------' `-' `--' `--' \ | |
"; | |
console.log(ascii); | |
var fs = require('fs'); | |
var walk = require ('walk'); | |
var toMarkdown = require('to-markdown'); | |
var cheerio = require('cheerio'); | |
var URL = require('url').URL; | |
var mdDirName = '_posts_bk'; | |
walkFiles(); | |
var files = []; | |
function walkFiles() { | |
var walker = walk.walk('..', { filters: ['hexo_html_2_md', 'node_modules', mdDirName, 'about'] }); | |
walker.on('file', function (root, stat, next) { | |
if (stat.name === 'index.html') { | |
files.push(root + '/' + stat.name); | |
} | |
next(); | |
}); | |
walker.on('end', function () { | |
try { | |
for (var i in files) { | |
startConvert(files[i]); | |
} | |
} catch (error) { | |
console.error('error ' + i + ': ' + files[i] + ': ', error.message); | |
} | |
}); | |
} | |
function startConvert(filePath) { | |
var elements = getElememts(filePath); | |
var header = getHeader(elements); | |
var mdStr = getMarkdown(elements.articleBody, elements.articleCodeTags); | |
saveFile(elements, header, mdStr); | |
} | |
// ------------ private method | |
function getElememts(htmlFilePath) { | |
var htmlStr = fs.readFileSync(htmlFilePath, 'utf8'); | |
var $ = cheerio.load(htmlStr); | |
var elements = {}; | |
// article body | |
var allBody = convertCodeTag($); | |
elements.articleBody = allBody.body; | |
elements.articleCodeTags = allBody.codeTags; | |
// file name | |
var url = $('meta[property="og:url"]').attr('content'); | |
if (url.indexOf('/about/index.html') !== -1) { | |
elements.fileName = 'about'; | |
} else { | |
url = new URL(url); | |
var pathnames = url.pathname.split('/'); | |
elements.fileName = decodeURI(pathnames[4]); | |
} | |
// title | |
var title = $('meta[property="og:title"]').attr('content').replace('%CF%80', 'π'); | |
elements.title = title; | |
// date | |
var date = $('time[itemprop="dateCreated"]').attr('datetime'); | |
date = date.replace(/T/, ' ').replace(/\+(.+)/, ''); | |
elements.date = date; | |
// tags | |
var tags = '[' + $('meta[name="keywords"]').attr('content').replace(/,$/gi, '') + ']'; | |
elements.tags = tags; | |
// category | |
var category = $('a[itemprop="url"][rel="index"] span[itemprop="name"]').text(); | |
elements.category = category; | |
// layout | |
var layout = 'post'; | |
elements.layout = layout; | |
return elements; | |
} | |
function getHeader(elements) { | |
var header = '---\n' + | |
'title: ' + elements.title + ' \n' + | |
'date: ' + elements.date + ' \n' + | |
'tags: ' + elements.tags + ' \n' + | |
'category: ' + elements.category + ' \n' + | |
'layout: ' + elements.layout + ' \n' + | |
'\n---\n\n'; | |
return header; | |
} | |
function convertCodeTag($) { | |
var codeTags = []; | |
$('figure').each(function (i0, el0) { | |
// console.log($(el0).find('td[class="code"] pre').html()); | |
var codeTagHtml = $(el0).find('td[class="code"] pre').html(); | |
if (codeTagHtml) { | |
var arr = codeTagHtml.split('<br>'); | |
var code = ''; | |
for (var i in arr) { | |
var _ = cheerio.load(arr[i]); | |
code += (_.text() + (i < arr.length - 1 ? '\n' : '')); | |
} | |
code = '\n```\n' + code + '```\n'; | |
var key = '__code__tag__' + i0; | |
codeTags[key] = code; | |
$(el0).replaceWith(key); | |
} | |
}) | |
convertTable($); | |
return { body: $('span[itemprop="articleBody"]').html(), codeTags: codeTags }; | |
} | |
/* | |
// 以 <span class="line"> 来作为一行的分隔,但是发现有些是最外层有 <span class="line">,里面还有 <span class="line">,所以这样转会有问题 | |
// 即这样转出来是有问题的: <span class="line"> test1 <span class="line"> test2 </span> </span> | |
// 这样转出来就没问题: <span class="line"> test1 </span> <span class="line"> test2 </span> | |
// 经观察,还是以 <br> 来作为一行的分隔才没问题,所以使用以上方法 | |
function convertCodeTag_old($) { | |
var codeTags = []; | |
$('figure').each(function(i0, el0) { | |
$(el0).find('td[class="code"] pre').each(function (i1, el1) { | |
var code = ''; | |
$(el1).find('span[class="line"]').each(function (i2, el2) { | |
code = code + $(el2).text() + '\n'; | |
}); | |
code = '\n```\n' + code + '```\n'; | |
var key = '__code__tag__' + i0; | |
codeTags[key] = code; | |
$(el0).replaceWith(key); | |
}); | |
}) | |
convertTable($); | |
return { body: $('span[itemprop="articleBody"]').html(), codeTags: codeTags }; | |
} | |
*/ | |
function convertTable($) { | |
$('table').each(function (i0, el0) { | |
var headers = []; | |
var bodies = []; | |
if ($(el0).has('thead')) { | |
$(el0).find('thead').each(function (i1, el1) { | |
$(el0).find('tr th').each(function(i11, el11) { | |
headers.push($(el11).text()); | |
}); | |
var rows = $(el0).find('tbody tr').each(function (i2, el2) { | |
var aRow = []; | |
$(el2).find('td').each(function (i3, el3) { | |
aRow.push($(el3).text()); | |
}); | |
bodies.push(aRow); | |
}); | |
}); | |
} | |
if (headers.length > 0) { | |
var headerStr = '|'; | |
var sep = '|'; | |
for (var aHeader in headers) { | |
headerStr += (headers[aHeader] + '|'); | |
sep += '---|'; | |
} | |
var rowStr = ''; | |
for (var aRow in bodies) { | |
var aRowStr = '|'; | |
for (var one in bodies[aRow]) { | |
aRowStr += (bodies[aRow][one] + '|'); | |
} | |
rowStr += (aRowStr + '<br>'); | |
} | |
var result = '<br>' + headerStr + '<br>' + sep + '<br>' + rowStr; | |
// console.log(result); | |
$(el0).replaceWith(result); | |
} | |
}); | |
} | |
function getMarkdown(articleBody, articleCodeTags) { | |
var mdStr = toMarkdown(articleBody); | |
for (var key in articleCodeTags) { | |
if (articleCodeTags.hasOwnProperty(key)) { | |
var element = articleCodeTags[key]; | |
mdStr = mdStr.replace(key, element); | |
} | |
} | |
mdStr = mdStr.replace('<a id="more"></a>', '<!--more-->').replace(/\[\]\(\#(.+)\)/g, '').replace(/``` /g, '```').replace(/<figure class="half">|<\/figure>/g, ''); | |
return mdStr; walk | |
} | |
function saveFile(elements, header, mdStr) { | |
var dir = '../' + mdDirName + '/'; | |
if (!fs.existsSync(dir)) { | |
fs.mkdirSync(dir); | |
} | |
fs.writeFileSync(dir + elements.fileName + '.md', header + mdStr, 'utf8'); | |
console.log('Saved ' + elements.fileName); | |
} |