Skip to content

Instantly share code, notes, and snippets.

@Aevit
Created September 5, 2017 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Aevit/ccb018e1ac6de50e2f4d631ca97f2bc7 to your computer and use it in GitHub Desktop.
Save Aevit/ccb018e1ac6de50e2f4d631ca97f2bc7 to your computer and use it in GitHub Desktop.
A NodeJS command to convert html files which are generated with the theme "next" by "hexo" to markdown files. Should use npm to install [ cheerio to-markdown walk ] firstly.

A NodeJS command to convert html files which are generated with the theme next by hexo to markdown files.

You should use npm to install cheerio to-markdown walk firstly, and then use node command to run the script.

#!/usr/bin/env node
"use strict";
// 检查 node 版本
const execSync = require('child_process').execSync;
const nodeVer = execSync('node -v').toString().replace(/\r|\n/ig, "").replace('v', '');
if (nodeVer < '8.0.0') {
Utils.logRed('当前 node 版本:' + nodeVer + ',请更新至 8.0.0 或以上(建议使用 nvm 管理 node)');
process.exit();
}
var ascii = "\
\n ('-. ('-. (`-. .-') _ \
\n ( OO ).-. _( OO) _(OO )_ ( OO) ) \
\n / . --. / (,------. ,--(_/ ,. \\ ,-.-') / '._ \
\n | \\-. \\ | .---' \\ \\ /(__/ | |OO) |'--...__) \
\n.-'-' | | | | \\ \\ / / | | \\ '--. .--' \
\n \\| |_.' | (| '--. \\ ' /, | |(_/ | | \
\n | .-. | | .--' \\ /__) ,| |_.' | | \
\n | | | | | `---. \\ / (_| | | | \
\n `--' `--' `------' `-' `--' `--' \
";
console.log(ascii);
var fs = require('fs');
var walk = require ('walk');
var toMarkdown = require('to-markdown');
var cheerio = require('cheerio');
var URL = require('url').URL;
var mdDirName = '_posts_bk';
walkFiles();
var files = [];
function walkFiles() {
var walker = walk.walk('..', { filters: ['hexo_html_2_md', 'node_modules', mdDirName, 'about'] });
walker.on('file', function (root, stat, next) {
if (stat.name === 'index.html') {
files.push(root + '/' + stat.name);
}
next();
});
walker.on('end', function () {
try {
for (var i in files) {
startConvert(files[i]);
}
} catch (error) {
console.error('error ' + i + ': ' + files[i] + ': ', error.message);
}
});
}
function startConvert(filePath) {
var elements = getElememts(filePath);
var header = getHeader(elements);
var mdStr = getMarkdown(elements.articleBody, elements.articleCodeTags);
saveFile(elements, header, mdStr);
}
// ------------ private method
function getElememts(htmlFilePath) {
var htmlStr = fs.readFileSync(htmlFilePath, 'utf8');
var $ = cheerio.load(htmlStr);
var elements = {};
// article body
var allBody = convertCodeTag($);
elements.articleBody = allBody.body;
elements.articleCodeTags = allBody.codeTags;
// file name
var url = $('meta[property="og:url"]').attr('content');
if (url.indexOf('/about/index.html') !== -1) {
elements.fileName = 'about';
} else {
url = new URL(url);
var pathnames = url.pathname.split('/');
elements.fileName = decodeURI(pathnames[4]);
}
// title
var title = $('meta[property="og:title"]').attr('content').replace('%CF%80', 'π');
elements.title = title;
// date
var date = $('time[itemprop="dateCreated"]').attr('datetime');
date = date.replace(/T/, ' ').replace(/\+(.+)/, '');
elements.date = date;
// tags
var tags = '[' + $('meta[name="keywords"]').attr('content').replace(/,$/gi, '') + ']';
elements.tags = tags;
// category
var category = $('a[itemprop="url"][rel="index"] span[itemprop="name"]').text();
elements.category = category;
// layout
var layout = 'post';
elements.layout = layout;
return elements;
}
function getHeader(elements) {
var header = '---\n' +
'title: ' + elements.title + ' \n' +
'date: ' + elements.date + ' \n' +
'tags: ' + elements.tags + ' \n' +
'category: ' + elements.category + ' \n' +
'layout: ' + elements.layout + ' \n' +
'\n---\n\n';
return header;
}
function convertCodeTag($) {
var codeTags = [];
$('figure').each(function (i0, el0) {
// console.log($(el0).find('td[class="code"] pre').html());
var codeTagHtml = $(el0).find('td[class="code"] pre').html();
if (codeTagHtml) {
var arr = codeTagHtml.split('<br>');
var code = '';
for (var i in arr) {
var _ = cheerio.load(arr[i]);
code += (_.text() + (i < arr.length - 1 ? '\n' : ''));
}
code = '\n```\n' + code + '```\n';
var key = '__code__tag__' + i0;
codeTags[key] = code;
$(el0).replaceWith(key);
}
})
convertTable($);
return { body: $('span[itemprop="articleBody"]').html(), codeTags: codeTags };
}
/*
// 以 <span class="line"> 来作为一行的分隔,但是发现有些是最外层有 <span class="line">,里面还有 <span class="line">,所以这样转会有问题
// 即这样转出来是有问题的: <span class="line"> test1 <span class="line"> test2 </span> </span>
// 这样转出来就没问题: <span class="line"> test1 </span> <span class="line"> test2 </span>
// 经观察,还是以 <br> 来作为一行的分隔才没问题,所以使用以上方法
function convertCodeTag_old($) {
var codeTags = [];
$('figure').each(function(i0, el0) {
$(el0).find('td[class="code"] pre').each(function (i1, el1) {
var code = '';
$(el1).find('span[class="line"]').each(function (i2, el2) {
code = code + $(el2).text() + '\n';
});
code = '\n```\n' + code + '```\n';
var key = '__code__tag__' + i0;
codeTags[key] = code;
$(el0).replaceWith(key);
});
})
convertTable($);
return { body: $('span[itemprop="articleBody"]').html(), codeTags: codeTags };
}
*/
function convertTable($) {
$('table').each(function (i0, el0) {
var headers = [];
var bodies = [];
if ($(el0).has('thead')) {
$(el0).find('thead').each(function (i1, el1) {
$(el0).find('tr th').each(function(i11, el11) {
headers.push($(el11).text());
});
var rows = $(el0).find('tbody tr').each(function (i2, el2) {
var aRow = [];
$(el2).find('td').each(function (i3, el3) {
aRow.push($(el3).text());
});
bodies.push(aRow);
});
});
}
if (headers.length > 0) {
var headerStr = '|';
var sep = '|';
for (var aHeader in headers) {
headerStr += (headers[aHeader] + '|');
sep += '---|';
}
var rowStr = '';
for (var aRow in bodies) {
var aRowStr = '|';
for (var one in bodies[aRow]) {
aRowStr += (bodies[aRow][one] + '|');
}
rowStr += (aRowStr + '<br>');
}
var result = '<br>' + headerStr + '<br>' + sep + '<br>' + rowStr;
// console.log(result);
$(el0).replaceWith(result);
}
});
}
function getMarkdown(articleBody, articleCodeTags) {
var mdStr = toMarkdown(articleBody);
for (var key in articleCodeTags) {
if (articleCodeTags.hasOwnProperty(key)) {
var element = articleCodeTags[key];
mdStr = mdStr.replace(key, element);
}
}
mdStr = mdStr.replace('<a id="more"></a>', '<!--more-->').replace(/\[\]\(\#(.+)\)/g, '').replace(/``` /g, '```').replace(/<figure class="half">|<\/figure>/g, '');
return mdStr; walk
}
function saveFile(elements, header, mdStr) {
var dir = '../' + mdDirName + '/';
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
fs.writeFileSync(dir + elements.fileName + '.md', header + mdStr, 'utf8');
console.log('Saved ' + elements.fileName);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment