Skip to content

Instantly share code, notes, and snippets.

@iChris
Last active September 13, 2019 03:21
Show Gist options
  • Save iChris/06efd9479df054c84bab83dd3645dfb4 to your computer and use it in GitHub Desktop.
Save iChris/06efd9479df054c84bab83dd3645dfb4 to your computer and use it in GitHub Desktop.
Blog2Markdown Node Script Converter
'use strict';
/***
Usage: blog2md b|w <BLOGGER/WordPress BACKUP XML> <OUTPUT DIR>
*/
const fs = require('fs');
const os = require('os');
const path = require('path');
const xml2js = require('xml2js');
const TurndownService = require('turndown');
var moment = require('moment');
var tds = new TurndownService({ codeBlockStyle: 'fenced', fence: '```' })
tds.addRule('wppreblock', {
filter: ['pre'],
replacement: function(content) {
return '```\n' + content + '\n```'
}
})
// console.log(`No. of arguments passed: ${process.argv.length}`);
if (process.argv.length < 5){
// ${process.argv[1]}
console.log(`Usage: blog2md [b|w] <BACKUP XML> <OUTPUT DIR> m|s`)
console.log(`\t b for parsing Blogger(Blogspot) backup`);
console.log(`\t w for parsing WordPress backup`);
return 1;
}
var option = process.argv[2];
var inputFile = process.argv[3];
var outputDir = process.argv[4];
var mergeComments = (process.argv[5] == 'm')?'m':'s' ;
if (fs.existsSync(outputDir)) {
console.log(`WARNING: Given output directory "${outputDir}" already exists. Files will be overwritten.`)
}
else{
fs.mkdirSync(outputDir);
}
if (mergeComments == 'm'){
console.log(`INFO: Comments requested to be merged along with posts. (m)`);
}
else{
console.log(`INFO: Comments requested to be a separate .md file(m - default)`);
}
if( option.toLowerCase() == 'b'){
bloggerImport(inputFile, outputDir);
}
else if(option.toLowerCase() == 'w'){
wordpressImport(inputFile, outputDir);
}
else {
console.log('Only b (Blogger) and w (WordPress) are valid options');
return;
}
function wordpressImport(backupXmlFile, outputDir){
var parser = new xml2js.Parser();
fs.readFile(backupXmlFile, function(err, data) {
parser.parseString(data, function (err, result) {
if (err) {
console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`);
return 1;
}
// console.dir(result);
// console.log(JSON.stringify(result)); return;
var posts = [];
// try {
posts = result.rss.channel[0].item;
console.log(`Total Post count: ${posts.length}`);
posts = posts.filter(function(post){
var status = '';
if(post["wp:status"]){
status = post["wp:status"].join('');
}
// console.log(post["wp:status"].join(''));
return status != "private" && status != "inherit"
});
// console.log(posts)
console.log(`Post count: ${posts.length}`);
var title = '';
var content = '';
var tags = [];
var published = '';
var comments = [];
var fname = '';
var markdown = '';
var fileContent = '';
var fileHeader = '';
var postMaps = {};
posts.forEach(function(post){
var postMap = {};
title = post.title[0].trim();
// console.log(title);
// if (title && title.indexOf("'")!=-1){
title = title.replace(/'/g, "''");
// }
published = post.pubDate;
comments = post['wp:comment'];
fname = post["wp:post_name"][0] || post["wp:post_id"];
markdown = '';
// if (post.guid && post.guid[0] && post.guid[0]['_']){
// fname = path.basename(post.guid[0]['_']);
// }
// console.log(comments);
console.log(`\n\n\n\ntitle: '${title}'`);
console.log(`published: '${published}'`);
if (comments){
console.log(`comments: '${comments.length}'`);
}
tags = [];
var categories = post.category;
var tagString = '';
if (categories && categories.length){
categories.forEach(function (category){
// console.log(category['_']);
tags.push(category['_']);
});
// console.log(tags.join(", "));
// tags = tags.join(", ");
tagString = 'tags: [' + tags.join(", ") + "]\n";
// console.log(tagString);
}
var pmap = {fname:'', comments:[]};
pmap.fname = outputDir+'/'+fname+'-comments.md';
// temp date var
var d = new Date( post.pubDate );
var formattedDate = `${d.getFullYear()}-${('0'+(d.getMonth()+1)).slice(-2)}-${('0'+d.getDate()).slice(-2)}`;
fname = outputDir+'/'+fname+'.md';
pmap.postName = fname;
console.log(`fname: '${fname}'`);
if (post["content:encoded"]){
// console.log('content available');
content = '<div>'+post["content:encoded"]+'</div>'; //to resolve error if plain text returned
markdown = tds.turndown(content);
// console.log(markdown);
fileHeader = `---\ntitle: '${title}'\ndate: ${formattedDate}\ndraft: false\n${tagString}\n---\n`;
fileContent = `${fileHeader}\n${markdown}`;
pmap.header = `${fileHeader}\n`;
// fileContent = `---\ntitle: '${title}'\ndate: ${published}\ndraft: false\n${tagString}---\n\n${markdown}`;
writeToFile(fname, fileContent);
}
//comments:
/*
"wp:comment" [.each]
wp:comment_author[0]
wp:comment_author_email[0]
wp:comment_author_url[0]
wp:comment_date[0]
wp:comment_content[0]
wp:comment_approved[0] == 1
wp:post_id
*/
var comments = post["wp:comment"] || [];
// console.dir(comments);
var anyApprovedComments = 0;
var ccontent = '';
comments.forEach(function(comment){
// console.log('')
if(comment["wp:comment_approved"].pop()){
anyApprovedComments = 1;
var cmt = {title:'', published:'', content:'', author:{}};
cmt.published = (comment["wp:comment_date"]?comment["wp:comment_date"].pop():'');
var cont = '<div>'+comment["wp:comment_content"].pop()+'</div>';
cmt.content = (comment["wp:comment_content"]?tds.turndown(cont):'');
cmt.author.name = (comment["wp:comment_author"]?comment["wp:comment_author"].pop():'');
cmt.author.email = (comment["wp:comment_author_email"]?comment["wp:comment_author_email"].pop():'');
cmt.author.url = (comment["wp:comment_author_url"]?comment["wp:comment_author_url"].pop():'');
ccontent += `#### [${cmt.author.name}](${cmt.author.url} "${cmt.author.email}") - ${cmt.published}\n\n${cmt.content}\n<hr />\n`;
pmap.comments.push(cmt);
}
});
//just a hack to re-use blogger writecomments method
if (pmap && pmap.comments && pmap.comments.length){
writeComments({"0": pmap});
}
});
});
});
}
function bloggerImport(backupXmlFile, outputDir){
var parser = new xml2js.Parser();
// __dirname + '/foo.xml'
fs.readFile(backupXmlFile, function(err, data) {
parser.parseString(data, function (err, result) {
if (err){
console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); return 1;
}
// console.dir(JSON.stringify(result)); return;
if(result.feed && result.feed.entry) {
var contents = result.feed.entry;
console.log(`Total no. of entries found : ${contents.length}`);
// var i=0
var posts = contents.filter(function(entry){
return entry.id[0].indexOf('.post-')!=-1 && !entry['thr:in-reply-to']
});
var comments = contents.filter(function(entry){
return entry.id[0].indexOf('.post-')!=-1 && entry['thr:in-reply-to']
});
// console.dir(posts);
console.log(`Content-posts ${posts.length}`);
console.log(`Content-Comments ${comments.length}`);
var content = '';
var markdown = '';
var fileContent = '';
var fileHeader = '';
var postMaps = {};
posts.forEach(function(entry){
var postMap = {};
var title = entry.title[0]['_'];
// title = tds.turndown(title);
if (title && title.indexOf("'")!=-1){
title = title.replace(/'/g, "''");
}
postMap.pid = entry.id[0].split('-').pop()
var published = entry.published;
var draft = 'false';
console.log(`title: "${title}"`);
console.log(`date: ${published}`);
console.log(`draft: false`);
var links = entry.link;
var urlLink = entry.link.filter(function(link){
return link["$"].type && link["$"].rel && link["$"].rel=='alternate' && link["$"].type=='text/html'
});
var url=''
// console.dir(urlLink[0]);
if (urlLink && urlLink[0] && urlLink[0]['$'] && urlLink[0]['$'].href){
url = urlLink[0]['$'].href;
var fname = outputDir + '/' + path.basename(url);
fname = fname.replace('.html', '.md')
console.log(fname);
postMap.postName = fname
postMap.fname = fname.replace('.md', '-comments.md');
postMap.comments = [];
if (entry.content && entry.content[0] && entry.content[0]['_']){
// console.log('content available');
content = entry.content[0]['_'];
markdown = tds.turndown(content);
// console.log(markdown);
}
var tagLabel = [];
var tags = [];
tagLabel = entry.category.filter(function (tag){
// console.log(`tagged against :${tag['$'].term}`);
return tag['$'].term && tag['$'].term.indexOf('http://schemas.google')==-1;
});
console.log(`No of category: ${entry.category.length}`);
tagLabel.forEach(function(tag){
// console.log(`tagged against :${tag['$'].term}`);
tags.push(tag['$'].term);
});
console.log(`tags : [${tags.join(', ')}]`);
var tagString='';
if(tags.length){
tagString=`tags : [${tags.join(', ')}]\n`;
}
console.dir(postMap);
console.log("\n\n\n\n\n");
fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: false\n${tagString}---\n`;
fileContent = `${fileHeader}\n${markdown}`;
postMap.header = fileHeader;
postMaps[postMap.pid] = postMap;
writeToFile(fname, fileContent)
}
});
comments.forEach(function(entry){
// var commentMap = {};
var comment = {published:'', title:'', content:''};
var postId = entry['thr:in-reply-to'][0]["$"]["source"];
postId = path.basename(postId);
comment.published = entry['published'][0];
if(entry['title'][0] && entry['title'][0]["_"]){
comment.title = tds.turndown(entry['title'][0]["_"]);
}
if (entry['content'][0] && entry['content'][0]["_"]){
comment.content = tds.turndown(entry['content'][0]["_"]);
}
comment.author = {name: '', email: '', url: ''};
if(entry['author'][0]["name"] && entry['author'][0]["name"][0]){
comment.author.name = entry['author'][0]["name"][0];
}
if (entry['author'][0]["email"] && entry['author'][0]["email"][0]){
comment.author.email = entry['author'][0]["email"][0];
}
if (entry['author'][0]["uri"] && entry['author'][0]["uri"][0]){
comment.author.url = entry['author'][0]["uri"][0];
}
postMaps[postId].comments.push(comment);
});
// console.log(JSON.stringify(postMaps)); return;
writeComments(postMaps);
}
console.log('Done');
});
});
}
function writeComments(postMaps){
if (mergeComments == 'm'){
console.log('DEBUG: merge comments requested');
}else{
console.log('DEBUG: separate comments requested (defaulted)');
}
for (var pmap in postMaps){
var comments = postMaps[pmap].comments;
console.log(`post id: ${pmap} has ${comments.length} comments`);
// console.dir(comments);
if (comments.length){
var ccontent = '';
comments.forEach(function(comment){
var readableDate = '<time datetime="'+comment.published+'">' + moment(comment.published).format("MMM d, YYYY") + '</time>';
ccontent += `#### ${comment.title}\n[${comment.author.name}](${comment.author.url} "${comment.author.email}") - ${readableDate}\n\n${comment.content}\n<hr />\n`;
});
if (mergeComments == 'm'){
writeToFile(postMaps[pmap].postName, `\n---\n### Comments:\n${ccontent}`, true);
}else{
writeToFile(postMaps[pmap].fname, `${postMaps[pmap].header}\n${ccontent}`);
}
}
}
}
function writeToFile(filename, content, append=false){
if(append){
console.log(`DEBUG: going to append to ${filename}`);
try{
fs.appendFileSync(filename, content);
console.log(`Successfully appended to ${filename}`);
}
catch(err){
console.log(`Error while appending to ${filename} - ${JSON.stringify(err)}`);
console.dir(err);
}
}else{
console.log(`DEBUG: going to write to ${filename}`);
try{
fs.writeFileSync(filename, content);
console.log(`Successfully written to ${filename}`);
}
catch(err){
console.log(`Error while writing to ${filename} - ${JSON.stringify(err)}`);
console.dir(err);
}
}
}
@iChris
Copy link
Author

iChris commented Sep 12, 2019

Right now the date is output in a date: Fri, 10 Nov 2017 18:03:49 +0000 format but I'd like it to be in YYYY-MM-DD format instead. The time of day isn't important for my purposes. (Converting WordPress Export.XML to a .MD file for importing to Eleventy).

@davatron5000
Copy link

try something like this?

// temp date var
var d = new Date( post.pubDate );
var formattedDate = `${d.getFullYear()}-${d.getMonth()+1}-${d.getDate()}`;

@iChris
Copy link
Author

iChris commented Sep 12, 2019

That's close. I get this error on build:

> date front matter value (2016-2-24) is invalid for ./site/blog/%e2%96%ba-dailyish-162-shure-thing.md (Error):
    Error: date front matter value (2016-2-24) is invalid for ./site/blog/%e2%96%ba-dailyish-162-shure-thing.md
        at Template.getMappedDate (/Users/chrisenns/Sites/chrisenns-eleventy/node_modules/@11ty/eleventy/src/Template.js:611:19)

Which I think is due to the month being a single digit?

i.e. 6 instead of 06? date: 2012-6-13 is what's in the posts now?

I'm not sure if Eleventy requires it exactly as 2012-06-13 or not? Support docs examples use the full number.

@davatron5000
Copy link

Gotta do a
Little string formatting then...

// temp date var
var d = new Date( post.pubDate );
var formattedDate = `${d.getFullYear()}-${('0'+(d.getMonth()+1)).slice(-2)}-${('0'+d.getDate()).slice(-2)}`;

I did this on my phone. Might not be great.

@mattsteele
Copy link

mattsteele commented Sep 12, 2019

Q: I'm not sure if Eleventy requires it exactly as 2012-06-13 or not?
A: Yes, I get the same error when I use one digit instead of two

Error: date front matter value (2019-01-1) is invalid for ./src/work/aces-editors/aces-editors.md at Template.getMappedDate (/Users/mattsteele/Code/mattsteele/node_modules/@11ty/eleventy/src/Template.js:613:19)

@iChris
Copy link
Author

iChris commented Sep 13, 2019

I did this on my phone. Might not be great.

That worked! Thanks Dave & Matt!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment