Created
June 25, 2022 10:58
-
-
Save rfennell/0f2768e5e6da0c1eb384e62e2f632116 to your computer and use it in GitHub Desktop.
A quick edit to the blog2md (https://github.com/palaniraja/blog2md) tools to place exported posts from a multi site WordPress Server in folders based on the sub site name
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
/*** | |
Usage: blog2md b|w <BLOGGER/WordPress BACKUP XML> <OUTPUT DIR> | |
*/ | |
const fs = require('fs'); | |
const os = require('os'); | |
const path = require('path'); | |
const xml2js = require('xml2js'); | |
const sanitize = require('sanitize-filename'); | |
const TurndownService = require('turndown'); | |
var moment = require('moment'); | |
var tds = new TurndownService({ codeBlockStyle: 'fenced', fence: '```' }) | |
tds.addRule('wppreblock', { | |
filter: ['pre'], | |
replacement: function(content) { | |
return '```\n' + content + '\n```' | |
} | |
}) | |
// console.log(`No. of arguments passed: ${process.argv.length}`); | |
if (process.argv.length < 5){ | |
// ${process.argv[1]} | |
console.log(`Usage: blog2md [b|w] <BACKUP XML> <OUTPUT DIR> m|s`) | |
console.log(`\t b for parsing Blogger(Blogspot) backup`); | |
console.log(`\t w for parsing WordPress backup`); | |
return 1; | |
} | |
var option = process.argv[2]; | |
var inputFile = process.argv[3]; | |
var outputDir = process.argv[4]; | |
var mergeComments = (process.argv[5] == 'm')?'m':'s' ; | |
/** Apply a fix to WordPress posts to convert newlines to paragraphs. */ | |
var applyParagraphFix = (process.argv.indexOf('paragraph-fix') >= 0); | |
if (fs.existsSync(outputDir)) { | |
console.log(`WARNING: Given output directory "${outputDir}" already exists. Files will be overwritten.`) | |
} | |
else{ | |
fs.mkdirSync(outputDir); | |
} | |
if (mergeComments == 'm'){ | |
console.log(`INFO: Comments requested to be merged along with posts. (m)`); | |
} | |
else{ | |
console.log(`INFO: Comments requested to be a separate .md file(m - default)`); | |
} | |
if( option.toLowerCase() == 'b'){ | |
bloggerImport(inputFile, outputDir); | |
} | |
else if(option.toLowerCase() == 'w'){ | |
wordpressImport(inputFile, outputDir); | |
} | |
else { | |
console.log('Only b (Blogger) and w (WordPress) are valid options'); | |
return; | |
} | |
function wordpressImport(backupXmlFile, outputDir){ | |
var parser = new xml2js.Parser(); | |
fs.readFile(backupXmlFile, function(err, data) { | |
parser.parseString(data, function (err, result) { | |
if (err) { | |
console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); | |
return 1; | |
} | |
// console.dir(result); | |
// console.log(JSON.stringify(result)); return; | |
var posts = []; | |
// try { | |
posts = result.rss.channel[0].item; | |
console.log(`Total Post count: ${posts.length}`); | |
posts = posts.filter(function(post){ | |
var status = ''; | |
if(post["wp:status"]){ | |
status = post["wp:status"].join(''); | |
} | |
// console.log(post["wp:status"].join('')); | |
return status != "private" && status != "inherit" | |
}); | |
// console.log(posts) | |
console.log(`Post count: ${posts.length}`); | |
var title = ''; | |
var content = ''; | |
var tags = []; | |
var draft = false; | |
var published = ''; | |
var comments = []; | |
var fname = ''; | |
var markdown = ''; | |
var fileContent = ''; | |
var fileHeader = ''; | |
var person = ''; | |
var postMaps = {}; | |
posts.forEach(function(post){ | |
var postMap = {}; | |
title = post.title[0].trim(); | |
// console.log(title); | |
// if (title && title.indexOf("'")!=-1){ | |
title = title.replace(/'/g, "''"); | |
// } | |
draft = post["wp:status"] == "draft" | |
person = post.link[0].split('/')[3] | |
console.log("Blog of "+person) | |
published = post.pubDate; | |
comments = post['wp:comment']; | |
fname = sanitize(decodeURI(post["wp:post_name"][0])) || post["wp:post_id"]; | |
markdown = ''; | |
// if (post.guid && post.guid[0] && post.guid[0]['_']){ | |
// fname = path.basename(post.guid[0]['_']); | |
// } | |
// console.log(comments); | |
console.log(`\n\n\n\ntitle: '${title}'`); | |
console.log(`published: '${published}'`); | |
if (comments){ | |
console.log(`comments: '${comments.length}'`); | |
} | |
tags = []; | |
var categories = post.category; | |
var tagString = ''; | |
if (categories && categories.length){ | |
categories.forEach(function (category){ | |
// console.log(category['_']); | |
tags.push(category['_']); | |
}); | |
// console.log(tags.join(", ")); | |
// tags = tags.join(", "); | |
tagString = 'tags: [\'' + tags.join("', '") + "']\n"; | |
// console.log(tagString); | |
} | |
if (!fs.existsSync(outputDir+'/'+person)) { | |
fs.mkdirSync(outputDir+'/'+person) | |
} | |
var pmap = {fname:'', comments:[]}; | |
pmap.fname = outputDir+'/'+person+ '/'+fname+'-comments.md'; | |
fname = outputDir+'/'+person+'/'+fname+'.md'; | |
pmap.postName = fname; | |
console.log(`fname: '${fname}'`); | |
if (post["content:encoded"]){ | |
// console.log('content available'); | |
var postContent = post["content:encoded"].toString(); | |
if (applyParagraphFix && !/<p>/i.test(postContent)) { | |
postContent = '<p>' + postContent.replace(/(\r?\n){2}/g, '</p>\n\n<p>') + '</p>'; | |
} | |
content = '<div>'+postContent+'</div>'; //to resolve error if plain text returned | |
markdown = tds.turndown(content); | |
// console.log(markdown); | |
fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: ${draft}\n${tagString}---\n`; | |
fileContent = `${fileHeader}\n${markdown}`; | |
pmap.header = `${fileHeader}\n`; | |
writeToFile(fname, fileContent); | |
} | |
//comments: | |
/* | |
"wp:comment" [.each] | |
wp:comment_author[0] | |
wp:comment_author_email[0] | |
wp:comment_author_url[0] | |
wp:comment_date[0] | |
wp:comment_content[0] | |
wp:comment_approved[0] == 1 | |
wp:post_id | |
*/ | |
var comments = post["wp:comment"] || []; | |
// console.dir(comments); | |
var anyApprovedComments = 0; | |
var ccontent = ''; | |
comments.forEach(function(comment){ | |
// console.log('') | |
if(comment["wp:comment_approved"].pop()){ | |
anyApprovedComments = 1; | |
var cmt = {title:'', published:'', content:'', author:{}}; | |
cmt.published = (comment["wp:comment_date"]?comment["wp:comment_date"].pop():''); | |
var cont = '<div>'+comment["wp:comment_content"].pop()+'</div>'; | |
cmt.content = (comment["wp:comment_content"]?tds.turndown(cont):''); | |
cmt.author.name = (comment["wp:comment_author"]?comment["wp:comment_author"].pop():''); | |
cmt.author.email = (comment["wp:comment_author_email"]?comment["wp:comment_author_email"].pop():''); | |
cmt.author.url = (comment["wp:comment_author_url"]?comment["wp:comment_author_url"].pop():''); | |
ccontent += `#### [${cmt.author.name}](${cmt.author.url} "${cmt.author.email}") - ${cmt.published}\n\n${cmt.content}\n<hr />\n`; | |
pmap.comments.push(cmt); | |
} | |
}); | |
//just a hack to re-use blogger writecomments method | |
if (pmap && pmap.comments && pmap.comments.length){ | |
writeComments({"0": pmap}); | |
} | |
}); | |
}); | |
}); | |
} | |
function getFileName(text) { | |
var newFileName = sanitize(text) // first remove any dodgy characters | |
.replace(/[\.']/g, '') // then remove some known characters | |
.replace(/[^a-z0-9]/gi, '-') // then turn anything that isn't a number or letter into a hyphen | |
.replace(/[\-]{2,}/g, '-') // then turn multiple hyphens into a single one | |
.toLowerCase(); // finally make it all lower case | |
return newFileName; | |
} | |
function bloggerImport(backupXmlFile, outputDir){ | |
var parser = new xml2js.Parser(); | |
// __dirname + '/foo.xml' | |
fs.readFile(backupXmlFile, function(err, data) { | |
parser.parseString(data, function (err, result) { | |
if (err){ | |
console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); return 1; | |
} | |
// console.dir(JSON.stringify(result)); return; | |
if(result.feed && result.feed.entry) { | |
var contents = result.feed.entry; | |
console.log(`Total no. of entries found : ${contents.length}`); | |
// var i=0 | |
var posts = contents.filter(function(entry){ | |
return entry.id[0].indexOf('.post-')!=-1 && !entry['thr:in-reply-to'] | |
}); | |
var comments = contents.filter(function(entry){ | |
return entry.id[0].indexOf('.post-')!=-1 && entry['thr:in-reply-to'] | |
}); | |
// console.dir(posts); | |
console.log(`Content-posts ${posts.length}`); | |
console.log(`Content-Comments ${comments.length}`); | |
var content = ''; | |
var markdown = ''; | |
var fileContent = ''; | |
var fileHeader = ''; | |
var postMaps = {}; | |
posts.forEach(function(entry){ | |
var postMap = {}; | |
var title = entry.title[0]['_']; | |
// title = tds.turndown(title); | |
if (title && title.indexOf("'")!=-1){ | |
title = title.replace(/'/g, "''"); | |
} | |
postMap.pid = entry.id[0].split('-').pop() | |
var published = entry.published; | |
var draft = 'false'; | |
if(entry['app:control'] && (entry['app:control'][0]['app:draft'][0] == 'yes')){ | |
draft = 'true'; | |
} | |
console.log(`title: "${title}"`); | |
console.log(`date: ${published}`); | |
console.log(`draft: ${draft}`); | |
var sanitizedTitle = getFileName(title) | |
var urlLink = entry.link.filter(function(link){ | |
return link["$"].type && link["$"].rel && link["$"].rel=='alternate' && link["$"].type=='text/html' | |
}); | |
var url='' | |
// console.dir(urlLink[0]); | |
if (urlLink && urlLink[0] && urlLink[0]['$'] && urlLink[0]['$'].href){ | |
url = urlLink[0]['$'].href; | |
} | |
var fname = outputDir + '/' + path.basename(sanitizedTitle) + '.md'; | |
console.log(fname); | |
postMap.postName = fname | |
postMap.fname = fname.replace('.md', '-comments.md'); | |
postMap.comments = []; | |
if (entry.content && entry.content[0] && entry.content[0]['_']){ | |
// console.log('content available'); | |
content = entry.content[0]['_']; | |
markdown = tds.turndown(content); | |
// console.log(markdown); | |
} | |
var tagLabel = []; | |
var tags = []; | |
tagLabel = entry.category.filter(function (tag){ | |
// console.log(`tagged against :${tag['$'].term}`); | |
return tag['$'].term && tag['$'].term.indexOf('http://schemas.google')==-1; | |
}); | |
console.log(`No of category: ${entry.category.length}`); | |
tagLabel.forEach(function(tag){ | |
// console.log(`tagged against :${tag['$'].term}`); | |
tags.push(tag['$'].term); | |
}); | |
console.log(`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`); | |
var tagString=''; | |
if(tags.length){ | |
tagString=`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`; | |
} | |
console.dir(postMap); | |
console.log("\n\n\n\n\n"); | |
var alias = url.replace(/^.*\/\/[^\/]+/, ''); | |
fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: ${draft}\nurl: ${alias}\n${tagString}---\n`; | |
fileContent = `${fileHeader}\n${markdown}`; | |
postMap.header = fileHeader; | |
postMaps[postMap.pid] = postMap; | |
writeToFile(fname, fileContent) | |
}); | |
comments.forEach(function(entry){ | |
// var commentMap = {}; | |
var comment = {published:'', title:'', content:''}; | |
var postId = entry['thr:in-reply-to'][0]["$"]["source"]; | |
postId = path.basename(postId); | |
comment.published = entry['published'][0]; | |
if(entry['title'][0] && entry['title'][0]["_"]){ | |
comment.title = tds.turndown(entry['title'][0]["_"]); | |
} | |
if (entry['content'][0] && entry['content'][0]["_"]){ | |
comment.content = tds.turndown(entry['content'][0]["_"]); | |
} | |
comment.author = {name: '', email: '', url: ''}; | |
if(entry['author'][0]["name"] && entry['author'][0]["name"][0]){ | |
comment.author.name = entry['author'][0]["name"][0]; | |
} | |
if (entry['author'][0]["email"] && entry['author'][0]["email"][0]){ | |
comment.author.email = entry['author'][0]["email"][0]; | |
} | |
if (entry['author'][0]["uri"] && entry['author'][0]["uri"][0]){ | |
comment.author.url = entry['author'][0]["uri"][0]; | |
} | |
postMaps[postId].comments.push(comment); | |
}); | |
// console.log(JSON.stringify(postMaps)); return; | |
writeComments(postMaps); | |
} | |
console.log('Done'); | |
}); | |
}); | |
} | |
function writeComments(postMaps){ | |
if (mergeComments == 'm'){ | |
console.log('DEBUG: merge comments requested'); | |
}else{ | |
console.log('DEBUG: separate comments requested (defaulted)'); | |
} | |
for (var pmap in postMaps){ | |
var comments = postMaps[pmap].comments; | |
console.log(`post id: ${pmap} has ${comments.length} comments`); | |
// console.dir(comments); | |
if (comments.length){ | |
var ccontent = ''; | |
comments.forEach(function(comment){ | |
var readableDate = '<time datetime="'+comment.published+'">' + moment(comment.published).format("MMM d, YYYY") + '</time>'; | |
ccontent += `#### ${comment.title}\n[${comment.author.name}](${comment.author.url} "${comment.author.email}") - ${readableDate}\n\n${comment.content}\n<hr />\n`; | |
}); | |
if (mergeComments == 'm'){ | |
writeToFile(postMaps[pmap].postName, `\n---\n### Comments:\n${ccontent}`, true); | |
}else{ | |
writeToFile(postMaps[pmap].fname, `${postMaps[pmap].header}\n${ccontent}`); | |
} | |
} | |
} | |
} | |
function writeToFile(filename, content, append=false){ | |
if(append){ | |
console.log(`DEBUG: going to append to ${filename}`); | |
try{ | |
fs.appendFileSync(filename, content); | |
console.log(`Successfully appended to ${filename}`); | |
} | |
catch(err){ | |
console.log(`Error while appending to ${filename} - ${JSON.stringify(err)}`); | |
console.dir(err); | |
} | |
}else{ | |
console.log(`DEBUG: going to write to ${filename}`); | |
try{ | |
fs.writeFileSync(filename, content); | |
console.log(`Successfully written to ${filename}`); | |
} | |
catch(err){ | |
console.log(`Error while writing to ${filename} - ${JSON.stringify(err)}`); | |
console.dir(err); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment