Created
December 18, 2012 17:27
-
-
Save markandey/4329992 to your computer and use it in GitHub Desktop.
htmltomarkdown.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var toMarkdown = function(string) { | |
var ELEMENTS = [{ | |
patterns: 'p', | |
replacement: function(str, attrs, innerHTML) { | |
return innerHTML ? '\n\n' + innerHTML + '\n' : ''; | |
} | |
}, { | |
patterns: 'br', | |
type: 'void', | |
replacement: '\n' | |
}, { | |
patterns: 'h([1-6])', | |
replacement: function(str, hLevel, attrs, innerHTML) { | |
var hPrefix = ''; | |
for(var i = 0; i < hLevel; i++) { | |
hPrefix += '#'; | |
} | |
return '\n\n' + hPrefix + ' ' + innerHTML + '\n'; | |
} | |
}, { | |
patterns: 'hr', | |
type: 'void', | |
replacement: '\n\n* * *\n' | |
}, { | |
patterns: 'a', | |
replacement: function(str, attrs, innerHTML) { | |
var href = attrs.match(attrRegExp('href')), | |
title = attrs.match(attrRegExp('title')); | |
return href ? '[' + innerHTML + ']' + '(' + href[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')' : str; | |
} | |
}, { | |
patterns: ['b', 'strong'], | |
replacement: function(str, attrs, innerHTML) { | |
return innerHTML ? '**' + innerHTML + '**' : ''; | |
} | |
}, { | |
patterns: ['i', 'em'], | |
replacement: function(str, attrs, innerHTML) { | |
return innerHTML ? '_' + innerHTML + '_' : ''; | |
} | |
}, { | |
patterns: 'code', | |
replacement: function(str, attrs, innerHTML) { | |
return innerHTML ? '`' + innerHTML + '`' : ''; | |
} | |
}, | |
{ | |
patterns: 'pre', | |
replacement: function(str, attrs, innerHTML) { | |
return innerHTML ? '`' + innerHTML + '`' : ''; | |
} | |
}, | |
{ | |
patterns: 'img', | |
type: 'void', | |
replacement: function(str, attrs, innerHTML) { | |
var src = attrs.match(attrRegExp('src')), | |
alt = attrs.match(attrRegExp('alt')), | |
title = attrs.match(attrRegExp('title')); | |
return '![' + (alt && alt[1] ? alt[1] : '') + ']' + '(' + src[1] + (title && title[1] ? ' "' + title[1] + '"' : '') + ')'; | |
} | |
}, | |
{ | |
patterns: 'style', | |
replacement: function(str, attrs, innerHTML) { | |
return ''; | |
} | |
}, | |
{ | |
patterns: 'script', | |
replacement: function(str, attrs, innerHTML) { | |
return ''; | |
} | |
}, | |
{ | |
patterns: 'div', | |
replacement: function(str, attrs, innerHTML) { | |
return innerHTML; | |
} | |
} | |
]; | |
for(var i = 0, len = ELEMENTS.length; i < len; i++) { | |
if(typeof ELEMENTS[i].patterns === 'string') { | |
string = replaceEls(string, { | |
tag: ELEMENTS[i].patterns, | |
replacement: ELEMENTS[i].replacement, | |
type: ELEMENTS[i].type | |
}); | |
} else { | |
for(var j = 0, pLen = ELEMENTS[i].patterns.length; j < pLen; j++) { | |
string = replaceEls(string, { | |
tag: ELEMENTS[i].patterns[j], | |
replacement: ELEMENTS[i].replacement, | |
type: ELEMENTS[i].type | |
}); | |
} | |
} | |
} | |
function replaceEls(html, elProperties) { | |
var pattern = elProperties.type === 'void' ? '<' + elProperties.tag + '\\b([^>]*)\\/?>' : '<' + elProperties.tag + '\\b([^>]*)>([\\s\\S]*?)<\\/' + elProperties.tag + '>', | |
regex = new RegExp(pattern, 'gi'), | |
markdown = ''; | |
if(typeof elProperties.replacement === 'string') { | |
markdown = html.replace(regex, elProperties.replacement); | |
} else { | |
markdown = html.replace(regex, function(str, p1, p2, p3) { | |
return elProperties.replacement.call(this, str, p1, p2, p3); | |
}); | |
} | |
return markdown; | |
} | |
function attrRegExp(attr) { | |
return new RegExp(attr + '\\s*=\\s*["\']?([^"\']*)["\']?', 'i'); | |
} | |
// Pre code blocks | |
string = string.replace(/<pre\b[^>]*>`([\s\S]*)`<\/pre>/gi, function(str, innerHTML) { | |
innerHTML = innerHTML.replace(/^\t+/g, ' '); // convert tabs to spaces (you know it makes sense) | |
innerHTML = innerHTML.replace(/\n/g, '\n '); | |
return '\n\n ' + innerHTML + '\n'; | |
}); | |
// Lists | |
// Escape numbers that could trigger an ol | |
// If there are more than three spaces before the code, it would be in a pre tag | |
// Make sure we are escaping the period not matching any character | |
string = string.replace(/^(\s{0,3}\d+)\. /g, '$1\\. '); | |
// Converts lists that have no child lists (of same type) first, then works it's way up | |
var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi; | |
while(string.match(noChildrenRegex)) { | |
string = string.replace(noChildrenRegex, function(str) { | |
return replaceLists(str); | |
}); | |
} | |
function replaceLists(html) { | |
html = html.replace(/<(ul|ol)\b[^>]*>([\s\S]*?)<\/\1>/gi, function(str, listType, innerHTML) { | |
var lis = innerHTML.split('</li>'); | |
lis.splice(lis.length - 1, 1); | |
for(i = 0, len = lis.length; i < len; i++) { | |
if(lis[i]) { | |
var prefix = (listType === 'ol') ? (i + 1) + ". " : "* "; | |
lis[i] = lis[i].replace(/\s*<li[^>]*>([\s\S]*)/i, function(str, innerHTML) { | |
innerHTML = innerHTML.replace(/^\s+/, ''); | |
innerHTML = innerHTML.replace(/\n\n/g, '\n\n '); | |
// indent nested lists | |
innerHTML = innerHTML.replace(/\n([ ]*)+(\*|\d+\.) /g, '\n$1 $2 '); | |
return prefix + innerHTML; | |
}); | |
} | |
} | |
return lis.join('\n'); | |
}); | |
return '\n\n' + html.replace(/[ \t]+\n|\s+$/g, ''); | |
} | |
// Blockquotes | |
var deepest = /<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)<\/blockquote>/gi; | |
while(string.match(deepest)) { | |
string = string.replace(deepest, function(str) { | |
return replaceBlockquotes(str); | |
}); | |
} | |
function replaceBlockquotes(html) { | |
html = html.replace(/<blockquote\b[^>]*>([\s\S]*?)<\/blockquote>/gi, function(str, inner) { | |
inner = inner.replace(/^\s+|\s+$/g, ''); | |
inner = cleanUp(inner); | |
inner = inner.replace(/^/gm, '> '); | |
inner = inner.replace(/^(>([ \t]{2,}>)+)/gm, '> >'); | |
return inner; | |
}); | |
return html; | |
} | |
function cleanUp(string) { | |
string = string.replace(/^[\t\r\n]+|[\t\r\n]+$/g, ''); // trim leading/trailing whitespace | |
string = string.replace(/\n\s+\n/g, '\n\n'); | |
string = string.replace(/\n{3,}/g, '\n\n'); // limit consecutive linebreaks to 2 | |
return string; | |
} | |
return cleanUp(string); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment