Skip to content

Instantly share code, notes, and snippets.

@SynCap
Last active June 30, 2021 05:14
Show Gist options
  • Save SynCap/eea9aecab6ac467f86798129d09f4fa0 to your computer and use it in GitHub Desktop.
Save SynCap/eea9aecab6ac467f86798129d09f4fa0 to your computer and use it in GitHub Desktop.
Parse content of popular sites and convert main article to MarkDown
/**!
* @author (c)2015-2020, CLosk, www.closk.design
*/
console.clear();
var mdGen = {
transHdr: true, // заготовка имени файла из первого заголовка транслитом или как есть
genOpts: {
// #0 --
'professionali.ru': {
selMain: '.b-box-part', // container
toRemove: [
// selectors of adjasent childs to be removed here and go
'.h-overflow',
'#inpage_VI-183754-0',
],
},
// #1 -- adme.ru
'adme.ru': {
// container
selMain: 'article.article',
// selectors of adjasent child tags to be removed here and go
toRemove: ['#js-article-share-top'],
},
// #2 -- proglib.io
'proglib.io': {
selMain: '.td-post-title,.td-post-content',
toRemove: ['time', 'ins', '.crayon-table', '.td-module-meta-info', 'form', 'noscript', 'td-a-rec'],
noHeader: true,
},
// #3 -- lifehacker.ru
'lifehacker.ru': {
selMain: '.post-content',
toRemove: ['.meta-info', 'social-and-date'],
},
// #4 --
'upworktestru.com': {
selMain: 'header, div.entry-content',
toRemove: ['div'],
preCorrect(doc) {
doc.find('span[style="background-color: #00ff00;"]').each(function() {
// let $t = jQuery(this);
// $t.text($t.text().replace(/^ *•/, "- [x] $&"));
this.innerText = this.innerText.replace('•', '- [x] $&');
return this;
});
doc.find('strong').each(function() {
// let $t = jQuery(this);
// $t.text($t.text().replace('•', "- [] $&")
this.innerText = this.innerText.replace('•', '- [] $&');
});
// doc.find('h3 br').remove();
doc.find('h3').each(function() {
let $t = jQuery(this);
$t.replaceWith('<pre><br/>' + $t.html() + '<br/></pre>');
});
},
finalClean(text) {
return text
.replace(/<!--.*?-->/g, '')
.replace(/\*{2}/g, '')
.replace(/ •/g, '')
.replace(/### Answers:/g, '**$&**')
.replace(/(\*{2})#{3} /g, '$1')
.replace(/»/g, '"')
.replace(/«/g, '"')
.replace(/‘/g, "'")
.replace(/’/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/^ +-/g, '-')
.replace(/^\n{3,}/, '\n\n')
.replace(/^ *(## \d+\.) */g, '$1\n\n')
.replace(/<\/?span.*?> */g, '');
},
},
// #5 digitalocean -- tutorial-content
// https://www.digitalocean.com/community/tutorials/
'www.digitalocean.com': {
selMain: '.content-title, .tutorial-content',
toRemove: [],
preCorrect: function(doc) {
let h1 = doc.filter('.content-title').get(0);
h1.innerHTML = `# ${h1.innerText}`;
doc.find('.secondary-code-label ').each(function() {
this.innerHTML = `### ${this.innerText}\n\n`;
});
doc.find('.code-label').each(function() {
this.outerHTML = `\n\n\<p><tt>${this.innerText}</tt></p>\n\n`;
});
doc.find('pre').each(function() {
this.outerText = `\n\`\`\`\n${this.innerText.trim()}\n\`\`\`\n`;
});
},
finalClean: function(text) {
return text.replace(/&lt;/g, '<').replace(/&gt;/g, '>');
},
},
// #6 vscale Dox
// https://community.vscale.io/hc/ru/community/topics/200563745-Руководства
'community.vscale.io': {
selMain: 'h1, .post-body',
toRemove: [],
preCorrect: function(doc) {
doc.find('pre').each(function() {
this.outerText = `\n\`\`\`\n${this.innerText.trim()}\n\`\`\`\n`;
});
},
finalClean: function(text) {
return text.replace(/^\$ /g, '');
},
},
// #7 www.opennet.ru
'www.opennet.ru': {
selMain: '.NAVHEADER th, .SECT1',
toRemove: ['a[name]'],
preCorrect(doc) {},
// finalClean(text) {return text}
},
// #8 medium.com
'medium.com': {
selMain: '.section-inner',
toRemove: ['span', '.followState', '.js-followState', 'canvas'],
finalClean(text) {
return text
.replace(/<\/?(figure|noscript|time).*?>[ \t]*/gi, '')
.replace(/``` `(.*?)` ```/gm, '\n```bash\n$1\n```\n')
.replace(/&gt;/g, '>')
.replace(/&lt;/g, '<')
.replace(/[ \t]+$/g, '');
},
},
// #9 dockercheatsheet.painlessdocker.com
'dockercheatsheet.painlessdocker.com': {
selMain: 'main',
toRemove: [],
titleToHeader: true,
// noHeader: true,
preCorrect(doc) {},
finalClean(text) {
return text
.replace(/``` `(.*?)` ```/gm, '\n```bash\n$1\n```\n')
.replace(/<\/?section.*?>[ \t]*/g, '')
.replace(/<!.*?>/g, '')
.replace(/###### /g, '');
},
},
// #10 dev.to
'dev.to': {
selMain: '#article-body',
toRemove: ['meta'],
finalClean(text) {
return text.replace(/<\/?(sub|span)[^>]*>/gi, '');
},
},
// #11 habr.com
'habr.com': {
selMain: '.post__text',
selTitle: '.post__title',
toRemove: [],
titleToHeader: true,
noHeader: false,
preCorrect() {},
finalClean(text) {
return text.replace(/<\/?(sub|span)[^>]*>/gi, '');
},
},
}[window.location.host],
start(settings = {}) {
let o = { ...this.genOpts, ...settings };
let toRemove = o.toRemove || [];
toRemove.push('script', 'style', 'iframe');
const picLbl = 'pic';
const lnkLbl = 'lnk';
// const $selMain = {...jQuery(o.selMain)};
const $selMain = jQuery(o.selMain).clone();
$selMain.remove(o.toRemove.join(','));
if (o.preCorrect !== undefined) o.preCorrect($selMain);
// Сменилась основная техника обработки, поэтому
var text = '';
$selMain.each(function(i) {
text += $(this).html();
});
var images = [];
var links = [];
var $textes = $selMain.find('textarea');
var idxTextes = 0;
var res = text
.replace(/&nbsp;/g, ' ')
.replace(/<(textarea\b)[^>]*?>(.*?)<\/\1>/gi, function(m, tag, content) {
return '\n```\n' + $textes[idxTextes++].value + '\n```\n';
})
.replace(/<[/]?(pre)\b[^>]*>/gi, '\n```\n')
.replace(/<[/]?(code|tt)\b[^>]*>/gi, '`')
.replace(/<[/]?(strong|b)\b[^>]*>/gi, '**')
.replace(/<[/]?(em|i)\b[^>]*>/gi, '_')
.replace(/\s+/gm, ' ')
.replace(/\s*<br *\/?>\s*/gi, '\n')
// .replace(/<\/?span[^>]*>/gi, "")
.replace(/<(h\d|p|blockquote|[ou]l|li)\b/gi, '\n\n$&')
.replace(/<\/(h\d|p|blockquote|[ou]l|li)>/gi, '$&\n\n')
// .replace(/<[/]?textarea\b[^>]*>/gi, '\n```\n')
.replace(/<h(\d)[^>]*>\s*/gi, function(w, m1) {
let d = parseInt(m1);
return '#'.repeat(d) + ' ';
})
.replace(/<a([^>]*)>(.*?)<\/a>/gi, function(m0, w, text) {
let m = w.match(/href=(['"])(.*?)\1/im);
let href = (m && m[2]) || '';
if (href === text) return `<${text}>`;
if (!href || href[0] == '#') return text.trim();
if (href.search(/^\/\b/) + 1) href = location.origin + href;
links.push(href);
return `[${text}][${lnkLbl}${links.length}]`;
})
.replace(/<img[^>]*>/gi, function(w) {
let m = w.match(/src=(['"])(.*?)\1/im);
let src = (m && m[2]) || '';
if (!src) return '';
if (src[0] == '/') src = location.origin + src;
m = w.match(/alt=(['"])(.*?)\1/im);
let alt = (m && m[2]) || '';
m = w.match(/title=(['"])(.*?)\1/im);
let title = (m && m[2]) || '';
images.push(src);
return `![${title ? title : alt}][${picLbl}${images.length}]`;
})
.replace(/<(blockquote)([^>]*)>(.*?)<\/\1>/gim, function(match, tag, attrs, content) {
return '> ' + content.trim();
})
.replace(/<(li)([^>]*)>(.*?)<\/\1>/gim, function(match, tag, attrs, content) {
return '\n- ' + content.trim();
})
.replace(/<[/]?(nobr|div|p|h\d|[uo]l|yobject|section|main|nav)\b[^>]*>/gi, ' ')
.replace(/<hr[^>]*>/gi, '\n---\n')
.replace(/^[ \t]+/gm, '');
// if (o.finalClean !== undefined)
res = o.finalClean(res) || res;
if (links.length) {
let cnt = 1;
res += '\n'.repeat(2);
for (let i of links) res += `[${lnkLbl}${cnt++}]: ${i}\n`;
}
if (images.length) {
let cnt = 1;
res += '\n'.repeat(2);
for (let i of images) res += `[${picLbl}${cnt++}]: ${i}\n`;
}
// взять заголовок документа из тэга TITLE,
// а все остальные заголовки понизить уровнем
if (o.titleToHeader || o.selTitle) res = this.makeHeaderFromTitle(res, o.selTitle);
// теперь вхерачим в первую строчку транслит от первого заголовка
// чтобы помочь текстовым редакторам назначать имя файла по умолчанию
// для сохранения
if (!o.noHeader) res = this.makeFileName(res);
// подчистим напоследок
res = res.replace(/( *\n){3,}/g, '\n\n');
copy(res);
console.log(res.substr(0, 300));
console.log('%c↔↔↔↔↔', 'color:blue');
return `${this.fullTime()} _mdGen.JS has Finished`;
},
makeHeaderFromTitle(text, selTitle) {
let title = !!selTitle ? document.querySelector(selTitle).innerText : document.getElementsByTagName('title')[0].innerText;
return text.replace(/^#/g, '##').replace(/^/, `# ${title}\n\n`);
},
makeFileName(text) {
return text.replace(
/^\s*#+[ \t]+(.+)\n/,
(w, fstHdr) =>
(this.transHdr ? this.translit(fstHdr) : fstHdr)
.replace(/[ _]+/g, '_')
.replace(/[:.]/g, '–')
.replace(/['"]/g, '')
.replace(/(_+-|-_+)/g, '-')
.replace(/[_+%,.–\-]+$/, '') +
'.md\n\n' +
w
);
},
tmeOpts: {
day: '2-digit',
hour: '2-digit',
minute: '2-digit',
month: 'long',
second: '2-digit',
timeZoneName: 'short',
weekday: 'short',
year: 'numeric',
},
aTrans: {А: 'a', а: 'a', Б: 'B', б: 'b', В: 'V', в: 'v', Г: 'G', г: 'g',
Д: 'D', д: 'd', Е: 'E', е: 'e', Ж: 'ZH', ж: 'zh', З: 'Z', з: 'z', И: 'I',
и: 'i', Й: 'I', й: 'i', К: 'K', к: 'k', Л: 'L', л: 'l', М: 'M', м: 'm',
Н: 'N', н: 'n', О: 'O', о: 'o', П: 'P', п: 'p', Р: 'R', р: 'r', С: 'S',
с: 's', Т: 'T', т: 't', У: 'U', у: 'u', Ф: 'F', ф: 'f', Х: 'H', х: 'h',
Ц: 'TS', ц: 'ts', Ч: 'CH', ч: 'ch', Ш: 'SH', ш: 'sh', Щ: 'SCH', щ: 'sch',
Ъ: '”', ъ: '”', Ы: 'Y', ы: 'y', Ь: '’', ь: '’', Э: 'E', э: 'e', ю: 'yu',
Ю: 'YU', Я: 'Ya', я: 'ya', Ё: 'YO', ё: 'yo', },
translit(str) {
return str
.split('')
.map(char => this.aTrans[char] || char)
.join('');
},
fullTime(settings = {}) {
return new Date().toLocaleString('ru', Object.assign({}, this.tmeOpts, settings));
},
stripMarkup(text) {
return text.replace(/<[^>]*>/g, '');
},
stripTags(tags, text) {
if (typeof tags === 'string') tags = tags.split(/[, |;\.]/);
if (Array.isArray(tags)) tags = tags.join('|');
else return text;
re = new RegExp(`</?(${tags})[^>]*>`, 'gi');
return text.replace(re, '');
},
};
mdGen.start();

mdGen - convert page article to MarkDown

I use this script within DevTools' Snippets of Chromium based browsers (Chrome, Yandex Browser, MS Edge, Opera)

Earlier FireFox has a Scratchpad with wich you can store and use this script. In modern FireFox switch console to multistring mode and paste script, then press Ctrl+Enter

This script is part of toolbox for prepare data and use the personal Knowlage Base

How to use

  1. Open article on any site which listed in mdGen's options
  2. Open DevTools (F12 or Ctrl+Sfhit+I)
  3. Choose Source panel
  4. Choose Snippets panel
  5. Create new snippet with your preffered name
  6. Paste code of mdGen.js and press Ctrl+S to save it
  7. Press Ctrl+Enter to launch

When all right content of page in Markdown format is in clipboard. Just paste text in your editor

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment