Skip to content

Instantly share code, notes, and snippets.

@thiagosanches
Last active February 5, 2023 12:35
Show Gist options
  • Save thiagosanches/dfaa0018d05f3f934f2c1539aa48aa2e to your computer and use it in GitHub Desktop.
Save thiagosanches/dfaa0018d05f3f934f2c1539aa48aa2e to your computer and use it in GitHub Desktop.
A little nodejs script that load a JSON exported chat from Telegram, in order to get the links and create a markdown with them, so I can use them in LogSeq.
const fs = require('fs')
const axios = require('axios')
const myData = require('./result.json');
const regexTitleTag = /<title>(.*)<\/title>/g;
const regexRedditPost = /<meta property="og:title" content\=\"(.*?")/gm;
async function getContentFromURL(url) {
let html = null
try {
const result = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0'
}
})
if (result && result.data) {
html = result.data;
}
}
catch (e) {
console.log("I was not able to fetch: ", url);
}
return html;
}
try {
(async () => {
let markdown = ""
const processedUrls = []
for (const message of myData.messages) {
if (message.text_entities) {
for (const entity of message.text_entities) {
let url = null;
let currentRegex = regexTitleTag
if (entity.type === 'link') url = entity.text
if (entity.type === 'text_link') url = entity.href
// I don't need to process an URL twice...
if (processedUrls.includes(url)) {
console.log("Already processed url: ", url)
continue;
}
// for now, block some URLs, due to limiting calls from their servers :/.
if (url && url.trim().startsWith("https://news.ycombinator.com")) continue;
if (url && url.trim().startsWith("https://api.whatsapp.com")) continue;
// switch the regexes.
if (url && url.trim().startsWith("https://www.reddit.com")) currentRegex = regexRedditPost
if (url && url.trim().startsWith('https://')) {
processedUrls.push(url.trim())
console.log("Fetching: ", url)
const html = await getContentFromURL(url)
if (html) {
const regex = new RegExp(currentRegex)
const matchRegex = regex.exec(html)
if (matchRegex) {
const text = matchRegex[1].replaceAll("<title>", "").replaceAll("</title>", "").replaceAll('"', "")
markdown += `- ${text} [[Bulk]]\n`
markdown += ` - ${url} #learning\n`
fs.writeFileSync('markdown.md', markdown);
}
}
}
}
}
}
})();
} catch (e) {
console.error(e.message);
process.exit(1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment