Skip to content

Instantly share code, notes, and snippets.

@emwadde
Last active July 14, 2022 08:37
Show Gist options
  • Save emwadde/1c26556f079f59511026d1b758dae250 to your computer and use it in GitHub Desktop.
Save emwadde/1c26556f079f59511026d1b758dae250 to your computer and use it in GitHub Desktop.
Using browser console as a webscrapping tool
var fetchText = async (url) => {
return new Promise((resolve, reject) => {
fetch(url)
.then(response => {
if(response.status == 200) return response.text();
throw new Error(response.statusText)
})
.then(html => {
let parser = new DOMParser();
let doc = parser.parseFromString(html, 'text/html');
let paras = doc.querySelectorAll(".article-content");
console.log(`${url} => ${paras.length} paragraphs`);
let text = [...paras].map(p =>p.textContent).join("\n")
resolve({url: url, text: text})
})
.catch(err => reject({url: url, error: err.message}))
})
}
var downloadTextFile = (filename, text) => {
const element = document.createElement("a");
const file = new Blob([text], {
type: "text/plain",
});
element.href = URL.createObjectURL(file);
element.download = filename;
document.body.appendChild(element);
element.click();
};
var urls = [
"https://archive.mv/en/articles/AdK5q",
"https://archive.mv/en/articles/QMDK1",
"https://archive.mv/en/articles/404-error",
"https://archive.mv/en/articles/8OLk9"
]
var fetchPromises = urls.map(fetchText);
Promise.allSettled(fetchPromises)
.then(results =>{
let fetchedText = results.filter(result => result.status === "fulfilled").map(result => result.value.text).join("\n")
let failed = results.filter(result => result.status == "rejected").map(result => `FAILED: ${result.reason.url}`)
let allText = [fetchedText, failed].join("\n------------------\n")
downloadTextFile("all_text.txt", allText)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment