Skip to content

Instantly share code, notes, and snippets.

@fredriccliver
Created January 18, 2022 00:31
Show Gist options
  • Save fredriccliver/4af6070bf66abdfa870b0adb7c408bd5 to your computer and use it in GitHub Desktop.
Save fredriccliver/4af6070bf66abdfa870b0adb7c408bd5 to your computer and use it in GitHub Desktop.
/**
*
* Usage
*
* node getRemote.js [target url]
*
*/
// import { readSync } from "to-vfile";
import { toString } from "nlcst-to-string";
import { retext } from "retext";
import retextPos from "retext-pos";
import retextKeywords from "retext-keywords";
import keyword_extractor from "keyword-extractor";
// import fetch from "node-fetch";
import { JSDOM } from "jsdom";
import readability from "node-readability";
import fs from "fs";
const targetUrl = process.argv[2] || "https://www.creatrip.com/en/blog/1491";
/*
Result example of the default URL
---- Keywords ----
exchange (SCORE:1 WEIGHT:undefined)
rates (SCORE:0.75 WEIGHT:undefined)
currency (SCORE:0.29 WEIGHT:undefined)
Myeongdong (SCORE:0.29 WEIGHT:undefined)
bank (SCORE:0.25 WEIGHT:undefined)
money (SCORE:0.25 WEIGHT:undefined)
counters (SCORE:0.25 WEIGHT:undefined)
---- Key-phrases ----
exchange counters (SCORE:1 WEIGHT:29)
exchange rate (SCORE:0.62 WEIGHT:41)
commission rate (SCORE:0.33 WEIGHT:22)
rates (SCORE:0.26 WEIGHT:17)
bank (SCORE:0.17 WEIGHT:5)
*/
// SIMPLE FETCH VERSION CODE
// fetch(targetUrl)
// .then((response) => response.text())
// .then((text) => {
// const dom = new JSDOM(text);
// const textContent = Array.from(dom.window.document.querySelectorAll("p"))
// .map((e) => getInnerText(e))
// .join(" ");
// // print paragraphs
// // console.log(textContent);
// extractKeywords(textContent);
// });
console.log(targetUrl);
// It convert raw HTML to simplified and summarised content HTML document.
readability(targetUrl, function (err, article, meta) {
// Main Article
// console.log(article.content);
if (err != null || article == null) {
console.log(err);
return;
}
// console.log(article.content);
const dom = new JSDOM(article.content);
const textContent = Array.from(dom.window.document.querySelectorAll("p"))
.map((e) => e.textContent)
.join(" ");
saveLatestDocument(article, textContent);
// Extract keywords from converted plain texts
extractKeywords(textContent, "retext");
// Close article to clean up jsdom and prevent leaks
article.close();
});
// It extract keywords and key-phrases from long text.
// There are two algorithm, retext and keyword_extractor
/** parameters
* p: phrase, long text
* mode: "retext" or "ke"
*/
function extractKeywords(p, mode) {
if (mode == "retext") {
retext()
.use(retextPos) // Make sure to use `retext-pos` before `retext-keywords`.
.use(retextKeywords, { maximum: 5 })
// .use(retextKeywords)
.process(p)
.then((p) => {
console.log("---- Keywords ----");
p.data.keywords.forEach((keyword) => {
// if (keyword.score < 0.4) return false;
console.log(
`${toString(keyword.matches[0].node)} (SCORE:${
Math.floor(keyword.score * 100) / 100
} WEIGHT:${keyword.weight})`
);
});
console.log("---- Key-phrases ----");
p.data.keyphrases.forEach((phrase) => {
// if (phrase.score < 0.01 || phrase.weight < 10) return false;
console.log(
`${phrase.matches[0].nodes
.map((d) => toString(d))
.join("")} (SCORE:${
Math.floor(phrase.score * 100) / 100
} WEIGHT:${phrase.weight})`
);
});
});
} else if (mode == "ke") {
const extraction_result = keyword_extractor.extract(p, {
language: "english",
remove_digits: true,
return_changed_case: true,
remove_duplicates: false,
return_max_ngrams: 5,
});
console.log(extraction_result);
}
}
// Logging purpose
function saveLatestDocument(article, textContent) {
fs.writeFile("./latestPage/content.html", article.content, (err) => {
if (err) {
console.error(err);
return;
}
//file written successfully
});
fs.writeFile("./latestPage/textContent.html", textContent, (err) => {
if (err) {
console.error(err);
return;
}
//file written successfully
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment