Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
// deno run --allow-net parse-feed-urls.js
import { DOMParser } from "https://deno.land/x/deno_dom@v0.1.13-alpha/deno-dom-wasm.ts";
import * as path from "https://deno.land/std@0.106.0/path/mod.ts";
import { parse } from "https://cdn.skypack.dev/tldts";
const urls = await getFeedUrls();
const urlCollection = createUrlCollection(urls);
console.log("\nURLs parsed: %s\n", urlCollection.length);
log(
"Resource name",
getTop5ByKey(urlCollection, "name")
.map(([name, count], i) => `${i + 1}. ${name}: ${count}`)
.join("\n")
);
log(
`Resource location`,
"Root /*: " + urlCollection.filter(({ dir }) => dir === "/").length,
"Nested /**/*: " + urlCollection.filter(({ dir }) => dir !== "/").length
);
console.log();
const urlCollectionWithExt = urlCollection.filter(({ ext }) => ext);
log(
`Resource with an extension: ${urlCollectionWithExt.length}`,
getTop5ByKey(urlCollectionWithExt, "ext")
.map(
([ext, count], i) =>
`${i + 1}. *${ext}: ${count}\n` +
getTop5ByKey(
urlCollectionWithExt.filter((u) => u.ext === ext),
"base"
)
.map(([base, count]) => ` ${base}: ${count}`)
.join("\n")
)
.join("\n")
);
const urlCollectionWithoutExt = urlCollection.filter(({ ext }) => !ext);
log(
`Resource without an extension: ${urlCollectionWithoutExt.length}`,
getTop5ByKey(urlCollectionWithoutExt, "base")
.map(([base, count], i) => `${i + 1}. /${base}/: ${count}`)
.join("\n")
);
log(
"Domains",
getTop5ByKey(urlCollection, "domain")
.map(([domain, count], i) => `${i + 1}. ${domain}: ${count}`)
.join("\n")
);
/**
* @typdef UrlCollection
* @param {string} url - The original URL pulled from the XML
* @param {string} domain - The domain derived from the URL
* @param {string} base
* @param {string} dir
* @param {string} ext
* @param {string} name
*
* base, dir, ext, and name are all extracted from Node's `parse()`
* Here’s how two example urls would break down:
* 1. `/path/to/feed.xml`
* 2. `/feed/`
* base 1) `feed.xml` 2) `feed`
* ext 1) `.xml` 2) ``
* dir 1) `/path/to` 2) `/`
* name 1) `feed` 2) `feed`
*/
/**
* Fetch the feed URLs we need
* @returns {Array.<string>}
*/
async function getFeedUrls() {
return fetch(
"https://raw.githubusercontent.com/simevidas/web-dev-feeds/master/feeds.opml"
)
.then((res) => res.text())
.then((text) => {
// Should be parsing this as 'application/xml' but it's not supported yet
// Parsing as HTML works for our purposes now tho.
const doc = new DOMParser().parseFromString(text, "text/html");
let urls = [];
doc.querySelectorAll("[xmlurl]").forEach((url) => {
urls.push(url.getAttribute("xmlurl"));
});
return urls;
})
.catch((e) => {
console.error(e);
});
}
/**
* Take some URLs and turn them into a collection of data we can count on
* @param {Array.<string>} urls
* @returns {UrlCollection}
*/
function createUrlCollection(urls) {
return urls.map((url) => {
const { pathname, origin } = new URL(url);
const { base, dir, ext, name } = path.parse(pathname);
return {
url,
domain: parse(origin).domain,
base,
dir,
ext,
name,
};
});
}
/**
* Take some info and log it in a consistent pattern with a new line at the end
* @param {string} title
* @param {Array.<string>} args
*/
function log(title, ...args) {
const divider = "----------------------------";
console.log(title);
console.log(divider);
console.log(args.join("\n"));
console.log();
}
/**
* Get the top 5 pieces of data within a URLCollection and return them back
* in an array of [value, count].
* @param {UrlCollection} collection
* @param {string} key
* @returns
*/
function getTop5ByKey(collection, key) {
return Object.entries(
collection.reduce((acc, url) => {
const value = url[key];
if (acc[value]) {
acc[value] += 1;
} else {
acc[value] = 1;
}
return acc;
}, {})
)
.sort((a, b) => b[1] - a[1])
.slice(0, 5);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment