Skip to content

Instantly share code, notes, and snippets.

Created December 11, 2023 12:18
Show Gist options
  • Save hubgit/fea70556ac746c88677d9e34fa7ff186 to your computer and use it in GitHub Desktop.
Save hubgit/fea70556ac746c88677d9e34fa7ff186 to your computer and use it in GitHub Desktop.
Fetch tracks played on the Independent Music Podcast
import { DOMParser, type Element, } from "";
const parser = new DOMParser()
const fetchDOM = async (url: string) => {
const response = await fetch(url)
if (!response.ok) {
throw new Error('Response was not ok')
const html = await response.text()
const doc = parser.parseFromString(html, 'text/html')
if (!doc) {
throw new Error('No document parsed')
return doc
async function* fetchCollection(url: string | null | undefined) {
while (url) {
const doc = await fetchDOM(url)
for (const item of doc.querySelectorAll("h4[itemprop='headline'] > a") as Iterable<Element>) {
const url = item.getAttribute('href')
const title = item.textContent
if (url && title) {
yield { url, title }
url = doc.querySelector("")?.getAttribute('href')
async function* fetchItem(url: string) {
const doc = await fetchDOM(url)
const html = doc.querySelector('.entry-content')?.innerHTML.replace(/^.+?Tracklisting/im, '').replaceAll('<br>', '\n')
if (!html) return
const lines = parser.parseFromString(html, 'text/html')?.textContent.split('\n')
if (!lines) return
for (const line of lines) {
const content = line.trim()
const matches = content.match(/^(?<artist>.+?) +– +(?<track>.+)( +\((?<label>.+?)\))/)
?? content.match(/^(?<artist>.+?) +– +(?<track>.+)/)
if (matches?.groups) {
yield {
artist: matches.groups.artist,
track: matches.groups.track,
label: matches.groups.label,
// const jsonLinesWriter = async (path: string) => {
// const file = await, {
// create: true,
// write: true,
// truncate: true,
// })
// const stream = new TransformStream()
// stream.readable
// .pipeThrough(
// new TransformStream({
// transform(chunk, controller) {
// controller.enqueue(JSON.stringify(chunk))
// controller.enqueue('\n')
// },
// })
// )
// .pipeThrough(new TextEncoderStream())
// .pipeTo(file.writable)
// return stream.writable.getWriter()
// }
// const episodeWriter = await jsonLinesWriter('episodes.ndjson')
// for await (const {url, title} of fetchCollection('')) {
// console.log(url, title)
// const item = { url, title, tracks: []}
// for await (const track of fetchItem(url)) {
// item.tracks.push(track)
// }
// await episodeWriter.write(item)
// }
const tsvLinesWriter = async (path: string) => {
const file = await, {
create: true,
write: true,
truncate: true,
const stream = new TransformStream()
new TransformStream<string[], string>({
transform: (items, controller) => {
controller.enqueue(items.join('\t')) // TODO: escape
.pipeThrough(new TextEncoderStream())
return stream.writable.getWriter()
const trackWriter = await tsvLinesWriter('tracks.tsv')
await trackWriter.write(['artist', 'track', 'label', 'url'])
for await (const { url, title } of fetchCollection('')) {
console.log(url, '\n\t', title)
for await (const item of fetchItem(url)) {
let {artist, track, label} = item
if (/^\d+\./.test(artist)) {
const _artist = track
track = artist.replace(/^\d+\./, '') // switch artist and title if item starts with a list ordinal
artist = _artist
await trackWriter.write([artist, track, label, url])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment