Created
December 11, 2023 12:18
-
-
Save hubgit/fea70556ac746c88677d9e34fa7ff186 to your computer and use it in GitHub Desktop.
Fetch tracks played on the Independent Music Podcast
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { DOMParser, type Element, } from "https://deno.land/x/deno_dom@v0.1.43/deno-dom-wasm.ts"; | |
const parser = new DOMParser() | |
const fetchDOM = async (url: string) => { | |
const response = await fetch(url) | |
if (!response.ok) { | |
throw new Error('Response was not ok') | |
} | |
const html = await response.text() | |
const doc = parser.parseFromString(html, 'text/html') | |
if (!doc) { | |
throw new Error('No document parsed') | |
} | |
return doc | |
} | |
async function* fetchCollection(url: string | null | undefined) { | |
while (url) { | |
const doc = await fetchDOM(url) | |
for (const item of doc.querySelectorAll("h4[itemprop='headline'] > a") as Iterable<Element>) { | |
const url = item.getAttribute('href') | |
const title = item.textContent | |
if (url && title) { | |
yield { url, title } | |
} | |
} | |
url = doc.querySelector("a.next")?.getAttribute('href') | |
} | |
} | |
async function* fetchItem(url: string) { | |
const doc = await fetchDOM(url) | |
const html = doc.querySelector('.entry-content')?.innerHTML.replace(/^.+?Tracklisting/im, '').replaceAll('<br>', '\n') | |
if (!html) return | |
const lines = parser.parseFromString(html, 'text/html')?.textContent.split('\n') | |
if (!lines) return | |
for (const line of lines) { | |
const content = line.trim() | |
const matches = content.match(/^(?<artist>.+?) +– +(?<track>.+)( +\((?<label>.+?)\))/) | |
?? content.match(/^(?<artist>.+?) +– +(?<track>.+)/) | |
if (matches?.groups) { | |
yield { | |
artist: matches.groups.artist, | |
track: matches.groups.track, | |
label: matches.groups.label, | |
} | |
} | |
} | |
} | |
// const jsonLinesWriter = async (path: string) => { | |
// const file = await Deno.open(path, { | |
// create: true, | |
// write: true, | |
// truncate: true, | |
// }) | |
// const stream = new TransformStream() | |
// stream.readable | |
// .pipeThrough( | |
// new TransformStream({ | |
// transform(chunk, controller) { | |
// controller.enqueue(JSON.stringify(chunk)) | |
// controller.enqueue('\n') | |
// }, | |
// }) | |
// ) | |
// .pipeThrough(new TextEncoderStream()) | |
// .pipeTo(file.writable) | |
// return stream.writable.getWriter() | |
// } | |
// const episodeWriter = await jsonLinesWriter('episodes.ndjson') | |
// for await (const {url, title} of fetchCollection('https://independentmusicpodcast.net/episodes-independent-music-podcast/')) { | |
// console.log(url, title) | |
// | |
// const item = { url, title, tracks: []} | |
// for await (const track of fetchItem(url)) { | |
// item.tracks.push(track) | |
// } | |
// await episodeWriter.write(item) | |
// } | |
const tsvLinesWriter = async (path: string) => { | |
const file = await Deno.open(path, { | |
create: true, | |
write: true, | |
truncate: true, | |
}) | |
const stream = new TransformStream() | |
stream.readable | |
.pipeThrough( | |
new TransformStream<string[], string>({ | |
transform: (items, controller) => { | |
controller.enqueue(items.join('\t')) // TODO: escape | |
controller.enqueue('\n') | |
}, | |
}) | |
) | |
.pipeThrough(new TextEncoderStream()) | |
.pipeTo(file.writable) | |
return stream.writable.getWriter() | |
} | |
const trackWriter = await tsvLinesWriter('tracks.tsv') | |
await trackWriter.write(['artist', 'track', 'label', 'url']) | |
for await (const { url, title } of fetchCollection('https://independentmusicpodcast.net/episodes-independent-music-podcast/')) { | |
console.log(url, '\n\t', title) | |
for await (const item of fetchItem(url)) { | |
let {artist, track, label} = item | |
if (/^\d+\./.test(artist)) { | |
const _artist = track | |
track = artist.replace(/^\d+\./, '') // switch artist and title if item starts with a list ordinal | |
artist = _artist | |
} | |
await trackWriter.write([artist, track, label, url]) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment