Skip to content

Instantly share code, notes, and snippets.

@hubgit
Created December 11, 2023 12:18
Show Gist options
  • Save hubgit/fea70556ac746c88677d9e34fa7ff186 to your computer and use it in GitHub Desktop.
Save hubgit/fea70556ac746c88677d9e34fa7ff186 to your computer and use it in GitHub Desktop.
Fetch tracks played on the Independent Music Podcast
import { DOMParser, type Element, } from "https://deno.land/x/deno_dom@v0.1.43/deno-dom-wasm.ts";
const parser = new DOMParser()
const fetchDOM = async (url: string) => {
const response = await fetch(url)
if (!response.ok) {
throw new Error('Response was not ok')
}
const html = await response.text()
const doc = parser.parseFromString(html, 'text/html')
if (!doc) {
throw new Error('No document parsed')
}
return doc
}
async function* fetchCollection(url: string | null | undefined) {
while (url) {
const doc = await fetchDOM(url)
for (const item of doc.querySelectorAll("h4[itemprop='headline'] > a") as Iterable<Element>) {
const url = item.getAttribute('href')
const title = item.textContent
if (url && title) {
yield { url, title }
}
}
url = doc.querySelector("a.next")?.getAttribute('href')
}
}
async function* fetchItem(url: string) {
const doc = await fetchDOM(url)
const html = doc.querySelector('.entry-content')?.innerHTML.replace(/^.+?Tracklisting/im, '').replaceAll('<br>', '\n')
if (!html) return
const lines = parser.parseFromString(html, 'text/html')?.textContent.split('\n')
if (!lines) return
for (const line of lines) {
const content = line.trim()
const matches = content.match(/^(?<artist>.+?) +– +(?<track>.+)( +\((?<label>.+?)\))/)
?? content.match(/^(?<artist>.+?) +– +(?<track>.+)/)
if (matches?.groups) {
yield {
artist: matches.groups.artist,
track: matches.groups.track,
label: matches.groups.label,
}
}
}
}
// const jsonLinesWriter = async (path: string) => {
// const file = await Deno.open(path, {
// create: true,
// write: true,
// truncate: true,
// })
// const stream = new TransformStream()
// stream.readable
// .pipeThrough(
// new TransformStream({
// transform(chunk, controller) {
// controller.enqueue(JSON.stringify(chunk))
// controller.enqueue('\n')
// },
// })
// )
// .pipeThrough(new TextEncoderStream())
// .pipeTo(file.writable)
// return stream.writable.getWriter()
// }
// const episodeWriter = await jsonLinesWriter('episodes.ndjson')
// for await (const {url, title} of fetchCollection('https://independentmusicpodcast.net/episodes-independent-music-podcast/')) {
// console.log(url, title)
//
// const item = { url, title, tracks: []}
// for await (const track of fetchItem(url)) {
// item.tracks.push(track)
// }
// await episodeWriter.write(item)
// }
const tsvLinesWriter = async (path: string) => {
const file = await Deno.open(path, {
create: true,
write: true,
truncate: true,
})
const stream = new TransformStream()
stream.readable
.pipeThrough(
new TransformStream<string[], string>({
transform: (items, controller) => {
controller.enqueue(items.join('\t')) // TODO: escape
controller.enqueue('\n')
},
})
)
.pipeThrough(new TextEncoderStream())
.pipeTo(file.writable)
return stream.writable.getWriter()
}
const trackWriter = await tsvLinesWriter('tracks.tsv')
await trackWriter.write(['artist', 'track', 'label', 'url'])
for await (const { url, title } of fetchCollection('https://independentmusicpodcast.net/episodes-independent-music-podcast/')) {
console.log(url, '\n\t', title)
for await (const item of fetchItem(url)) {
let {artist, track, label} = item
if (/^\d+\./.test(artist)) {
const _artist = track
track = artist.replace(/^\d+\./, '') // switch artist and title if item starts with a list ordinal
artist = _artist
}
await trackWriter.write([artist, track, label, url])
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment