Skip to content

Instantly share code, notes, and snippets.

@Mic92
Created February 25, 2019 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Mic92/ad33560f10e2f8d10dd31d8ee8d9cc7e to your computer and use it in GitHub Desktop.
Save Mic92/ad33560f10e2f8d10dd31d8ee8d9cc7e to your computer and use it in GitHub Desktop.
Scraper for jedentageinset
#!/usr/bin/env nix-shell
#! nix-shell -i python3 -p "python3.withPackages(ps: [ps.requests ps.youtube-dl])"
import dbm
import re
import subprocess
import xml.etree.ElementTree as ET
from io import StringIO
from typing import List, Optional
import requests
import youtube_dl
PLAYER_URL = re.compile(r'.*(https://w\.soundcloud\.com/player/[^"]+).*')
class Db:
def __init__(self) -> None:
self.handle = dbm.open("download.db", "c")
def seen(self, url: str) -> bool:
res = self.handle.get(url, None)
return res is not None
def mark_seen(self, url: str):
print(f"set {url}")
self.handle[url] = "downloaded"
def scrape_player(player_url: str) -> Optional[str]:
resp = requests.get(player_url)
for (event, el) in ET.iterparse(StringIO(resp.text), events=("start",)):
if el.tag == "link" and el.attrib.get("rel", None) == "canonical":
return el.attrib.get("href")
return None
def download_songs(songs: List[str]) -> None:
db = Db()
count = 0
with youtube_dl.YoutubeDL({}) as ydl:
for song in songs:
if db.seen(song):
continue
ydl.download([song])
count += 1
db.mark_seen(song)
if count == 0:
print("all songs already downloaded")
def main() -> None:
url = "https://www.jedentageinset.de/feed"
resp = requests.get(url)
songs = []
for l in resp.text.split("\n"):
match = PLAYER_URL.match(l)
if match:
player_url = match.group(1)
song_url = scrape_player(player_url)
if song_url is not None:
songs.append(song_url)
if len(songs) == 0:
print("no songs found")
download_songs(songs)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment