Skip to content

Instantly share code, notes, and snippets.

@yekm
Last active February 9, 2018 08:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yekm/9bad9d100c1ded45133ff51538d6a641 to your computer and use it in GitHub Desktop.
Save yekm/9bad9d100c1ded45133ff51538d6a641 to your computer and use it in GitHub Desktop.
various snippents
# extract lines with c-style comments
sed -ne '/\/\*/ bp; /\/\// p; b; :p; p; /\*\// b; n; bp;'
# -*- coding: utf-8 -*-
import scrapy
import urllib.request
import os
def remove_prefix(text, prefix):
if text.startswith(prefix):
return text[len(prefix):]
return text
class RarchiveSpider(scrapy.Spider):
name = 'rarchive'
allowed_domains = ['www.radiolab.org']
start_urls = ['http://www.radiolab.org/archive/']
def parse(self, response):
for story in response.xpath('//div[@id="radiolab-archive"]//a[contains(@href, "/story/")]/@href').extract():
yield scrapy.Request(story, callback=self.parse_story)
pass
def parse_story(self, response):
player = response.css('div.inline_audioplayer_wrapper').xpath('div[@data-width="620"]')
title = player.xpath('@data-title').extract()[0]
mp3url = player.xpath('@data-download').extract()[0]
se = response.xpath('//div[@class="seanum-epnum"]/text()').extract()
#podtrac = 'https://www.podtrac.com/pts/redirect.mp3/'
if not se:
se = ["season 0 | ep 0"]
s = se[0].split(' ')[1]
e = se[0].split(' ')[4]
filename = 'episodes/s{}e{} {}.mp3'.format(s, e, title)
if not os.path.isfile(filename):
urllib.request.urlretrieve(mp3url, filename)
yield {
'title': title,
'mp3': mp3url,
'season': s,
'episode' : e
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment