Created
August 24, 2017 15:05
-
-
Save 0xa/ebc8eeccd123aaa452027d050de26f33 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Get Rick & Morty episodes ("rick" as in "https://ctoon.party/rick"): | |
> scrapy runspider ctoonparty.py -o rick.json -a show=rick | |
(add -a season=2 to get only one season) | |
Extract links only: | |
> jq '.[].url' -r < rick.json > rick.txt | |
> wget -c -i rick.txt | |
""" | |
import scrapy | |
import re | |
def get_best_source(sources): | |
ordered = ['1080p', '720p'] | |
for o in ordered: | |
for s in sources: | |
label = s.css('::attr(label)').extract_first() | |
href = s.css('::attr(src)').extract_first() | |
if label == o: | |
return (label, href) | |
class CToonSpider(scrapy.Spider): | |
name = 'ctoon' | |
def __init__(self, *args, show=None, season=None, **kwargs): | |
if not show or '/' in show: | |
raise Exception("use -a show=<name>") | |
self.show = show | |
self.season = int(season) if season else None | |
self.base = 'https://ctoon.party/%s' % self.show | |
self.start_urls = [self.base] | |
def parse(self, response): | |
if re.match(self.base + '/[0-9a-z]+/?', response.url): | |
# Episode page | |
best = get_best_source(response.css('video source')) | |
if not best: | |
raise Exception() | |
yield {'url': best[1], 'quality': best[0]} | |
else: | |
# Index | |
if self.season: | |
f = '#collapse%02d' % self.season | |
else: | |
f = '' | |
for next_page in response.css(f + ' .ep-entry a'): | |
yield response.follow(next_page, self.parse) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment