Created
August 20, 2021 13:18
-
-
Save retorquere/74ea41fc7898bd5cb102155b71098cc2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import scrapy | |
from scrapy.pipelines.files import FilesPipeline | |
from scrapy.exceptions import CloseSpider | |
import sys | |
import os | |
ITEM_PIPELINES = {'SoundZipPipeline': 1} | |
class SoundZip(scrapy.Item): | |
url = scrapy.Field() | |
name = scrapy.Field() | |
path = scrapy.Field() | |
class SoundZipPipeline(FilesPipeline): | |
def file_path(self): | |
pass | |
class DrumitSpider(scrapy.Spider): | |
name = "2box_drumit_sounds_spider" | |
start_urls = ['https://2box-drums.com/sounds/'] | |
def classify(self, name): | |
categories = [ | |
('Clap', None), | |
('Darabouk', None), | |
('Pete Lockett', None), | |
('Synth', None), | |
('Tom', 'Tom'), | |
('HiHat', 'Cym/HiHat'), | |
('x-HiHat', 'Cym/HiHat'), | |
('Splash', 'Cym/Crash'), | |
('Crash', 'Cym/Crash'), | |
('China', 'Cym/China'), | |
('CH Bell', 'Cym/China'), | |
('Trash', 'Cym/China'), | |
('Ride', 'Cym/Ride'), | |
('Snare', 'Snare'), | |
('Snares', 'Snare'), | |
('Cross Stick', 'Snare'), | |
('Gretsch Maple', None), | |
('Heavy Rock', None), | |
('Shaker', None), | |
('Disco Kick', None), | |
('Kick', 'Kick'), | |
('Tambourin', 'Perc/Tambourine'), | |
('Tambourine', 'Perc/Tambourine'), | |
('Tabla', None), | |
('Calebass', None), | |
('Cabasa', None), | |
('Caxixi', None), | |
('Octaban', None), | |
('Block', None), | |
('Blip', None), | |
('Guiro', None), | |
] | |
for cat, tgt in categories: | |
cat = cat.lower() | |
lname = name.lower() | |
if f' {cat} ' in lname or lname.endswith(f' {cat}') or lname.startswith(f'{cat} '): | |
if tgt is None: | |
if cat in ['gretsch maple', 'heavy rock']: | |
print('Skipping', name) | |
return None | |
else: | |
return os.path.join(tgt, name + '.zip') | |
# Kick | |
# Perc/Cowbell | |
# Perc/Tambourine | |
print('uncategorized', name) | |
raise CloseSpider(name) | |
def parse(self, response): | |
# actual download | |
name = response.css('h1.product_title ::text').get() | |
url = response.xpath('.//a[text() = "Download"]/text()').get() | |
if name and url: | |
name = name.strip() | |
sound = SoundZip() | |
sound['url'] = response.urljoin(url) | |
sound['name'] = name, | |
sound['path'] = self.classify(name), | |
if sound['path']: | |
yield sound | |
# sounds on page | |
for sound in response.css('a.woocommerce-LoopProduct-link'): | |
yield response.follow(sound.css('::attr(href)').get(), callback=self.parse) | |
# page of sounds | |
for page in response.css('a.page-numbers'): | |
yield response.follow(page.css('::attr(href)').get(), callback=self.parse) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment