Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python3
import scrapy
from scrapy.pipelines.files import FilesPipeline
from scrapy.exceptions import CloseSpider
import sys
import os
ITEM_PIPELINES = {'SoundZipPipeline': 1}
class SoundZip(scrapy.Item):
url = scrapy.Field()
name = scrapy.Field()
path = scrapy.Field()
class SoundZipPipeline(FilesPipeline):
def file_path(self):
class DrumitSpider(scrapy.Spider):
name = "2box_drumit_sounds_spider"
start_urls = ['']
def classify(self, name):
categories = [
('Clap', None),
('Darabouk', None),
('Pete Lockett', None),
('Synth', None),
('Tom', 'Tom'),
('HiHat', 'Cym/HiHat'),
('x-HiHat', 'Cym/HiHat'),
('Splash', 'Cym/Crash'),
('Crash', 'Cym/Crash'),
('China', 'Cym/China'),
('CH Bell', 'Cym/China'),
('Trash', 'Cym/China'),
('Ride', 'Cym/Ride'),
('Snare', 'Snare'),
('Snares', 'Snare'),
('Cross Stick', 'Snare'),
('Gretsch Maple', None),
('Heavy Rock', None),
('Shaker', None),
('Disco Kick', None),
('Kick', 'Kick'),
('Tambourin', 'Perc/Tambourine'),
('Tambourine', 'Perc/Tambourine'),
('Tabla', None),
('Calebass', None),
('Cabasa', None),
('Caxixi', None),
('Octaban', None),
('Block', None),
('Blip', None),
('Guiro', None),
for cat, tgt in categories:
cat = cat.lower()
lname = name.lower()
if f' {cat} ' in lname or lname.endswith(f' {cat}') or lname.startswith(f'{cat} '):
if tgt is None:
if cat in ['gretsch maple', 'heavy rock']:
print('Skipping', name)
return None
return os.path.join(tgt, name + '.zip')
# Kick
# Perc/Cowbell
# Perc/Tambourine
print('uncategorized', name)
raise CloseSpider(name)
def parse(self, response):
# actual download
name = response.css('h1.product_title ::text').get()
url = response.xpath('.//a[text() = "Download"]/text()').get()
if name and url:
name = name.strip()
sound = SoundZip()
sound['url'] = response.urljoin(url)
sound['name'] = name,
sound['path'] = self.classify(name),
if sound['path']:
yield sound
# sounds on page
for sound in response.css('a.woocommerce-LoopProduct-link'):
yield response.follow(sound.css('::attr(href)').get(), callback=self.parse)
# page of sounds
for page in response.css(''):
yield response.follow(page.css('::attr(href)').get(), callback=self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment