Skip to content

Instantly share code, notes, and snippets.

@asad-haider
Created May 12, 2023 10:26
Show Gist options
  • Save asad-haider/27f71ee66543026ac432f702a999a5a1 to your computer and use it in GitHub Desktop.
Save asad-haider/27f71ee66543026ac432f702a999a5a1 to your computer and use it in GitHub Desktop.
Spidey Pipeline Tutorial
const { Spidey, DiscardItemError } = require('spidey');
class AsinPipeline {
constructor() {}
process(item) {
const asin = item.url.match(/\/dp\/([A-Z0-9]{10})/)[1];
item.asin = asin;
return item;
}
}
class ValidationPipeline {
constructor() {}
process(item) {
if (!item.title) {
return new DiscardItemError('No title found');
}
return item;
}
}
class AmazonSpidey extends Spidey {
constructor() {
super({
concurrency: 20,
outputFormat: 'json',
outputFileName: 'output.json',
pipelines: [AsinPipeline, ValidationPipeline],
});
}
categoryUrls = [
'https://www.amazon.de/-/en/gp/bestsellers/beauty/64272031/ref=zg_bs_nav_beauty_1',
'https://www.amazon.de/-/en/gp/bestsellers/beauty/122877031/ref=zg_bs_nav_beauty_1',
'https://www.amazon.de/-/en/gp/bestsellers/beauty/122876031/ref=zg_bs_nav_beauty_1',
];
start() {
for (const categoryUrl of this.categoryUrls) {
this.request({ url: categoryUrl }, this.parse.bind(this));
}
}
parse(response) {
const productUrls = new Set();
response
.$('#gridItemRoot .p13n-sc-uncoverable-faceout > a')
.each((index, element) => {
productUrls.add(response.$(element).attr('href'));
});
productUrls.forEach((url) => {
url = `https://www.amazon.de${url}`;
this.request({ url }, this.parseProduct.bind(this));
});
}
parseProduct(response) {
const url = response.url;
const title = response.$('#productTitle').text().trim();
// parse other information from product page
this.save({ url, title });
}
}
new AmazonSpidey().start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment