Skip to content

Instantly share code, notes, and snippets.

@asad-haider
Created May 16, 2023 11:15
Show Gist options
  • Save asad-haider/0b813d819743af292e0d9bf249f2757b to your computer and use it in GitHub Desktop.
Save asad-haider/0b813d819743af292e0d9bf249f2757b to your computer and use it in GitHub Desktop.
Spidey tutorial with rotating proxy integration
const { Spidey } = require('spidey');
const { MongoClient } = require('mongodb');
class MongoPipeline {
options;
client;
collection;
constructor(options) {
this.options = options;
}
async start() {
this.client = new MongoClient(this.options?.connectionString);
await this.client.connect();
const db = this.client.db(this.options?.database);
this.collection = db.collection(this.options?.collection);
}
async complete() {
await this.client.close();
}
async process(data) {
await this.collection.findOneAndUpdate(
{ asin: data.asin },
{ $set: data },
{ upsert: true }
);
return data;
}
}
class AsinPipeline {
constructor() {}
process(item) {
const asin = item.url.match(/\/dp\/([A-Z0-9]{10})/)[1];
item.asin = asin;
return item;
}
}
class ValidationPipeline {
constructor() {}
process(item) {
if (!item.title) {
return new DiscardItemError('No title found');
}
return item;
}
}
class AmazonSpidey extends Spidey {
constructor() {
super({
concurrency: 5,
connectionString: 'mongodb://localhost:27017',
database: 'amazon',
collection: 'products',
pipelines: [AsinPipeline, ValidationPipeline, MongoPipeline],
SCRAPER_API_KEY: 'e4bf883896cacad2d4bf4b01dadfb0b8',
});
}
categoryUrls = [
'https://www.amazon.de/-/en/gp/bestsellers/beauty/64272031/ref=zg_bs_nav_beauty_1',
'https://www.amazon.de/-/en/gp/bestsellers/beauty/122877031/ref=zg_bs_nav_beauty_1',
'https://www.amazon.de/-/en/gp/bestsellers/beauty/122876031/ref=zg_bs_nav_beauty_1',
];
options = this.getOptions();
start() {
for (const categoryUrl of this.categoryUrls) {
const proxyUrl = `http://api.scraperapi.com?api_key=${this.options.SCRAPER_API_KEY}&url=${categoryUrl}`;
this.request({ url: proxyUrl }, this.parse.bind(this));
}
}
parse(response) {
const productUrls = new Set();
response
.$('#gridItemRoot .p13n-sc-uncoverable-faceout > a')
.each((index, element) => {
productUrls.add(response.$(element).attr('href'));
});
productUrls.forEach((url) => {
url = `https://www.amazon.de${url}`;
const proxyUrl = `http://api.scraperapi.com?api_key=${this.options.SCRAPER_API_KEY}&url=${url}`;
this.request(
{ url: proxyUrl, meta: { url } },
this.parseProduct.bind(this)
);
});
}
parseProduct(response) {
const url = response.meta.url;
const title = response.$('#productTitle').text().trim();
// parse other information from product page
this.save({ url, title });
}
}
new AmazonSpidey().start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment