Skip to content

Instantly share code, notes, and snippets.

@asad-haider
Created May 14, 2023 00:03
Show Gist options
  • Save asad-haider/ef41df17ef18a4528fdf8f000fb8a7a5 to your computer and use it in GitHub Desktop.
Save asad-haider/ef41df17ef18a4528fdf8f000fb8a7a5 to your computer and use it in GitHub Desktop.
Spidey pipeline tutorial with MongoDB integration
const { Spidey } = require("spidey");
const { MongoClient } = require("mongodb");
class MongoPipeline {
options;
client;
collection;
constructor(options) {
this.options = options;
}
async start() {
this.client = new MongoClient(this.options?.connectionString);
await this.client.connect();
const db = this.client.db(this.options?.database);
this.collection = db.collection(this.options?.collection);
}
async complete() {
await this.client.close();
}
async process(data) {
await this.collection.findOneAndUpdate(
{ asin: data.asin },
{ $set: data },
{ upsert: true }
);
return data;
}
}
class AsinPipeline {
constructor() {}
process(item) {
const asin = item.url.match(/\/dp\/([A-Z0-9]{10})/)[1];
item.asin = asin;
return item;
}
}
class ValidationPipeline {
constructor() {}
process(item) {
if (!item.title) {
return new DiscardItemError("No title found");
}
return item;
}
}
class AmazonSpidey extends Spidey {
constructor() {
super({
concurrency: 20,
connectionString: "mongodb://localhost:27017",
database: "amazon",
collection: "products",
pipelines: [AsinPipeline, ValidationPipeline, MongoPipeline],
});
}
categoryUrls = [
"https://www.amazon.de/-/en/gp/bestsellers/beauty/64272031/ref=zg_bs_nav_beauty_1",
"https://www.amazon.de/-/en/gp/bestsellers/beauty/122877031/ref=zg_bs_nav_beauty_1",
"https://www.amazon.de/-/en/gp/bestsellers/beauty/122876031/ref=zg_bs_nav_beauty_1",
];
start() {
for (const categoryUrl of this.categoryUrls) {
this.request({ url: categoryUrl }, this.parse.bind(this));
}
}
parse(response) {
const productUrls = new Set();
response
.$("#gridItemRoot .p13n-sc-uncoverable-faceout > a")
.each((index, element) => {
productUrls.add(response.$(element).attr("href"));
});
productUrls.forEach((url) => {
url = `https://www.amazon.de${url}`;
this.request({ url }, this.parseProduct.bind(this));
});
}
parseProduct(response) {
const url = response.url;
const title = response.$("#productTitle").text().trim();
// parse other information from product page
this.save({ url, title });
}
}
new AmazonSpidey().start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment