Skip to content

Instantly share code, notes, and snippets.

@HelloWorld017
Created April 26, 2020 16:58
Show Gist options
  • Save HelloWorld017/e05d5dec7955e9346e992c5e93a4dfa9 to your computer and use it in GitHub Desktop.
Save HelloWorld017/e05d5dec7955e9346e992c5e93a4dfa9 to your computer and use it in GitHub Desktop.
Musinsa Crawler
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const promisePipe = require('promisepipe');
const signale = require('signale');
const baseUrl = 'https://store.musinsa.com';
const api = axios.create({
baseURL: baseUrl,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
},
responseType: 'text'
});
const getNumber = elem =>
parseInt(elem.text().replace(/[^0-9]/g, ''));
const crawl = async (id, dest, maxPage = null) => {
try {
await fs.promises.mkdir(dest);
} catch(e) {}
const listUrl = `/app/items/lists/${id}`;
const crawlPage = async page => {
const { data: body } = await api(`${listUrl}?page=${page}`);
const $ = cheerio.load(body);
const listItems = $('.list-box .li_box').toArray();
const images = [];
for(const listItem of listItems) {
const elem = $(listItem);
/*
const title = elem.children('.list_info').text();
const price = elem.children('.txt_price_member').text();
const like = getNumber(elem.children('.txt_cnt_like'));
*/
const image = elem.find('.list_img img[data-original]');
const imageUrl = image.attr('data-original');
images.push(new URL(
imageUrl,
baseUrl
).toString());
}
const errors = [];
for(const imageUrl of images) {
try {
const { data: imageStream } = await api({
url: imageUrl,
responseType: 'stream'
});
const fileName = path.basename(imageUrl);
await promisePipe(
imageStream,
fs.createWriteStream(path.join(dest, fileName))
);
} catch(e) {
signale.error(e);
errors.push(imageUrl);
}
}
return errors;
};
const { data: body } = await api(listUrl);
const $ = cheerio.load(body);
const goods = getNumber($('.box_num_goods'));
const pages = getNumber($('.totalPagingNum'));
signale.info(`Found ${goods} goods.`);
const errors = [];
for(let page = 1; page <= pages; page++) {
const pageErrors = await crawlPage(page);
if(pageErrors.length > 0) {
signale.error(`${pageErrors.length} errors have been occurred while crawling`);
errors.push(...pageErrors);
}
signale.success(`Crawled page ${page}/${Math.min(maxPage, pages)}.`);
if(maxPage && page >= maxPage) {
signale.info("Touched the maximum page. Stopping.");
break;
}
await new Promise(resolve => setTimeout(resolve, 2000));
}
if(errors.length > 0) {
await fs.promises.writeFile('./errors.json', JSON.stringify(errors));
}
signale.success(`Done with ${errors.length} errors.`);
};
// crawl('002020', '../dataset/cardigan', 25);
crawl('001001', '../dataset/tshirts', 25);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment