Skip to content

Instantly share code, notes, and snippets.

@guillim
Last active March 28, 2023 06:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guillim/38f85ee1412b594a231e99ef8a8eb405 to your computer and use it in GitHub Desktop.
Save guillim/38f85ee1412b594a231e99ef8a8eb405 to your computer and use it in GitHub Desktop.
#cdiscount #dgm
//------------START------------------------------------------------------------------------------------------------
async function start(context,$,site,typeDeCrawl){
context.log.info('remote file => start')
site = (context.customData && context.customData.site) ? context.customData.site : site
typeDeCrawl = (context.customData && context.customData.typeDeCrawl) ? context.customData.typeDeCrawl : typeDeCrawl
switch (context.request.userData.label) {
case 'home':
return await case_home(context,site,typeDeCrawl);
case 'search':
return await case_search(context,$,site,typeDeCrawl);
case 'product':
return await case_product(context,$,site,typeDeCrawl);
}
}
//------------HOME------------------------------------------------------------------------------------------------
async function case_home(context,site,typeDeCrawl){
var initialList = (context.customData && context.customData.initialList) ? context.customData.initialList : ''
if (initialList === '') { return { debugInfo: 'case_home: initialList empty' }; }
switch (typeDeCrawl) {
case 'simple':
case 'profond':
initialList.split(',').map(function(keyword) {
var encodedKeyword = encodeURI( keyword.trim().replace(/(\s{1,})/g, '+') );
var searchUrl = 'https://www.cdiscount.com/search/10/'+encodedKeyword+'.html';
enqueueLabel(context,'search',searchUrl,{ c01_keyword: keyword })
});
return undefined
case 'produit':
initialList.split(',').map(function(url) {
enqueueLabel(context,'product',url,{ p03_marketplaceName: site })
});
return undefined
default:
return { debugInfo: 'case_home: bug in typeDeCrawl' };
}
}
//------------SEARCH----------------------------------------------------------------------------------------------
async function case_search(context,$,site,typeDeCrawl){
obj = context.request.userData.interceptRequestData;
var result = [];
var sponsored = 0;
var notAProductRow = 0;
await context.waitFor(() => !!$(".lpMain .jsPrdBlocContainer form"), { timeoutMillis: 10000 });
var productCountRaw = $("#lpTitle > span").text();
var position = 0;
$(".lpMain .jsPrdBlocContainer form").map(function(i) {
var obj = {};
obj = $.extend({}, context.request.userData.interceptRequestData);
obj.c02_marketplaceName = site;
obj.c03_NumberofResults = parseInt( $("#lpTitle > span").text().replace(/[^0-9]/g,'') );
obj.c06_itemURL = $(this).find('.prdtBILDetails a:eq(0)').attr('href');
obj.c04_asin = reg(obj.c06_itemURL)
if(obj.c04_asin.length === 0){
notAProductRow++;
return false;
}
obj.c26_sponsoredBrand = false;
if($(this).find(".c-sponsoredMentions").length > 0){
sponsored++;
obj.c23_sponsoredProduct = true;
obj.c07_position = sponsored;
}else{
position++;
obj.c23_sponsoredProduct = false;
obj.c07_position = position - notAProductRow;
if (obj.c07_position < 0) {
obj.errorInfo = 'BUG => obj.c07_position =' + obj.c07_position + ' & obj.c01_keyword = ' +obj.c01_keyword + ' & i=' + i + '& position='+position
}
}
obj.c05_itemTitle = $(this).find('.prdtBILTit').text().trim();
obj.c14_priceRaw = $(this).find('.prdtBILPrice .price:eq(0)').text().trim();
var nbrOfCom = $(this).find(".prdtBILStar").text().trim();
obj.c08_numberofcomments = (nbrOfCom) ? tr(nbrOfCom.replace(",","").replace(/[^0-9]/g,'')) : 0
// sur-charging the result
obj = addBooleansCdiscount($,this,obj)
if(obj.c06_itemURL && obj.c07_position){
if(obj.c07_position <= 15){
if(typeDeCrawl === 'profond'){
// here we don't use c06_itemURL because we want to remove duplicates url ending with ?param=blabla
enqueueLabel(context,'product',obj.c06_itemURL,{ p03_marketplaceName: site });
return undefined
}else{
result.push($.extend({}, obj));
}
}
}else{
console.log('===> BUG : (crawl profond) c06_itemURL undefined OR c07_position undefined');
}
});
//here we try to get the Headlines Products
$(".skwOffer").map(function(i) {
var obj = {};
obj = $.extend({}, context.request.userData.interceptRequestData);
obj.c02_marketplaceName = site;
// obj.c07_position = false;
obj.c23_sponsoredProduct = false;
obj.c26_sponsoredBrand = true;
obj.c06_itemURL = $(this).find('a[href]:eq(0)').attr('href');
obj.c07_position = i + 1
obj.c04_asin = reg(obj.c06_itemURL)
obj.c05_itemTitle = $(this).find('.skwOfferTitle').text().trim();
obj.c08_numberofcomments = $(this).find(".skwRateContent").text().trim();
obj.c08_numberofcomments = (obj.c08_numberofcomments) ? tr(obj.c08_numberofcomments.replace(",","").replace(/[\(\)]/g,"")) : 0;
// sur-charging the result
obj = addBooleansCdiscount($,this,obj)
if(obj.c07_position <= 15){
result.push($.extend({}, obj));
}
});
//end of headline Search
return await result;
}
//------------PRODUCT-----------------------------------------------------------------------------------
async function case_product(context,$,site,typeDeCrawl){
var obj = (context.request.userData.interceptRequestData) ? context.request.userData.interceptRequestData : {};
var startedAt = Date.now();
var g = function() {
if( Date.now() - startedAt > 10000 ) { // timeout after 10 seconds
obj.debugInfo = 'case_product: timeout after 10 seconds - check h1#title ? or is captcha true or false:' + checkCaptcha($)
return obj;
}
if($("h1").length){
obj.p16_ImageURL = $(".fpImg img:eq(0)").attr('src');
obj.p10_sellerTechnicalBrand = $("#descContent table tr:eq(1) td:eq(1)").text();
obj.p08_description1 = tr($(".fpBulletPointReadMore p").text());
obj.p08_description1 = (typeof(obj.p08_description1) === 'string') ? obj.p08_description1.substr(0,1900) + '...' : obj.p08_description1
obj.p04_code = reg(context.request.url)
obj.p05_itemTitle = tr($("h1").text())
obj.p06_numberofcomments = Number( tr( $(".fpCusto:contains('avis')").text().replace(/[^0-9]/g,'') ) )
var p09_description2 = $("#descContent")
p09_description2.find("script").remove()
obj.p09_description2 = tr(p09_description2.text());
obj.p09_description2 = (typeof(obj.p09_description2) === 'string') ? obj.p09_description2.substr(0,1900) + '...' : obj.p09_description2
obj.p02_sellerOfficial = tr($("#fpSellBy .fpSellerName:eq(0)").text())
if(!obj.p02_sellerOfficial && $(".outOfStock").length > 0){ obj.p02_sellerOfficial = 'out of stock' }
var cdiscountavolonte = $("#fpSellBy").filter(function() { return /Cdiscount\sà\svolonté/g.test($(this).text());}).text();
if(!obj.p02_sellerOfficial && cdiscountavolonte.length > 0){ obj.p02_sellerOfficial = 'Cdiscount à volonté' }
if(!obj.p02_sellerOfficial){ obj.p02_sellerOfficial = tr($("#fpSellBy").text()) }
var star = 'NA'
try {
star = tr( $("span[itemprop='ratingValue']:eq(0)").text() ).replace(',','.')
star = (star) ? star : 'NA'
}
catch(e) { console.error(e); }
obj.p07_star = Number( tr(star).replace(',','.') )
obj.p14_reviews = [];
var h = function() {
$(".infoCli").map(function() {
var review = {};
review.p05_title = tr($(this).find('.title').text())
review.p02_note = getStarFromClasses($(this))
review.p04_texte = tr($(this).find('> p').text())
review.p04_texte = (typeof(review.p04_texte) === 'string') ? review.p04_texte.substr(0,2900) + '...' : review.p04_texte
// ex de p07_other: "1000 PREMIERS REDACTEURS D'AVIS"
// review.p07_other = tr($(this).find(".badges-genome-widget").text());
var helpf = tr($(this).find(".jsYesRat:contains('Oui') span").text());
review.p01_helpfulReview = (helpf && helpf.match(/\([0-9]*\)/g) ) ? helpf.replace(/[\(\)]/g,'') : 'NA' ;
review.p06_verified = ($(this).find(".achatCert").length > 0) ? true : false;
obj.p14_reviews.push( review );
});
}
h();
// obj.p01_seller =
obj.p15_QuestionReponses = [];
var h2 = function() {
$(this).find(".fpFAQQuestion").map(function() {
var qa = {};
qa.p01_question = tr($(this).find('div:eq(0) .fpFAQQuestionText').text());
qa.p02_reponse = tr($(this).find('div:eq(0) .fpAnswerContent p:eq(0)').text());
// qa.p03_votes = Number( $(this).find('.a-col-left .vote .count').text().trim() );
obj.p15_QuestionReponses.push( qa );
});
}
h2()
return obj;
}else{
setTimeout(g, 1000);
}
};
g();
}
//-----------------------------------function-----------------------------------------------------
function enqueueLabel(context,label,url,interceptRequestData) {
context.enqueueRequest({
userData : {
label:label,
interceptRequestData:interceptRequestData
},
url: url
});
}
function tr(text) {
return (typeof text === 'string') ? text.replace(/(\s\s+|\\n)/gi, ' ').trim() : text ;
}
//captcha alert
function checkCaptcha($){
if ( $('div:contains("make sure you\'re not a robot.")').length !== 0
|| $('div:contains("ne suis pas un robot")').length !== 0
|| $('div:contains("n\'êtes pas un robot")').length !== 0
|| $('div:contains("caractères que vous voyez")').length !== 0
|| $('div:contains("the characters you see")').length !== 0
|| $('div:contains("les caractères affichés")').length !== 0) {
return true
}else{
return false
}
}
function addBooleansCdiscount($,thisObject,interceptRequestData){
// interceptRequestData.c28_isPantry = ($(thisObject).find("i.s-eu-icon-amazon-pantry").length ) ? true : false
// interceptRequestData.c29_isPrime = ($(thisObject).find("i.a-icon-prime").length ) ? true : false
// interceptRequestData.c30_isNumberOneSeller = ( $(thisObject).find("[aria-labelledby*='best-seller']").length ) ? true : false
interceptRequestData.c31_isFirstChoice = ($(thisObject).find(".prdtBILLabel").length ) ? true : false
interceptRequestData.c32_isCouponAvailable = ($(thisObject).find(".prdtBILSpecial > div").length ) ? true : false
return interceptRequestData
}
function getStarFromClasses(jQueryElement){
var star = false
for( var i = 1; i <= 5; i++){
if( jQueryElement.find('.ratingPosition').hasClass('stN'+i) ){ star = i; }
}
return star
}
function reg(url) {
var re = url.match(/[^\/]*.html|\??idOffre=[^&#]*/g);
return (re) ? re.join('') : '';
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment