Skip to content

Instantly share code, notes, and snippets.

@rakeshtembhurne
Created January 22, 2021 05:27
Show Gist options
  • Save rakeshtembhurne/34f6d186f0ad457361060ffd3efde064 to your computer and use it in GitHub Desktop.
Save rakeshtembhurne/34f6d186f0ad457361060ffd3efde064 to your computer and use it in GitHub Desktop.
delete
"use strict";
const puppeteer = require("puppeteer");
const _ = require('lodash');
const log = data => console.log(data);
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
let totalHeight = 0;
let distance = 1000;
let timer = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
function isEmpty(value) {
return (
value === undefined ||
value === null ||
value === NaN ||
(typeof value === 'object' && Object.keys(value).length === 0 )||
(typeof value === 'string' && value.trim().length === 0)
)
}
function get(obj, path, def) {
path = path
.replace(/\[/g, '.')
.replace(/]/g, '')
.split('.');
path.forEach(function (level) {
obj = obj[level];
});
if (obj === undefined) {
return def;
}
return obj;
};
async function main() {
const url = "https://www.expedia.com.au/Sydney-Hotels-Shangri-La-Hotel.h11974.Hotel-Information?chkin=2021-01-29&chkout=2021-01-30&x_pwa=1&rfrr=HSR&pwa_ts=1610694652769&referrerUrl=aHR0cHM6Ly93d3cuZXhwZWRpYS5jb20uYXUvSG90ZWwtU2VhcmNo&useRewards=false&rm1=a2&regionId=178312&destination=Sydney+%28and+vicinity%29%2C+New+South+Wales%2C+Australia&destType=MARKET&neighborhoodId=6132025&trackingData=AAAAECNwmltY6Lf7yh2vFtnbNS7MoQH94W9EOm4f8HsXDajktBN-LFDIGJIiMR3aEYYZCwoqz2BI1Fz3Muwf5ZA47POaGKdEJ0-fCGkET8vsaRp_11FtevKgwiMtJ7mkZt-BIdxFMMuiF1oRr5uK-CqVxGkx8vZErSYXsQeSNwl4Ds93GvSjyYZRVv6ZItpseoYPjA-6SzY9MpdPmHSMYl6wSjx8kODFuxGooXum1OxN_VNa5ihRb9aaEKqhUO0Egq28oqKw4W_C5goxtZJj6-6VlIJPnYUYJFDOeQQZYxchv7ORw5h4-k7fAditKo6jYTdtTOlwiDhgaKNdH5BX0lDun_N0dyrg8KH8NYWqrzLCWg3AtIgC_wn75wfomBLY13qokhxe-Ab7U-3RL8srmFfZfyWi_Q6_CGZAUWpKQFX7dzn0zsoBykIZzd_pg4RHG8IliFor9lOHLW_A2EMcrDtYQhwRBDOnS4PyHyZ1nBcrXgE2-uSI0SU0YyH-GU6BKoP7QeE0t3czevPwxPOf2YuzLAYe83FVyyARO1VcXE-WrFB7BsuF4Carz-4w1il_XLnOwGq6o1f2cJn57xSX0RxaqugO13s_61do469ZP489Blkbqekjjn53FhAas3Rdqsq6KQFidEi6pow7CZOPHjceLzMnF6aES8KW_FOZa0UPBRZ64fwPRrdHnZdUu58F6F5-RovqHB6zaoVlYN_TI_d0Hx_8eY_BEtWyGxx9F086-RSetdkvZ3M5EXceoRvO1QBLdcevB6dFMD0eWls_kAy_X2FEnPogRUgpv1nbys3JcidwNfQHa-BaqOB7kqXa2OBb1YBp8t-6Qjs-INdXIZEl6r3clOhWKr4ZPAFWCUfRHoa536zq3dWceyj96Y8BnfvaGgpKU7hsWdSlABVrvniHZ1FZZO6x7RAwOG6J18BoHBXzmB_VgAheVJ9df-Ukg3qwXpVW4HhCFxTJnygMPGiEHMj3hWlcyRbfHQaoB8khIUzW1nGd2P9v-9_KH2sVu7fs3h7wHG3OciDJVlP5Lse6yZs%3D&rank=1&testVersionOverride=Buttercup%2C31936.102311.0%2C33775.98848.1%2C38414.114301.0%2C33739.99567.0%2C37898.109354.1%2C37930.113882.2%2C37949.107324.0%2C39046.114579.0&slots=HSR_A&position=1&beaconIssued=2021-01-15T07%3A10%3A51&sort=RECOMMENDED&top_dp=295&top_cur=AUD&semdtl=&selectedRoomType=202123389&selectedRatePlan=230207305";
// const url = "https://www.expedia.com.au";
// const url = "https://i-know-you-faked-user-agent.glitch.me/new-window";
const HEADLESS = true;
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36";
const browser = await puppeteer.launch({
headless: HEADLESS,
args: [
"--disable-setuid-sandbox",
`--user-agent=${USER_AGENT}`,
// '--proxy-server=socks325655://127.0.0.1:9050',
],
ignoreHTTPSErrors: true
});
const page = await browser.newPage();
// Some Hacks
await page.setRequestInterceptionEnabled(true);
page.on('request', req => {
log({reqHeaders: req.headers()});
let headers = req.headers;
headers['referer'] = 'http://www.expedia.com.au/';
req.continue({ headers: headers });
});
await page.evaluateOnNewDocument(fakeUserAgent => {
Object.defineProperty(navigator, 'platform', { get: () => 'MacIntel'});
Object.defineProperty(navigator, 'productSub', { get: () => '20030107'});
Object.defineProperty(navigator, 'vendor', { get: () => 'Apple Computer, Inc.'});
let open = window.open;
window.open = (...args) => {
let newPage = open(...args);
Object.defineProperty(newPage.navigator, 'userAgent', { get: () => fakeUserAgent})
};
window.open.toString = () => 'function open() { [native code] }';
}, USER_AGENT);
await page.setUserAgent(USER_AGENT);
if (HEADLESS) {
await page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36');
}
await page.setViewport({ width:1920, height:1008 });
let visible = false;
await page.goto(url, { waitUntil: 'load', timeout: 0 })// Remove the timeout
await page.waitForSelector('.main-region', { visible: true, timeout: 3000 })
.catch(err => {
log('Page not found')
visible = true
})
// if (visible) {
// let folderToUpload = `tmp`;
// upsertDirectory(folderToUpload);
// upsertDirectory(`${folderToUpload}/${scriptTime}`);
// folderToUpload = `${folderToUpload}/${scriptTime}`;
// await page.screenshot({
// path: `${folderToUpload}/${expedia_id}.png`,
// fullPage: true
// });
// log(`${folderToUpload}/${expedia_id}.png`)
// await page.close();
// return {}
// }
await page.waitForTimeout(3000)
await page.addScriptTag({ content: `${isEmpty} ${get}`});
const cookies = await page._client.send('Network.getAllCookies');
log(cookies);
if(_.isEmpty(page)) {
return {}
}
const window = await page.evaluate(() => window.__STATE__)
const propertieInfo = _.get(window, "currentHotel", {});
await autoScroll(page);
const hotelDetails = await page.evaluate(async() => {
// Name
const hotelName = (
!isEmpty(document.querySelector('h1.uitk-type-display-700')) &&
!isEmpty(document.querySelector('h1.uitk-type-display-700').textContent)
) ? document.querySelector('h1.uitk-type-display-700').textContent : null;
// Reviews
const liListElements = document.querySelectorAll('li.uitk-tab');
const reviewButton = Array.from(liListElements).find(ele => ele.textContent === "Reviews" )
if(
!isEmpty(reviewButton) &&
!isEmpty(reviewButton.querySelector('a')) &&
typeof reviewButton.querySelector('a').click == 'function'
) {
reviewButton.querySelector('a').click()
}
await new Promise(function(resolve) { setTimeout(resolve, 1000) });
const reviewBaseElement = document.getElementById('Reviews')
const reviewElement = !isEmpty(reviewBaseElement) ?
reviewBaseElement.querySelectorAll('.uitk-flex.uitk-flex-align-items-flex-end.uitk-flex-gap-one.uitk-type-200') :
[];
const reviewScoreElement = !isEmpty(reviewBaseElement) ?
reviewBaseElement.querySelectorAll('.uitk-flex.uitk-flex-column.uitk-flex-item.uitk-flex-basis-half_width.all-y-margin-three') :
[];
const reviewScoreArray = Array.from(reviewScoreElement).map(ele => {
const reviewType = (
!isEmpty(ele.querySelector('.uitk-type-300')) &&
!isEmpty(ele.querySelector('.uitk-type-300').textContent)
) ? ele.querySelector('.uitk-type-300').textContent : null
const value = (
!isEmpty(ele.querySelector('h3')) &&
!isEmpty(ele.querySelector('h3').textContent)
) ? ele.querySelector('h3').textContent : null
switch(reviewType) {
case 'Cleanliness':
return { cleanlinessOverMax: value }
case 'Amenities':
return { amenityScoreOverMax: value };
case 'Staff & service':
return { serviceAndStaffOverMax: value };
case 'Property conditions & facilities':
return { hotelConditionOverMax: value };
default:
return {}
}
})
const ratingCounts = Array.from(reviewElement).map(ele => {
const reviewType = (
!isEmpty(ele.querySelector('.uitk-progress-bar-title')) &&
!isEmpty(ele.querySelector('.uitk-progress-bar-title').textContent)
) ?
ele.querySelector('.uitk-progress-bar-title').textContent
: "";
const value = (
!isEmpty(ele.querySelector('.uitk-progress-bar-description')) &&
!isEmpty(ele.querySelector('.uitk-progress-bar-description').textContent)
) ?
ele.querySelector('.uitk-progress-bar-description').textContent
: null
return { rating: get(reviewType.split(' - '), '0', ''), value }
})
const reviewScoreDefault = {
cleanlinessOverMax: '0/5',
amenityScoreOverMax: '0/5',
serviceAndStaffOverMax: '0/5',
hotelConditionOverMax: '0/5',
}
const reviewScore = isEmpty(reviewScoreArray) ? reviewScoreDefault : Object.assign.apply(Object, reviewScoreArray);
const overallRating = (
!isEmpty(reviewBaseElement) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular')) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular').querySelector('span')) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular').querySelector('span').textContent)
) ?
reviewBaseElement.querySelector('.uitk-type-900.uitk-type-regular').querySelector('span').textContent
: null;
const countData = (
!isEmpty(reviewBaseElement) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one')) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one').querySelector('button')) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one').querySelector('button').textContent)
) ?
reviewBaseElement.querySelector('.uitk-flex.uitk-flex-column.all-t-padding-one').querySelector('button').textContent
: '';
const count = get(countData.split(' '), '0', 0);
const superlative = (
!isEmpty(reviewBaseElement) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-type-400.uitk-type-bold')) &&
!isEmpty(reviewBaseElement.querySelector('.uitk-type-400.uitk-type-bold').textContent)
) ?
reviewBaseElement.querySelector('.uitk-type-400.uitk-type-bold').textContent
: null;
// About
const li_elements = document.querySelectorAll('.uitk-card-aloha-content-section')
const aboutBaseElement = Array.from(li_elements).find((element) => !isEmpty(element.querySelector('h2')) && element.querySelector('h2').textContent === 'About this property' )
const aboutElement = !isEmpty(aboutBaseElement) ?
aboutBaseElement.querySelectorAll('.uitk-layout-grid-item.uitk-layout-grid-item-columnspan-medium-1.uitk-layout-grid-item-columnspan-large-2') :
[]
const about = Array.from(aboutElement).map(ele => {
const h3 = (
!isEmpty(ele.querySelector('h3')) &&
!isEmpty(ele.querySelector('h3').textContent)
) ? ele.querySelector('h3').textContent : ''
const body = (
!isEmpty(ele.querySelector('.uitk-flex.uitk-flex-gap-one.uitk-flex-wrap.all-t-padding-two')) &&
!isEmpty(ele.querySelector('.uitk-flex.uitk-flex-gap-one.uitk-flex-wrap.all-t-padding-two').textContent)
) ? ele.querySelector('.uitk-flex.uitk-flex-gap-one.uitk-flex-wrap.all-t-padding-two').textContent: ''
return (`${h3}\n${body}`).trim()
})
// Address
const addressBaseElement = document.querySelector('.uitk-flex.uitk-flex-column.uitk-spacing-margin-large-inlinestart-three.uitk-layout-grid-item-columnspan-large-5')
const addressButton = !isEmpty(addressBaseElement) ?
addressBaseElement.querySelector('button.uitk-link.all-t-padding-two.uitk-link-layout-inline.uitk-type-300') : ""
const addressMetaTag = !isEmpty(addressButton) ? addressButton.querySelectorAll('meta'): [];
const overviewData = Array.from(addressMetaTag).map(ele => {
switch(ele.getAttribute('itemprop')) {
case 'name':
return { city: !isEmpty(ele.content) ? ele.content: null }
case 'addressRegion':
return { state: !isEmpty(ele.content) ? ele.content: null }
case 'streetAddress':
return { streetAddress: !isEmpty(ele.content) ? ele.content: null }
case 'addressCountry':
return { countryCode: !isEmpty(ele.content) ? ele.content: null }
case 'latitude':
return { latitude: !isEmpty(ele.content) ? ele.content: null }
case 'longitude':
return { longitude: !isEmpty(ele.content) ? ele.content: null }
}
})
const addressLine = (
!isEmpty(addressBaseElement) &&
!isEmpty(addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1')) &&
!isEmpty(addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1').querySelector('span')) &&
!isEmpty(addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1').querySelector('span').textContent)
) ?
addressBaseElement.querySelector('.uitk-flex-item.uitk-type-left.uitk-flex-grow-1').querySelector('span').textContent :
null;
overviewData.push({ addressLine })
const overview = isEmpty(overviewData) ? {} :Object.assign.apply(Object, overviewData)
// Star Rating
const ratingBaseElement = document.querySelector('.uitk-rating');
const svgLength = (
!isEmpty(ratingBaseElement) &&
!isNaN(ratingBaseElement.querySelectorAll('svg.uitk-icon.uitk-icon-xsmall').length)
) ?
ratingBaseElement.querySelectorAll('svg.uitk-icon.uitk-icon-xsmall').length
: 0;
const ratingContent = (
!isEmpty(ratingBaseElement) &&
!isEmpty(ratingBaseElement.querySelector('span')) &&
!isEmpty(ratingBaseElement.querySelector('span').textContent)
) ?
ratingBaseElement.querySelector('span').textContent
: '';
const starRatingValue = ratingContent.split(' ')
const stars = (svgLength === get(starRatingValue, '0', 0)) ? svgLength : starRatingValue[0];
// Review score header
const reviewHeaderElement = document.querySelector('[data-stid="content-hotel-reviewsummary"]');
const reviewMetaTag = !isEmpty(reviewHeaderElement) ? reviewHeaderElement.querySelectorAll('meta') : []
const reviewData = Array.from(reviewMetaTag).map(ele => {
switch(ele.getAttribute('itemprop')) {
case 'ratingValue':
return { overallRating: !isEmpty(ele.content) ? ele.content: null }
case 'reviewCount':
return { count: !isEmpty(ele.content) ? ele.content: null }
case 'description':
return { superlative: !isEmpty(ele.content) ? ele.content: null }
}
})
const reviewHeader = isEmpty(reviewData) ? {} :Object.assign.apply(Object, reviewData)
// Amenities
const amenitiesBaseElement = document.querySelector('[data-stid="hotel-amenities-list"]');
if (
!isEmpty(amenitiesBaseElement) &&
!isEmpty(amenitiesBaseElement.querySelector('button')) &&
typeof amenitiesBaseElement.querySelector('button').click == 'function'
) {
amenitiesBaseElement.querySelector('button').click()
} else {
console.log("Button not found")
}
await new Promise(function(resolve) { setTimeout(resolve, 1000) });
// Scrape Dialoge Box
const amenitiesBaseDialogElement = document.querySelector('.uitk-dialog-layer.uitk-dialog-layer-responsive.layer-overlay-active')
const amenitiesArray = !isEmpty(amenitiesBaseDialogElement) ?
amenitiesBaseDialogElement.querySelectorAll('.uitk-spacing.uitk-spacing-padding-blockend-four')
: []
let popularAmentiesListBaseElement = Array.from(amenitiesArray).find(ele => ele.querySelector('h3') && ele.querySelector('h3').textContent === 'Popular amenities')
if (isEmpty(popularAmentiesListBaseElement)) {
popularAmentiesListBaseElement = Array.from(amenitiesArray).find(ele => ele.querySelector('h3') === null )
}
const popularAmenitiesList = !isEmpty(popularAmentiesListBaseElement) ?
popularAmentiesListBaseElement.querySelectorAll('li') : [];
const topSummary = Array.from(popularAmenitiesList).map(ele => {
return { description: ele.textContent }
})
return {
hotel: {
additionalInfo: {
propertyDescription: {
hotelDescription: about.join('\n'),
about
}
},
amenities: {
topSummary
},
overview,
hotelName,
stars
},
reviews: {
overallRating,
ratingCounts,
count,
superlative,
...reviewScore
},
reviewHeader,
}
});
console.log(JSON.stringify(hotelDetails, null, 2));
};
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment