Created
January 15, 2020 15:39
-
-
Save AlexRatmansky/46dc2d408f3e73416d876a3953849e57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Full code to Scrap Amazon behind Login Wall - Optimized and Works in Headless Mode (Avoid BOT detection) | |
// Get addressess from Amazon Address Book | |
const puppeteer = require('puppeteer'); | |
(async () => { | |
// Syntactic Sugar | |
const Navigate = async (url) => { | |
await page.goto(url); | |
} | |
const EnterText = async (selector, text) => { | |
await page.click(selector); | |
await page.keyboard.type(text); | |
} | |
const ClickNavigate = async (selector, waitFor = -1) => { | |
await page.click(selector); | |
if (waitFor >= 0) { | |
await page.waitFor(waitFor*1000) | |
} | |
else { | |
await page.waitForNavigation(); | |
} | |
} | |
// Main Flow | |
const C_HEADELESS = true | |
const C_OPTIMIZE = true | |
const C_SLOWMOTION = 0 // slow down by X ms | |
const browser = await puppeteer.launch({ | |
headless: C_HEADELESS, | |
slowMo: C_SLOWMOTION | |
}); | |
const page = await browser.newPage(); | |
// To ensure Amazon doesn't detect it as a Bot | |
await page.setExtraHTTPHeaders({ | |
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' | |
}); | |
// No unwanted resources | |
if (C_OPTIMIZE) { | |
await page.setRequestInterception(true); | |
const block_ressources = ['image', 'stylesheet', 'media', 'font', 'texttrack', 'object', 'beacon', 'csp_report', 'imageset']; | |
page.on('request', request => { | |
//if (request.resourceType() === 'image') | |
if (block_ressources.indexOf(request.resourceType) > 0) | |
request.abort(); | |
else | |
request.continue(); | |
}); | |
} | |
// Creds | |
const USER_EMAIL = "YOUR_EMAIL_HERE" | |
const USER_PASSWORD = "YOUR_PASSWORD_HERE" | |
// Home Page constants | |
const U_HOMEPAGE = 'https://amazon.com' | |
const U_LOGIN_PAGE = 'https://www.amazon.com/ap/signin?clientContext=135-8638983-8261231&openid.return_to=https%3A%2F%2Fwww.amazon.com%2Fa%2Faddresses&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&marketPlaceId=ATVPDKIKX0DER&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&pageId=usflex&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=900&siteState=clientContext%3D143-3525329-4850620%2CsourceUrl%3Dhttps%253A%252F%252Fwww.amazon.com%252Fa%252Faddresses%2Csignature%3Dnull' | |
const S_LOGIN_LINK = '#nav-link-accountList' | |
// Optimzed the flow to reach address book faster, trick is to manually try to go to Target page before login and will be hit | |
// by the Amazon Login Wall, capture the URL which will now have return page set to openid.return_to field in the url | |
// This helps to land on the target page direclty after login without having to browse through heavy Home page | |
// Caution: Trying to go to Address Book directly (any page with sensitive information) will challenge the user with additional password screen. | |
// Commented, since this is now optimized | |
// ------------------------------------------ | |
// // Go to Home Page | |
// await Navigate(U_HOMEPAGE) | |
// | |
// // Go to Login Page | |
// await ClickNavigate(S_LOGIN_LINK, 1) | |
// ------------------------------------------ | |
// Go directly to Login Page | |
await Navigate(U_LOGIN_PAGE) // USER-ACTION | |
// Login Page constants | |
const S_EMAIL_TEXT = '#ap_email' | |
const S_CONTINUE_BUTTON = '#continue' | |
const S_PASSWORD_TEXT = '#ap_password' | |
const S_SIGNIN_BUTTON = '#signInSubmit' | |
// Login - Step 1 | |
await EnterText(S_EMAIL_TEXT, USER_EMAIL); // USER-ACTION | |
await ClickNavigate(S_CONTINUE_BUTTON); // USER-ACTION | |
// Login - Step 2 | |
await EnterText(S_PASSWORD_TEXT, USER_PASSWORD); // USER-ACTION | |
await ClickNavigate(S_SIGNIN_BUTTON); // USER-ACTION | |
// Enter password again - Secondary Protection - This is required only if you try to land on the page with sensitive information directly | |
await EnterText(S_PASSWORD_TEXT, USER_PASSWORD); // USER-ACTION | |
await ClickNavigate(S_SIGNIN_BUTTON); // USER-ACTION | |
// AddressBook constants | |
const U_ADDRESSBOOK = 'https://www.amazon.com/a/addresses' | |
const S_ADDRESS_TILE = '.normal-desktop-address-tile' | |
const S_ADDRESS_FULLNAME = '#address-ui-widgets-FullName' | |
const S_ADDRESS_LINEONE = '#address-ui-widgets-AddressLineOne' | |
const S_ADDRESS_LINETWO = '#address-ui-widgets-AddressLineTwo' | |
const S_ADDRESS_CITYSTATEPOSTALCODE ='#address-ui-widgets-CityStatePostalCode' | |
const S_ADDRESS_COUNTRY = '#address-ui-widgets-Country' | |
const S_ADDRESS_PHONENUMBER = '#address-ui-widgets-PhoneNumber' | |
const S_ADDRESS_NODEFAULT = '.address-section-no-default' | |
const S_ADDRESS_DEFAULT = '.default-section' | |
const S_ADDRESS_DEFAULT_FRESH = '#ya-myab-fresh-address-icon' | |
const S_ADDRESS_DEFAULT_AMAZON = '#ya-myab-default-shipping-address-icon' | |
// Commented, since this is now optimized | |
// ------------------------------------------ | |
// // Go to AddressBook | |
// await Navigate(U_ADDRESSBOOK) | |
// ------------------------------------------ | |
// Get All Addresses | |
const allAddressElements = await page.$$(S_ADDRESS_TILE); | |
const getAddresses = allAddressElements.map(async (addressElement) => { | |
let defaultAddressforAmazon = false | |
let defaultAddressforFresh = false | |
const defaultAddressElement = await addressElement.$(S_ADDRESS_DEFAULT) | |
if (defaultAddressElement !== null) { | |
const defaultAddressForAmazonElement = await defaultAddressElement.$(S_ADDRESS_DEFAULT_AMAZON) | |
defaultAddressforAmazon = defaultAddressForAmazonElement ? true: false | |
const defaultAddressForFreshElement = await defaultAddressElement.$(S_ADDRESS_DEFAULT_FRESH) | |
defaultAddressforFresh = defaultAddressForFreshElement ? true: false | |
} | |
const fullNameElement = await addressElement.$(S_ADDRESS_FULLNAME) | |
const fullName = await (await fullNameElement.getProperty('innerHTML')).jsonValue(); | |
const addressLineOneElement = await addressElement.$(S_ADDRESS_LINEONE) | |
const addressLineOne = await (await addressLineOneElement.getProperty('innerHTML')).jsonValue(); | |
const addressLineTwoElement = await addressElement.$(S_ADDRESS_LINETWO) | |
const addressLineTwo = addressLineTwoElement ? await (await addressLineTwoElement.getProperty('innerHTML')).jsonValue() : ''; | |
const cityStatePostalCodeElement = await addressElement.$(S_ADDRESS_CITYSTATEPOSTALCODE) | |
const cityStatePostalCode = await (await cityStatePostalCodeElement.getProperty('innerHTML')).jsonValue(); | |
const countryElement = await addressElement.$(S_ADDRESS_COUNTRY) | |
const country = await (await countryElement.getProperty('innerHTML')).jsonValue(); | |
const phoneNumberElement = await addressElement.$(S_ADDRESS_PHONENUMBER) | |
let phoneNumber = await (await phoneNumberElement.getProperty('innerHTML')).jsonValue(); | |
phoneNumber = phoneNumber.split(':') | |
phoneNumber = phoneNumber[1].trim() | |
return { | |
FullName: fullName, | |
AddressLineOne: addressLineOne, | |
AddressLineTwo: addressLineTwo, | |
CityStatePostalCode: cityStatePostalCode, | |
Country: country, | |
PhoneNumber: phoneNumber, | |
DefaultAddressforAmazon: defaultAddressforAmazon, | |
DefaultAddressforFresh: defaultAddressforFresh | |
} | |
}); | |
let addresses = await Promise.all(getAddresses) | |
console.log(addresses) | |
await browser.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment