Skip to content

Instantly share code, notes, and snippets.

@tedshd
Created June 23, 2020 10:36
Show Gist options
  • Save tedshd/4100b1cbefa2610c6dfdbff6cd5617e1 to your computer and use it in GitHub Desktop.
Save tedshd/4100b1cbefa2610c6dfdbff6cd5617e1 to your computer and use it in GitHub Desktop.
parse
const puppeteer = require('puppeteer');
const url = require('url');
const pLimit = require('p-limit');
const limit = pLimit(1);
crawlerIG(['joanne_722']);
/**
*
*
* @param {*} ids String or Array
* example: 'joanne_722' or ['joanne_722']
* @returns [
* {
* avatar: '',
* posts: '',
* post_count: '',
* follower_count: '',
* social_name: '',
* private_account: false / true,
* social_name_exist: false / true,
* recent: [''],
* }
* ]
*/
async function crawlerIG (ids) {
let browser = false,
cookieFlag = false,
urlArray = [],
parseResult = {};
if (!ids) {
// TODO put error log
console.error('crawlerIG: not set IG ids');
return;
}
if (Array.isArray(ids)) {
if (!ids.length) {
// TODO put error log
console.error('crawlerIG: ids is empty');
return;
}
}
if (typeof ids === 'string') {
console.log('crawlerIG: ids trans to array');
ids = [ids];
}
for (let i = 0; i < ids.length; i++) {
urlArray.push('https://www.instagram.com/' + ids[i] + '/');
}
if (!browser) {
browser = await puppeteer.launch();
}
var page = await browser.newPage();
// UA
page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:64.0) Gecko/20100101 Firefox/64.0');
// set cookie
if (!cookieFlag) {
var cookies = [{
'name': 'sessionid',
'value': '37549417879%3AK12RRPXVjcjTz0%3A5',
'domain': '.instagram.com'
}];
await page.setCookie(...cookies);
cookieFlag = true;
}
var cookiesSet = await page.cookies(urlArray[0]);
console.log('IG cookie: ' + JSON.stringify(cookiesSet));
console.log('urlArray: ', urlArray);
await fetchData(page, urlArray[0]);
// for (let j = 0; j < urlArray.length; j++) {
// var parseUrl = urlArray[j],
// path = url.parse(parseUrl, true).path,
// user = path.slice(1, path.length - 1),
// privateAccount = false,
// socialNameExist = true;
// await page.goto(parseUrl, { waitUntil: 'networkidle2' });
// if (await page.$('input[name=username]') !== null) {
// console.log('login');
// await page.focus('input[name=username]');
// await page.keyboard.type('');
// await page.focus('input[name=password]');
// await page.keyboard.type('');
// await page.waitFor(1000);
// await page.click('button[type=submit]');
// await page.waitFor(2000);
// if (await page.$('section > main > div > div > div > div > button') !== null) {
// await page.click('button[type=button]'); // first login check save account data in browser
// }
// }
// await page.waitFor(2000);
// if (await page.$('a[href="' + path + '"]') == null) {
// console.log('not profile');
// if (await page.$('section > main > div > header > section > div > button') !== null) {
// await page.click('section > main > div > header > section > div > button') // click follow button
// privateAccount = true;
// } else {
// socialNameExist = false;
// }
// }
// if (!socialNameExist) {
// parseResult[user] = {
// social_name: user,
// social_name_exist: false,
// };
// continue;
// }
// await page.waitFor(3000);
// console.log('parse data');
// let data = await page.evaluate(() => {
// var avatar = document.querySelector('header img').src || '',
// post_count = document.querySelectorAll('header section ul li')[0].innerText.split(' ')[0] || '',
// follower_count = document.querySelectorAll('header section ul li')[1].innerText.split(' ')[0] || '',
// images = document.querySelectorAll('article div')[0].querySelectorAll('img') || [],
// recent = [],
// sum = 10;
// for (let index = 0; index < images.length; index++) {
// if (index == sum - 1) {
// break;
// }
// recent.push(images[index].src);
// }
// return {
// "avatar": avatar,
// "post_count": post_count,
// "follower_count": follower_count,
// "recent": recent
// };
// });
// data['social_name'] = user;
// data['private_account'] = privateAccount;
// data['social_name_exist'] = socialNameExist;
// parseResult[user] = data;
// }
async function fetchData (page, parseUrl) {
var path = url.parse(parseUrl, true).path,
user = path.slice(1, path.length - 1),
privateAccount = false,
socialNameExist = true;
await page.goto(parseUrl, { waitUntil: 'networkidle2' });
if (await page.$('input[name=username]') !== null) {
console.log('login');
await page.focus('input[name=username]');
await page.keyboard.type('');
await page.focus('input[name=password]');
await page.keyboard.type('');
await page.waitFor(1000);
await page.click('button[type=submit]');
await page.waitFor(2000);
if (await page.$('section > main > div > div > div > div > button') !== null) {
await page.click('button[type=button]'); // first login check save account data in browser
}
}
await page.waitFor(2000);
if (await page.$('a[href="' + path + '"]') == null) {
console.log('not profile');
if (await page.$('section > main > div > header > section > div > button') !== null) {
await page.click('section > main > div > header > section > div > button') // click follow button
privateAccount = true;
} else {
socialNameExist = false;
}
}
if (!socialNameExist) {
parseResult[user] = {
social_name: user,
social_name_exist: false,
};
continue;
}
await page.waitFor(3000);
console.log('parse data');
let data = await page.evaluate(() => {
var avatar = document.querySelector('header img').src || '',
post_count = document.querySelectorAll('header section ul li')[0].innerText.split(' ')[0] || '',
follower_count = document.querySelectorAll('header section ul li')[1].innerText.split(' ')[0] || '',
images = document.querySelectorAll('article div')[0].querySelectorAll('img') || [],
recent = [],
sum = 10;
for (let index = 0; index < images.length; index++) {
if (index == sum - 1) {
break;
}
recent.push(images[index].src);
}
return {
"avatar": avatar,
"post_count": post_count,
"follower_count": follower_count,
"recent": recent
};
});
data['social_name'] = user;
data['private_account'] = privateAccount;
data['social_name_exist'] = socialNameExist;
parseResult[user] = data;
}
await browser.close();
console.log('IG: ', parseResult);
return parseResult;
}
exports.crawlerIG = crawlerIG;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment