Created
February 25, 2021 09:52
-
-
Save jenhacool/58dd215f70f3743821e3ce58d3e589a2 to your computer and use it in GitHub Desktop.
Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as cheerio from 'cheerio' | |
const Queue = require('bull') | |
const axios = require('axios') | |
const doimain = 'http://www.findglocal.com' | |
const redisHost = 'redis://127.0.0.1:6379' | |
const puppeteer = require('puppeteer') | |
const sleep = require('util').promisify(setTimeout) | |
export default class Crawler { | |
public db:any | |
public db_country:any | |
public db_city:any | |
public db_business:any | |
public db_category:any | |
public db_post:any | |
public countryQue:any | |
public cityQue:any | |
public characterQue:any | |
public paginationQue:any | |
public categoryQue:any | |
public categoryQueReset:any | |
public dbBusinessQue:any | |
public dbCityQue:any | |
public postQue:any | |
public doimain: string | |
public redisHost: string | |
public requestSuccess: boolean | |
public timeout: any | |
public pageQue: any | |
public slugQue: any | |
constructor(_db) { | |
this.db = _db; | |
this.db_country = this.db.getModel('Country') | |
this.db_city = this.db.getModel('City') | |
this.db_business = this.db.getModel('Business') | |
this.db_category = this.db.getModel('Category') | |
this.db_post = this.db.getModel('Post') | |
this.countryQue = new Queue('LinkQue', redisHost) | |
this.cityQue = new Queue('CityQue', redisHost) | |
this.characterQue = new Queue('CharacterQue', redisHost) | |
this.paginationQue = new Queue('PaginationQue', redisHost) | |
this.categoryQue = new Queue('CategoryQue', redisHost) | |
this.categoryQueReset = new Queue('CategoryQue', redisHost) | |
this.postQue = new Queue('PostQue', redisHost) | |
this.dbBusinessQue = new Queue('dbBusinessQue', redisHost) | |
this.dbCityQue = new Queue('dbCityQue', redisHost) | |
this.pageQue = new Queue('pageQue', redisHost) | |
this.slugQue = new Queue('slugQue', redisHost) | |
this.doimain = doimain | |
this.redisHost = redisHost | |
// this.resetQue() | |
this.start() | |
} | |
public resetQue() { | |
// this.categoryQueReset.process(async(job, done) => { | |
// done() | |
// }) | |
this.dbBusinessQue.process(async(job, done) => { | |
done() | |
}) | |
} | |
public start() {} | |
// Request get captcha token from server | |
public async sendRequest(id) { | |
return new Promise((async (resolve, reject) => { | |
let response = await axios.get(`https://2captcha.com/res.php?key=50986ec6209e57b3a5919215d6cc78f4&action=get&id=${id}`) | |
console.log(response.data) | |
if(response.data.includes('OK|')) { | |
resolve(response.data) | |
} else { | |
reject(); | |
} | |
})) | |
} | |
// Request get captcha token | |
public async getToken(id) { | |
return this.sendRequest(id).then((token) => { | |
return token | |
}).catch(async () => { | |
// If captcha is not ready, try again after 5 seconds | |
await sleep(5000) | |
return this.getToken(id) | |
}) | |
} | |
public async fetchData(url) { | |
// console.log("Crawling: ", url) | |
// make http call to url | |
let browser | |
try { | |
const browser = await puppeteer.launch({ | |
headless: false, | |
defaultViewport: null, | |
}); | |
const page = await browser.newPage(); | |
/** | |
* Thay key bằng key thật của anh nhé. | |
*/ | |
let response1 = await axios.get(`https://2captcha.com/in.php?key=50986ec6209e57b3a5919215d6cc78f4&method=hcaptcha&sitekey=33f96e6a-38cd-421b-bb68-7806e1764460&pageurl=${url}`); | |
let id = response1.data.split('|')[1]; | |
await page.goto(url) | |
await sleep(5000) | |
let token = await this.getToken(id) | |
token = token.split('|')[1]; | |
await page.evaluate((token) => { | |
let textarea:any = document.querySelector('textarea[name="h-captcha-response"]') | |
textarea.value = token | |
let form:any = document.querySelector('form.challenge-form') | |
form.submit() | |
}, token); | |
console.log('clicked') | |
await page.waitForNavigation({ waitUntil: 'networkidle0' }) | |
await page.waitForSelector('#logo') | |
console.log('loaded') | |
let cookies = await page._client.send('Network.getAllCookies'); | |
// Đến đoạn này có 2 cách để lấy HTML | |
// Cách 1: Dùng luôn Puppeteer | |
let html = await page.evaluate(() => document.querySelector('*').outerHTML); | |
// Cách 2: Dùng axios, nhưng phải send cùng với cookies ở trên | |
let response = await axios({ | |
url: url, | |
// `headers` are custom headers to be sent | |
headers: { | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", | |
"Accept-Encoding": "gzip, deflate", | |
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,vi;q=0.7", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
"Cookie": "pg_mm2_cookie_a=63bcf845-33c8-4bc0-9a19-db448a847dd7; _ga=GA1.2.1571244568.1603709096; pg_custom_timeout=; pg_ip=222.252.31.129; __cfduid=d53b71900fc3ae62e489fc8745a22fa2e1603775197; trc_cookie_storage=taboola%2520global%253Auser-id%3Daecefe17-7b89-4aa3-9214-1857a38d90e8-tuct471703c; _fbp=fb.1.1604074359951.1464048744; cf_clearance=57e2ec2e37105bf3d65cbe63144df94830b244ac-1604395945-0-1zd8d8aad5zaeeb1365z9616d90d-150; __gads=ID=3318a675c31815e0:T=1604460674:S=ALNI_MZQ9HtNM0LnvmAo-9yiy_hg8zC0Pw; _gid=GA1.2.668831960.1604633071; pg_tc=not-sampled; pg_pg_closed_unit_/336251151/findg_anchor_lazy=x; __atuvc=295%7C44%2C251%7C45; __atuvs=5fa546fa64887481000; freewheel-detected-bandwidth=56; GED_PLAYLIST_ACTIVITY=W3sidSI6InNwT2UiLCJ0c2wiOjE2MDQ2NjcxNTQsIm52IjoxLCJ1cHQiOjE2MDQ2NjcxNTAsImx0IjoxNjA0NjY3MTUwfV0.; pg_session_depth=1; pg_analytics=disabled; pg_session_id=63bcf845-33c8-4bc0-9a19-db448a847dd7; _gat=1", | |
"Host": "www.findglocal.com", | |
"If-Modified-Since": "Wed, 04 Nov 2020 15:20:01 GMT", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36" | |
}, | |
}).catch((err) => console.log(err)) | |
if(response.status !== 200){ | |
console.log("Error occurred while fetching data") | |
return | |
} | |
return response | |
} catch (e) { | |
console.log('Error occurred while fetching data') | |
} | |
} | |
public _logTitle($) { | |
const title = $('.headerrow > .col-xs-12 > h1') | |
console.log('Find :', title.text()) | |
} | |
public _findCharactersLink($) { | |
return $('.headerrow > .col-xs-12 > a') | |
} | |
public _findPaginationLink($) { | |
const pagination = $('.paginationrow > .col-xs-12').last() | |
return pagination.find('a') | |
} | |
public _findCities($) { | |
return $('.row > .townlist') | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment