Skip to content

Instantly share code, notes, and snippets.

@jenhacool
Created February 25, 2021 09:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jenhacool/58dd215f70f3743821e3ce58d3e589a2 to your computer and use it in GitHub Desktop.
Save jenhacool/58dd215f70f3743821e3ce58d3e589a2 to your computer and use it in GitHub Desktop.
Crawler
import * as cheerio from 'cheerio'
const Queue = require('bull')
const axios = require('axios')
const doimain = 'http://www.findglocal.com'
const redisHost = 'redis://127.0.0.1:6379'
const puppeteer = require('puppeteer')
const sleep = require('util').promisify(setTimeout)
export default class Crawler {
public db:any
public db_country:any
public db_city:any
public db_business:any
public db_category:any
public db_post:any
public countryQue:any
public cityQue:any
public characterQue:any
public paginationQue:any
public categoryQue:any
public categoryQueReset:any
public dbBusinessQue:any
public dbCityQue:any
public postQue:any
public doimain: string
public redisHost: string
public requestSuccess: boolean
public timeout: any
public pageQue: any
public slugQue: any
constructor(_db) {
this.db = _db;
this.db_country = this.db.getModel('Country')
this.db_city = this.db.getModel('City')
this.db_business = this.db.getModel('Business')
this.db_category = this.db.getModel('Category')
this.db_post = this.db.getModel('Post')
this.countryQue = new Queue('LinkQue', redisHost)
this.cityQue = new Queue('CityQue', redisHost)
this.characterQue = new Queue('CharacterQue', redisHost)
this.paginationQue = new Queue('PaginationQue', redisHost)
this.categoryQue = new Queue('CategoryQue', redisHost)
this.categoryQueReset = new Queue('CategoryQue', redisHost)
this.postQue = new Queue('PostQue', redisHost)
this.dbBusinessQue = new Queue('dbBusinessQue', redisHost)
this.dbCityQue = new Queue('dbCityQue', redisHost)
this.pageQue = new Queue('pageQue', redisHost)
this.slugQue = new Queue('slugQue', redisHost)
this.doimain = doimain
this.redisHost = redisHost
// this.resetQue()
this.start()
}
public resetQue() {
// this.categoryQueReset.process(async(job, done) => {
// done()
// })
this.dbBusinessQue.process(async(job, done) => {
done()
})
}
public start() {}
// Request get captcha token from server
public async sendRequest(id) {
return new Promise((async (resolve, reject) => {
let response = await axios.get(`https://2captcha.com/res.php?key=50986ec6209e57b3a5919215d6cc78f4&action=get&id=${id}`)
console.log(response.data)
if(response.data.includes('OK|')) {
resolve(response.data)
} else {
reject();
}
}))
}
// Request get captcha token
public async getToken(id) {
return this.sendRequest(id).then((token) => {
return token
}).catch(async () => {
// If captcha is not ready, try again after 5 seconds
await sleep(5000)
return this.getToken(id)
})
}
public async fetchData(url) {
// console.log("Crawling: ", url)
// make http call to url
let browser
try {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
/**
* Thay key bằng key thật của anh nhé.
*/
let response1 = await axios.get(`https://2captcha.com/in.php?key=50986ec6209e57b3a5919215d6cc78f4&method=hcaptcha&sitekey=33f96e6a-38cd-421b-bb68-7806e1764460&pageurl=${url}`);
let id = response1.data.split('|')[1];
await page.goto(url)
await sleep(5000)
let token = await this.getToken(id)
token = token.split('|')[1];
await page.evaluate((token) => {
let textarea:any = document.querySelector('textarea[name="h-captcha-response"]')
textarea.value = token
let form:any = document.querySelector('form.challenge-form')
form.submit()
}, token);
console.log('clicked')
await page.waitForNavigation({ waitUntil: 'networkidle0' })
await page.waitForSelector('#logo')
console.log('loaded')
let cookies = await page._client.send('Network.getAllCookies');
// Đến đoạn này có 2 cách để lấy HTML
// Cách 1: Dùng luôn Puppeteer
let html = await page.evaluate(() => document.querySelector('*').outerHTML);
// Cách 2: Dùng axios, nhưng phải send cùng với cookies ở trên
let response = await axios({
url: url,
// `headers` are custom headers to be sent
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8,vi;q=0.7",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "pg_mm2_cookie_a=63bcf845-33c8-4bc0-9a19-db448a847dd7; _ga=GA1.2.1571244568.1603709096; pg_custom_timeout=; pg_ip=222.252.31.129; __cfduid=d53b71900fc3ae62e489fc8745a22fa2e1603775197; trc_cookie_storage=taboola%2520global%253Auser-id%3Daecefe17-7b89-4aa3-9214-1857a38d90e8-tuct471703c; _fbp=fb.1.1604074359951.1464048744; cf_clearance=57e2ec2e37105bf3d65cbe63144df94830b244ac-1604395945-0-1zd8d8aad5zaeeb1365z9616d90d-150; __gads=ID=3318a675c31815e0:T=1604460674:S=ALNI_MZQ9HtNM0LnvmAo-9yiy_hg8zC0Pw; _gid=GA1.2.668831960.1604633071; pg_tc=not-sampled; pg_pg_closed_unit_/336251151/findg_anchor_lazy=x; __atuvc=295%7C44%2C251%7C45; __atuvs=5fa546fa64887481000; freewheel-detected-bandwidth=56; GED_PLAYLIST_ACTIVITY=W3sidSI6InNwT2UiLCJ0c2wiOjE2MDQ2NjcxNTQsIm52IjoxLCJ1cHQiOjE2MDQ2NjcxNTAsImx0IjoxNjA0NjY3MTUwfV0.; pg_session_depth=1; pg_analytics=disabled; pg_session_id=63bcf845-33c8-4bc0-9a19-db448a847dd7; _gat=1",
"Host": "www.findglocal.com",
"If-Modified-Since": "Wed, 04 Nov 2020 15:20:01 GMT",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36"
},
}).catch((err) => console.log(err))
if(response.status !== 200){
console.log("Error occurred while fetching data")
return
}
return response
} catch (e) {
console.log('Error occurred while fetching data')
}
}
public _logTitle($) {
const title = $('.headerrow > .col-xs-12 > h1')
console.log('Find :', title.text())
}
public _findCharactersLink($) {
return $('.headerrow > .col-xs-12 > a')
}
public _findPaginationLink($) {
const pagination = $('.paginationrow > .col-xs-12').last()
return pagination.find('a')
}
public _findCities($) {
return $('.row > .townlist')
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment