Skip to content

Instantly share code, notes, and snippets.

@ShaunLWM
Created December 29, 2018 05:23
Show Gist options
  • Save ShaunLWM/65ec009b65463fe10d8fa45f4a48220a to your computer and use it in GitHub Desktop.
Save ShaunLWM/65ec009b65463fe10d8fa45f4a48220a to your computer and use it in GitHub Desktop.
pcgamesdownload scraper
const request = require('request');
const cheerio = require('cheerio');
const async = require('async');
const fs = require('fs');
const JobManager = require('./JobManager');
let job = new JobManager();
let page = 1;
async.whilst(
function () { return page < 297; },
function (callback) {
requestPage(`https://pcgames-download.com/page/${page}/`, (error, response, body) => {
if (error) {
console.log('----- ERROR ------');
console.log(`https://pcgames-download.com/page/${page}/`);
console.error(error);
console.log('----------------');
return callback(null);
}
if (response.statusCode !== 200) {
console.log('----- ERROR RESPONSE ------');
console.log(`https://pcgames-download.com/page/${page}/`);
console.error(response.statusCode);
console.log('----------------');
return callback(null);
}
fs.writeFileSync(`./pages/${page}.html`, body);
let $ = cheerio.load(body);
$('.post-container').each((index, element) => {
let url = $(element).find('.post-title > a').attr('href');
console.log(url);
job.addPageLink(url);
});
page++;
setTimeout(() => {
console.log('------------------------------');
return callback(null);
}, 2000);
});
},
function (err, n) {
console.log('Done');
}
);
function requestPage(url, callback) {
const options = {
url,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
};
return request(options, callback);
}
const Queue = require('bull');
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
const Arena = require('bull-arena');
const express = require('express');
const app = express();
let arena = Arena({
queues: [{
name: 'links queue',
"hostId": "Downloader"
}]
});
app.use('/', arena);
app.listen(8081, () => console.log(`>> [jm] queue server listening on port 8081`));
class JobManager {
constructor() {
this.pageQueue = new Queue('links queue');
this.pageQueue.process((job, done) => {
let url = job.data.url;
console.log(`Processing: ${url}`);
this.requestPage(url, (error, response, body) => {
if (error) {
console.log(url);
console.error(error);
return done();
}
if (response.statusCode !== 200) {
console.log(url);
console.error(`statusCode ${response.statusCode}`);
return done();
}
let $ = cheerio.load(body);
let title = $('.post-title').text();
let links = [];
$('.post').find('a').each((i, element) => {
let p = $(element).attr('href');
if (typeof p !== 'undefined' && p !== null && !p.startsWith('https://pcgames-download') && !p.startsWith('http://pcgames-download.net')) {
links.push(p);
}
});
if (links.length > 0) {
let s = new URL(url);
let l = s.pathname.split('/');
fs.writeFileSync(`./pages/individual/${l[3]}.html`, body);
let file = JSON.parse(fs.readFileSync('./links.json'));
file.push({
title, links
});
fs.writeFileSync('./links.json', JSON.stringify(file, null, 2));
}
return done();
});
});
}
addPageLink(url) {
this.pageQueue.add({ url });
}
requestPage(url, callback) {
const options = {
url,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
};
return request(options, callback);
}
}
module.exports = JobManager;
{
"name": "pcgames-download",
"version": "1.0.0",
"main": "index.js",
"license": "MIT",
"dependencies": {
"async": "^2.6.1",
"bull": "^3.5.2",
"bull-arena": "^2.5.2",
"cheerio": "^1.0.0-rc.2",
"request": "^2.88.0"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment