Skip to content

Instantly share code, notes, and snippets.

@GerardBouchar
Last active Oct 31, 2019
Embed
What would you like to do?

chrome-headless-fetch-charset.js

Fetch pages using a headless chromium browser, and returns the detected charset.

Input file format

Text file, one URL per line

Output file format

JSON lines. Text file with one JSON entry per line.

{"url": "...", "charset": "..."}
{"url": "...", "charset": "..."}
...

Usage

In the base directory:

npm start < input_file.txt > output_file.jsonl
const puppeteer = require('puppeteer');
const fs = require('fs');
const proxy = process.env.PROXY;
const BATCH_SIZE = 64;
const TIMEOUT = 5*1000;
const USER_AGENT = process.env.USER_AGENT;
const BLOCK = ['.jpg', '.png', '.gif', '.css', '.js'];
async function getBrowser() {
return puppeteer.launch({
args:
proxy ? ['--proxy-server=' + proxy,] : []
});
}
function log(msg) {
process.stderr.write(msg + '\n');
}
async function preparePage(ctx) {
const page = await ctx.newPage();
if (USER_AGENT) await page.setUserAgent(USER_AGENT);
await page.setJavaScriptEnabled(false);
await page.setRequestInterception(true);
page.on('request', req => {
if (BLOCK.some(x=>req.url().endsWith(x))) req.abort();
else req.continue();
});
return page;
}
async function fetchUrl(ctx, url) {
let charset = 'unknown';
let page = null;
try {
page = await preparePage(ctx);
await page.goto(url, {
timeout: TIMEOUT,
waitUntil: 'domcontentloaded',
});
charset = await page.evaluate('document.characterSet');
} catch (err) {
log(`${url} : ${err}`);
}
if(page) await page.close();
return {url, charset};
}
function* getBatch(ctx, urls) {
yield* urls.map(url => fetchUrl(ctx, url));
}
function* makeBatches(list, size) {
const ret = [];
for(const elem of list) {
ret.push(elem);
if(ret.length == size) {
yield ret;
ret.length = 0;
}
}
if (ret.length > 0) yield ret;
}
// read an array of non-empty strings
function readList(file=0) {
return new Promise((resolve, fail) => {
fs.readFile(0, 'utf8', (err, str) => {
if(err) return fail(err);
lines = str.split('\n').filter(x=>x);
resolve(lines);
});
});
}
function printResult(res) {
console.log(JSON.stringify(res));
}
async function main() {
const browserP = getBrowser();
const urlsP = readList();
const urls = await urlsP;
const browser = await browserP;
let done = 0;
let toDo = urls.length;
for(const batch of makeBatches(urls, BATCH_SIZE)) {
const ctx = await browser.createIncognitoBrowserContext();
const results = getBatch(ctx, batch);
for(const res of results) {
printResult(await res);
done++;
log(`${done}/${toDo} : ${100*done/toDo|0}%`);
}
let pages = await browser.pages();
log(`${pages.length} opened pages: ${pages.map(p=>p.url())}`);
await ctx.close();
}
await browser.close();
log('done');
}
main();
{
"name": "charset-detect",
"version": "1.0.0",
"description": "Detect charset using a headless chrome",
"main": "chrome-headless-fetch-charset.js",
"dependencies": {
"puppeteer": "^1.5.0"
},
"devDependencies": {},
"scripts": {
"start": "node chrome-headless-fetch-charset.js",
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment