Skip to content

Instantly share code, notes, and snippets.

@SmartHypercube
Last active June 2, 2021 03:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SmartHypercube/49e1ef69076e67e7b64d44cd963c21b4 to your computer and use it in GitHub Desktop.
Save SmartHypercube/49e1ef69076e67e7b64d44cd963c21b4 to your computer and use it in GitHub Desktop.
#!/usr/bin/node
// 作为脚本时需要上面这行
// 用于 AWS Lambda 时不需要
'use strict';
// 用于 AWS Lambda 时需要下面这些库,以及添加 Layer: arn:aws:lambda:eu-west-2:764866452798:layer:chrome-aws-lambda:24
// 这个 Layer 的最新版本见 https://github.com/shelfio/chrome-aws-lambda-layer
const crypto = require('crypto');
const chromium = require('chrome-aws-lambda');
const aws = require('aws-sdk');
const s3 = new aws.S3();
// 作为脚本时需要下面这些库,其中 puppeteer 我只测试过 2.1.1 版本
const puppeteer = require('puppeteer');
const fs = require('fs');
// https://github.com/puppeteer/puppeteer/issues/305#issuecomment-385145048
async function scroll(page) {
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
function end() {
// 滚动到顶部再返回
clearInterval(timer1);
clearTimeout(timer2);
window.scrollTo(0, 0);
resolve();
}
let pos = 0;
// 每 0.1s 滚动 200px 直到到达底部,这两个数值来自若干实验,如微信文章中的图片
const timer1 = setInterval(() => {
pos += 200;
if (pos > document.body.scrollHeight) {
end();
}
window.scrollTo(0, pos);
}, 100);
// 40s 后无条件结束,40s = AWS Lambda 限制 60s - 加载前后共等待 10s - 保险余量 10s
const timer2 = setTimeout(end, 40000);
});
});
}
async function main(browser, url) {
const page = await browser.newPage();
await page.goto(url);
await new Promise(r => setTimeout(r, 5000));
await scroll(page);
await new Promise(r => setTimeout(r, 5000));
const cdp = await page.target().createCDPSession();
const {data} = await cdp.send('Page.captureSnapshot', {format: 'mhtml'});
let title = await page.title();
title = title.replace(/\//g, '|');
title = title.replace(/\0/g, ' ');
title = title.replace(/\n/g, ' ');
title = title || "webpage";
return {title, data};
};
// 用于 AWS Lambda 时需要配置 API Gateway 触发器,POST 请求,正文示例:
// {"url": "https://example.com/"}
// 响应示例:
// {"name": "Example", "https://0x01-mhtml-temp.s3.eu-west-2.amazonaws.com/12345678"}
exports.handler = async (event, context) => {
let browser = null;
try {
browser = await chromium.puppeteer.launch({
args: chromium.args.concat([
'--disable-file-system',
'--window-size=1024,768',
]),
executablePath: await chromium.executablePath,
headless: chromium.headless,
});
{title, data} = await main(browser, event.url);
// 用随机文件名保存到 AWS S3 并返回 URL,注意配置生命周期实现一段时间后自动删除
let nonce = crypto.randomBytes(16).hexSlice();
await s3.putObject({
Bucket: '0x01-mhtml-temp',
Key: nonce,
Body: data,
}).promise();
let url = 'https://0x01-mhtml-temp.s3.eu-west-2.amazonaws.com/';
url += nonce;
let name = title + '.mhtml';
return context.succeed({name, url});
} catch (error) {
return context.fail(error);
} finally {
if (browser) {
await browser.close();
}
}
};
// 作为脚本时,从标准输入读入 URL,先输出网页标题和换行符,再输出全部内容
(async () => {
let browser = null;
try {
const browser = await puppeteer.launch({
args: [
'--no-sandbox',
'--disable-file-system',
'--window-size=1024,768',
],
});
const url = fs.readFileSync(0, 'utf-8');
{title, data} = await main(browser, url);
process.stdout.write(title);
process.stdout.write('\n');
process.stdout.write(data);
} finally {
if (browser) {
await browser.close();
}
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment