Last active
June 2, 2021 03:52
-
-
Save SmartHypercube/49e1ef69076e67e7b64d44cd963c21b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/node | |
// 作为脚本时需要上面这行 | |
// 用于 AWS Lambda 时不需要 | |
'use strict'; | |
// 用于 AWS Lambda 时需要下面这些库,以及添加 Layer: arn:aws:lambda:eu-west-2:764866452798:layer:chrome-aws-lambda:24 | |
// 这个 Layer 的最新版本见 https://github.com/shelfio/chrome-aws-lambda-layer | |
const crypto = require('crypto'); | |
const chromium = require('chrome-aws-lambda'); | |
const aws = require('aws-sdk'); | |
const s3 = new aws.S3(); | |
// 作为脚本时需要下面这些库,其中 puppeteer 我只测试过 2.1.1 版本 | |
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
// https://github.com/puppeteer/puppeteer/issues/305#issuecomment-385145048 | |
async function scroll(page) { | |
await page.evaluate(async () => { | |
await new Promise((resolve, reject) => { | |
function end() { | |
// 滚动到顶部再返回 | |
clearInterval(timer1); | |
clearTimeout(timer2); | |
window.scrollTo(0, 0); | |
resolve(); | |
} | |
let pos = 0; | |
// 每 0.1s 滚动 200px 直到到达底部,这两个数值来自若干实验,如微信文章中的图片 | |
const timer1 = setInterval(() => { | |
pos += 200; | |
if (pos > document.body.scrollHeight) { | |
end(); | |
} | |
window.scrollTo(0, pos); | |
}, 100); | |
// 40s 后无条件结束,40s = AWS Lambda 限制 60s - 加载前后共等待 10s - 保险余量 10s | |
const timer2 = setTimeout(end, 40000); | |
}); | |
}); | |
} | |
async function main(browser, url) { | |
const page = await browser.newPage(); | |
await page.goto(url); | |
await new Promise(r => setTimeout(r, 5000)); | |
await scroll(page); | |
await new Promise(r => setTimeout(r, 5000)); | |
const cdp = await page.target().createCDPSession(); | |
const {data} = await cdp.send('Page.captureSnapshot', {format: 'mhtml'}); | |
let title = await page.title(); | |
title = title.replace(/\//g, '|'); | |
title = title.replace(/\0/g, ' '); | |
title = title.replace(/\n/g, ' '); | |
title = title || "webpage"; | |
return {title, data}; | |
}; | |
// 用于 AWS Lambda 时需要配置 API Gateway 触发器,POST 请求,正文示例: | |
// {"url": "https://example.com/"} | |
// 响应示例: | |
// {"name": "Example", "https://0x01-mhtml-temp.s3.eu-west-2.amazonaws.com/12345678"} | |
exports.handler = async (event, context) => { | |
let browser = null; | |
try { | |
browser = await chromium.puppeteer.launch({ | |
args: chromium.args.concat([ | |
'--disable-file-system', | |
'--window-size=1024,768', | |
]), | |
executablePath: await chromium.executablePath, | |
headless: chromium.headless, | |
}); | |
{title, data} = await main(browser, event.url); | |
// 用随机文件名保存到 AWS S3 并返回 URL,注意配置生命周期实现一段时间后自动删除 | |
let nonce = crypto.randomBytes(16).hexSlice(); | |
await s3.putObject({ | |
Bucket: '0x01-mhtml-temp', | |
Key: nonce, | |
Body: data, | |
}).promise(); | |
let url = 'https://0x01-mhtml-temp.s3.eu-west-2.amazonaws.com/'; | |
url += nonce; | |
let name = title + '.mhtml'; | |
return context.succeed({name, url}); | |
} catch (error) { | |
return context.fail(error); | |
} finally { | |
if (browser) { | |
await browser.close(); | |
} | |
} | |
}; | |
// 作为脚本时,从标准输入读入 URL,先输出网页标题和换行符,再输出全部内容 | |
(async () => { | |
let browser = null; | |
try { | |
const browser = await puppeteer.launch({ | |
args: [ | |
'--no-sandbox', | |
'--disable-file-system', | |
'--window-size=1024,768', | |
], | |
}); | |
const url = fs.readFileSync(0, 'utf-8'); | |
{title, data} = await main(browser, url); | |
process.stdout.write(title); | |
process.stdout.write('\n'); | |
process.stdout.write(data); | |
} finally { | |
if (browser) { | |
await browser.close(); | |
} | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment