Created
November 12, 2017 10:15
-
-
Save xcaptain/b95234777990d388d3158fbc4a042359 to your computer and use it in GitHub Desktop.
用typescript+puppeteer写的点评网页下载脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as puppeteer from 'puppeteer'; | |
import * as fs from 'fs'; | |
/** | |
* 使用typescript实现一个点评页面爬虫 | |
*/ | |
class DianpingDownloader { | |
private urls; | |
/** | |
* 设置需要下载的链接 | |
* | |
* @param urls | |
*/ | |
public setUrls(urls: [string]) { | |
this.urls = urls; | |
} | |
public run(): void { | |
console.log('start download dianping webpages', this.urls); | |
this.urls.forEach(url => { | |
console.log('at this url', url); | |
this.downloadAPage(url); | |
}); | |
} | |
private downloadAPage(url: string) { | |
(async (url) => { | |
const filename = 'data/shops/' + url.split('/').slice(-1)[0] + '.html'; | |
if (this.fileExists(filename)) { | |
console.log('文件已存在,不执行下载操作', filename); | |
return false; | |
} | |
const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] }); | |
const page = await browser.newPage(); | |
page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'); | |
await page.goto(url); | |
const html = await page.evaluate(() => { | |
return document.querySelectorAll('html')[0].outerHTML; | |
}); | |
fs.writeFile(filename, html, (err) => { | |
if (err) { | |
throw err; | |
} | |
console.info('file saved!', filename); | |
}) | |
await browser.close(); | |
})(url); | |
} | |
private fileExists(filename) { | |
return true; | |
} | |
} | |
// 运行时开启下面几行 | |
// let downloader = new DianpingDownloader(); | |
// downloader.setUrls([ | |
// 'http://www.dianping.com/shop/59473758', | |
// 'http://www.dianping.com/shop/5588380', | |
// ]); | |
// downloader.run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment