Skip to content

Instantly share code, notes, and snippets.

@martinadamsdev
Created May 21, 2020 03:53
Show Gist options
  • Save martinadamsdev/75217423f86cc9106976d5beffca745b to your computer and use it in GitHub Desktop.
Save martinadamsdev/75217423f86cc9106976d5beffca745b to your computer and use it in GitHub Desktop.
A douban.com crawler!
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch({
headless: false,
timeout: 50000
})
const page = await browser.newPage()
page.setViewport({
width: 1920,
height: 1080
})
// 去豆瓣登陆页面
await page.goto('https://accounts.douban.com/passport/login', {
waitUntil: 'networkidle2' // 网络空闲说明已加载完毕
});
// 点击搜索框拟人输入
const clickPhoneLogin = await page.$('.account-tab-account')
await clickPhoneLogin.click()
const name = 'xxxxxxx'
await page.type('input[id="username"]', name, {delay: 0})
const pwd = 'xxxxxxxx'
await page.type('input[id="password"]', pwd, {delay: 1})
// 获取登录按钮元素
const loginElement = await page.$('div.account-form-field-submit > a')
// 点击按钮,开始登陆
await loginElement.click()
await page.waitForNavigation()
// 目标页面 url
let url = 'https://www.douban.com/group/topic/112565224/?start='
// 翻页参数
let pages = [0, 100, 200, 300, 400, 500]
// 定义爬取函数
async function next(url) {
await page.goto(url, {
waitUntil: 'networkidle2' // 网络空闲说明已加载完毕
})
return await page.$$eval("div.reply-doc.content > p", e => {
let a = []
e.forEach(element => {
a.push(element.innerText)
})
return a
})
}
// 拼接文本字符串
let data = ''
for (const index of pages) {
let res = await next(url + index)
data = res.join('\n\n\n-----------------------------------------------------------\n\n') + data
}
// 写入文件
fs.writeFile('douban.txt',data,'utf8',function(error){
if(error){
console.log(error);
return false;
}
console.log('写入成功');
})
await browser.close()
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment