Created
September 13, 2018 01:13
-
-
Save yangfch3/db008c851e548e2ffb59d1b1f4064218 to your computer and use it in GitHub Desktop.
[Chromeless 豆瓣数据抓取脚本] 以抓取一本书的猜你喜欢书籍列表信息为例 #crawl #douban
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { | |
Chromeless | |
} = require('chromeless') | |
const fs = require('fs') | |
const myConsole = console | |
const bookName = process.argv[2] | |
const bookInfoList = [] | |
if (!bookName) { | |
myConsole.log('未输入正确的书籍名称') | |
return | |
} | |
const chromeless = new Chromeless({ | |
waitTimeout: 30000 | |
}) | |
async function crawlRelatedBookUrls(bookName) { | |
let bookUrl = await chromeless.goto(`https://book.douban.com/subject_search?search_text=${encodeURIComponent(bookName)}&cat=1001`).wait(1000).evaluate(() => { | |
let dom = document.querySelector('#root div div div .item-root img').parentElement | |
return dom.getAttribute('href') | |
}) | |
let relatedBookUrls = await chromeless.goto(bookUrl).wait(5000).evaluate(() => { | |
relatedBookUrls = Array.prototype.map.call(document.querySelectorAll('#db-rec-section .content dl dt a'), (item) => { | |
return item.getAttribute('href') | |
}) | |
return relatedBookUrls | |
}) | |
return relatedBookUrls | |
} | |
async function crawlBookInfo(url) { | |
let bookInfo = await chromeless.goto(url).wait(1000).evaluate(() => { | |
let bookInfo = {} | |
let nameDom = document.querySelector('#wrapper > h1 > span') | |
bookInfo.name = nameDom ? nameDom.innerText : '' | |
let authorDom = document.querySelector('#info > a:nth-child(2)') | |
bookInfo.author = authorDom ? authorDom.innerText.replace(/[\\n\n\s]+/, '') : '' | |
let scoreNumDom = document.querySelector('#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span') | |
bookInfo.scoreNum = scoreNumDom ? scoreNumDom.innerText : '' | |
let scoreDom = document.querySelector('#interest_sectl > div > div.rating_self.clearfix > strong') | |
bookInfo.score = scoreDom ? scoreDom.innerText : '' | |
let categoryDom = document.querySelectorAll('#db-tags-section > div > span > a') | |
bookInfo.category = categoryDom ? [...categoryDom].reduce((pv, cv, ci) => { | |
return pv + '|' + cv.innerText | |
}, '') : '' | |
let bookInfoMixin = document.querySelector('#info').innerText | |
let searchResult = /出版[\w\W]+?(\d+-?\d)/.exec(bookInfoMixin) | |
bookInfo.pubTime = searchResult[1] || '' | |
return bookInfo | |
}) | |
bookInfoList.push(bookInfo) | |
} | |
async function exec() { | |
let relatedBookUrls = await crawlRelatedBookUrls(bookName) | |
let len = relatedBookUrls.length | |
for (let i = 0; i < len; i++) { | |
await crawlBookInfo(relatedBookUrls[i]) | |
} | |
return bookInfoList | |
} | |
exec().then((res) => { | |
myConsole.log(res) | |
fs.writeFileSync('output.json', JSON.stringify(res, null, 4)) | |
chromeless.end() | |
process.exit() | |
}).catch((e) => { | |
myConsole.log(e) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment