Skip to content

Instantly share code, notes, and snippets.

@sykwer
Last active November 25, 2017 01:03
Show Gist options
  • Save sykwer/33ea2492aaa8e1659d5559173e64ffb1 to your computer and use it in GitHub Desktop.
Save sykwer/33ea2492aaa8e1659d5559173e64ffb1 to your computer and use it in GitHub Desktop.
Airbnbのreviewデータをとってくる。使い方は下の方に。Airbnbのrooms#showページのhtml構造に深く依存しているコードなので2017/11/24現在以降正しく動く保証はありません。
const phantom = require("phantom")
const Papa = require("papaparse")
const fs = require("fs")
// input data
const dataRows = Papa.parse(fs.readFileSync("URLs.csv", "utf8")).data
dataRows.shift() // title row
const rowsCount = dataRows.length
// init model data
const modelData = {}
const years = Array(10).fill(0).map(function(v, i) { return 2008 + i })
const months = Array(12).fill(0).map(function(v, i) { return 1 + i })
years.forEach(function(year) {
months.forEach(function(month) {
modelData[year + "/" + month] = 0
})
})
// init output file
const outputFileName = new Date().getTime() + "_fetch_airbnb.csv"
const content = "url,id,lat,lon," + Object.keys(modelData).join(",") + "\n"
fs.writeFile(outputFileName, content, function(err) {
if (err) throw err
console.log("Output file " + outputFileName + " created!")
})
// entry point
let _ph
phantom.create().then(function(ph) {
_ph = ph
nextRow(ph)
}).catch(function(e) {
console.log(e)
_ph.exit()
})
// process one row
async function nextRow(ph) {
if (dataRows.length < 1) {
console.log("Finished!")
ph.exit()
}
const row = dataRows.shift()
const roomId = row[2]
const lat = row[3]
const lon = row[4]
const url = "https://www.airbnb.jp/rooms/" + roomId
console.log(rowsCount - dataRows.length + "/" + rowsCount + ": Fetching data from " + url)
const page = await ph.createPage()
const status = await page.open(url)
console.log("Status: " + status)
const pagesCount = await page.evaluate(checkPagesCount)
const data = Object.assign({}, modelData)
for (var i = 1; i <= pagesCount; i++) {
console.log("Scanning page: " + i + "/" + pagesCount)
const dates = await page.evaluate(scanAndClick)
dates.forEach(function(date) {
data[date] += 1
})
await waitWhile(2000)
}
// record data
const content = [url, roomId, lat, lon].join(",") + "," + Object.values(data).join(",") + "\n"
fs.appendFile(outputFileName, content, function(err) {
if (err) throw err
console.log("Add data to" + outputFileName)
})
nextRow(ph)
}
// functions
//
function waitWhile(millisecond) {
return new Promise(function(resolve) {
setTimeout(function() {
resolve()
}, millisecond)
})
}
function checkPagesCount() {
if (!document.getElementsByClassName("list-unstyled").item(0)) {
return 1
} else {
const paginateButtonWrappers = document.getElementsByClassName("list-unstyled").item(0).childNodes
return parseInt(paginateButtonWrappers.item(paginateButtonWrappers.length - 2).childNodes.item(0).innerText)
}
}
function scanAndClick() {
const scanDates = function() {
const nodes = document.getElementsByClassName("_150a3jym")
const strings = Array.prototype.map.call(nodes, function(node) {
return node.innerText
})
const filtered = strings.filter(function(s) {
return s.match(/\d+年\d+月/)
}).map(function(s) {
return s.replace("年", "/").replace("月", "")
})
return filtered
}
const resultDates = []
scanDates().forEach(function(date) {
resultDates.push(date)
})
if (document.getElementsByClassName("list-unstyled").item(0)) {
const paginateButtonWrappers = document.getElementsByClassName("list-unstyled").item(0).childNodes
const button = paginateButtonWrappers.item(paginateButtonWrappers.length - 1).childNodes.item(0)
button.click()
}
return resultDates
}
{
"name": "fetch_airbnb_reviews",
"version": "1.0.0",
"main": "index.js",
"license": "MIT",
"dependencies": {
"papaparse": "^4.3.6",
"phantom": "^4.0.9",
}
}
@sykwer
Copy link
Author

sykwer commented Nov 23, 2017

nodeのversionは>=8を想定

Usage

  • こんな感じのcsvファイルをファイル名URLs.csvでprojectのrootディレクトリにおく。(A~E行のみでよい)

2017-11-24 8 12 01

  • node ./index.js (上のfetch_airbnb_reviews.jsをファイル名index.jsとしておく)

Output

review数を月ごとに集計する
2017-11-24 8 20 50

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment