Skip to content

Instantly share code, notes, and snippets.

@dogancelik
Created August 7, 2013 12:36
Show Gist options
  • Save dogancelik/6173677 to your computer and use it in GitHub Desktop.
Save dogancelik/6173677 to your computer and use it in GitHub Desktop.
(PhantomJS) Parse Bleach episodes in Wikipedia
page = require('webpage').create()
fs = require 'fs'
system = require 'system'
page.onConsoleMessage = (msg) -> console.log(msg)
page.open "http://en.wikipedia.org/wiki/Bleach_episodes", (status) ->
csv = page.evaluate ->
formatDate = (date) ->
date.getFullYear() + "-" + ((if date.getMonth() < 9 then "0" else "")) + (date.getMonth() + 1) + "-" + ((if date.getDate() < 10 then "0" else "")) + date.getDate()
ret = ""
i = 1
totalEps = 366
while i <= totalEps
text = $("#ep" + i).parent().children("td.summary").text()
try
[_, englishTitle, romajiTitle, kanjiTitle] = text.match(/"(.*)"\s"(.*)"\s\((.*)\)/)
date = $("#ep#{i}").parent().children("td.summary").next().text().replace(/\[.*\]/gi,'')
date = formatDate(new Date(date))
catch e
console.log "Episode: ", i
console.log "Error:", e
return
ret += "\"#{i}\", \"#{englishTitle}\", \"#{romajiTitle}\", \"#{kanjiTitle}\", \"#{date}\"\n"
i++
ret
filename = system.args[1] or "bleach_episodes.csv"
fs.remove filename
fs.write filename, csv, "w"
phantom.exit()
@dogancelik
Copy link
Author

I've also used XPath. This was in my notes: //table[@class="wikitable" and position()<17]//td[position()<4]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment