Skip to content

Instantly share code, notes, and snippets.

@camman3d
Created March 23, 2015 14:52
Show Gist options
  • Save camman3d/bc8e3ee9b3eefb0871d6 to your computer and use it in GitHub Desktop.
Save camman3d/bc8e3ee9b3eefb0871d6 to your computer and use it in GitHub Desktop.
Extracts data from http://www.nuforc.org and writes it to a .CSV file
import java.io.{File, PrintWriter}
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import collection.JavaConversions._
/**
* Created by josh on 3/20/15.
*
* Extracts data from http://www.nuforc.org and writes it to a .CSV file
*/
object Driver {
val indexURL = "http://www.nuforc.org/webreports/ndxevent.html"
val numPages = 50
def getPageURLs = {
Jsoup.connect(indexURL).get()
.select("table a")
.map(_.attr("abs:href"))
.toVector
}
case class RowData(date: String, city: String, state: String, shape: String, duration: String, summary: String)
def getRowData(row: Element) = {
val data = row
.select("td")
.map(_.text())
.toVector
RowData(data.head, data(1), data(2), data(3), data(4), data(5))
}
def getData(url: String) = {
println(s"Extracting data from $url")
Jsoup.connect(url).get()
.select("tr:not(:first-child)")
.map(getRowData)
.toVector
}
def writeCsv(data: Vector[RowData]): Unit = {
println("Writing .CSV file")
val p = new PrintWriter(new File("./data.csv"))
data.foreach(row => p.println(row.productIterator.mkString("\t")))
p.close()
}
def main(args: Array[String]) {
val urls = getPageURLs
val data = urls
.take(numPages)
.flatMap(getData)
println(s"${data.size} entries extracted")
writeCsv(data)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment