Skip to content

Instantly share code, notes, and snippets.

@anothernoise
Forked from travisbrown/build.sbt
Created October 13, 2015 08:58
Show Gist options
  • Save anothernoise/fc50a8ee64d3eed56919 to your computer and use it in GitHub Desktop.
Save anothernoise/fc50a8ee64d3eed56919 to your computer and use it in GitHub Desktop.
Simple example of how to use Dispatch to access the Chronicling America API asynchronously.
scalaVersion := "2.10.2"
libraryDependencies ++= Seq(
"net.databinder.dispatch" %% "dispatch-core" % "0.11.0",
"net.databinder.dispatch" %% "dispatch-json4s-jackson" % "0.11.0",
"net.sf.opencsv" % "opencsv" % "2.0"
)
object Searcher {
import dispatch.{ Http, url }, dispatch.Defaults._, dispatch.as
import org.json4s._
import scala.Function.const
import scala.concurrent.Future
import scala.util._
implicit val formats = DefaultFormats
val pageReq = url("http://chroniclingamerica.loc.gov/search/pages/results/")
def constructRequest(
conj: List[String],
disj: List[String],
yearRange: (Int, Int),
page: Option[Int] = None
) = pageReq <<? Map(
"format" -> "json",
"date1" -> yearRange._1.toString,
"date2" -> yearRange._2.toString,
"dateFilterType" -> "yearRange",
"andtext" -> conj.mkString(" "),
"ortext" -> disj.mkString(" ")
) ++ page.map("page" -> _.toString)
def retrievePage(
conj: List[String],
disj: List[String],
yearRange: (Int, Int),
page: Option[Int] = None
) = Http(constructRequest(conj, disj, yearRange, page) OK as.json4s.Json)
def search(
conj: List[String],
disj: List[String],
yearRange: (Int, Int)
) = retrievePage(conj, disj, yearRange).flatMap { json =>
val results = json.extract[ResultSet]
Future.traverse(2 to results.pageCount) { i =>
retrievePage(conj, disj, yearRange, Some(i)).map(
_.extract[ResultSet].items
)
}.map(results.items ++ _.flatten)
}
// Asynchronously begin downloads and print result when completed.
def saveSearchResults(
conj: List[String],
disj: List[String],
yearRange: (Int, Int) )(path: String) = search(conj, disj, yearRange).onComplete {
case Success(items) =>
println("Successfully downloaded %d items!".format(items.size))
CsvOutput.writeItems(items)(path)
case Failure(e) =>
println("There was a problem: %s".format(e))
}
}
// Represents a single newspaper page.
case class Item(
id: String,
url: String,
lccn: String,
date: String,
sequence: Int,
title: String,
ocr_eng: String
) {
def pubId = id match {
case Item.PubIdPattern(pubId) => pubId
}
def formattedDate = date match {
case Item.DatePattern(year, month, day) => "%s-%s-%s".format(year, month, day)
}
}
// Some helpers for working with item data.
object Item {
val DatePattern = """(\d\d\d\d)(\d\d)(\d\d)""".r
val PubIdPattern = """/lccn/([^/]+)/.*""".r
}
// Represents a set of search query results.
case class ResultSet(totalItems: Int, itemsPerPage: Int, items: List[Item]) {
def pageCount = {
val quot = (totalItems / itemsPerPage).toInt
val remd = (totalItems % itemsPerPage).toInt
quot + math.signum(remd)
}
}
// The boring stuff: writing the CSV file.
object CsvOutput {
import au.com.bytecode.opencsv.CSVWriter
def writeItems(items: List[Item])(path: String) = {
val writer = new CSVWriter(new java.io.FileWriter(path))
items.foreach { item =>
writer.writeNext(
Array(
item.pubId,
item.title,
item.formattedDate,
item.sequence.toString,
item.url,
item.ocr_eng.replaceAll("\n", " ")
)
)
}
writer.close()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment