Skip to content

Instantly share code, notes, and snippets.

@travisbrown
Last active December 20, 2015 09:09
Show Gist options
  • Save travisbrown/6105462 to your computer and use it in GitHub Desktop.
Save travisbrown/6105462 to your computer and use it in GitHub Desktop.
Simple example of how to use Dispatch to access the Chronicling America API asynchronously.
scalaVersion := "2.10.2"
libraryDependencies ++= Seq(
"net.databinder.dispatch" %% "dispatch-core" % "0.11.0",
"net.databinder.dispatch" %% "dispatch-json4s-jackson" % "0.11.0",
"net.sf.opencsv" % "opencsv" % "2.0"
)
object Searcher {
import dispatch.{ Http, url }, dispatch.Defaults._, dispatch.as
import org.json4s._
import scala.Function.const
import scala.concurrent.Future
import scala.util._
implicit val formats = DefaultFormats
val pageReq = url("http://chroniclingamerica.loc.gov/search/pages/results/")
def constructRequest(
conj: List[String],
disj: List[String],
yearRange: (Int, Int),
page: Option[Int] = None
) = pageReq <<? Map(
"format" -> "json",
"date1" -> yearRange._1.toString,
"date2" -> yearRange._2.toString,
"dateFilterType" -> "yearRange",
"andtext" -> conj.mkString(" "),
"ortext" -> disj.mkString(" ")
) ++ page.map("page" -> _.toString)
def retrievePage(
conj: List[String],
disj: List[String],
yearRange: (Int, Int),
page: Option[Int] = None
) = Http(constructRequest(conj, disj, yearRange, page) OK as.json4s.Json)
def search(
conj: List[String],
disj: List[String],
yearRange: (Int, Int)
) = retrievePage(conj, disj, yearRange).flatMap { json =>
val results = json.extract[ResultSet]
Future.traverse(2 to results.pageCount) { i =>
retrievePage(conj, disj, yearRange, Some(i)).map(
_.extract[ResultSet].items
)
}.map(results.items ++ _.flatten)
}
// Asynchronously begin downloads and print result when completed.
def saveSearchResults(
conj: List[String],
disj: List[String],
yearRange: (Int, Int) )(path: String) = search(conj, disj, yearRange).onComplete {
case Success(items) =>
println("Successfully downloaded %d items!".format(items.size))
CsvOutput.writeItems(items)(path)
case Failure(e) =>
println("There was a problem: %s".format(e))
}
}
// Represents a single newspaper page.
case class Item(
id: String,
url: String,
lccn: String,
date: String,
sequence: Int,
title: String,
ocr_eng: String
) {
def pubId = id match {
case Item.PubIdPattern(pubId) => pubId
}
def formattedDate = date match {
case Item.DatePattern(year, month, day) => "%s-%s-%s".format(year, month, day)
}
}
// Some helpers for working with item data.
object Item {
val DatePattern = """(\d\d\d\d)(\d\d)(\d\d)""".r
val PubIdPattern = """/lccn/([^/]+)/.*""".r
}
// Represents a set of search query results.
case class ResultSet(totalItems: Int, itemsPerPage: Int, items: List[Item]) {
def pageCount = {
val quot = (totalItems / itemsPerPage).toInt
val remd = (totalItems % itemsPerPage).toInt
quot + math.signum(remd)
}
}
// The boring stuff: writing the CSV file.
object CsvOutput {
import au.com.bytecode.opencsv.CSVWriter
def writeItems(items: List[Item])(path: String) = {
val writer = new CSVWriter(new java.io.FileWriter(path))
items.foreach { item =>
writer.writeNext(
Array(
item.pubId,
item.title,
item.formattedDate,
item.sequence.toString,
item.url,
item.ocr_eng.replaceAll("\n", " ")
)
)
}
writer.close()
}
}
@alexsroussi
Copy link

Hi that's exactly what I needed.
However I'm very new at scala, could you in a few lines explain what's the right way to assemble the pieces?
(I already installed sbt)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment