Skip to content

Instantly share code, notes, and snippets.

View erraggy's full-sized avatar

Robbie Coleman erraggy

View GitHub Profile
class SiteMetricsTable extends HbaseTable[SiteMetricsTable, String, SiteMetricsRow](tableName = "site-metrics", rowKeyClass = classOf[String]) {
def rowBuilder(result: DeserializedResult) = new SiteMetricsRow(this, result)
val meta = family[String, String, Any]("meta")
val name = column(meta, "name", classOf[String])
val searchMetrics = family[String, DateMidnight, Long]("searchesByDay")
}
class SiteMetricsRow(table: SiteMetricsTable, result: DeserializedResult) extends HRow[SiteMetricsTable, String](result, table)
WebCrawlingSchema.WebTable.query2.withKey("http://mycrawledsite.com/crawledpage.html")
.withColumns(_.title, _.lastCrawled)
.withFamilies(_.searchMetrics)
.singleOption() match {
case Some(pageRow) => {
println("Title: " + pageRow.column(_.title).getOrElse("No Title"))
println("Crawled on: " + pageRow.column(_.lastCrawled).getOrElse(new DateTime()))
pageRow.family(_.searchMetrics).foreach {
case (date: DateMidnight, views: Long) =>
WebCrawlingSchema.WebTable
.put("http://mycrawledsite.com/crawledpage.html")
.value(_.title, "My Crawled Page Title")
.value(_.lastCrawled, new DateTime())
.value(_.article, "Jonsie went to the store. She didn't notice the spinning of the Earth, nor did the Earth notice the expansion of the Universe.")
.value(_.attributes, Map("foo" -> "bar", "custom" -> "data"))
.valueMap(_.searchMetrics, Map(new DateMidnight(2011, 6, 5) -> 3l, new DateMidnight(2011, 6, 4) -> 34l))
.execute()
class WebTable extends HbaseTable[WebTable, String, WebPageRow](tableName = "pages", rowKeyClass = classOf[String]) {
def rowBuilder(result: DeserializedResult) = new WebPageRow(this, result)
val meta = family[String, String, Any]("meta")
val title = column(meta, "title", classOf[String])
val lastCrawled = column(meta, "lastCrawled", classOf[DateTime])
val content = family[String, String, Any]("text", compressed = true)
val article = column(content, "article", classOf[String])
val attributes = column(content, "attrs", classOf[Map[String, String]])
@erraggy
erraggy / grvxml.scala
Created June 19, 2013 21:08
Utility functions to preprocess XML content that has parsing issues
package object grvxml {
private[grvxml] val entityRegex = """&\w+;""".r
def forceHexEntities(input: String): String = entityRegex.replaceAllIn(input, m => {
val matched = m.group(0)
namedEntityToHexEntity.get(matched) match {
case Some(replaceWithMe) => replaceWithMe
case None => matched
}
})
@erraggy
erraggy / gist:1903620
Created February 24, 2012 20:42
Aggregating values via MapReduce jobs
class SiteMetricsTable extends HbaseTable[SiteMetricsTable, String, SiteMetricsRow](tableName = "site-metrics", rowKeyClass = classOf[String]) {
def rowBuilder(result: DeserializedResult) = new SiteMetricsRow(this, result)
val meta = family[String, String, Any]("meta")
val name = column(meta, "name", classOf[String])
val searchMetrics = family[String, DateMidnight, Long]("searchesByDay")
}
class SiteMetricsRow(table: SiteMetricsTable, result: DeserializedResult) extends HRow[SiteMetricsTable, String](result, table)
@erraggy
erraggy / gist:1903511
Created February 24, 2012 20:25
Querying values out of the WebTable
WebCrawlingSchema.WebTable.query2.withKey("http://mycrawledsite.com/crawledpage.html")
.withColumns(_.title, _.lastCrawled)
.withFamilies(_.searchMetrics)
.singleOption() match {
case Some(pageRow) => {
println("Title: " + pageRow.column(_.title).getOrElse("No Title"))
println("Crawled on: " + pageRow.column(_.lastCrawled).getOrElse(new DateTime()))
pageRow.family(_.searchMetrics).foreach {
case (date: DateMidnight, views: Long) =>
@erraggy
erraggy / gist:1903494
Created February 24, 2012 20:22
Putting values into the WebTable
WebCrawlingSchema.WebTable
.put("http://mycrawledsite.com/crawledpage.html")
.value(_.title, "My Crawled Page Title")
.value(_.lastCrawled, new DateTime())
.value(_.article, "Jonsie went to the store. She didn't notice the spinning of the Earth, nor did the Earth notice the expansion of the Universe.")
.value(_.attributes, Map("foo" -> "bar", "custom" -> "data"))
.valueMap(_.searchMetrics, Map(new DateMidnight(2011, 6, 5) -> 3l, new DateMidnight(2011, 6, 4) -> 34l))
.execute()
@erraggy
erraggy / gist:1903478
Created February 24, 2012 20:19
Creating a WebTable
class WebTable extends HbaseTable[WebTable, String, WebPageRow](tableName = "pages", rowKeyClass = classOf[String]) {
def rowBuilder(result: DeserializedResult) = new WebPageRow(this, result)
val meta = family[String, String, Any]("meta")
val title = column(meta, "title", classOf[String])
val lastCrawled = column(meta, "lastCrawled", classOf[DateTime])
val content = family[String, String, Any]("text", compressed = true)
val article = column(content, "article", classOf[String])
val attributes = column(content, "attrs", classOf[Map[String, String]])
@erraggy
erraggy / gist:1903276
Created February 24, 2012 19:51
Maven Dependency example for HPaste
<dependency>
<groupId>com.gravity</groupId>
<artifactId>gravity-hpaste</artifactId>
<version>0.1.11</version>
</dependency>