David Portabella dportabella

## dist.scala
// using build.sbt: libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % "provided"
// using Ammonite:  import $ivy.`org.apache.sis.core:sis-referencing:0.7`, org.apache.sis.distance.DistanceUtils

case class Coordinates(lat: Double, lon: Double)

def readCoordinates(file: String): Map[String, Coordinates] = {
  def parseLine(line: String): (String, Coordinates) = {
    val c = line.split("\t")
    (c(0) + "-" + c(1), Coordinates(c(9).toDouble, c(10).toDouble))
  }

## FilterArchive.scala
package application

import java.io._
import java.util

import org.apache.spark.rdd.RDD
import org.archive.format.warc.WARCConstants.WARCRecordType
import org.archive.io.warc.WARCRecordInfo
import org.warcbase.spark.archive.io.ArchiveRecord
import org.warcbase.spark.matchbox.RecordLoader

## DeserializeHadoopSequenceFileWithoutClassDeclaration.scala
// resolvers += "dportabella-3rd-party-mvn-repo-releases" at "https://github.com/dportabella/3rd-party-mvn-repo/raw/master/releases/"
// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"
// libraryDependencies += "com.github.dportabella.3rd-party-mvn-repo" % "jdeserialize" % "1.0.0",

import java.io._
import org.apache.hadoop.conf._
import org.apache.hadoop.fs._
import org.apache.hadoop.io._
import org.unsynchronized.jdeserialize

## RunTestOnMultipleGithubRepos
#!/usr/bin/env amm

/* To run this script:
 * $ chmod +x ./RunTestOnMultipleGithubRepos
 * $ ./RunTestOnMultipleGithubRepos
*/

import ammonite.ops._
import scalaj.http._
import $ivy.`org.eclipse.jgit:org.eclipse.jgit:4.5.0.201609210915-r`, org.eclipse.jgit.api.Git

## deserialize_hadoop_sequence_file.scala
// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"

import java.io.{ByteArrayInputStream, ObjectInputStream}

import org.apache.hadoop.conf._
import org.apache.hadoop.fs._
import org.apache.hadoop.io._

val f = "/path/to/part-00000"
val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f)))

## ExampleExecuteScalaFuturesInSerial.scala
/*
Example on how to execute scala futures in serial one after the other, without collecting the result of the futures

Look this instead if we need to collect the result of the futures (it also explains how foldLeft works here):
https://gist.github.com/dportabella/4e7569643ad693433ec6b86968f589b8
*/


import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration.Duration

## ExampleExecuteScalaFuturesInSerial.scala
/*
Execute scala futures in serial one after the other
This gist is to explain the solution given in
http://www.michaelpollmeier.com/execute-scala-futures-in-serial-one-after-the-other-non-blocking

The three examples produce the same result:
---
done: 10
done: 20
done: 30

## build.sbt
val sparkVersion = "1.6.1"
val hbaseVersion = "0.98.7-hadoop2"

name := "spark-examples"

version := sparkVersion

javacOptions ++= Seq("-source", "1.8", "-target", "1.8", "-Xlint")

initialize := {

## PomDependenciesToSbt
#!/usr/bin/env amm

// This script converts Maven dependencies from a pom.xml to sbt dependencies.
// It is based on the answers of George Pligor and Mike Slinn on http://stackoverflow.com/questions/15430346/
// - install https://github.com/lihaoyi/Ammonite
// - make this script executable: chmod +x PomDependenciesToSbt
// - run it with from your shell (e.g bash):
// $ ./PomDependenciesToSbt /path/to/pom.xml

import scala.xml._

## ExampleScalaAck.scala
import java.io.File
import org.apache.tika.detect._
import org.apache.tika.metadata._
import org.apache.tika.mime._
import org.apache.tika.io._
import org.apache.tika.parser.txt._
import resource._

def recursiveListFiles(f: File): List[File] = {
  val these = f.listFiles.toList
	// using build.sbt: libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % "provided"
	// using Ammonite: import $ivy.`org.apache.sis.core:sis-referencing:0.7`, org.apache.sis.distance.DistanceUtils

	case class Coordinates(lat: Double, lon: Double)

	def readCoordinates(file: String): Map[String, Coordinates] = {
	def parseLine(line: String): (String, Coordinates) = {
	val c = line.split("\t")
	(c(0) + "-" + c(1), Coordinates(c(9).toDouble, c(10).toDouble))
	}
	package application

	import java.io._
	import java.util

	import org.apache.spark.rdd.RDD
	import org.archive.format.warc.WARCConstants.WARCRecordType
	import org.archive.io.warc.WARCRecordInfo
	import org.warcbase.spark.archive.io.ArchiveRecord
	import org.warcbase.spark.matchbox.RecordLoader
	// resolvers += "dportabella-3rd-party-mvn-repo-releases" at "https://github.com/dportabella/3rd-party-mvn-repo/raw/master/releases/"
	// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"
	// libraryDependencies += "com.github.dportabella.3rd-party-mvn-repo" % "jdeserialize" % "1.0.0",

	import java.io._
	import org.apache.hadoop.conf._
	import org.apache.hadoop.fs._
	import org.apache.hadoop.io._
	import org.unsynchronized.jdeserialize
	#!/usr/bin/env amm

	/* To run this script:
	* $ chmod +x ./RunTestOnMultipleGithubRepos
	* $ ./RunTestOnMultipleGithubRepos
	*/

	import ammonite.ops._
	import scalaj.http._
	import $ivy.`org.eclipse.jgit:org.eclipse.jgit:4.5.0.201609210915-r`, org.eclipse.jgit.api.Git
	// libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3"

	import java.io.{ByteArrayInputStream, ObjectInputStream}

	import org.apache.hadoop.conf._
	import org.apache.hadoop.fs._
	import org.apache.hadoop.io._

	val f = "/path/to/part-00000"
	val reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path(f)))
	/*
	Example on how to execute scala futures in serial one after the other, without collecting the result of the futures

	Look this instead if we need to collect the result of the futures (it also explains how foldLeft works here):
	https://gist.github.com/dportabella/4e7569643ad693433ec6b86968f589b8
	*/


	import scala.concurrent.ExecutionContext.Implicits.global
	import scala.concurrent.duration.Duration
	/*
	Execute scala futures in serial one after the other
	This gist is to explain the solution given in
	http://www.michaelpollmeier.com/execute-scala-futures-in-serial-one-after-the-other-non-blocking

	The three examples produce the same result:
	---
	done: 10
	done: 20
	done: 30
	val sparkVersion = "1.6.1"
	val hbaseVersion = "0.98.7-hadoop2"

	name := "spark-examples"

	version := sparkVersion

	javacOptions ++= Seq("-source", "1.8", "-target", "1.8", "-Xlint")

	initialize := {
	#!/usr/bin/env amm

	// This script converts Maven dependencies from a pom.xml to sbt dependencies.
	// It is based on the answers of George Pligor and Mike Slinn on http://stackoverflow.com/questions/15430346/
	// - install https://github.com/lihaoyi/Ammonite
	// - make this script executable: chmod +x PomDependenciesToSbt
	// - run it with from your shell (e.g bash):
	// $ ./PomDependenciesToSbt /path/to/pom.xml

	import scala.xml._
	import java.io.File
	import org.apache.tika.detect._
	import org.apache.tika.metadata._
	import org.apache.tika.mime._
	import org.apache.tika.io._
	import org.apache.tika.parser.txt._
	import resource._

	def recursiveListFiles(f: File): List[File] = {
	val these = f.listFiles.toList