Skip to content

Instantly share code, notes, and snippets.

@kakakazuma
Created November 10, 2015 10:09
Show Gist options
  • Save kakakazuma/d6977b8fbd7c48c39c65 to your computer and use it in GitHub Desktop.
Save kakakazuma/d6977b8fbd7c48c39c65 to your computer and use it in GitHub Desktop.
Spark sample application, wordcount from S3
package sample
import org.apache.spark.{SparkConf, SparkContext}
object Sample1 {
val myAccessKey = "xxxxxxx"
val mySecretKey = "yyyyyyy"
val bucket = "your_bucket"
val filepath = "your_path"
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("sample")
val sc = new SparkContext(conf)
val hadoopConf = sc.hadoopConfiguration;
hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3n.awsAccessKeyId",myAccessKey)
hadoopConf.set("fs.s3n.awsSecretAccessKey",mySecretKey)
val s3data = sc.textFile("s3n://" + bucket + "/" + filepath)
val numDog = s3data.filter( line => line.contains("Dog")).count()
val numCat = s3data.filter( line => line.contains("Cat")).count()
val total = s3data.count()
println("total lines: %s".format(total))
println("Lines with Dog: %s, Lines with Cat: %s".format(numDog, numCat))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment