Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:02
Show Gist options
  • Save mychaelstyle/71463d32fcfcc239438b to your computer and use it in GitHub Desktop.
Save mychaelstyle/71463d32fcfcc239438b to your computer and use it in GitHub Desktop.
Apache Spark ドキュメント和訳 - Quick Start ref:
name := "Simple Project"
version := "1.0"
scalaVersion := "2.10.4"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.0"
resolvers += "Akka Repository" at ""
# Your directory layout should look like this
$ find .
# Package a jar containing your application
$ sbt package
[info] Packaging {..}/{..}/target/scala-2.10/simple-project_2.10-1.0.jar
# Use spark-submit to run your application
$ YOUR_SPARK_HOME/bin/spark-submit \
--class "SimpleApp" \
--master local[4] \
Lines with a: 46, Lines with b: 23
$ find .
# Package a jar containing your application
$ mvn package
[INFO] Building jar: {..}/{..}/target/simple-project-1.0.jar
# Use spark-submit to run your application
$ YOUR_SPARK_HOME/bin/spark-submit \
--class "SimpleApp" \
--master local[4] \
Lines with a: 46, Lines with b: 23
# Use spark-submit to run your application
$ YOUR_SPARK_HOME/bin/spark-submit \
--master local[4] \
Lines with a: 46, Lines with b: 23
# For Scala and Java, use run-example:
./bin/run-example SparkPi
# For Python examples, use spark-submit directly:
./bin/spark-submit examples/src/main/python/
<name>Simple Project</name>
<id>Akka repository</id>
<dependency> <!-- Spark dependency -->
>>> textFile.filter(lambda line: "Spark" in line).count() # How many lines contain "Spark"?
scala> linesWithSpark.cache()
res7: spark.RDD[String] = spark.FilteredRDD@17e51082
scala> linesWithSpark.count()
res8: Long = 15
scala> linesWithSpark.count()
res9: Long = 15
scala> import java.lang.Math
import java.lang.Math
scala> => line.split(" ").size).reduce((a, b) => Math.max(a, b))
res5: Int = 15
/* */
import org.apache.spark.SparkConf;
public class SimpleApp {
public static void main(String[] args) {
String logFile = "YOUR_SPARK_HOME/"; // Should be some file on your system
SparkConf conf = new SparkConf().setAppName("Simple Application");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> logData = sc.textFile(logFile).cache();
long numAs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) { return s.contains("a"); }
long numBs = logData.filter(new Function<String, Boolean>() {
public Boolean call(String s) { return s.contains("b"); }
System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs);
from pyspark import SparkContext
logFile = "YOUR_SPARK_HOME/" # Should be some file on your system
sc = SparkContext("local", "Simple App")
logData = sc.textFile(logFile).cache()
numAs = logData.filter(lambda s: 'a' in s).count()
numBs = logData.filter(lambda s: 'b' in s).count()
print "Lines with a: %i, lines with b: %i" % (numAs, numBs)
/* SimpleApp.scala */
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SimpleApp {
def main(args: Array[String]) {
val logFile = "YOUR_SPARK_HOME/" // あなたのマシン上のパス
val conf = new SparkConf().setAppName("Simple Application")
val sc = new SparkContext(conf)
val logData = sc.textFile(logFile, 2).cache()
val numAs = logData.filter(line => line.contains("a")).count()
val numBs = logData.filter(line => line.contains("b")).count()
println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment