-
-
Save gregrahn/b63fea668d27d6265434 to your computer and use it in GitHub Desktop.
Spark setup commands
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SPARK_TOP=$HOME/src/spark_top | |
mkdir -p $SPARK_TOP | |
rm -rf $SPARK_TOP/* | |
cd $SPARK_TOP | |
# download spark | |
curl -#ROL http://mirror.cc.columbia.edu/pub/software/apache/spark/spark-1.4.1/spark-1.4.1.tgz | |
tar xvf spark-1.4.1.tgz | |
cd spark-1.4.1 | |
# build takes ~25 minutes on my laptop under OS X, not a VM | |
export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" | |
build/mvn -DskipTests clean package | |
cd .. | |
# get the avro-mapred jar from maven.org | |
curl -#ROL http://central.maven.org/maven2/org/apache/avro/avro-mapred/1.7.6/avro-mapred-1.7.6.jar | |
# grab the Redshift JDBC Driver | |
curl -#ROL http://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC41-1.1.6.1006.jar | |
# grab the PostgreSQL JDBC Driver | |
curl -#ROL http://central.maven.org/maven2/org/postgresql/postgresql/9.4-1201-jdbc41/postgresql-9.4-1201-jdbc41.jar | |
# Download the AWS jars | |
curl -#ROL http://central.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.1/hadoop-aws-2.7.1.jar | |
curl -#ROL http://central.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.9.40/aws-java-sdk-s3-1.9.40.jar | |
curl -#ROL http://central.maven.org/maven2/com/amazonaws/aws-java-sdk-core/1.9.40/aws-java-sdk-core-1.9.40.jar | |
curl -#ROL http://central.maven.org/maven2/com/amazonaws/aws-java-sdk/1.9.40/aws-java-sdk-1.9.40.jar | |
git clone https://github.com/databricks/spark-redshift.git | |
cd spark-redshift | |
build/sbt clean assembly | |
cd .. | |
# $SPARK_TOP/spark-1.4.1/bin/spark-shell \ | |
#--packages com.databricks:spark-avro_2.10:1.0.0 \ | |
#--jars $SPARK_TOP/spark-redshift/target/scala-2.10/spark-redshift-assembly-0.4.1-SNAPSHOT.jar,$SPARK_TOP/RedshiftJDBC41-1.1.6.1006.jar,$SPARK_TOP/avro-mapred-1.7.6.jar,$SPARK_TOP/aws-java-sdk-1.9.40.jar,$SPARK_TOP/aws-java-sdk-core-1.9.40.jar,$SPARK_TOP/aws-java-sdk-s3-1.9.40.jar,$SPARK_TOP/hadoop-aws-2.7.1.jar,$SPARK_TOP/postgresql-9.4-1201-jdbc41.jar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val url = "jdbc:redshift://rshost.us-west-2.redshift.amazonaws.com:5439/testdb?user=rsuser&password=rspassword" | |
val driver = "com.amazon.redshift.jdbc41.Driver" | |
val t = sqlContext.read.format("com.databricks.spark.redshift").option("jdbcdriver", driver).option("url", url).option("dbtable", "emp").option("tempdir", "s3://spark-temp-dir").load() | |
t.printSchema() | |
t.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading 1-rs.scala | |
url: String = jdbc:redshift://myredshift.us-west-2.redshift.amazonaws.com:5439/testdb?user=myuser&password=mypassword | |
driver: String = com.amazon.redshift.jdbc41.Driver | |
t: org.apache.spark.sql.DataFrame = [empno: int, ename: string, job: string, mgr: int, hiredate: date, sal: decimal(7,2), comm: decimal(7,2), deptno: int] | |
root | |
|-- empno: integer (nullable = true) | |
|-- ename: string (nullable = true) | |
|-- job: string (nullable = true) | |
|-- mgr: integer (nullable = true) | |
|-- hiredate: date (nullable = true) | |
|-- sal: decimal(7,2) (nullable = true) | |
|-- comm: decimal(7,2) (nullable = true) | |
|-- deptno: integer (nullable = true) | |
15/08/06 10:19:14 INFO Parameters: Using default provider chain for AWS credentials, as none provided explicitly. | |
15/08/06 10:19:16 INFO MemoryStore: ensureFreeSpace(132712) called with curMem=0, maxMem=278019440 | |
15/08/06 10:19:16 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 129.6 KB, free 265.0 MB) | |
15/08/06 10:19:16 INFO MemoryStore: ensureFreeSpace(14522) called with curMem=132712, maxMem=278019440 | |
15/08/06 10:19:16 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 14.2 KB, free 265.0 MB) | |
15/08/06 10:19:16 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on localhost:57701 (size: 14.2 KB, free: 265.1 MB) | |
15/08/06 10:19:16 INFO SparkContext: Created broadcast 0 from newAPIHadoopFile at RedshiftRelation.scala:82 | |
org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path does not exist: s3://spark-temp-dir/19380690-5730-451a-b741-8245b27da674 | |
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:285) | |
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:340) | |
at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:95) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) | |
at scala.Option.getOrElse(Option.scala:120) | |
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) | |
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) | |
at scala.Option.getOrElse(Option.scala:120) | |
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) | |
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) | |
at scala.Option.getOrElse(Option.scala:120) | |
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) | |
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) | |
at scala.Option.getOrElse(Option.scala:120) | |
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) | |
at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) | |
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) | |
at scala.Option.getOrElse(Option.scala:120) | |
at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) | |
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:121) | |
at org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:125) | |
at org.apache.spark.sql.DataFrame.collect(DataFrame.scala:1269) | |
at org.apache.spark.sql.DataFrame.head(DataFrame.scala:1203) | |
at org.apache.spark.sql.DataFrame.take(DataFrame.scala:1262) | |
at org.apache.spark.sql.DataFrame.showString(DataFrame.scala:176) | |
at org.apache.spark.sql.DataFrame.show(DataFrame.scala:331) | |
at org.apache.spark.sql.DataFrame.show(DataFrame.scala:338) | |
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:26) | |
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:31) | |
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:33) | |
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:35) | |
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:37) | |
at $iwC$$iwC$$iwC.<init>(<console>:39) | |
at $iwC$$iwC.<init>(<console>:41) | |
at $iwC.<init>(<console>:43) | |
at <init>(<console>:45) | |
at .<init>(<console>:49) | |
at .<clinit>(<console>) | |
at .<init>(<console>:7) | |
at .<clinit>(<console>) | |
at $print(<console>) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:497) | |
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065) | |
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1338) | |
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840) | |
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871) | |
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819) | |
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857) | |
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902) | |
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814) | |
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657) | |
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665) | |
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1$$anonfun$apply$mcV$sp$1$$anonfun$apply$mcV$sp$2.apply(SparkILoop.scala:680) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1$$anonfun$apply$mcV$sp$1$$anonfun$apply$mcV$sp$2.apply(SparkILoop.scala:677) | |
at scala.reflect.io.Streamable$Chars$class.applyReader(Streamable.scala:104) | |
at scala.reflect.io.File.applyReader(File.scala:82) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(SparkILoop.scala:677) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1$$anonfun$apply$mcV$sp$1.apply(SparkILoop.scala:677) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1$$anonfun$apply$mcV$sp$1.apply(SparkILoop.scala:677) | |
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$savingReplayStack(SparkILoop.scala:162) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1.apply$mcV$sp(SparkILoop.scala:676) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1.apply(SparkILoop.scala:676) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$interpretAllFrom$1.apply(SparkILoop.scala:676) | |
at org.apache.spark.repl.SparkILoop.savingReader(SparkILoop.scala:167) | |
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$interpretAllFrom(SparkILoop.scala:675) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$loadCommand$1.apply(SparkILoop.scala:740) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$loadCommand$1.apply(SparkILoop.scala:739) | |
at org.apache.spark.repl.SparkILoop.withFile(SparkILoop.scala:733) | |
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loadCommand(SparkILoop.scala:739) | |
at org.apache.spark.repl.SparkILoop$$anonfun$standardCommands$7.apply(SparkILoop.scala:344) | |
at org.apache.spark.repl.SparkILoop$$anonfun$standardCommands$7.apply(SparkILoop.scala:344) | |
at scala.tools.nsc.interpreter.LoopCommands$LineCmd.apply(LoopCommands.scala:81) | |
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:809) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$loadFiles$1.apply(SparkILoop.scala:910) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$loadFiles$1.apply(SparkILoop.scala:908) | |
at scala.collection.immutable.List.foreach(List.scala:318) | |
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loadFiles(SparkILoop.scala:908) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:995) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945) | |
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945) | |
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135) | |
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945) | |
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1059) | |
at org.apache.spark.repl.Main$.main(Main.scala:31) | |
at org.apache.spark.repl.Main.main(Main.scala) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:497) | |
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:665) | |
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:170) | |
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:193) | |
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:112) | |
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) | |
15/08/06 10:19:18 INFO SparkContext: Invoking stop() from shutdown hook | |
15/08/06 10:19:18 INFO SparkUI: Stopped Spark web UI at http://192.168.1.93:4040 | |
15/08/06 10:19:18 INFO DAGScheduler: Stopping DAGScheduler | |
15/08/06 10:19:18 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped! | |
15/08/06 10:19:18 INFO Utils: path = /private/var/folders/wm/m8qq8wl94jg1m2wrw8tqkhcc0000gn/T/spark-0933a4c6-0c09-4c76-825e-c9de122e7d82/blockmgr-33de4e9a-f4bb-4b8d-9066-5fef9c3134e2, already present as root for deletion. | |
15/08/06 10:19:18 INFO MemoryStore: MemoryStore cleared | |
15/08/06 10:19:18 INFO BlockManager: BlockManager stopped | |
15/08/06 10:19:18 INFO BlockManagerMaster: BlockManagerMaster stopped | |
15/08/06 10:19:18 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped! | |
15/08/06 10:19:18 INFO SparkContext: Successfully stopped SparkContext | |
15/08/06 10:19:18 INFO Utils: Shutdown hook called | |
15/08/06 10:19:18 INFO Utils: Deleting directory /private/var/folders/wm/m8qq8wl94jg1m2wrw8tqkhcc0000gn/T/spark-0933a4c6-0c09-4c76-825e-c9de122e7d82 | |
15/08/06 10:19:18 INFO Utils: Deleting directory /private/var/folders/wm/m8qq8wl94jg1m2wrw8tqkhcc0000gn/T/spark-79111ed2-2331-4f1f-ab1f-f8ee798871b5 | |
15/08/06 10:19:18 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon. | |
15/08/06 10:19:18 INFO RemoteActorRefProvider$RemotingTerminator: Remote daemon shut down; proceeding with flushing remote transports. | |
15/08/06 10:19:18 INFO RemoteActorRefProvider$RemotingTerminator: Remoting shut down. | |
Gregs-MacBook-Pro:spark-scripts grahn$ aws s3 ls s3://spark-temp-dir/19380690-5730-451a-b741-8245b27da674/ | |
2015-08-06 10:19:15 335 0000_part_00 | |
2015-08-06 10:19:15 338 0001_part_00 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment