Created
July 3, 2014 18:39
-
-
Save quasiben/da0f4778fbc87d02c088 to your computer and use it in GitHub Desktop.
Anaconda Spark AMI Error
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> sc.appName | |
u'PySparkShell' | |
>>> sc._conf.getAll() | |
[(u'spark.executor.extraLibraryPath', u'/root/ephemeral-hdfs/lib/native/'), (u'spark.executor.memory', u'6154m'), (u'spark.submit.pyFiles', u''), (u'spark.app.name', u' | |
PySparkShell'), (u'spark.executor.extraClassPath', u'/root/ephemeral-hdfs/conf'), (u'spark.master', u'spark://XX-XXX-XXX-XXX.compute-1.amazonaws.com:7077')] | |
>>> file = sc.textFile("hdfs://XX-XXX-XXX-XXX.amazonaws.com:9000/user/root/chekhov.txt") | |
14/07/03 18:24:02 INFO storage.MemoryStore: ensureFreeSpace(34388) called with curMem=0, maxMem=309225062 | |
14/07/03 18:24:02 INFO storage.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 33.6 KB, free 294.9 MB) | |
>>> file.take(2) | |
14/07/03 18:24:08 WARN snappy.LoadSnappy: Snappy native library is available | |
14/07/03 18:24:08 INFO util.NativeCodeLoader: Loaded the native-hadoop library | |
14/07/03 18:24:08 INFO snappy.LoadSnappy: Snappy native library loaded | |
14/07/03 18:24:08 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
14/07/03 18:24:08 INFO spark.SparkContext: Starting job: runJob at PythonRDD.scala:290 | |
14/07/03 18:24:08 INFO scheduler.DAGScheduler: Got job 0 (runJob at PythonRDD.scala:290) with 1 output partitions (allowLocal=true) | |
14/07/03 18:24:08 INFO scheduler.DAGScheduler: Final stage: Stage 0(runJob at PythonRDD.scala:290) | |
14/07/03 18:24:08 INFO scheduler.DAGScheduler: Parents of final stage: List() | |
14/07/03 18:24:08 INFO scheduler.DAGScheduler: Missing parents: List() | |
14/07/03 18:24:08 INFO scheduler.DAGScheduler: Computing the requested partition locally | |
14/07/03 18:24:09 INFO rdd.HadoopRDD: Input split: hdfs://XX-XXX-XXX-XXX.compute-1.amazonaws.com:9000/user/root/chekhov.txt:0+205768 | |
14/07/03 18:24:09 INFO python.PythonRDD: Times: total = 639, boot = 571, init = 68, finish = 0 | |
14/07/03 18:24:09 INFO spark.SparkContext: Job finished: runJob at PythonRDD.scala:290, took 0.669292504 s | |
[u"Project Gutenberg's Plays by Chekhov, Second Series, by Anton Chekhov", u''] | |
>>> lines = file.filter(lambda x: len(x) > 0) | |
>>> lines.count() | |
14/07/03 18:25:46 WARN snappy.LoadSnappy: Snappy native library is available | |
14/07/03 18:25:46 INFO util.NativeCodeLoader: Loaded the native-hadoop library | |
14/07/03 18:25:46 INFO snappy.LoadSnappy: Snappy native library loaded | |
14/07/03 18:25:46 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
14/07/03 18:25:46 INFO spark.SparkContext: Starting job: count at <stdin>:1 | |
14/07/03 18:25:46 INFO scheduler.DAGScheduler: Got job 0 (count at <stdin>:1) with 2 output partitions (allowLocal=false) | |
14/07/03 18:25:46 INFO scheduler.DAGScheduler: Final stage: Stage 0(count at <stdin>:1) | |
14/07/03 18:25:46 INFO scheduler.DAGScheduler: Parents of final stage: List() | |
14/07/03 18:25:46 INFO scheduler.DAGScheduler: Missing parents: List() | |
14/07/03 18:25:46 INFO scheduler.DAGScheduler: Submitting Stage 0 (PythonRDD[2] at RDD at PythonRDD.scala:40), which has no missing parents | |
14/07/03 18:25:46 INFO scheduler.DAGScheduler: Submitting 2 missing tasks from Stage 0 (PythonRDD[2] at RDD at PythonRDD.scala:40) | |
14/07/03 18:25:46 INFO scheduler.TaskSchedulerImpl: Adding task set 0.0 with 2 tasks | |
14/07/03 18:25:46 INFO scheduler.TaskSetManager: Starting task 0.0:0 as TID 0 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:46 INFO scheduler.TaskSetManager: Serialized task 0.0:0 as 3289 bytes in 3 ms | |
14/07/03 18:25:46 INFO scheduler.TaskSetManager: Starting task 0.0:1 as TID 1 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:46 INFO scheduler.TaskSetManager: Serialized task 0.0:1 as 3289 bytes in 0 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 0 (task 0.0:0) | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Loss was due to org.apache.spark.api.python.PythonException | |
org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/root/spark/python/pyspark/worker.py", line 77, in main | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 286, in func | |
def func(s, iterator): return f(iterator) | |
File "/root/spark/python/pyspark/rdd.py", line 774, in <lambda> | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "/root/spark/python/pyspark/rdd.py", line 774, in <genexpr> | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "<stdin>", line 1, in <lambda> | |
TypeError: __import__() argument 1 must be string, not tuple | |
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:118) | |
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:148) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:81) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:261) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:228) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:112) | |
at org.apache.spark.scheduler.Task.run(Task.scala:51) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) | |
at java.lang.Thread.run(Thread.java:744) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Starting task 0.0:0 as TID 2 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Serialized task 0.0:0 as 3289 bytes in 0 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 1 (task 0.0:1) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Loss was due to org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/root/spark/python/pyspark/worker.py", line 77, in main | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 286, in func | |
def func(s, iterator): return f(iterator) | |
File "/root/spark/python/pyspark/rdd.py", line 774, in <lambda> | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "/root/spark/python/pyspark/rdd.py", line 774, in <genexpr> | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "<stdin>", line 1, in <lambda> | |
TypeError: __import__() argument 1 must be string, not tuple | |
[duplicate 1] | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Starting task 0.0:1 as TID 3 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Serialized task 0.0:1 as 3289 bytes in 1 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 2 (task 0.0:0) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Loss was due to org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/root/spark/python/pyspark/worker.py", line 77, in main | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 1532, in pipeline_func | |
return func(split, prev_func(split, iterator)) | |
File "/root/spark/python/pyspark/rdd.py", line 286, in func | |
def func(s, iterator): return f(iterator) | |
File "/root/spark/python/pyspark/rdd.py", line 774, in <lambda> | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "/root/spark/python/pyspark/rdd.py", line 774, in <genexpr> | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "<stdin>", line 1, in <lambda> | |
TypeError: __import__() argument 1 must be string, not tuple | |
[duplicate 2] | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Starting task 0.0:0 as TID 4 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Serialized task 0.0:0 as 3289 bytes in 1 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 3 (task 0.0:1) | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Loss was due to java.net.SocketException | |
java.net.SocketException: Connection reset | |
at java.net.SocketInputStream.read(SocketInputStream.java:196) | |
at java.net.SocketInputStream.read(SocketInputStream.java:122) | |
at java.io.BufferedInputStream.fill(BufferedInputStream.java:235) | |
at java.io.BufferedInputStream.read(BufferedInputStream.java:254) | |
at java.io.DataInputStream.readInt(DataInputStream.java:387) | |
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:95) | |
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:148) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:81) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:261) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:228) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:112) | |
at org.apache.spark.scheduler.Task.run(Task.scala:51) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) | |
at java.lang.Thread.run(Thread.java:744) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Starting task 0.0:1 as TID 5 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Serialized task 0.0:1 as 3289 bytes in 0 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 5 (task 0.0:1) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Loss was due to java.net.SocketException: Connection reset [duplicate 1] | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Starting task 0.0:1 as TID 6 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Serialized task 0.0:1 as 3289 bytes in 1 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 4 (task 0.0:0) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Loss was due to java.net.SocketException: Connection reset [duplicate 2] | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Starting task 0.0:0 as TID 7 on executor 0: ip-XX-XXX-XX-XX.ec2.internal (NODE_LOCAL) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Serialized task 0.0:0 as 3289 bytes in 1 ms | |
14/07/03 18:25:48 WARN scheduler.TaskSetManager: Lost TID 6 (task 0.0:1) | |
14/07/03 18:25:48 INFO scheduler.TaskSetManager: Loss was due to java.net.SocketException: Connection reset [duplicate 3] | |
14/07/03 18:25:48 ERROR scheduler.TaskSetManager: Task 0.0:1 failed 4 times; aborting job | |
14/07/03 18:25:48 INFO scheduler.TaskSchedulerImpl: Cancelling stage 0 | |
14/07/03 18:25:48 INFO scheduler.TaskSchedulerImpl: Stage 0 was cancelled | |
14/07/03 18:25:48 INFO scheduler.DAGScheduler: Failed to run count at <stdin>:1 | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
File "/root/spark/python/pyspark/rdd.py", line 774, in count | |
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() | |
File "/root/spark/python/pyspark/rdd.py", line 765, in sum | |
return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) | |
File "/root/spark/python/pyspark/rdd.py", line 685, in reduce | |
vals = self.mapPartitions(func).collect() | |
File "/root/spark/python/pyspark/rdd.py", line 649, in collect | |
bytesInJava = self._jrdd.collect().iterator() | |
File "/root/spark/python/lib/py4j-0.8.1-src.zip/py4j/java_gateway.py", line 537, in __call__ | |
File "/root/spark/python/lib/py4j-0.8.1-src.zip/py4j/protocol.py", line 300, in get_return_value | |
py4j.protocol.Py4JJavaError14/07/03 18:25:48 WARN scheduler.TaskSetManager: Loss was due to org.apache.spark.SparkException | |
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed) | |
at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:144) | |
at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:148) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:81) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:261) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:228) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:112) | |
at org.apache.spark.scheduler.Task.run(Task.scala:51) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) | |
at java.lang.Thread.run(Thread.java:744) | |
14/07/03 18:25:48 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool | |
: An error occurred while calling o24.collect. | |
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0:1 failed 4 times, most recent failure: Exception failure in TID 6 on host ip-XX-XXX-XX-XX.ec2.internal: java.net.SocketException: Connection reset | |
java.net.SocketInputStream.read(SocketInputStream.java:196) | |
java.net.SocketInputStream.read(SocketInputStream.java:122) | |
java.io.BufferedInputStream.fill(BufferedInputStream.java:235) | |
java.io.BufferedInputStream.read(BufferedInputStream.java:254) | |
java.io.DataInputStream.readInt(DataInputStream.java:387) | |
org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:95) | |
org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:148) | |
org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:81) | |
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:261) | |
org.apache.spark.rdd.RDD.iterator(RDD.scala:228) | |
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:112) | |
org.apache.spark.scheduler.Task.run(Task.scala:51) | |
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187) | |
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) | |
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) | |
java.lang.Thread.run(Thread.java:744) | |
Driver stacktrace: | |
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1041) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1025) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1023) | |
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) | |
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1023) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:631) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:631) | |
at scala.Option.foreach(Option.scala:236) | |
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:631) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1226) | |
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) | |
at akka.actor.ActorCell.invoke(ActorCell.scala:456) | |
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) | |
at akka.dispatch.Mailbox.run(Mailbox.scala:219) | |
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) | |
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) | |
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) | |
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) | |
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment