Created
March 9, 2017 22:32
-
-
Save anonymous/e8616182e1f4eaf31ed35de764bc4619 to your computer and use it in GitHub Desktop.
pyspark error log when creating DataFrame containing empty bytearray.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
λ pyspark | |
Python 3.5.2 |Continuum Analytics, Inc.| (default, Jul 5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)] on win32 | |
Type "help", "copyright", "credits" or "license" for more information. | |
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
Setting default log level to "WARN". | |
To adjust logging level use sc.setLogLevel(newLevel). | |
17/03/09 13:23:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
Welcome to | |
____ __ | |
/ __/__ ___ _____/ /__ | |
_\ \/ _ \/ _ `/ __/ '_/ | |
/__ / .__/\_,_/_/ /_/\_\ version 2.0.2 | |
/_/ | |
Using Python version 3.5.2 (default, Jul 5 2016 11:41:13) | |
SparkSession available as 'spark'. | |
>>> from pyspark.sql import SQLContext | |
>>> sqlC = SQLContext(sc) | |
>>> rows = [[bytearray(b'')]] | |
>>> rows | |
[[bytearray(b'')]] | |
>>> cols = ['binaryCol'] | |
>>> DF = sqlC.createDataFrame(rows, schema=cols) | |
>>> DF | |
DataFrame[binaryCol: binary] | |
>>> DF.collect() | |
[Stage 0:> (0 + 8) / 8]17/03/09 14:13:16 ERROR Executor: Exception in task 7.0 in stage 0.0 (TID 7) | |
net.razorvine.pickle.PickleException: invalid pickle data for bytearray; expected 1 or 2 args, got 0 | |
at net.razorvine.pickle.objects.ByteArrayConstructor.construct(ByteArrayConstructor.java:20) | |
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707) | |
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175) | |
at net.razorvine.pickle.Unpickler.load(Unpickler.java:99) | |
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:152) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:151) | |
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) | |
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) | |
at org.apache.spark.scheduler.Task.run(Task.scala:86) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) | |
at java.lang.Thread.run(Unknown Source) | |
17/03/09 14:13:16 WARN TaskSetManager: Lost task 7.0 in stage 0.0 (TID 7, localhost): net.razorvine.pickle.PickleException: invalid pickle data for bytearray; expected 1 or 2 args, got 0 | |
at net.razorvine.pickle.objects.ByteArrayConstructor.construct(ByteArrayConstructor.java:20) | |
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707) | |
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175) | |
at net.razorvine.pickle.Unpickler.load(Unpickler.java:99) | |
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:152) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:151) | |
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) | |
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) | |
at org.apache.spark.scheduler.Task.run(Task.scala:86) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) | |
at java.lang.Thread.run(Unknown Source) | |
17/03/09 14:13:16 ERROR TaskSetManager: Task 7 in stage 0.0 failed 1 times; aborting job | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in <module> | |
File "C:\Users\t-lupick\Dev\spark-2.0.2-bin-hadoop2.7\python\pyspark\sql\dataframe.py", line 310, in collect | |
port = self._jdf.collectToPython() | |
File "C:\Users\t-lupick\Dev\spark-2.0.2-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\java_gateway.py", line 1133, in __call__ | |
File "C:\Users\t-lupick\Dev\spark-2.0.2-bin-hadoop2.7\python\pyspark\sql\utils.py", line 63, in deco | |
return f(*a, **kw) | |
File "C:\Users\t-lupick\Dev\spark-2.0.2-bin-hadoop2.7\python\lib\py4j-0.10.3-src.zip\py4j\protocol.py", line 319, in get_return_value | |
py4j.protocol.Py4JJavaError: An error occurred while calling o42.collectToPython. | |
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 0.0 failed 1 times, most recent failure: Lost task 7.0 in stage 0.0 (TID 7, localhost): net.razorvine.pickle.PickleException: invalid pickle data for bytearray; expected 1 or 2 args, got 0 | |
at net.razorvine.pickle.objects.ByteArrayConstructor.construct(ByteArrayConstructor.java:20) | |
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707) | |
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175) | |
at net.razorvine.pickle.Unpickler.load(Unpickler.java:99) | |
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:152) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:151) | |
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) | |
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) | |
at org.apache.spark.scheduler.Task.run(Task.scala:86) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) | |
at java.lang.Thread.run(Unknown Source) | |
Driver stacktrace: | |
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441) | |
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811) | |
at scala.Option.foreach(Option.scala:257) | |
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611) | |
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1873) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1886) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1899) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1913) | |
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) | |
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) | |
at org.apache.spark.rdd.RDD.collect(RDD.scala:911) | |
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:290) | |
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply$mcI$sp(Dataset.scala:2526) | |
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2523) | |
at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:2523) | |
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) | |
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2546) | |
at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:2523) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) | |
at java.lang.reflect.Method.invoke(Unknown Source) | |
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237) | |
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) | |
at py4j.Gateway.invoke(Gateway.java:280) | |
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) | |
at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
at py4j.GatewayConnection.run(GatewayConnection.java:214) | |
at java.lang.Thread.run(Unknown Source) | |
Caused by: net.razorvine.pickle.PickleException: invalid pickle data for bytearray; expected 1 or 2 args, got 0 | |
at net.razorvine.pickle.objects.ByteArrayConstructor.construct(ByteArrayConstructor.java:20) | |
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:707) | |
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:175) | |
at net.razorvine.pickle.Unpickler.load(Unpickler.java:99) | |
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:112) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:152) | |
at org.apache.spark.api.python.SerDeUtil$$anonfun$pythonToJava$1$$anonfun$apply$1.apply(SerDeUtil.scala:151) | |
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434) | |
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:246) | |
at org.apache.spark.sql.execution.SparkPlan$$anonfun$4.apply(SparkPlan.scala:240) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:803) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) | |
at org.apache.spark.scheduler.Task.run(Task.scala:86) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) | |
... 1 more | |
>>> rows2 = [[bytearray(b'0')]] | |
>>> DF2 = sqlC.createDataFrame(rows2, schema=cols) | |
>>> DF2.collect() | |
[Row(binaryCol=bytearray(b'0'))] | |
>>> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment