Created
April 30, 2019 21:40
-
-
Save zmjjmz/0af5cf9b059b4969951e825565e266aa to your computer and use it in GitHub Desktop.
from_json nightmare
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
An error occurred while calling o4971.count. | |
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 255.0 failed 4 times, most recent failure: Lost task 3.3 in stage 255.0 (TID 840, ip-172-32-98-36.ec2.internal, executor 1): java.lang.ClassCastException: java.lang.Boolean cannot be cast to java.lang.String | |
at org.apache.spark.sql.catalyst.json.JSONOptions$$anonfun$27.apply(JSONOptions.scala:84) | |
at scala.Option.map(Option.scala:146) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:84) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:43) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser$lzycompute(jsonExpressions.scala:555) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser(jsonExpressions.scala:552) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.nullSafeEval(jsonExpressions.scala:585) | |
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:331) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:102) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:92) | |
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969) | |
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760) | |
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) | |
at org.apache.spark.scheduler.Task.run(Task.scala:108) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) | |
at java.lang.Thread.run(Thread.java:748) | |
Driver stacktrace: | |
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504) | |
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) | |
at scala.Option.foreach(Option.scala:257) | |
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676) | |
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094) | |
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) | |
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) | |
at org.apache.spark.rdd.RDD.collect(RDD.scala:935) | |
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) | |
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2435) | |
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2434) | |
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2842) | |
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) | |
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2841) | |
at org.apache.spark.sql.Dataset.count(Dataset.scala:2434) | |
at sun.reflect.GeneratedMethodAccessor65.invoke(Unknown Source) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:498) | |
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) | |
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) | |
at py4j.Gateway.invoke(Gateway.java:280) | |
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) | |
at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
at py4j.GatewayConnection.run(GatewayConnection.java:214) | |
at java.lang.Thread.run(Thread.java:748) | |
Caused by: java.lang.ClassCastException: java.lang.Boolean cannot be cast to java.lang.String | |
at org.apache.spark.sql.catalyst.json.JSONOptions$$anonfun$27.apply(JSONOptions.scala:84) | |
at scala.Option.map(Option.scala:146) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:84) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:43) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser$lzycompute(jsonExpressions.scala:555) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser(jsonExpressions.scala:552) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.nullSafeEval(jsonExpressions.scala:585) | |
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:331) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:102) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:92) | |
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969) | |
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760) | |
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) | |
at org.apache.spark.scheduler.Task.run(Task.scala:108) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) | |
... 1 more | |
Traceback (most recent call last): | |
File "/mnt/yarn/usercache/livy/appcache/application_1555975660097_0024/container_1555975660097_0024_01_000001/pyspark.zip/pyspark/sql/dataframe.py", line 427, in count | |
return int(self._jdf.count()) | |
File "/mnt/yarn/usercache/livy/appcache/application_1555975660097_0024/container_1555975660097_0024_01_000001/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ | |
answer, self.gateway_client, self.target_id, self.name) | |
File "/mnt/yarn/usercache/livy/appcache/application_1555975660097_0024/container_1555975660097_0024_01_000001/pyspark.zip/pyspark/sql/utils.py", line 63, in deco | |
return f(*a, **kw) | |
File "/mnt/yarn/usercache/livy/appcache/application_1555975660097_0024/container_1555975660097_0024_01_000001/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value | |
format(target_id, ".", name), value) | |
Py4JJavaError: An error occurred while calling o4971.count. | |
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 255.0 failed 4 times, most recent failure: Lost task 3.3 in stage 255.0 (TID 840, ip-172-32-98-36.ec2.internal, executor 1): java.lang.ClassCastException: java.lang.Boolean cannot be cast to java.lang.String | |
at org.apache.spark.sql.catalyst.json.JSONOptions$$anonfun$27.apply(JSONOptions.scala:84) | |
at scala.Option.map(Option.scala:146) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:84) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:43) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser$lzycompute(jsonExpressions.scala:555) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser(jsonExpressions.scala:552) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.nullSafeEval(jsonExpressions.scala:585) | |
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:331) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:102) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:92) | |
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969) | |
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760) | |
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) | |
at org.apache.spark.scheduler.Task.run(Task.scala:108) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) | |
at java.lang.Thread.run(Thread.java:748) | |
Driver stacktrace: | |
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1505) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1504) | |
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1504) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814) | |
at scala.Option.foreach(Option.scala:257) | |
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1732) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676) | |
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2029) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2050) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2069) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2094) | |
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) | |
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) | |
at org.apache.spark.rdd.RDD.collect(RDD.scala:935) | |
at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:278) | |
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2435) | |
at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2434) | |
at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2842) | |
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) | |
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2841) | |
at org.apache.spark.sql.Dataset.count(Dataset.scala:2434) | |
at sun.reflect.GeneratedMethodAccessor65.invoke(Unknown Source) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:498) | |
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) | |
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) | |
at py4j.Gateway.invoke(Gateway.java:280) | |
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) | |
at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
at py4j.GatewayConnection.run(GatewayConnection.java:214) | |
at java.lang.Thread.run(Thread.java:748) | |
Caused by: java.lang.ClassCastException: java.lang.Boolean cannot be cast to java.lang.String | |
at org.apache.spark.sql.catalyst.json.JSONOptions$$anonfun$27.apply(JSONOptions.scala:84) | |
at scala.Option.map(Option.scala:146) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:84) | |
at org.apache.spark.sql.catalyst.json.JSONOptions.<init>(JSONOptions.scala:43) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser$lzycompute(jsonExpressions.scala:555) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.parser(jsonExpressions.scala:552) | |
at org.apache.spark.sql.catalyst.expressions.JsonToStructs.nullSafeEval(jsonExpressions.scala:585) | |
at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:331) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) | |
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:102) | |
at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:92) | |
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038) | |
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969) | |
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029) | |
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760) | |
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) | |
at org.apache.spark.scheduler.Task.run(Task.scala:108) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) | |
... 1 more |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import Row | |
from pyspark.sql.functions import from_json | |
# spark is the SparkSession, I don't have code for that here because SparkMagic does that for you... magically? | |
unparsed_schema = Row('uber', 'userid') | |
uber_str_modified = """ | |
{ | |
"a":2, | |
"b":3 | |
} | |
""" | |
print(uber_str_modified) | |
uber_source = spark.createDataFrame([unparsed_schema( | |
uber_str_modified, | |
12345 # unimportant, there to illustrate the schema changes | |
)]) | |
uber_source_parsed_direct = spark.read.option('multiLine',True).json( | |
uber_source.rdd.map(lambda row: row.uber)) | |
uber_schema = uber_source_parsed_direct.schema | |
uber_source_parsed = uber_source.withColumn( | |
'parsed_uber', from_json(uber_source.uber, uber_schema, | |
#options={'multiLine':'true'} | |
options={'multiLine':True}) | |
).cache() | |
uber_source_parsed_direct.printSchema() # this works | |
print(uber_source_parsed_direct.count()) # 1 | |
uber_source_parsed.printSchema() # this works | |
print(uber_source_parsed.count()) # this breaks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment