Created
January 18, 2017 03:01
-
-
Save rjurney/ec0d6b1ef050e3fbead2314255f4b6fa to your computer and use it in GitHub Desktop.
Error when I write from Spark to Elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> on_time_dataframe.repartition(1).write.format("org.elasticsearch.spark.sql")\ | |
... .option("es.resource","agile_data_science/on_time_performance")\ | |
... .mode("overwrite")\ | |
... .save() | |
[Stage 5:============================================> (3 + 1) / 4]17/01/18 02:52:43 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 10) | |
org.elasticsearch.hadoop.EsHadoopException: Could not write all entries [1000/1000] (Maybe ES was overloaded?). Error sample (first [5] error messages): | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
Bailing out... | |
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:250) | |
at org.elasticsearch.hadoop.rest.RestRepository.doWriteToIndex(RestRepository.java:196) | |
at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:159) | |
at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:67) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:745) | |
17/01/18 02:52:43 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 10, localhost, executor driver): org.elasticsearch.hadoop.EsHadoopException: Could not write all entries [1000/1000] (Maybe ES was overloaded?). Error sample (first [5] error messages): | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
Bailing out... | |
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:250) | |
at org.elasticsearch.hadoop.rest.RestRepository.doWriteToIndex(RestRepository.java:196) | |
at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:159) | |
at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:67) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:745) | |
17/01/18 02:52:43 ERROR TaskSetManager: Task 0 in stage 6.0 failed 1 times; aborting job | |
Traceback (most recent call last): | |
File "<stdin>", line 3, in <module> | |
File "/home/ubuntu/spark/python/pyspark/sql/readwriter.py", line 548, in save | |
self._jwrite.save() | |
File "/home/ubuntu/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1133, in __call__ | |
File "/home/ubuntu/spark/python/pyspark/sql/utils.py", line 63, in deco | |
return f(*a, **kw) | |
File "/home/ubuntu/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py", line 319, in get_return_value | |
py4j.protocol.Py4JJavaError: An error occurred while calling o42.save. | |
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 1 times, most recent failure: Lost task 0.0 in stage 6.0 (TID 10, localhost, executor driver): org.elasticsearch.hadoop.EsHadoopException: Could not write all entries [1000/1000] (Maybe ES was overloaded?). Error sample (first [5] error messages): | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
Bailing out... | |
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:250) | |
at org.elasticsearch.hadoop.rest.RestRepository.doWriteToIndex(RestRepository.java:196) | |
at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:159) | |
at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:67) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:745) | |
Driver stacktrace: | |
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) | |
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) | |
at scala.Option.foreach(Option.scala:257) | |
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) | |
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951) | |
at org.elasticsearch.spark.sql.EsSparkSQL$.saveToEs(EsSparkSQL.scala:94) | |
at org.elasticsearch.spark.sql.ElasticsearchRelation.insert(DefaultSource.scala:503) | |
at org.elasticsearch.spark.sql.DefaultSource.createRelation(DefaultSource.scala:94) | |
at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:426) | |
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:215) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:498) | |
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) | |
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) | |
at py4j.Gateway.invoke(Gateway.java:280) | |
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) | |
at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
at py4j.GatewayConnection.run(GatewayConnection.java:214) | |
at java.lang.Thread.run(Thread.java:745) | |
Caused by: org.elasticsearch.hadoop.EsHadoopException: Could not write all entries [1000/1000] (Maybe ES was overloaded?). Error sample (first [5] error messages): | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
[agile_data_science][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest to [agile_data_science] containing [1000] requests] | |
Bailing out... | |
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:250) | |
at org.elasticsearch.hadoop.rest.RestRepository.doWriteToIndex(RestRepository.java:196) | |
at org.elasticsearch.hadoop.rest.RestRepository.writeToIndex(RestRepository.java:159) | |
at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:67) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.elasticsearch.spark.sql.EsSparkSQL$$anonfun$saveToEs$1.apply(EsSparkSQL.scala:94) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
... 1 more |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment