Created
July 11, 2018 08:31
-
-
Save AlexDBlack/90a7dda78b91b7b96574b4bc57cd34de to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2018-07-11 08:31:09 INFO TaskSetManager:54 - Lost task 165.3 in stage 7.0 (TID 857) on 10.0.2.10, executor 2: java.lang.RuntimeException (Training failed due to exception in ParallelWrapper fit operation) [duplicate 22] | |
2018-07-11 08:31:09 INFO DAGScheduler:54 - Job 6 failed: treeAggregate at SharedTrainingMaster.java:611, took 26.675859 s | |
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 133 in stage 7.0 failed 4 times, most recent failure: Lost task 133.3 in stage 7.0 (TID 855, 10.0.2.10, executor 2): java.lang.RuntimeException: Training failed due to exception in ParallelWrapper fit operation | |
at org.deeplearning4j.spark.parameterserver.pw.SharedTrainingWrapper.run(SharedTrainingWrapper.java:398) | |
at org.deeplearning4j.spark.parameterserver.functions.SharedFlatMapPathsAdapter.call(SharedFlatMapPaths.java:49) | |
at org.deeplearning4j.spark.parameterserver.functions.SharedFlatMapPathsAdapter.call(SharedFlatMapPaths.java:27) | |
at org.datavec.spark.transform.BaseFlatMapFunctionAdaptee.call(BaseFlatMapFunctionAdaptee.java:24) | |
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:153) | |
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:153) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) | |
at org.apache.spark.scheduler.Task.run(Task.scala:109) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) | |
at java.lang.Thread.run(Thread.java:748) | |
Caused by: java.lang.OutOfMemoryError: Cannot allocate new LongPointer(9): totalBytes = 2438M, physicalBytes = 21868M | |
at org.bytedeco.javacpp.LongPointer.<init>(LongPointer.java:75) | |
at org.nd4j.linalg.cpu.nativecpu.ops.NativeOpExecutioner.exec(NativeOpExecutioner.java:1714) | |
at org.nd4j.linalg.convolution.Convolution.im2col(Convolution.java:215) | |
at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.preOutput(ConvolutionLayer.java:359) | |
at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.activate(ConvolutionLayer.java:403) | |
at org.deeplearning4j.nn.graph.vertex.impl.LayerVertex.doForward(LayerVertex.java:112) | |
at org.deeplearning4j.nn.graph.ComputationGraph.ffToLayerActivationsInWS(ComputationGraph.java:2062) | |
at org.deeplearning4j.nn.graph.ComputationGraph.computeGradientAndScore(ComputationGraph.java:1408) | |
at org.deeplearning4j.nn.graph.ComputationGraph.computeGradientAndScore(ComputationGraph.java:1377) | |
at org.deeplearning4j.optimize.solvers.BaseOptimizer.gradientAndScore(BaseOptimizer.java:162) | |
at org.deeplearning4j.optimize.solvers.StochasticGradientDescent.optimize(StochasticGradientDescent.java:53) | |
at org.deeplearning4j.optimize.Solver.optimize(Solver.java:54) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fitHelper(ComputationGraph.java:1200) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fit(ComputationGraph.java:1149) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fit(ComputationGraph.java:1136) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fit(ComputationGraph.java:1016) | |
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.fit(DefaultTrainer.java:213) | |
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.run(DefaultTrainer.java:333) | |
... 3 more | |
Caused by: java.lang.OutOfMemoryError: Physical memory usage is too high: physicalBytes (21868M) > maxPhysicalBytes (21846M) | |
at org.bytedeco.javacpp.Pointer.deallocator(Pointer.java:587) | |
at org.bytedeco.javacpp.Pointer.init(Pointer.java:124) | |
at org.bytedeco.javacpp.LongPointer.allocateArray(Native Method) | |
at org.bytedeco.javacpp.LongPointer.<init>(LongPointer.java:68) | |
... 20 more | |
Driver stacktrace: | |
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589) | |
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) | |
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) | |
at scala.Option.foreach(Option.scala:257) | |
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772) | |
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761) | |
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034) | |
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2131) | |
at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1092) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) | |
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) | |
at org.apache.spark.rdd.RDD.fold(RDD.scala:1086) | |
at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1155) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) | |
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) | |
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) | |
at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1131) | |
at org.apache.spark.api.java.JavaRDDLike$class.treeAggregate(JavaRDDLike.scala:439) | |
at org.apache.spark.api.java.AbstractJavaRDDLike.treeAggregate(JavaRDDLike.scala:45) | |
at org.deeplearning4j.spark.parameterserver.training.SharedTrainingMaster.processResults(SharedTrainingMaster.java:611) | |
at org.deeplearning4j.spark.parameterserver.training.SharedTrainingMaster.doIterationPaths(SharedTrainingMaster.java:857) | |
at org.deeplearning4j.spark.parameterserver.training.SharedTrainingMaster.executeTrainingPathsHelper(SharedTrainingMaster.java:373) | |
at org.deeplearning4j.spark.parameterserver.training.SharedTrainingMaster.executeTrainingPaths(SharedTrainingMaster.java:540) | |
at org.deeplearning4j.spark.impl.graph.SparkComputationGraph.fitPaths(SparkComputationGraph.java:211) | |
at org.deeplearning4j.distributed.SparkTrainingExperimentDriver.entryPoint(SparkTrainingExperimentDriver.java:315) | |
at org.deeplearning4j.distributed.SparkTrainingExperimentDriver.main(SparkTrainingExperimentDriver.java:164) | |
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
at java.lang.reflect.Method.invoke(Method.java:498) | |
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) | |
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:894) | |
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198) | |
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228) | |
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137) | |
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) | |
Caused by: java.lang.RuntimeException: Training failed due to exception in ParallelWrapper fit operation | |
at org.deeplearning4j.spark.parameterserver.pw.SharedTrainingWrapper.run(SharedTrainingWrapper.java:398) | |
at org.deeplearning4j.spark.parameterserver.functions.SharedFlatMapPathsAdapter.call(SharedFlatMapPaths.java:49) | |
at org.deeplearning4j.spark.parameterserver.functions.SharedFlatMapPathsAdapter.call(SharedFlatMapPaths.java:27) | |
at org.datavec.spark.transform.BaseFlatMapFunctionAdaptee.call(BaseFlatMapFunctionAdaptee.java:24) | |
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:153) | |
at org.apache.spark.api.java.JavaRDDLike$$anonfun$fn$4$1.apply(JavaRDDLike.scala:153) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800) | |
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) | |
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) | |
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) | |
at org.apache.spark.scheduler.Task.run(Task.scala:109) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) | |
at java.lang.Thread.run(Thread.java:748) | |
Caused by: java.lang.OutOfMemoryError: Cannot allocate new LongPointer(9): totalBytes = 2438M, physicalBytes = 21868M | |
at org.bytedeco.javacpp.LongPointer.<init>(LongPointer.java:75) | |
at org.nd4j.linalg.cpu.nativecpu.ops.NativeOpExecutioner.exec(NativeOpExecutioner.java:1714) | |
at org.nd4j.linalg.convolution.Convolution.im2col(Convolution.java:215) | |
at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.preOutput(ConvolutionLayer.java:359) | |
at org.deeplearning4j.nn.layers.convolution.ConvolutionLayer.activate(ConvolutionLayer.java:403) | |
at org.deeplearning4j.nn.graph.vertex.impl.LayerVertex.doForward(LayerVertex.java:112) | |
at org.deeplearning4j.nn.graph.ComputationGraph.ffToLayerActivationsInWS(ComputationGraph.java:2062) | |
at org.deeplearning4j.nn.graph.ComputationGraph.computeGradientAndScore(ComputationGraph.java:1408) | |
at org.deeplearning4j.nn.graph.ComputationGraph.computeGradientAndScore(ComputationGraph.java:1377) | |
at org.deeplearning4j.optimize.solvers.BaseOptimizer.gradientAndScore(BaseOptimizer.java:162) | |
at org.deeplearning4j.optimize.solvers.StochasticGradientDescent.optimize(StochasticGradientDescent.java:53) | |
at org.deeplearning4j.optimize.Solver.optimize(Solver.java:54) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fitHelper(ComputationGraph.java:1200) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fit(ComputationGraph.java:1149) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fit(ComputationGraph.java:1136) | |
at org.deeplearning4j.nn.graph.ComputationGraph.fit(ComputationGraph.java:1016) | |
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.fit(DefaultTrainer.java:213) | |
at org.deeplearning4j.parallelism.trainer.DefaultTrainer.run(DefaultTrainer.java:333) | |
... 3 more | |
Caused by: java.lang.OutOfMemoryError: Physical memory usage is too high: physicalBytes (21868M) > maxPhysicalBytes (21846M) | |
at org.bytedeco.javacpp.Pointer.deallocator(Pointer.java:587) | |
at org.bytedeco.javacpp.Pointer.init(Pointer.java:124) | |
at org.bytedeco.javacpp.LongPointer.allocateArray(Native Method) | |
at org.bytedeco.javacpp.LongPointer.<init>(LongPointer.java:68) | |
... 20 more | |
2018-07-11 08:31:09 WARN TaskSetManager:66 - Lost task 134.3 in stage 7.0 (TID 858, 10.0.2.10, executor 2): TaskKilled (Stage cancelled) | |
2018-07-11 08:31:10 WARN TaskSetManager:66 - Lost task 127.0 in stage 7.0 (TID 811, 10.0.2.9, executor 0): TaskKilled (Stage cancelled) | |
2018-07-11 08:31:10 WARN TaskSetManager:66 - Lost task 149.0 in stage 7.0 (TID 833, 10.0.2.9, executor 0): TaskKilled (Stage cancelled) | |
2018-07-11 08:31:10 WARN TaskSetManager:66 - Lost task 148.1 in stage 7.0 (TID 842, 10.0.2.9, executor 0): TaskKilled (Stage cancelled) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment