Created
July 6, 2017 16:41
-
-
Save nomoa/2d630ad915af160d5f37ebd0af2fe86c to your computer and use it in GitHub Desktop.
mjolnir tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vagrant@mjolnir:/vagrant$ venv/bin/tox | |
GLOB sdist-make: /vagrant/setup.py | |
flake8 inst-nodeps: /vagrant/.tox/dist/mjolnir-0.0.1.zip | |
flake8 installed: certifi==2017.4.17,chardet==3.0.4,clickmodels==1.0.2,configparser==3.5.0,enum34==1.1.6,flake8==3.3.0,idna==2.5,mccabe==0.6.1,mjolnir==0.0.1,pycodestyle==2.3.1,pyflakes==1.5.0,requests==2.18.1,urllib3==1.21.1 | |
flake8 runtests: PYTHONHASHSEED='1926447621' | |
flake8 runtests: commands[0] | flake8 mjolnir/ | |
pytest inst-nodeps: /vagrant/.tox/dist/mjolnir-0.0.1.zip | |
pytest installed: certifi==2017.4.17,chardet==3.0.4,clickmodels==1.0.2,configparser==3.5.0,decorator==4.0.11,enum34==1.1.6,findspark==1.1.0,flake8==3.3.0,future==0.16.0,hyperopt==0.1,idna==2.5,kafka==1.3.3,mccabe==0.6.1,mjolnir==0.0.1,networkx==1.11,nose==1.3.7,numpy==1.13.0,pluggy==0.4.0,py==1.4.34,py4j==0.10.6,pycodestyle==2.3.1,pyflakes==1.5.0,pymongo==3.4.0,pytest==3.1.3,requests==2.18.1,scipy==0.19.1,six==1.10.0,tox==2.7.0,urllib3==1.21.1,virtualenv==15.1.0 | |
pytest runtests: PYTHONHASHSEED='1926447621' | |
pytest runtests: commands[0] | pytest --pyargs mjolnir | |
====================================== test session starts ======================================= | |
platform linux2 -- Python 2.7.9, pytest-3.1.3, py-1.4.34, pluggy-0.4.0 | |
rootdir: /vagrant, inifile: | |
collected 22 items | |
mjolnir/test/test_dbn.py . | |
mjolnir/test/test_features.py F | |
mjolnir/test/test_metrics.py .. | |
mjolnir/test/test_norm_query.py F...... | |
mjolnir/test/test_sampling.py .. | |
mjolnir/test/test_spark.py . | |
mjolnir/test/training/test_tuning.py .... | |
mjolnir/test/training/test_xgboost.py .... | |
============================================ FAILURES ============================================ | |
________________________________________ test_collect_es _________________________________________ | |
spark_context = <pyspark.context.SparkContext object at 0x7f7b590206d0> | |
hive_context = <pyspark.sql.context.HiveContext object at 0x7f7b590205d0> | |
make_requests_session = <function f at 0x7f7b5900bb18> | |
def test_collect_es(spark_context, hive_context, make_requests_session): | |
def session_factory(): | |
return make_requests_session('requests/test_features.sqlite3') | |
r = pyspark.sql.Row('wikiid', 'query', 'hit_page_id') | |
source_data = { | |
'apple': [18978754, 36071326, 856], | |
'foo': [11178, 1140775, 844613] | |
} | |
rows = [r('enwiki', query, page_id) for query, ids in source_data.items() for page_id in ids] | |
df = spark_context.parallelize(rows).toDF() | |
df_result = mjolnir.features.collect_es(df, ['http://localhost:9200/_msearch'], | |
mjolnir.features.enwiki_features(), | |
{'enwiki': 'enwiki_content'}, | |
> session_factory=session_factory) | |
mjolnir/test/test_features.py:20: | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
mjolnir/features.py:402: in collect_es | |
.toDF(['wikiid', 'query', 'hit_page_id', 'features']) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:57: in toDF | |
return sparkSession.createDataFrame(self, schema, sampleRatio) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:520: in createDataFrame | |
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:360: in _createFromRDD | |
struct = self._inferSchema(rdd, samplingRatio) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:331: in _inferSchema | |
first = rdd.first() | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1361: in first | |
rs = self.take(1) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1343: in take | |
res = self.context.runJob(self, takeUpToNumLeft, p) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/context.py:965: in runJob | |
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py:1133: in __call__ | |
answer, self.gateway_client, self.target_id, self.name) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/utils.py:63: in deco | |
return f(*a, **kw) | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
answer = 'xro141', gateway_client = <py4j.java_gateway.GatewayClient object at 0x7f7b58f8ead0> | |
target_id = 'z:org.apache.spark.api.python.PythonRDD', name = 'runJob' | |
def get_return_value(answer, gateway_client, target_id=None, name=None): | |
"""Converts an answer received from the Java gateway into a Python object. | |
For example, string representation of integers are converted to Python | |
integer, string representation of objects are converted to JavaObject | |
instances, etc. | |
:param answer: the string returned by the Java gateway | |
:param gateway_client: the gateway client used to communicate with the Java | |
Gateway. Only necessary if the answer is a reference (e.g., object, | |
list, map) | |
:param target_id: the name of the object from which the answer comes from | |
(e.g., *object1* in `object1.hello()`). Optional. | |
:param name: the name of the member from which the answer comes from | |
(e.g., *hello* in `object1.hello()`). Optional. | |
""" | |
if is_error(answer)[0]: | |
if len(answer) > 1: | |
type = answer[1] | |
value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) | |
if answer[1] == REFERENCE_TYPE: | |
raise Py4JJavaError( | |
"An error occurred while calling {0}{1}{2}.\n". | |
> format(target_id, ".", name), value) | |
E Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob. | |
E : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 50, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
E process() | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
E serializer.dump_stream(func(split_index, iterator), outfile) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
E vs = list(itertools.islice(iterator, batch)) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
E yield next(iterator) | |
E File "/vagrant/mjolnir/features.py", line 387, in collect_partition | |
E with session_factory() as session: | |
E File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory | |
E return make_requests_session('requests/test_features.sqlite3') | |
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
E return MockSession(path) | |
E File "mjolnir/test/conftest.py", line 85, in __init__ | |
E "(digest text PRIMARY KEY, status_code int, content text)") | |
E OperationalError: database is locked | |
E | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
E at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
E at java.lang.Thread.run(Thread.java:748) | |
E | |
E Driver stacktrace: | |
E at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) | |
E at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
E at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
E at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) | |
E at scala.Option.foreach(Option.scala:257) | |
E at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) | |
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) | |
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) | |
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) | |
E at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
E at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) | |
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) | |
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) | |
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) | |
E at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441) | |
E at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala) | |
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
E at java.lang.reflect.Method.invoke(Method.java:498) | |
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) | |
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) | |
E at py4j.Gateway.invoke(Gateway.java:280) | |
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) | |
E at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
E at py4j.GatewayConnection.run(GatewayConnection.java:214) | |
E at java.lang.Thread.run(Thread.java:748) | |
E Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
E process() | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
E serializer.dump_stream(func(split_index, iterator), outfile) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
E vs = list(itertools.islice(iterator, batch)) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
E yield next(iterator) | |
E File "/vagrant/mjolnir/features.py", line 387, in collect_partition | |
E with session_factory() as session: | |
E File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory | |
E return make_requests_session('requests/test_features.sqlite3') | |
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
E return MockSession(path) | |
E File "mjolnir/test/conftest.py", line 85, in __init__ | |
E "(digest text PRIMARY KEY, status_code int, content text)") | |
E OperationalError: database is locked | |
E | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
E at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
E ... 1 more | |
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py:319: Py4JJavaError | |
-------------------------------------- Captured stderr call -------------------------------------- | |
17/07/06 16:25:56 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID 50) | |
org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
process() | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
vs = list(itertools.islice(iterator, batch)) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
yield next(iterator) | |
File "/vagrant/mjolnir/features.py", line 387, in collect_partition | |
with session_factory() as session: | |
File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory | |
return make_requests_session('requests/test_features.sqlite3') | |
File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
return MockSession(path) | |
File "mjolnir/test/conftest.py", line 85, in __init__ | |
"(digest text PRIMARY KEY, status_code int, content text)") | |
OperationalError: database is locked | |
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:748) | |
17/07/06 16:25:56 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 50, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
process() | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
vs = list(itertools.islice(iterator, batch)) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
yield next(iterator) | |
File "/vagrant/mjolnir/features.py", line 387, in collect_partition | |
with session_factory() as session: | |
File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory | |
return make_requests_session('requests/test_features.sqlite3') | |
File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
return MockSession(path) | |
File "mjolnir/test/conftest.py", line 85, in __init__ | |
"(digest text PRIMARY KEY, status_code int, content text)") | |
OperationalError: database is locked | |
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:748) | |
17/07/06 16:25:56 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1 times; aborting job | |
________________________________________ test_norm_query _________________________________________ | |
df_love = DataFrame[query: string, session_id: string, wikiid: string] | |
hive_context = <pyspark.sql.context.HiveContext object at 0x7f7b590205d0> | |
make_requests_session = <function f at 0x7f7b590e0e60> | |
def test_norm_query(df_love, hive_context, make_requests_session): | |
"""Very basic happy path test of query normalization""" | |
def session_factory(): | |
return make_requests_session('requests/test_norm_query.sqlite3') | |
# Make a fake stemmer() udf. We know everything in df_love | |
# stems to 'love' because that's how it was generated | |
hive_context.registerFunction("stemmer", lambda x, y: "love") | |
df_res = mjolnir.norm_query.transform( | |
df_love, | |
url_list=['http://localhost:9200/_msearch'], | |
min_sessions_per_query=10, | |
> session_factory=session_factory) | |
mjolnir/test/test_norm_query.py:27: | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
mjolnir/norm_query.py:176: in transform | |
mjolnir.es_hits.transform(df_queries, url_list, indices, batch_size, top_n, session_factory) | |
mjolnir/es_hits.py:99: in transform | |
.toDF(['wikiid', 'query', 'norm_query', 'hit_page_ids'])) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:57: in toDF | |
return sparkSession.createDataFrame(self, schema, sampleRatio) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:520: in createDataFrame | |
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:360: in _createFromRDD | |
struct = self._inferSchema(rdd, samplingRatio) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:331: in _inferSchema | |
first = rdd.first() | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1361: in first | |
rs = self.take(1) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1343: in take | |
res = self.context.runJob(self, takeUpToNumLeft, p) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/context.py:965: in runJob | |
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py:1133: in __call__ | |
answer, self.gateway_client, self.target_id, self.name) | |
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/utils.py:63: in deco | |
return f(*a, **kw) | |
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ | |
answer = 'xro828', gateway_client = <py4j.java_gateway.GatewayClient object at 0x7f7b58f8ead0> | |
target_id = 'z:org.apache.spark.api.python.PythonRDD', name = 'runJob' | |
def get_return_value(answer, gateway_client, target_id=None, name=None): | |
"""Converts an answer received from the Java gateway into a Python object. | |
For example, string representation of integers are converted to Python | |
integer, string representation of objects are converted to JavaObject | |
instances, etc. | |
:param answer: the string returned by the Java gateway | |
:param gateway_client: the gateway client used to communicate with the Java | |
Gateway. Only necessary if the answer is a reference (e.g., object, | |
list, map) | |
:param target_id: the name of the object from which the answer comes from | |
(e.g., *object1* in `object1.hello()`). Optional. | |
:param name: the name of the member from which the answer comes from | |
(e.g., *hello* in `object1.hello()`). Optional. | |
""" | |
if is_error(answer)[0]: | |
if len(answer) > 1: | |
type = answer[1] | |
value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) | |
if answer[1] == REFERENCE_TYPE: | |
raise Py4JJavaError( | |
"An error occurred while calling {0}{1}{2}.\n". | |
> format(target_id, ".", name), value) | |
E Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob. | |
E : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 42.0 failed 1 times, most recent failure: Lost task 0.0 in stage 42.0 (TID 107, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
E process() | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
E serializer.dump_stream(func(split_index, iterator), outfile) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
E vs = list(itertools.islice(iterator, batch)) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
E yield next(iterator) | |
E File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids | |
E with session_factory() as session: | |
E File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory | |
E return make_requests_session('requests/test_norm_query.sqlite3') | |
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
E return MockSession(path) | |
E File "mjolnir/test/conftest.py", line 85, in __init__ | |
E "(digest text PRIMARY KEY, status_code int, content text)") | |
E OperationalError: database is locked | |
E | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
E at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
E at java.lang.Thread.run(Thread.java:748) | |
E | |
E Driver stacktrace: | |
E at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422) | |
E at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) | |
E at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) | |
E at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) | |
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) | |
E at scala.Option.foreach(Option.scala:257) | |
E at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) | |
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) | |
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) | |
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) | |
E at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) | |
E at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) | |
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918) | |
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931) | |
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) | |
E at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441) | |
E at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala) | |
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) | |
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) | |
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) | |
E at java.lang.reflect.Method.invoke(Method.java:498) | |
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) | |
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) | |
E at py4j.Gateway.invoke(Gateway.java:280) | |
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) | |
E at py4j.commands.CallCommand.execute(CallCommand.java:79) | |
E at py4j.GatewayConnection.run(GatewayConnection.java:214) | |
E at java.lang.Thread.run(Thread.java:748) | |
E Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
E process() | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
E serializer.dump_stream(func(split_index, iterator), outfile) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
E vs = list(itertools.islice(iterator, batch)) | |
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
E yield next(iterator) | |
E File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids | |
E with session_factory() as session: | |
E File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory | |
E return make_requests_session('requests/test_norm_query.sqlite3') | |
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
E return MockSession(path) | |
E File "mjolnir/test/conftest.py", line 85, in __init__ | |
E "(digest text PRIMARY KEY, status_code int, content text)") | |
E OperationalError: database is locked | |
E | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
E at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
E ... 1 more | |
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py:319: Py4JJavaError | |
-------------------------------------- Captured stderr call -------------------------------------- | |
17/07/06 16:32:24 ERROR Executor: Exception in task 0.0 in stage 42.0 (TID 107) | |
org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
process() | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
vs = list(itertools.islice(iterator, batch)) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
yield next(iterator) | |
File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids | |
with session_factory() as session: | |
File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory | |
return make_requests_session('requests/test_norm_query.sqlite3') | |
File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
return MockSession(path) | |
File "mjolnir/test/conftest.py", line 85, in __init__ | |
"(digest text PRIMARY KEY, status_code int, content text)") | |
OperationalError: database is locked | |
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:748) | |
17/07/06 16:32:24 WARN TaskSetManager: Lost task 0.0 in stage 42.0 (TID 107, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main | |
process() | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process | |
serializer.dump_stream(func(split_index, iterator), outfile) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream | |
vs = list(itertools.islice(iterator, batch)) | |
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft | |
yield next(iterator) | |
File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids | |
with session_factory() as session: | |
File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory | |
return make_requests_session('requests/test_norm_query.sqlite3') | |
File "/vagrant/mjolnir/test/conftest.py", line 73, in f | |
return MockSession(path) | |
File "mjolnir/test/conftest.py", line 85, in __init__ | |
"(digest text PRIMARY KEY, status_code int, content text)") | |
OperationalError: database is locked | |
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193) | |
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234) | |
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152) | |
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63) | |
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) | |
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) | |
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) | |
at org.apache.spark.scheduler.Task.run(Task.scala:99) | |
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282) | |
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) | |
at java.lang.Thread.run(Thread.java:748) | |
17/07/06 16:32:24 ERROR TaskSetManager: Task 0 in stage 42.0 failed 1 times; aborting job | |
============================= 2 failed, 20 passed in 804.44 seconds ============================== | |
ERROR: InvocationError: '/vagrant/.tox/pytest/bin/pytest --pyargs mjolnir' | |
____________________________________________ summary _____________________________________________ | |
flake8: commands succeeded | |
ERROR: pytest: commands failed | |
vagrant@mjolnir:/vagrant$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment