Skip to content

Instantly share code, notes, and snippets.

@nomoa
Created July 6, 2017 16:41
Show Gist options
  • Save nomoa/2d630ad915af160d5f37ebd0af2fe86c to your computer and use it in GitHub Desktop.
Save nomoa/2d630ad915af160d5f37ebd0af2fe86c to your computer and use it in GitHub Desktop.
mjolnir tests
vagrant@mjolnir:/vagrant$ venv/bin/tox
GLOB sdist-make: /vagrant/setup.py
flake8 inst-nodeps: /vagrant/.tox/dist/mjolnir-0.0.1.zip
flake8 installed: certifi==2017.4.17,chardet==3.0.4,clickmodels==1.0.2,configparser==3.5.0,enum34==1.1.6,flake8==3.3.0,idna==2.5,mccabe==0.6.1,mjolnir==0.0.1,pycodestyle==2.3.1,pyflakes==1.5.0,requests==2.18.1,urllib3==1.21.1
flake8 runtests: PYTHONHASHSEED='1926447621'
flake8 runtests: commands[0] | flake8 mjolnir/
pytest inst-nodeps: /vagrant/.tox/dist/mjolnir-0.0.1.zip
pytest installed: certifi==2017.4.17,chardet==3.0.4,clickmodels==1.0.2,configparser==3.5.0,decorator==4.0.11,enum34==1.1.6,findspark==1.1.0,flake8==3.3.0,future==0.16.0,hyperopt==0.1,idna==2.5,kafka==1.3.3,mccabe==0.6.1,mjolnir==0.0.1,networkx==1.11,nose==1.3.7,numpy==1.13.0,pluggy==0.4.0,py==1.4.34,py4j==0.10.6,pycodestyle==2.3.1,pyflakes==1.5.0,pymongo==3.4.0,pytest==3.1.3,requests==2.18.1,scipy==0.19.1,six==1.10.0,tox==2.7.0,urllib3==1.21.1,virtualenv==15.1.0
pytest runtests: PYTHONHASHSEED='1926447621'
pytest runtests: commands[0] | pytest --pyargs mjolnir
====================================== test session starts =======================================
platform linux2 -- Python 2.7.9, pytest-3.1.3, py-1.4.34, pluggy-0.4.0
rootdir: /vagrant, inifile:
collected 22 items
mjolnir/test/test_dbn.py .
mjolnir/test/test_features.py F
mjolnir/test/test_metrics.py ..
mjolnir/test/test_norm_query.py F......
mjolnir/test/test_sampling.py ..
mjolnir/test/test_spark.py .
mjolnir/test/training/test_tuning.py ....
mjolnir/test/training/test_xgboost.py ....
============================================ FAILURES ============================================
________________________________________ test_collect_es _________________________________________
spark_context = <pyspark.context.SparkContext object at 0x7f7b590206d0>
hive_context = <pyspark.sql.context.HiveContext object at 0x7f7b590205d0>
make_requests_session = <function f at 0x7f7b5900bb18>
def test_collect_es(spark_context, hive_context, make_requests_session):
def session_factory():
return make_requests_session('requests/test_features.sqlite3')
r = pyspark.sql.Row('wikiid', 'query', 'hit_page_id')
source_data = {
'apple': [18978754, 36071326, 856],
'foo': [11178, 1140775, 844613]
}
rows = [r('enwiki', query, page_id) for query, ids in source_data.items() for page_id in ids]
df = spark_context.parallelize(rows).toDF()
df_result = mjolnir.features.collect_es(df, ['http://localhost:9200/_msearch'],
mjolnir.features.enwiki_features(),
{'enwiki': 'enwiki_content'},
> session_factory=session_factory)
mjolnir/test/test_features.py:20:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
mjolnir/features.py:402: in collect_es
.toDF(['wikiid', 'query', 'hit_page_id', 'features'])
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:57: in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:520: in createDataFrame
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:360: in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:331: in _inferSchema
first = rdd.first()
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1361: in first
rs = self.take(1)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1343: in take
res = self.context.runJob(self, takeUpToNumLeft, p)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/context.py:965: in runJob
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py:1133: in __call__
answer, self.gateway_client, self.target_id, self.name)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/utils.py:63: in deco
return f(*a, **kw)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
answer = 'xro141', gateway_client = <py4j.java_gateway.GatewayClient object at 0x7f7b58f8ead0>
target_id = 'z:org.apache.spark.api.python.PythonRDD', name = 'runJob'
def get_return_value(answer, gateway_client, target_id=None, name=None):
"""Converts an answer received from the Java gateway into a Python object.
For example, string representation of integers are converted to Python
integer, string representation of objects are converted to JavaObject
instances, etc.
:param answer: the string returned by the Java gateway
:param gateway_client: the gateway client used to communicate with the Java
Gateway. Only necessary if the answer is a reference (e.g., object,
list, map)
:param target_id: the name of the object from which the answer comes from
(e.g., *object1* in `object1.hello()`). Optional.
:param name: the name of the member from which the answer comes from
(e.g., *hello* in `object1.hello()`). Optional.
"""
if is_error(answer)[0]:
if len(answer) > 1:
type = answer[1]
value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
if answer[1] == REFERENCE_TYPE:
raise Py4JJavaError(
"An error occurred while calling {0}{1}{2}.\n".
> format(target_id, ".", name), value)
E Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
E : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 50, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
E process()
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
E serializer.dump_stream(func(split_index, iterator), outfile)
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
E vs = list(itertools.islice(iterator, batch))
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
E yield next(iterator)
E File "/vagrant/mjolnir/features.py", line 387, in collect_partition
E with session_factory() as session:
E File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory
E return make_requests_session('requests/test_features.sqlite3')
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f
E return MockSession(path)
E File "mjolnir/test/conftest.py", line 85, in __init__
E "(digest text PRIMARY KEY, status_code int, content text)")
E OperationalError: database is locked
E
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
E at org.apache.spark.scheduler.Task.run(Task.scala:99)
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
E at java.lang.Thread.run(Thread.java:748)
E
E Driver stacktrace:
E at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
E at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
E at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
E at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
E at scala.Option.foreach(Option.scala:257)
E at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
E at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
E at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
E at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
E at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
E at java.lang.reflect.Method.invoke(Method.java:498)
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
E at py4j.Gateway.invoke(Gateway.java:280)
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
E at py4j.commands.CallCommand.execute(CallCommand.java:79)
E at py4j.GatewayConnection.run(GatewayConnection.java:214)
E at java.lang.Thread.run(Thread.java:748)
E Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
E process()
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
E serializer.dump_stream(func(split_index, iterator), outfile)
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
E vs = list(itertools.islice(iterator, batch))
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
E yield next(iterator)
E File "/vagrant/mjolnir/features.py", line 387, in collect_partition
E with session_factory() as session:
E File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory
E return make_requests_session('requests/test_features.sqlite3')
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f
E return MockSession(path)
E File "mjolnir/test/conftest.py", line 85, in __init__
E "(digest text PRIMARY KEY, status_code int, content text)")
E OperationalError: database is locked
E
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
E at org.apache.spark.scheduler.Task.run(Task.scala:99)
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
E ... 1 more
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py:319: Py4JJavaError
-------------------------------------- Captured stderr call --------------------------------------
17/07/06 16:25:56 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID 50)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
yield next(iterator)
File "/vagrant/mjolnir/features.py", line 387, in collect_partition
with session_factory() as session:
File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory
return make_requests_session('requests/test_features.sqlite3')
File "/vagrant/mjolnir/test/conftest.py", line 73, in f
return MockSession(path)
File "mjolnir/test/conftest.py", line 85, in __init__
"(digest text PRIMARY KEY, status_code int, content text)")
OperationalError: database is locked
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
17/07/06 16:25:56 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 50, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
yield next(iterator)
File "/vagrant/mjolnir/features.py", line 387, in collect_partition
with session_factory() as session:
File "/vagrant/mjolnir/test/test_features.py", line 7, in session_factory
return make_requests_session('requests/test_features.sqlite3')
File "/vagrant/mjolnir/test/conftest.py", line 73, in f
return MockSession(path)
File "mjolnir/test/conftest.py", line 85, in __init__
"(digest text PRIMARY KEY, status_code int, content text)")
OperationalError: database is locked
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
17/07/06 16:25:56 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1 times; aborting job
________________________________________ test_norm_query _________________________________________
df_love = DataFrame[query: string, session_id: string, wikiid: string]
hive_context = <pyspark.sql.context.HiveContext object at 0x7f7b590205d0>
make_requests_session = <function f at 0x7f7b590e0e60>
def test_norm_query(df_love, hive_context, make_requests_session):
"""Very basic happy path test of query normalization"""
def session_factory():
return make_requests_session('requests/test_norm_query.sqlite3')
# Make a fake stemmer() udf. We know everything in df_love
# stems to 'love' because that's how it was generated
hive_context.registerFunction("stemmer", lambda x, y: "love")
df_res = mjolnir.norm_query.transform(
df_love,
url_list=['http://localhost:9200/_msearch'],
min_sessions_per_query=10,
> session_factory=session_factory)
mjolnir/test/test_norm_query.py:27:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
mjolnir/norm_query.py:176: in transform
mjolnir.es_hits.transform(df_queries, url_list, indices, batch_size, top_n, session_factory)
mjolnir/es_hits.py:99: in transform
.toDF(['wikiid', 'query', 'norm_query', 'hit_page_ids']))
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:57: in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:520: in createDataFrame
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:360: in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/session.py:331: in _inferSchema
first = rdd.first()
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1361: in first
rs = self.take(1)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py:1343: in take
res = self.context.runJob(self, takeUpToNumLeft, p)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/context.py:965: in runJob
port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py:1133: in __call__
answer, self.gateway_client, self.target_id, self.name)
/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/sql/utils.py:63: in deco
return f(*a, **kw)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
answer = 'xro828', gateway_client = <py4j.java_gateway.GatewayClient object at 0x7f7b58f8ead0>
target_id = 'z:org.apache.spark.api.python.PythonRDD', name = 'runJob'
def get_return_value(answer, gateway_client, target_id=None, name=None):
"""Converts an answer received from the Java gateway into a Python object.
For example, string representation of integers are converted to Python
integer, string representation of objects are converted to JavaObject
instances, etc.
:param answer: the string returned by the Java gateway
:param gateway_client: the gateway client used to communicate with the Java
Gateway. Only necessary if the answer is a reference (e.g., object,
list, map)
:param target_id: the name of the object from which the answer comes from
(e.g., *object1* in `object1.hello()`). Optional.
:param name: the name of the member from which the answer comes from
(e.g., *hello* in `object1.hello()`). Optional.
"""
if is_error(answer)[0]:
if len(answer) > 1:
type = answer[1]
value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
if answer[1] == REFERENCE_TYPE:
raise Py4JJavaError(
"An error occurred while calling {0}{1}{2}.\n".
> format(target_id, ".", name), value)
E Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
E : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 42.0 failed 1 times, most recent failure: Lost task 0.0 in stage 42.0 (TID 107, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
E process()
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
E serializer.dump_stream(func(split_index, iterator), outfile)
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
E vs = list(itertools.islice(iterator, batch))
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
E yield next(iterator)
E File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids
E with session_factory() as session:
E File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory
E return make_requests_session('requests/test_norm_query.sqlite3')
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f
E return MockSession(path)
E File "mjolnir/test/conftest.py", line 85, in __init__
E "(digest text PRIMARY KEY, status_code int, content text)")
E OperationalError: database is locked
E
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
E at org.apache.spark.scheduler.Task.run(Task.scala:99)
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
E at java.lang.Thread.run(Thread.java:748)
E
E Driver stacktrace:
E at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
E at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
E at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
E at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
E at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
E at scala.Option.foreach(Option.scala:257)
E at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
E at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
E at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
E at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
E at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
E at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
E at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
E at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
E at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
E at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
E at java.lang.reflect.Method.invoke(Method.java:498)
E at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
E at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
E at py4j.Gateway.invoke(Gateway.java:280)
E at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
E at py4j.commands.CallCommand.execute(CallCommand.java:79)
E at py4j.GatewayConnection.run(GatewayConnection.java:214)
E at java.lang.Thread.run(Thread.java:748)
E Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
E process()
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
E serializer.dump_stream(func(split_index, iterator), outfile)
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
E vs = list(itertools.islice(iterator, batch))
E File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
E yield next(iterator)
E File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids
E with session_factory() as session:
E File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory
E return make_requests_session('requests/test_norm_query.sqlite3')
E File "/vagrant/mjolnir/test/conftest.py", line 73, in f
E return MockSession(path)
E File "mjolnir/test/conftest.py", line 85, in __init__
E "(digest text PRIMARY KEY, status_code int, content text)")
E OperationalError: database is locked
E
E at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
E at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
E at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
E at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
E at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
E at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
E at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
E at org.apache.spark.scheduler.Task.run(Task.scala:99)
E at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
E at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
E at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
E ... 1 more
/opt/spark-2.1.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py:319: Py4JJavaError
-------------------------------------- Captured stderr call --------------------------------------
17/07/06 16:32:24 ERROR Executor: Exception in task 0.0 in stage 42.0 (TID 107)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
yield next(iterator)
File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids
with session_factory() as session:
File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory
return make_requests_session('requests/test_norm_query.sqlite3')
File "/vagrant/mjolnir/test/conftest.py", line 73, in f
return MockSession(path)
File "mjolnir/test/conftest.py", line 85, in __init__
"(digest text PRIMARY KEY, status_code int, content text)")
OperationalError: database is locked
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
17/07/06 16:32:24 WARN TaskSetManager: Lost task 0.0 in stage 42.0 (TID 107, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/spark-2.1.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/spark-2.1.0-bin-hadoop2.6/python/pyspark/rdd.py", line 1339, in takeUpToNumLeft
yield next(iterator)
File "/vagrant/mjolnir/es_hits.py", line 85, in collect_partition_hit_page_ids
with session_factory() as session:
File "/vagrant/mjolnir/test/test_norm_query.py", line 17, in session_factory
return make_requests_session('requests/test_norm_query.sqlite3')
File "/vagrant/mjolnir/test/conftest.py", line 73, in f
return MockSession(path)
File "mjolnir/test/conftest.py", line 85, in __init__
"(digest text PRIMARY KEY, status_code int, content text)")
OperationalError: database is locked
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:748)
17/07/06 16:32:24 ERROR TaskSetManager: Task 0 in stage 42.0 failed 1 times; aborting job
============================= 2 failed, 20 passed in 804.44 seconds ==============================
ERROR: InvocationError: '/vagrant/.tox/pytest/bin/pytest --pyargs mjolnir'
____________________________________________ summary _____________________________________________
flake8: commands succeeded
ERROR: pytest: commands failed
vagrant@mjolnir:/vagrant$
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment