Skip to content

Instantly share code, notes, and snippets.

@david-caro
Last active November 27, 2017 15:53
Show Gist options
  • Save david-caro/249c545fb3d24a8d2908f75eeb6a6019 to your computer and use it in GitHub Desktop.
Save david-caro/249c545fb3d24a8d2908f75eeb6a6019 to your computer and use it in GitHub Desktop.
import re
import os
from invenio_db import db
from invenio_workflows import workflow_object_class, ObjectStatus, workflows
from inspirehep.modules.records.utils import open_url_or_path
SITE_NAME = 'labs.inspirehep.net'
FNAME_REG = re.compile(r'https?://%s(?P<fpath>[^ ]+) into' % SITE_NAME)
def has_name(task, task_name):
return ' ' + task_name + ' ' in str(task)
def get_task_index(workflow, task_name):
if callable(workflow):
if has_name(workflow, task_name):
return [0]
else:
return []
for index, task in enumerate(workflow):
if callable(task):
if has_name(task, task_name):
return [index]
else:
indexes = get_task_index(task, task_name)
if indexes:
return [index] + indexes
return []
def has_zero_size(fpath):
return os.stat(fpath).st_size == 0
def get_fname_from_error_msg(error_msg):
match = FNAME_REG.search(error_msg)
if match:
return match.groupdict()['fpath']
raise Exception('Can\'t find file path in "%s"' % error_msg)
def has_empty_figure(workflow):
for figure in workflow.data.get('figures', ()):
figure_path = workflow.files[figure['key']].file.uri
if has_zero_size(figure_path):
return True
return False
def fix_zero_size_figures(workflow):
if not has_empty_figure(workflow):
print 'Workflow %s has no empty figures' % workflow.id
return False
print 'Restarting %s to fix figures' % workflow.id
plot_extract_pos = get_task_index(
workflows['article'].workflow,
'arxiv_plot_extract',
)
workflow.callback_pos = plot_extract_pos
workflow.save()
workflow.restart_current(delayed=True)
return True
def fix_zero_size_documents(workflow):
to_redownload = []
for document in workflow.data.get('documents', ()):
local_path = workflow.files[document['key']].file.uri
if has_zero_size(local_path):
to_redownload.append([document['key'], document])
if to_redownload:
print 'Workflow %s has documents to redownload: %s' % (
workflow.id, to_redownload
)
download_documents(workflow, to_redownload)
else:
print 'Workflow %s has no documents to redownload'
return workflow.data.get('documents', ())
def download_document(workflow, key, document):
url = document.get('original_url')
if url is None:
raise Exception(
'Can\'t redownload the document %s for the workflow %s'
% (document, workflow.id)
)
workflow.files[key] = open_url_or_path(url)
raw_input(
'Downloaded document %s, can you check if it\'s ok?'
% workflow.files[key].file.uri
)
def download_documents(workflow, documents):
for doc_key, doc in documents:
download_document(workflow, doc_key, doc)
def fix_zero_size_errors():
all_errors = workflow_object_class.query(status=ObjectStatus.ERROR)
zero_size_errors = [
error
for error in all_errors
if 'seems to be empty' in error.extra_data['_error_msg']
]
print 'Got %s errors, from which, %s are zero size errors.' % (
len(all_errors),
len(zero_size_errors),
)
for workflow in zero_size_errors:
res = fix_zero_size_documents(workflow)
if not res:
print 'workflow %s did not have any empty documents' % workflow.id
for workflow in zero_size_errors:
was_restarted = fix_zero_size_figures(workflow)
if not was_restarted:
workflow.restart_current(delayed=True)
db.session.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment