Last active
November 27, 2017 15:53
-
-
Save david-caro/249c545fb3d24a8d2908f75eeb6a6019 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
from invenio_db import db | |
from invenio_workflows import workflow_object_class, ObjectStatus, workflows | |
from inspirehep.modules.records.utils import open_url_or_path | |
SITE_NAME = 'labs.inspirehep.net' | |
FNAME_REG = re.compile(r'https?://%s(?P<fpath>[^ ]+) into' % SITE_NAME) | |
def has_name(task, task_name): | |
return ' ' + task_name + ' ' in str(task) | |
def get_task_index(workflow, task_name): | |
if callable(workflow): | |
if has_name(workflow, task_name): | |
return [0] | |
else: | |
return [] | |
for index, task in enumerate(workflow): | |
if callable(task): | |
if has_name(task, task_name): | |
return [index] | |
else: | |
indexes = get_task_index(task, task_name) | |
if indexes: | |
return [index] + indexes | |
return [] | |
def has_zero_size(fpath): | |
return os.stat(fpath).st_size == 0 | |
def get_fname_from_error_msg(error_msg): | |
match = FNAME_REG.search(error_msg) | |
if match: | |
return match.groupdict()['fpath'] | |
raise Exception('Can\'t find file path in "%s"' % error_msg) | |
def has_empty_figure(workflow): | |
for figure in workflow.data.get('figures', ()): | |
figure_path = workflow.files[figure['key']].file.uri | |
if has_zero_size(figure_path): | |
return True | |
return False | |
def fix_zero_size_figures(workflow): | |
if not has_empty_figure(workflow): | |
print 'Workflow %s has no empty figures' % workflow.id | |
return False | |
print 'Restarting %s to fix figures' % workflow.id | |
plot_extract_pos = get_task_index( | |
workflows['article'].workflow, | |
'arxiv_plot_extract', | |
) | |
workflow.callback_pos = plot_extract_pos | |
workflow.save() | |
workflow.restart_current(delayed=True) | |
return True | |
def fix_zero_size_documents(workflow): | |
to_redownload = [] | |
for document in workflow.data.get('documents', ()): | |
local_path = workflow.files[document['key']].file.uri | |
if has_zero_size(local_path): | |
to_redownload.append([document['key'], document]) | |
if to_redownload: | |
print 'Workflow %s has documents to redownload: %s' % ( | |
workflow.id, to_redownload | |
) | |
download_documents(workflow, to_redownload) | |
else: | |
print 'Workflow %s has no documents to redownload' | |
return workflow.data.get('documents', ()) | |
def download_document(workflow, key, document): | |
url = document.get('original_url') | |
if url is None: | |
raise Exception( | |
'Can\'t redownload the document %s for the workflow %s' | |
% (document, workflow.id) | |
) | |
workflow.files[key] = open_url_or_path(url) | |
raw_input( | |
'Downloaded document %s, can you check if it\'s ok?' | |
% workflow.files[key].file.uri | |
) | |
def download_documents(workflow, documents): | |
for doc_key, doc in documents: | |
download_document(workflow, doc_key, doc) | |
def fix_zero_size_errors(): | |
all_errors = workflow_object_class.query(status=ObjectStatus.ERROR) | |
zero_size_errors = [ | |
error | |
for error in all_errors | |
if 'seems to be empty' in error.extra_data['_error_msg'] | |
] | |
print 'Got %s errors, from which, %s are zero size errors.' % ( | |
len(all_errors), | |
len(zero_size_errors), | |
) | |
for workflow in zero_size_errors: | |
res = fix_zero_size_documents(workflow) | |
if not res: | |
print 'workflow %s did not have any empty documents' % workflow.id | |
for workflow in zero_size_errors: | |
was_restarted = fix_zero_size_figures(workflow) | |
if not was_restarted: | |
workflow.restart_current(delayed=True) | |
db.session.commit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment