Skip to content

Instantly share code, notes, and snippets.

@jpmckinney
Last active April 26, 2021 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpmckinney/f3bdbb62620a9974ba1ff254392f6b6d to your computer and use it in GitHub Desktop.
Save jpmckinney/f3bdbb62620a9974ba1ff254392f6b6d to your computer and use it in GitHub Desktop.
Exports the descriptions of data quality checks from Pelican.
# Generates:
# https://docs.google.com/document/d/1l5SL1hUR9n8IDYVLSYsU85hdB3L_u2e3iD-txQPc0zU/edit
# https://docs.google.com/spreadsheets/d/1WPPOSf7xM9LR3VlTGx7wyzc8q_5xQ3HyMoB8eaEWHDM/edit#gid=0
# Gist: https://gist.github.com/jpmckinney/f3bdbb62620a9974ba1ff254392f6b6d
# GitHub: https://github.com/open-contracting/pelican/issues/63
# CRM: https://crm.open-contracting.org/issues/5908
import csv
import json
import re
from html.parser import HTMLParser
import polib
from jsonpointer import JsonPointerException, resolve_pointer
def convert(html):
return re.sub(r'<code>(.+?)</code>', r"""<span style="font-family:'Roboto Mono'">\1</span>""", html)
# https://github.com/django/django/blob/master/django/utils/html.py
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(value):
s = MLStripper()
s.feed(value)
s.close()
return s.get_data()
# Convert frontend/src/messages/en.js to JSON using a tool like https://www.convertonline.io/convert/js-to-json
with open('en.json') as f:
data = json.load(f)
toc = {
'resourceLevel': [
'coherent',
'consistent',
'reference',
],
'datasetLevel': [
'distribution',
'misc',
'reference',
'consistent',
],
}
replacements = {
'field.exists': 'fieldDetail.coverage.exists',
'field.non_empty': 'fieldDetail.coverage.non_empty',
'field.': 'fieldDetail.quality.',
'resource.': 'resourceLevel.',
'dataset.': 'datasetLevel.',
}
field_replacements = {
'.name': '.count_header',
'.description': '.count_header_tooltip',
}
dataset_replacements = {
'.description': '.description_long',
}
html_replacements = {
r'<p>': '\n',
r'</p>': '\n',
r'<ul>': '\n',
r'</ul>': '',
r'<li>': '- ',
r'</li>': '\n',
}
filename = 'backend/dqt/locale/en/LC_MESSAGES/django.po'
po = polib.pofile(filename)
for entry in po:
pointer = entry.msgid
for old, new in replacements.items():
pointer = pointer.replace(old, new)
if pointer.startswith('fieldDetail.'):
for old, new in field_replacements.items():
pointer = pointer.replace(old, new)
if pointer.startswith('datasetLevel.'):
for old, new in dataset_replacements.items():
pointer = pointer.replace(old, new)
pointer = '/' + pointer.replace('.', '/')
try:
message = resolve_pointer(data, pointer)
for pattern, replacement in html_replacements.items():
message = re.sub(pattern, replacement, message)
entry.msgstr = strip_tags(message).strip()
except JsonPointerException as e:
print(f'{entry.msgid} -> {pointer}: {e}')
po.save(filename)
with open('en.html', 'w') as f:
f.write('<h2>{}</h2>'.format(data['field']['all'][4:]))
f.write(convert(data['field']['description']))
for key in ('coverage', 'quality'):
obj = data['fieldDetail'][key]
obj.pop('failureSamplesPrefix')
f.write('<h3>{}</h3>'.format(obj.pop('label')))
for check in obj.values():
f.write('<h4>{name}</h4><p>{description}</p>'.format(
name=check['count_header'], description=convert(check['count_header_tooltip'])))
for level, types in toc.items():
f.write('<h2>{}</h2>'.format(data[level]['subheadline'][4:]))
f.write(convert(data[level]['description']))
for category in types:
label = data[level][category].pop('categoryName', None)
if label:
f.write('<h3>{}</h3>'.format(label))
for key, check in data[level][category].items():
f.write('<h4>{name}</h4><p>{description}</p>'.format(
name=check['name'], description=convert(check.get('description_long', check['description']))))
f.write('<h2>{}</h2>'.format(data['timeLevel']['subheadline'][4:]))
f.write(convert(data['timeLevel']['description']))
for key in ('ocid', 'phase_stable', 'tender_title'):
check = data['timeLevel'][key]
f.write('<h4>{name}</h4><p>{description}</p>'.format(
name=check['name'], description=convert(check['descriptionLong'])))
# If we wanted to build a whitelist of tags recognized by Google Docs, we can use this file to test what's recognized:
# https://github.com/cbracco/html5-test-page
with open('en.html') as f:
print("The tags used in en.html which need to be supported by Google Docs are:")
for tag in sorted(set(re.findall(r'<[^=/][^>]*>', f.read()))):
print(tag)
with open('en.csv', 'w') as f:
writer = csv.writer(f)
level = 'resourceLevel'
writer.writerow(['resource', '', data[level]['subheadline'][4:], data[level]['description']])
for category in toc[level]:
writer.writerow(['resource', category, data[level][category].get('categoryName', None), ''])
for key, check in data[level][category].items():
if key == 'categoryName':
continue
writer.writerow([
'resource',
f'{category}.{key}',
check['name'],
check.get('description_long', check['description']),
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment