Created
July 24, 2015 16:05
-
-
Save DanielWeitzenfeld/f13bd8892dbe091eaa38 to your computer and use it in GitHub Desktop.
Script to calculate annotation coverage on a LookML repo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import yaml | |
import pandas as pd | |
# Generates CSVs with aggregate stats about your LookML: | |
# % of dimensions and measures that have descriptions | |
# % of dashboard elements with a note | |
# First arg should be path to your LookML repo, e.g.: | |
# python looker_annotation_coverage.py ~/PycharmProjects/looker-q | |
VIEW_COLUMNS = ['hidden', # for root-level keys, just use the key | |
'dimension', | |
'measure', | |
'description', | |
'type', | |
] | |
DASHBOARD_COLUMNS = ['model', | |
'name', | |
'title', | |
('note', 'text'), # for nested paths, use a tuple | |
] | |
def flatten_nested_dict(dict, list_of_paths): | |
"""Extract desired info from a dict that may have nested data. | |
List of paths is a list of str (for root-level keys) and tuples (for | |
nested keys) | |
""" | |
d = {} | |
for p in list_of_paths: | |
if isinstance(p, str): | |
new_key = p | |
else: | |
new_key = '_'.join(p) | |
d[new_key] = extract_value_from_dict(dict, p) | |
return d | |
def extract_value_from_dict(dict, key_or_keys): | |
"""Recursively dig into a dict as directed by key_or_keys and return | |
the value or None. | |
""" | |
if isinstance(key_or_keys, str): | |
if key_or_keys in dict: | |
return dict[key_or_keys] | |
return None | |
elif isinstance(key_or_keys, tuple): | |
next_key = key_or_keys[0] | |
if next_key in dict: | |
if len(key_or_keys) == 1: | |
return dict[next_key] | |
return extract_value_from_dict(dict[next_key], key_or_keys[1:]) | |
return None | |
if __name__ == "__main__": | |
dirname = sys.argv[1] | |
dirname = os.path.join(dirname, "") # ensure that dirname ends with / | |
lookml_files = os.listdir(dirname) | |
lookml_files = [f for f in lookml_files if '.lookml' in f] | |
dashboards = [f for f in lookml_files if '.dashboard.' in f] | |
models = [f for f in lookml_files if '.model.' in f] | |
views = [f for f in lookml_files if '.view.' in f] | |
# Views | |
view_dfs = [] | |
for v in views: | |
view_name = v.replace('.view.lookml', '') | |
with open(dirname + v) as file: | |
lookml = yaml.load(file)[0] | |
dimensions = [d for d in lookml['fields'] if 'dimension' in d] | |
measures = [d for d in lookml['fields'] if 'measure' in d] | |
dimensions_and_measures = dimensions + measures | |
select_fields = [flatten_nested_dict(d, VIEW_COLUMNS) for d in | |
dimensions_and_measures] | |
df = pd.DataFrame(select_fields) | |
df['view_name'] = view_name | |
view_dfs.append(df) | |
all_views = pd.concat(view_dfs) | |
all_views['is_dimension'] = all_views.dimension.notnull() | |
all_views['is_hidden'] = all_views.hidden.notnull() | |
all_views['has_description'] = all_views.description.notnull() | |
# Dashboards | |
dash_dfs = [] | |
for d in dashboards: | |
dash_name = d.replace('.dashboard.lookml', '') | |
with open(dirname + d) as file: | |
lookml = yaml.load(file)[0] | |
elements = lookml['elements'] | |
select_fields = [flatten_nested_dict(e, DASHBOARD_COLUMNS) for e in | |
elements] | |
df = pd.DataFrame(select_fields) | |
df['dash_name'] = dash_name | |
dash_dfs.append(df) | |
all_dash = pd.concat(dash_dfs) | |
all_dash['has_note_text'] = all_dash.note_text.notnull() | |
all_dash.title = all_dash.title.str.encode('ascii', errors='ignore') | |
all_dash.note_text = all_dash.note_text.str.encode('ascii', | |
errors='ignore') | |
# Aggregate Stats | |
g = all_views[~all_views.is_hidden].groupby('view_name') | |
view_stats = pd.DataFrame({'n_unhidden_fields': g.size(), | |
'percent_described': g.has_description.mean()}) | |
g = all_dash.groupby('dash_name') | |
dash_stats = pd.DataFrame({'n_elements': g.size(), | |
'percent_noted': g.has_note_text.mean()}) | |
# Write CSVs | |
all_views.to_csv('all_looker_view_fields.csv', index=False) | |
all_dash.to_csv('all_looker_dashboard_elements.csv', index=False) | |
view_stats.to_csv('looker_view_stats.csv') | |
dash_stats.to_csv('looker_dash_stats.csv') | |
print 'Files written: \n' \ | |
' all_looker_view_fields.csv,\n' \ | |
' all_looker_dashboard_elements.csv,\n' \ | |
' looker_view_stats.csv,\n' \ | |
' looker_dash_stats.csv' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment