Skip to content

Instantly share code, notes, and snippets.

@callahantiff
Created October 13, 2021 18:47
Show Gist options
  • Save callahantiff/f31a936d967c8f44d5d0e51c563ae65f to your computer and use it in GitHub Desktop.
Save callahantiff/f31a936d967c8f44d5d0e51c563ae65f to your computer and use it in GitHub Desktop.
PheKnowLator - repairing metadata files
# Script Purpose: Script was build to address https://github.com/callahantiff/PheKnowLator/issues/116
# import needed libraries
import json
import os
import re
import pandas as pd
import pickle
import shutil
from datetime import datetime
from google.cloud import storage # type: ignore
from rdflib import Graph, Literal, Namespace, URIRef # type: ignore
from rdflib.namespace import RDF, RDFS, OWL # type: ignore
from tqdm import tqdm
from builds.build_utilities import * # type: ignore
from pkt_kg.__version__ import __version__ # type: ignore
from pkt_kg.utils import * # type: ignore
log_dir = 'temp/'
release = 'release_v' + __version__; bucket = storage.Client().get_bucket('pheknowlator')
bucket_files = [file.name.split('/')[2] for file in bucket.list_blobs(prefix='archived_builds/' + release + '/')]
builds = [x[0] for x in [re.findall(r'(?<=_)\d.*', x) for x in bucket_files] if len(x) > 0]
sorted_dates = sorted([datetime.strftime(datetime.strptime(str(x), '%d%b%Y'), '%Y-%m-%d').upper() for x in builds])
build = 'build_' + datetime.strftime(datetime.strptime(sorted_dates[-1], '%Y-%m-%d'), '%d%b%Y').upper()
# arch_builds = [file.name for file in bucket.list_blobs(prefix='archived_builds/release_v2.0.0/build_11FEB2021/')]
# y = sorted([x for x in arch_builds if x.endswith('NodeLabels.txt') or x.endswith('node_metadata_dict.pkl') or
# x.endswith('Triples_Integer_Identifier_Map.json')])
# for i in y: print("'" + i + "'")
directories = directory_list
for i in tqdm(range(0, len(directories))):
result_set = directories[i]; meta_dict, node_label_file, node_int_map = None, None, None
print('\n\n*** PROCESSING SET: {} ***'.format('/'.join(result_set[0].split('/')[0:7])))
for f in result_set:
if f.endswith('NodeLabels.txt'):
node_label_org = '/'.join(f.split('/')[:-1]); node_label = f.split('/')[-1]
if f.endswith('node_metadata_dict.pkl'):
node_meta_org = '/'.join(f.split('/')[:-1]); node_meta = f.split('/')[-1]
bucket.blob(f).download_to_filename(log_dir + node_meta)
meta_dict = pickle.load(open(log_dir + node_meta, 'rb'))
if f.endswith('Triples_Integer_Identifier_Map.json'):
node_int_org = '/'.join(f.split('/')[:-1]); node_int = f.split('/')[-1]
bucket.blob(f).download_to_filename(log_dir + node_int)
node_int_map = json.load(open(log_dir + node_int, 'r'))
if meta_dict is not None and node_label is not None and node_int_map is not None:
# fix the file metadata dictionary
if len(meta_dict['nodes']) > 0 and len(meta_dict['relations']) > 0:
print('Updating node_metadata_dict')
temp_copy = meta_dict.copy(); meta_dict = dict()
for key, value in temp_copy.items():
meta_dict[key] = {}
for e_key, val in value.items():
d = {k: re.sub('\s\s+', ' ', v.replace('\n', ' ')) if v is not None else v for k, v in val.items()}
meta_dict[key][e_key] = d
del temp_copy; pickle.dump(meta_dict, open(log_dir + node_meta, 'wb'), protocol=4)
uploads_data_to_gcs_bucket(bucket, node_meta_org + '/', log_dir, node_meta)
else: raise ValueError('node_metadata_dict is empty!')
# update node metadata list node list
if len(node_int_map) > 0:
print('Generating Updated NodeLabels.txt')
entity_type = 'n3' if list(node_int_map.keys())[0].startswith('<') else 'string'
if entity_type != 'n3': entities = set(k for k in node_int_map.keys() if k.startswith('http'))
else: entities = set(URIRef(k.strip('<').strip('>')) for k in node_int_map.keys() if k.startswith('<http'))
with open(log_dir + node_label, 'w', encoding='utf-8') as out:
out.write('entity_type' + '\t' +
'integer_id' + '\t' +
'entity_uri' + '\t' +
'label' + '\t' +
'description/definition' + '\t' +
'synonym' + '\n')
for x in entities:
if entity_type != 'n3': nid, nint = str(x), node_int_map[str(x)]
else: nid, nint = n3(x), node_int_map[n3(x)]
if str(x) in meta_dict['nodes'].keys(): etyp, meta = 'NODES', meta_dict['nodes'][str(x)]
elif str(x) in meta_dict['relations'].keys(): etyp, meta = 'RELATIONS', meta_dict['relations'][str(x)]
else: meta, etyp, lab, dsc, syn = None, 'NA', 'NA', 'NA', 'NA'
if meta is not None:
lab = meta['Label'] if meta['Label'] is not None else 'None'
dsc = meta['Description'] if meta['Description'] is not None else 'None'
syn = meta['Synonym'] if meta['Synonym'] is not None else 'None'
try: out.write(etyp + '\t' + str(nint) + '\t' + nid + '\t' + lab + '\t' + dsc + '\t' + syn + '\n')
except UnicodeEncodeError:
out.write(etyp.encode().decode() + '\t' + str(nint).encode().decode() +
'\t' + nid.encode().decode() + '\t' + lab.encode().decode() +
'\t' + dsc.encode().decode() + '\t' + syn.encode().decode() + '\n')
out.close()
# upload cleaned node list to gcs bucket
header_key = list(meta_dict['nodes'].keys())[0]; headings = list(meta_dict['nodes'][header_key].keys())
df = pd.read_csv(log_dir + node_label, header=0, sep='\t'); df_columns = list(df.columns)
if len(headings) + 3 == len(df_columns) and len(df) > 0:
uploads_data_to_gcs_bucket(bucket, node_label_org + '/', log_dir, node_label)
shutil.rmtree(log_dir); os.mkdir(log_dir) # remove directory
else: print(df_columns); raise ValueError('DF has the incorrect number of columns')
else: raise ValueError('Triples_Integer_Identifier_Map.json is empty!')
else: raise ValueError('One of the files did not download correctly!')
@callahantiff
Copy link
Author

The directory_list input file looks as follows:

directory_list =[
['archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/inverse_relations/owl/PheKnowLator_v2.0.0_full_instance_inverseRelations_OWL_NodeLabels.txt', 
'archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/inverse_relations/owl/PheKnowLator_v2.0.0_full_instance_inverseRelations_OWL_Triples_Integer_Identifier_Map.json', 
'archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/inverse_relations/owl/node_metadata_dict.pkl'], 
['archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/inverse_relations/owlnets/PheKnowLator_v2.0.0_full_instance_inverseRelations_OWLNETS_INSTANCE_purified_NodeLabels.txt', 
'archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/inverse_relations/owlnets/PheKnowLator_v2.0.0_full_instance_inverseRelations_OWLNETS_INSTANCE_purified_Triples_Integer_Identifier_Map.json', 
'archived_builds/release_v2.0.0/build_11FEB2021/knowledge_graphs/instance_builds/inverse_relations/owlnets/node_metadata_dict.pkl']
]

@callahantiff
Copy link
Author

This Gist is part of the solution for the following issue #116 in the PheKnowLator repository

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment