Skip to content

Instantly share code, notes, and snippets.

@nickstenning
Created July 29, 2015 16:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nickstenning/4f51236739b4b1ad2177 to your computer and use it in GitHub Desktop.
Save nickstenning/4f51236739b4b1ad2177 to your computer and use it in GitHub Desktop.
{
"has_multiple_targets": 5,
"has_target": 44430,
"has_uri": 57754,
"target_missing_sources": 553,
"target_source_uri_mismatch": 1485,
"total": 57760
}
from __future__ import print_function
from collections import defaultdict
import fileinput
import json
def main():
docs = (json.loads(line) for line in fileinput.input())
stats = defaultdict(int)
for doc in docs:
if doc['_type'] != 'annotation':
continue
ann = doc['_source']
# if ann['updated'] < '2014-01-01':
# continue
stats['total'] += 1
targets = num_targets(ann)
if targets > 0:
stats['has_target'] += 1
if targets > 1:
stats['has_multiple_targets'] += 1
if targets > 0 and not targets_all_objects(ann):
stats['target_not_objects'] += 1
if targets > 0 and not targets_all_have_sources(ann):
stats['target_missing_sources'] += 1
uri = has_uri(ann)
if uri:
stats['has_uri'] += 1
if targets > 0 and uri and not target_sources_match_uri(ann):
stats['target_source_uri_mismatch'] += 1
print(json.dumps(stats, indent=4, sort_keys=True))
def num_targets(ann):
total = 0
if 'target' not in ann:
return total
if not isinstance(ann['target'], list):
return total
for target in ann['target']:
if not isinstance(target, dict):
continue
total += 1
return total
def targets_all_objects(ann):
for target in ann['target']:
if not isinstance(target, dict):
return False
return True
def targets_all_have_sources(ann):
for target in ann['target']:
if not isinstance(target, dict):
return False
if not 'source' in target:
return False
return True
def has_uri(ann):
return 'uri' in ann
def target_sources_match_uri(ann):
uri = ann['uri']
for target in ann['target']:
if not isinstance(target, dict):
continue
if not 'source' in target:
return False
if target['source'] != uri:
return False
return True
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment