import csv | |
import sqlite3 | |
eval_file = "data/eval/MayoSRS_mesh.csv" | |
db_file = "data/pubtator/pubtator-20190725-6496be10.db" | |
words = [] | |
with open(eval_file) as f: | |
csv_reader = csv.DictReader(f) | |
for row in csv_reader: | |
for term in ['TERM1', 'TERM2']: | |
words.extend(row[term].lower().split()) | |
for mesh in ['MESH1', 'MESH2']: | |
if row[mesh] != 'None': | |
words.append('\u03B5mesh_' + row[mesh].lower()) | |
words_str = ','.join([f'"{w}"' for w in set(words)]) | |
conn = sqlite3.connect(db_file) | |
c = conn.cursor() | |
c.execute(f'select * from vocabulary where symbol in ({words_str}) order by id;') | |
rows = c.fetchall() | |
found = [r[1] for r in rows] | |
print(f'total words found {len(rows)}/{len(set(words))} ({len(rows)/len(set(words))}%)') | |
print(f'word ranks range from {rows[0][0]} - {rows[-1][0]}') | |
print('10 less frequent words are ') | |
for w in rows[-10:]: | |
print(w) | |
print(f'Not found words are {set(words) - set(found)}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.