Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import csv
import sqlite3
eval_file = "data/eval/MayoSRS_mesh.csv"
db_file = "data/pubtator/pubtator-20190725-6496be10.db"
words = []
with open(eval_file) as f:
csv_reader = csv.DictReader(f)
for row in csv_reader:
for term in ['TERM1', 'TERM2']:
words.extend(row[term].lower().split())
for mesh in ['MESH1', 'MESH2']:
if row[mesh] != 'None':
words.append('\u03B5mesh_' + row[mesh].lower())
words_str = ','.join([f'"{w}"' for w in set(words)])
conn = sqlite3.connect(db_file)
c = conn.cursor()
c.execute(f'select * from vocabulary where symbol in ({words_str}) order by id;')
rows = c.fetchall()
found = [r[1] for r in rows]
print(f'total words found {len(rows)}/{len(set(words))} ({len(rows)/len(set(words))}%)')
print(f'word ranks range from {rows[0][0]} - {rows[-1][0]}')
print('10 less frequent words are ')
for w in rows[-10:]:
print(w)
print(f'Not found words are {set(words) - set(found)}')
@romanegloo

This comment has been minimized.

Copy link
Owner Author

@romanegloo romanegloo commented Aug 14, 2019

total words found 384/397 (0.9672544080604534%)
word ranks range from 1060 - 521307
10 less frequent words are
(72553, 'drawer', 0, 2192)
(89889, 'hypomotility', 0, 1171)
(110254, 'splinter', 0, 686)
(117604, 'antalgic', 0, 585)
(167687, 'arrythmia', 0, 259)
(177792, 'crohns', 0, 229)
(366093, 'raynauds', 0, 58)
(386631, 'prothombin', 0, 53)
(503762, 'myocaridal', 0, 34)
(521307, 'dysguesia', 0, 32)
Not found words are {'nodusum', 'gastrosomy', 'hemipareisis', 'arthriits', 'haletosis', 'buterfly', 'varicsoe', 'heberdens', '(ana)', 'εmesh_c562577', 'hemetemesis', 'dysparunia', 'celluitis'}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment