Skip to content

Instantly share code, notes, and snippets.

@lakeparkXPA
Last active November 14, 2023 01:27
Show Gist options
  • Save lakeparkXPA/51e58e2391ce2cc6676dddf684e0cdc4 to your computer and use it in GitHub Desktop.
Save lakeparkXPA/51e58e2391ce2cc6676dddf684e0cdc4 to your computer and use it in GitHub Desktop.
word2vec model with drug side effect model
from gensim.models import Word2Vec
import pymssql
import numpy as np
import pandas as pd
import pickle
conn = pymssql.connect(server='localhost', user='junha',
password='password', database='FAERS', port=1433)
cur = conn.cursor()
print('SQL Data Loading...')
col = ['demo.primaryid','demo.event_dt', 'th.start_dt','demo.age', 'demo.age_cod', 'demo.sex', 'indi.indi_pt',
'dr.prod_ai', 're.pt', 'ot.outc_cod']
sql = """select distinct demo.primaryid,demo.EVENT_DT, th.START_DT, demo.age, demo.age_cod, demo.sex, indi.indi_pt, dr.prod_ai, re.pt, ot.OUTC_COD
from dbo.demographic as demo
inner join dbo.drug as dr
on dr.PRIMARYID = demo.PRIMARYID
inner join dbo.reaction as re
on re.PRIMARYID = demo.PRIMARYID
left outer join dbo.outcome as ot
on ot.PRIMARYID = demo.PRIMARYID
left outer join dbo.therapy as th
on th.PRIMARYID = dr.PRIMARYID and th.DSG_DRUG_SEQ = dr.DRUG_SEQ
left outer join dbo.indication as indi
on indi.PRIMARYID = dr.PRIMARYID and indi.INDI_DRUG_SEQ = dr.DRUG_SEQ
where (dr.PROD_AI is not null and (dr.ROLE_COD = 'PS' or dr.ROLE_COD = 'SS'))"""
cur.execute(sql)
data = cur.fetchall()
print('SQL Data Loaded!')
#다루기 쉬원 pandas 형태로 변환
df = pd.DataFrame(data)
df.columns = col
drug_dt = {}
drug_ct = {}
indi_dt = {}
age_cod = {'DEC' : 0.1, 'YR' : 1, 'MON' : 12,'WK' : 52.1429, 'DY' : 365, 'HR' : 8760, None : None}
sex = {'UNK' : 'Unknown', 'M' : 'Male', 'F' : 'Female', None : None}
df = df.replace(np.nan, None)
side_lst = []
print('Transforming To Form...')
for row in df.iterrows():
tmp = []
for col in row[1:]:
# event_dt >= start_dt 인 경우만 추출
load = 'No'
if col['demo.event_dt'] == None or col['th.start_dt'] == None:
load = 'yes'
elif int(float(col['demo.event_dt'][:4])) >= float(int(col['th.start_dt'][:4])) and \
(len(str(int(float(col['demo.event_dt'])))) == 4 or len(str(int(float(col['th.start_dt'])))) == 4):
load = 'yes'
elif int(float(col['demo.event_dt'][:6])) >= int(float(col['th.start_dt'][:6])) and \
(len(str(int(float(col['demo.event_dt'])))) == 6 or len(str(int(float(col['th.start_dt'])))) == 6):
load = 'yes'
elif int(float(col['demo.event_dt'][:8])) >= int(float(col['th.start_dt'][:8])):
load = 'yes'
if load == 'yes':
# 나이 연 단위로 변환
if col['demo.age'] != None:
age = str(int(float(col['demo.age'])/age_cod[col['demo.age_cod']]))
tmp.append(age)
# 성별이 null 인 경우 성별 제외
if col['demo.sex'] != None:
sex = col['demo.sex']
tmp.append(sex)
# 문장의 형태로 생성
indi = col['indi.indi_pt']
drug = col['dr.prod_ai']
side = col['re.pt']
out = col['ot.outc_cod']
tmp = tmp + [indi] + [drug] + [side] + [out]
#약물에 대한 부작용 딕셔너리
if col['dr.prod_ai'] in drug_dt.keys():
if col['re.pt'] not in drug_dt[col['dr.prod_ai']]:
drug_dt[col['dr.prod_ai']].append(col['re.pt'])
drug_ct[col['dr.prod_ai']].update({col['re.pt']:1})
else:
drug_ct[col['dr.prod_ai']][col['re.pt']] = drug_ct[col['dr.prod_ai']][col['re.pt']] + 1
else:
drug_dt[col['dr.prod_ai']] = [col['re.pt']]
drug_ct[col['dr.prod_ai']] = {col['re.pt']:1}
side_lst.append(tmp)
print('Transform Complete!')
print('Side2Vec...')
model = Word2Vec(sentences = side_lst,size=200,window=3,min_count=1,workers=4,sg=0,seed=123)
# model 저장
# model.save('/Users/KimJunha 1/Desktop/side2vec/side2vec.model')
model.save('/home/junha/python/side2vec.model')
# drug 별 SE 저장
# f = open('/Users/KimJunha 1/Desktop/side2vec/drug_dt.pkl','wb')
f = open('/home/junha/pickle/drug_dt.pkl','wb')
pickle.dump(drug_dt,f)
f.close()
# drug의 SE count dict 저장
# f = open('/Users/KimJunha 1/Desktop/side2vec/drug_ct.pkl','wb')
f = open('/home/junha/pickle/drug_ct.pkl','wb')
pickle.dump(drug_ct,f)
f.close()
print('Side2Vec Done!')
def side_print(drug_name):
print(drug_name)
for i in model.wv.most_similar(positive=[drug_name,'Thyroiditis'], topn=100):
if i[0] in drug_dt:
print(i[0],i[1])
io_lst = ['IPILIMUMAB','NIVOLUMAB','PEMBROLIZUMAB','DURVALUMAB','AVELUMAB','ATEZOLIZUMAB','CEMIPLIMAB']
for i in io_lst:
side_print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment