Skip to content

Instantly share code, notes, and snippets.

@vsoch
Last active December 31, 2015 07:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsoch/7952145 to your computer and use it in GitHub Desktop.
Save vsoch/7952145 to your computer and use it in GitHub Desktop.
clinPheno.py connects to a database of clinical data, a data dictionary, searches the data dictionary (all fields) for strings of interest, and then queries the clinical data for all maches, and prints two text files: 1) a text file of data, with questions in columns and items in rows, and 2) a text file of data dictionary fields See script head…
#!/usr/bin/python
"""
clinPheno: Methods for querying a SQL database of clinical
variables to establish a clinical phenotype. The script will
search a data dictionary (database) for a set of user specified
terms (in some input.txt file), and then extract the matches
from the clinical database, for import into R (see clinPhenoR.R)
# SAMPLE INPUT.TXT file:
# One search term per line, capitalization not important
eye
gaze
# SAMPLE USAGE:
import clinPheno as cp
# Create database object for clinical data
hostname = "my.database.com"
user = "user"
dbname = "dbname"
password = "password"
db = cp.Database(hostname,user,password,dbname)
# Create database object for data dictionary
hostname = "my.data.dictionary"
user = "user"
dbname = "dbname"
password = "password"
dd = cp.Database(hostname,user,password,dbname)
# Create Phenotype object from the databases
pheno = cp.Phenotype(db,dd)
# This is the table name for the data dictionary
ddTableName = "data_dictionary"
# Now, search the data dictionary using our input file
inputfile = "input.txt";
pheno.search(inputfile,ddTableName)
# Parse the data - you can specify multiple subject ID columns, and
# each will be tried. Be sure that no table has both!
subcolname = ["individual_id","subject_id"]
tableNameidx = 0 # col index with table names in data dictionary
quesNameidx = 1 # col index with column (field) names
pheno.parse(subcolname,tableNameidx,quesNameidx)
# Print results to file
pheno.printf("SFARI_eyegaze")
"""
__author__ = "Vanessa Sochat (vsochat@stanford.edu)"
__version__ = "$Revision: 1.0 $"
__date__ = "$Date: 2013/12/10 $"
__license__ = "Python"
import os
import sys
import re
import MySQLdb
# DATABASE ----------------------------------------------------------------------------------
# A database object manages connections to the database
class Database:
def __init__(self,host,user,pw,name):
self.host = host
self.user = user
self.password = pw
self.dbname = name
self.tables = self.getTables()
# When user prints object, show all fields
def __str__(self):
return "<host:" + self.host + ">" + "<user:" + self.user + ">" + "<pw:" + self.password + ">" + "<name:" + self.dbname + ">" + "<tables:" + str(len(self.tables)) + ">"
# Connect to database - get tables, and close connection
def __connect__(self):
try:
# Open database connection, get list of tables
self.db = MySQLdb.connect(self.host,self.user,self.password,self.dbname)
except:
print("Could not connect to database. Check credentials and try again.")
sys.exit()
# Get list of tables
def getTables(self):
temp = self.query("SELECT * FROM INFORMATION_SCHEMA.TABLES;")
tables = []
for t in temp:
# If we have found a table in the db of interest
if t[1] == self.dbname:
tables.append(t[2])
return tables
# Close connection to database
def __disconnect__(self):
self.db.close()
# Get data dictionary
def getDataDict(self,dicttablename):
self.__connect__()
# Get all column names for the data dictionary
temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + dicttablename + "\";")
self.fields = []
for t in temp:
self.fields.append(t[3])
# We can't be certain about data dictionary format, so get all data fields
self.datadict = self.query("SELECT * FROM " + dicttablename + ";")
# get all column information from a table with unique values, etc.
def getTableCols(self,tablename):
temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + tablename + "\";")
colnames = list(); tabnames = list(); dbname = list(); dattype = list()
# temp[*][1] is the database, 2 is table, 3 is colname, 7 is datatype,
for t in temp:
colnames.append(t[3])
dbname.append(t[1])
tabnames.append(t[2])
dattype.append(t[7])
return dbname,tabnames,colnames,dattype
# get unique values for a particular table and column
def getSingleCol(self,tablename,colname):
temp = self.query("SELECT DISTINCT(" + colname + ") as Dat FROM " + tablename + " ORDER BY Dat DESC;")
return temp
# Query the database!
def query(self,qstring):
self.__connect__()
cursor = self.db.cursor()
cursor.execute(qstring)
temp = cursor.fetchall()
self.__disconnect__()
return temp
# PHENOTYPE
# -----------------------------------------------------------------------------
# A phenotype object brings together the database, tables, and builds the phenotype
class Phenotype:
def __init__(self,db,dd):
self.db = db # Database
self.dd = dd # Data dictionary
# When print phenotype object:
def __str__(self):
if self.matches:
return 'Matches: ' + str(len(self.matches)) + '\nDatabase: ' + str(self.db) + '\nData Dict: ' + str(self.dd)
else:
return 'Matches: 0\n' + 'Database: ' + str(self.db) + '\nData Dict: ' + str(self.dd)
# Search
def search(self,infile,datadicttable):
# Create search pattern
words = self.readInput(infile)
pattern = self.createRegexp(words)
# Now search data dictionary for pattern
self.dd.getDataDict(datadicttable)
# Get list of table, column id, variable type, and options
self.matches = self.matchPattern(self.dd.datadict,pattern)
print "New list of matches: found " + str(len(self.matches)) + "."
# Parse - uses list of matches from data dictionary to query database
# Subcolname should be the name of the column with subject IDs
def parse(self,subcolname,tablecolidx,quescolidx):
if not self.matches:
print "Error: please run Phenotype.search(infile) to find matches before querying!"
return
else:
self.db.__connect__() # connect to database
Metrics = dict() # This is a dictionary of metric objects, one per column match
subs = dict() # This is a dictionary of unique subjects, each is a data dictionary
metric_count = 0 # index for metrics
sub_count = 0 # index for subjects
for m in self.matches:
tname = m[tablecolidx]
cname = m[quescolidx]
found = False
for scolname in subcolname:
try:
temp = self.db.query("SELECT " + scolname + ", " + cname + " FROM " + tname + ";")
# Create new metric(table name, column name, data, type, options)
metric = Metric(m,temp,metric_count)
Metrics[cname] = metric
metric_count = metric_count+1
# Save unique subject IDS to dictionary
for s in temp:
subid = s[0]
# If the subject isn't in the list, add, add one to count
if subid not in subs.keys():
subject = Subject(subid,sub_count)
subs[subid] = subject
sub_count = sub_count + 1
found = True
except:
pass
if not found:
print "Table " + tname + " Question: " + m[1] + " not in database!"
# Save metrics and subjects
self.subjects = subs
self.metrics = Metrics
# Each subject and metric holds track of its index,
# and the length of each is the size of the final data
self.data = [[None for _ in range(len(self.metrics))] for _ in range(len(self.subjects))]
# Now, go through metrics, and add subject specific data
print "Parsing " + str(len(self.metrics)) + " metrics for " + str(len(self.subjects))
for question,m in self.metrics.iteritems():
dat = m.data
metidx = m.index # This is the metric index
for d in dat: # For each subid, value
subid = d[0]; val = d[1]
# Get subject index, add to data
subidx = self.subjects[subid].index
self.data[subidx][metidx] = val
print "Data parsing complete."
# Read input words, create regular expression
def readInput(self,infile):
words = []
filey = open(infile,'r')
for f in filey.readlines():
words.append(f.strip('\n'))
filey.close()
return words
# Create regular expression
def createRegexp(self,words):
pattern_strings = []
for w in words:
pattern_strings.append(w)
pattern_string = '|'.join(pattern_strings)
pattern = re.compile(pattern_string)
return pattern
# Find tables and columns that match our pattern
def matchPattern(self,datadict,pattern):
matches = list() # Will hold (table name, column name, variable type, options)
count = 1
for d in datadict:
# table_name, column_name, question_instruction, domain, options, scale, Mom, Dad, Proband, Siblin, Family, Other, notes, display_title, column_title, column_hint
temp = " ".join(d)
if pattern.search(temp.lower()):
matches.append(d)
count = count + 1
return matches
# Print self.data to file, as well as print a table of unique values, types, etc.
def printf(self,outfile):
nrow = len(self.data)
ncol = len(self.data[0])
print "Printing " + str(nrow) + " subjects for " + str(ncol) + " questions..."
# Get the correctly ordered list of subjects
sublist = []
suborder = []
for s,v in self.subjects.iteritems():
sublist.append(s)
suborder.append(v.index)
subs = [x for y,x in sorted(zip(suborder,sublist))]
# Get the correctly ordered list of metrics
metlist = []
metorder = []
for s,v in self.metrics.iteritems():
metlist.append(s)
metorder.append(v.index)
mets = [x for y,x in sorted(zip(metorder,metlist))]
# Print both an output data file, and dictionary file
outfdict = open(outfile + "_vars.txt",'w') # column names, ranges, type, and text
outf = open(outfile + "_data.txt",'w') # the data
# First print metric_column names and metric info to file
outf.write("subjectkey\t")
outfdict.write("\t".join(self.dd.fields) + "\n")
for m in mets:
outf.write(m + "\t")
outfdict.write("\t".join(self.metrics[m].fields) + "\n")
outf.write("\n")
# Print subid, followed by subject data
count = 0;
for r in range(nrow):
row = self.data[r]
outf.write(subs[count] + "\t" + "\t".join(str(x) for x in row) + "\n")
count = count+1
print "Done printing to file."
print "See " + outfile + "_data.txt for question data"
print "See " + outfile + "_vars.txt for data dictionary"
outfdict.close()
outf.close()
# METRIC -------------------------------------------------------------------------------
# A metric object holds results for a particular column in a table
class Metric:
# data-dict fields, data,idx
def __init__(self,d,data,idx):
self.fields = d
self.data = data;
self.index = idx
def __str__(self):
return "<table:" + self.table + "><ques:" + self.question + "><options:" + self.options + "><index:" + str(self.index) + ">"
# SUBJECT -------------------------------------------------------------------------------
# A subject object holds subject ID and count
class Subject:
def __init__(self,subid,subindex):
self.id = subid
self.index = subindex
def __str__(self):
return "<id:" + self.id + "><index:" + str(self.index) + ">"
# MAIN ----------------------------------------------------------------------------------
def usage():
print __doc__
def main():
print __doc__
if __name__ == "__main__":
main(sys.argv[1:])
# clinPhenoR.R reads in clinical data extracted using clinPheno.py,
# and helps user to view questions and (manually, carefully!) recode
# Read in the data dictionary
dd = read.csv("SFARI_eyegaze_vars.txt",sep="\t",header=TRUE)
# Read in the data
data = read.csv("SFARI_eyegaze_data.txt",sep="\t",header=TRUE)
rownames(data) = data[,1]
data = data[,-1]
# Last column is empty, because of extra delimiter
# (fix this in export)
data = data[,1:ncol(data)-1]
# There should be one less column in dd, does not list subject field
# (fix this in export)
# MANUAL DIRECTION SCORING:
# Look at the options, and figure out max and min for each:
# IDs to eliminate (not relevant questions)
delete = c()
# Max values
maxy = c()
# Min values
minny = c()
# Direction
direction = c()
# Recodes
recodes = c()
# NA vals
NaVals = c()
for (o in 1:nrow(dd)) {
data_col_name = dd$name[o]
opt = as.character(dd$options[o])
ques = as.character(dd$column_title[o])
dat = data[as.character(data_col_name)]
cat("ID: ",as.character(data_col_name),"\n")
cat("?: ",ques,"\n")
cat("opt: ",opt,"\n\n")
# Ask user if we want to keep question
n = as.numeric(readline("Delete question? [0:No][1:Yes] "))
if (n == 1) {
delete = c(delete,o)
recodes = c(recodes,"")
maxy = c(maxy,0)
minny = c(minny,0)
direction = c(direction,0)
} else { # if we want to keep the question
# Ask user if we want to recode question
n = as.numeric(readline("Question needs recode? (eg, if Yes/No, True/False) [0:No][1:Yes] "))
if (n == 1) {
n = readline("Enter string with Value:Recode separated by commas (eg, \"No:0,Yes:1\" ")
recodes = c(recodes,n)
} else {
recodes = c(recodes,"")
}
# Ask user for NA values
n = readline("Specify NA values, in row separated by commas: (eg, \"8,9\"")
NaVals = c(NaVals,n)
# Ask user for direction (0=normal, 1=aberrant)
n = as.numeric(readline("Question Values Scale? [0:lower=normal][1:lower=not normal]"))
direction = c(direction,n)
# Ask user for max value
n = as.numeric(readline("Max Value?"))
maxy = c(maxy,n)
# Ask user for min value
n = as.numeric(readline("Min Value?"))
minny = c(minny,n)
}
}
# First, delete questions that we don't want
dd = dd[-delete,]
test <- subset(data, select = -delete )
data = test
maxy = maxy[-delete]
minny = minny[-delete]
direction = direction[-delete]
recodes = recodes[-delete]
NaVals = NaVals[-delete]
# Manually rescoring each question
q = 15
d = data[,q]
max = maxy[q]
min = minny[q]
dir = direction[q]
rec = recodes[q]
na = NaVals[q]
# This will hold new data
tmp = array(dim=dim(data)[1])
cat("Max: ",max)
cat("Min: ",min)
cat("Dir: ",dir)
cat("Rec: ",rec)
cat("Na: ",na)
# Check for unique values
unique(d)
# Change Yes/No to 1/0
tmp[which(d=="yes")] = 1
tmp[which(d=="no")] = 0
# Change 2 to 1
tmp[which(d=="2")] = 1
tmp[which(d=="0")] = 0
tmp[which(d=="0")] = 0
tmp[which(d=="1")] = 1
tmp[which(d=="2")] = 2
tmp[which(d=="3")] = 3
# Also change direction
tmp[which(d=="0")] = 3
tmp[which(d=="1")] = 2
tmp[which(d=="2")] = 1
tmp[which(d=="3")] = 0
# Now normalize
tmp = ((tmp - min) / (max-min))
hist(tmp)
# Save
data2[,q] = tmp
normdata = data2
# Save to file
save(data, normdata, dd, file=paste(getwd(),"/Simons_Eye_Data.rda",sep=''))
#!/usr/bin/python
# This script will create a quasi-data dictionary from a database,
# as a csv file for import into a database
import clinPheno as cp
# Create database object to connect to data
hostname = "hostname"
user = "root"
dbname = "databasename"
password = "password"
db = cp.Database(hostname,user,password,dbname)
# Create lists for holding column names, and possible values
tnames = list()
cnames = list()
dattypes = list()
dbnames = list()
for t in db.tables:
dbname,tbname,colname,dattype = db.getTableCols(t)
tnames.append(tbname)
cnames.append(colname)
dattypes.append(dattype)
dbnames.append(dbname)
# Now print to csv file
outf = open("datadict.txt",'w')
# First print metric_column names and metric info to file
outf.write("table_name,column_name,question_instruction,domain,options,scale,Mom,Dad,Proband, Sibling,Family,Other,notes,display_title,column_title,column_hint\n")
# Print each row
for t in range(len(tnames)):
for c in range(len(tnames[t])):
outf.write(tnames[t][c] + "," + cnames[t][c] + ",,,," + dattypes[t][c] + ",,,,,,,,,,\n" )
outf.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment