Last active
December 31, 2015 07:09
-
-
Save vsoch/7952145 to your computer and use it in GitHub Desktop.
clinPheno.py connects to a database of clinical data, a data dictionary, searches the data dictionary (all fields) for strings of interest, and then queries the clinical data for all maches, and prints two text files: 1) a text file of data, with questions in columns and items in rows, and 2) a text file of data dictionary fields See script head…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
clinPheno: Methods for querying a SQL database of clinical | |
variables to establish a clinical phenotype. The script will | |
search a data dictionary (database) for a set of user specified | |
terms (in some input.txt file), and then extract the matches | |
from the clinical database, for import into R (see clinPhenoR.R) | |
# SAMPLE INPUT.TXT file: | |
# One search term per line, capitalization not important | |
eye | |
gaze | |
# SAMPLE USAGE: | |
import clinPheno as cp | |
# Create database object for clinical data | |
hostname = "my.database.com" | |
user = "user" | |
dbname = "dbname" | |
password = "password" | |
db = cp.Database(hostname,user,password,dbname) | |
# Create database object for data dictionary | |
hostname = "my.data.dictionary" | |
user = "user" | |
dbname = "dbname" | |
password = "password" | |
dd = cp.Database(hostname,user,password,dbname) | |
# Create Phenotype object from the databases | |
pheno = cp.Phenotype(db,dd) | |
# This is the table name for the data dictionary | |
ddTableName = "data_dictionary" | |
# Now, search the data dictionary using our input file | |
inputfile = "input.txt"; | |
pheno.search(inputfile,ddTableName) | |
# Parse the data - you can specify multiple subject ID columns, and | |
# each will be tried. Be sure that no table has both! | |
subcolname = ["individual_id","subject_id"] | |
tableNameidx = 0 # col index with table names in data dictionary | |
quesNameidx = 1 # col index with column (field) names | |
pheno.parse(subcolname,tableNameidx,quesNameidx) | |
# Print results to file | |
pheno.printf("SFARI_eyegaze") | |
""" | |
__author__ = "Vanessa Sochat (vsochat@stanford.edu)" | |
__version__ = "$Revision: 1.0 $" | |
__date__ = "$Date: 2013/12/10 $" | |
__license__ = "Python" | |
import os | |
import sys | |
import re | |
import MySQLdb | |
# DATABASE ---------------------------------------------------------------------------------- | |
# A database object manages connections to the database | |
class Database: | |
def __init__(self,host,user,pw,name): | |
self.host = host | |
self.user = user | |
self.password = pw | |
self.dbname = name | |
self.tables = self.getTables() | |
# When user prints object, show all fields | |
def __str__(self): | |
return "<host:" + self.host + ">" + "<user:" + self.user + ">" + "<pw:" + self.password + ">" + "<name:" + self.dbname + ">" + "<tables:" + str(len(self.tables)) + ">" | |
# Connect to database - get tables, and close connection | |
def __connect__(self): | |
try: | |
# Open database connection, get list of tables | |
self.db = MySQLdb.connect(self.host,self.user,self.password,self.dbname) | |
except: | |
print("Could not connect to database. Check credentials and try again.") | |
sys.exit() | |
# Get list of tables | |
def getTables(self): | |
temp = self.query("SELECT * FROM INFORMATION_SCHEMA.TABLES;") | |
tables = [] | |
for t in temp: | |
# If we have found a table in the db of interest | |
if t[1] == self.dbname: | |
tables.append(t[2]) | |
return tables | |
# Close connection to database | |
def __disconnect__(self): | |
self.db.close() | |
# Get data dictionary | |
def getDataDict(self,dicttablename): | |
self.__connect__() | |
# Get all column names for the data dictionary | |
temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + dicttablename + "\";") | |
self.fields = [] | |
for t in temp: | |
self.fields.append(t[3]) | |
# We can't be certain about data dictionary format, so get all data fields | |
self.datadict = self.query("SELECT * FROM " + dicttablename + ";") | |
# get all column information from a table with unique values, etc. | |
def getTableCols(self,tablename): | |
temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + tablename + "\";") | |
colnames = list(); tabnames = list(); dbname = list(); dattype = list() | |
# temp[*][1] is the database, 2 is table, 3 is colname, 7 is datatype, | |
for t in temp: | |
colnames.append(t[3]) | |
dbname.append(t[1]) | |
tabnames.append(t[2]) | |
dattype.append(t[7]) | |
return dbname,tabnames,colnames,dattype | |
# get unique values for a particular table and column | |
def getSingleCol(self,tablename,colname): | |
temp = self.query("SELECT DISTINCT(" + colname + ") as Dat FROM " + tablename + " ORDER BY Dat DESC;") | |
return temp | |
# Query the database! | |
def query(self,qstring): | |
self.__connect__() | |
cursor = self.db.cursor() | |
cursor.execute(qstring) | |
temp = cursor.fetchall() | |
self.__disconnect__() | |
return temp | |
# PHENOTYPE | |
# ----------------------------------------------------------------------------- | |
# A phenotype object brings together the database, tables, and builds the phenotype | |
class Phenotype: | |
def __init__(self,db,dd): | |
self.db = db # Database | |
self.dd = dd # Data dictionary | |
# When print phenotype object: | |
def __str__(self): | |
if self.matches: | |
return 'Matches: ' + str(len(self.matches)) + '\nDatabase: ' + str(self.db) + '\nData Dict: ' + str(self.dd) | |
else: | |
return 'Matches: 0\n' + 'Database: ' + str(self.db) + '\nData Dict: ' + str(self.dd) | |
# Search | |
def search(self,infile,datadicttable): | |
# Create search pattern | |
words = self.readInput(infile) | |
pattern = self.createRegexp(words) | |
# Now search data dictionary for pattern | |
self.dd.getDataDict(datadicttable) | |
# Get list of table, column id, variable type, and options | |
self.matches = self.matchPattern(self.dd.datadict,pattern) | |
print "New list of matches: found " + str(len(self.matches)) + "." | |
# Parse - uses list of matches from data dictionary to query database | |
# Subcolname should be the name of the column with subject IDs | |
def parse(self,subcolname,tablecolidx,quescolidx): | |
if not self.matches: | |
print "Error: please run Phenotype.search(infile) to find matches before querying!" | |
return | |
else: | |
self.db.__connect__() # connect to database | |
Metrics = dict() # This is a dictionary of metric objects, one per column match | |
subs = dict() # This is a dictionary of unique subjects, each is a data dictionary | |
metric_count = 0 # index for metrics | |
sub_count = 0 # index for subjects | |
for m in self.matches: | |
tname = m[tablecolidx] | |
cname = m[quescolidx] | |
found = False | |
for scolname in subcolname: | |
try: | |
temp = self.db.query("SELECT " + scolname + ", " + cname + " FROM " + tname + ";") | |
# Create new metric(table name, column name, data, type, options) | |
metric = Metric(m,temp,metric_count) | |
Metrics[cname] = metric | |
metric_count = metric_count+1 | |
# Save unique subject IDS to dictionary | |
for s in temp: | |
subid = s[0] | |
# If the subject isn't in the list, add, add one to count | |
if subid not in subs.keys(): | |
subject = Subject(subid,sub_count) | |
subs[subid] = subject | |
sub_count = sub_count + 1 | |
found = True | |
except: | |
pass | |
if not found: | |
print "Table " + tname + " Question: " + m[1] + " not in database!" | |
# Save metrics and subjects | |
self.subjects = subs | |
self.metrics = Metrics | |
# Each subject and metric holds track of its index, | |
# and the length of each is the size of the final data | |
self.data = [[None for _ in range(len(self.metrics))] for _ in range(len(self.subjects))] | |
# Now, go through metrics, and add subject specific data | |
print "Parsing " + str(len(self.metrics)) + " metrics for " + str(len(self.subjects)) | |
for question,m in self.metrics.iteritems(): | |
dat = m.data | |
metidx = m.index # This is the metric index | |
for d in dat: # For each subid, value | |
subid = d[0]; val = d[1] | |
# Get subject index, add to data | |
subidx = self.subjects[subid].index | |
self.data[subidx][metidx] = val | |
print "Data parsing complete." | |
# Read input words, create regular expression | |
def readInput(self,infile): | |
words = [] | |
filey = open(infile,'r') | |
for f in filey.readlines(): | |
words.append(f.strip('\n')) | |
filey.close() | |
return words | |
# Create regular expression | |
def createRegexp(self,words): | |
pattern_strings = [] | |
for w in words: | |
pattern_strings.append(w) | |
pattern_string = '|'.join(pattern_strings) | |
pattern = re.compile(pattern_string) | |
return pattern | |
# Find tables and columns that match our pattern | |
def matchPattern(self,datadict,pattern): | |
matches = list() # Will hold (table name, column name, variable type, options) | |
count = 1 | |
for d in datadict: | |
# table_name, column_name, question_instruction, domain, options, scale, Mom, Dad, Proband, Siblin, Family, Other, notes, display_title, column_title, column_hint | |
temp = " ".join(d) | |
if pattern.search(temp.lower()): | |
matches.append(d) | |
count = count + 1 | |
return matches | |
# Print self.data to file, as well as print a table of unique values, types, etc. | |
def printf(self,outfile): | |
nrow = len(self.data) | |
ncol = len(self.data[0]) | |
print "Printing " + str(nrow) + " subjects for " + str(ncol) + " questions..." | |
# Get the correctly ordered list of subjects | |
sublist = [] | |
suborder = [] | |
for s,v in self.subjects.iteritems(): | |
sublist.append(s) | |
suborder.append(v.index) | |
subs = [x for y,x in sorted(zip(suborder,sublist))] | |
# Get the correctly ordered list of metrics | |
metlist = [] | |
metorder = [] | |
for s,v in self.metrics.iteritems(): | |
metlist.append(s) | |
metorder.append(v.index) | |
mets = [x for y,x in sorted(zip(metorder,metlist))] | |
# Print both an output data file, and dictionary file | |
outfdict = open(outfile + "_vars.txt",'w') # column names, ranges, type, and text | |
outf = open(outfile + "_data.txt",'w') # the data | |
# First print metric_column names and metric info to file | |
outf.write("subjectkey\t") | |
outfdict.write("\t".join(self.dd.fields) + "\n") | |
for m in mets: | |
outf.write(m + "\t") | |
outfdict.write("\t".join(self.metrics[m].fields) + "\n") | |
outf.write("\n") | |
# Print subid, followed by subject data | |
count = 0; | |
for r in range(nrow): | |
row = self.data[r] | |
outf.write(subs[count] + "\t" + "\t".join(str(x) for x in row) + "\n") | |
count = count+1 | |
print "Done printing to file." | |
print "See " + outfile + "_data.txt for question data" | |
print "See " + outfile + "_vars.txt for data dictionary" | |
outfdict.close() | |
outf.close() | |
# METRIC ------------------------------------------------------------------------------- | |
# A metric object holds results for a particular column in a table | |
class Metric: | |
# data-dict fields, data,idx | |
def __init__(self,d,data,idx): | |
self.fields = d | |
self.data = data; | |
self.index = idx | |
def __str__(self): | |
return "<table:" + self.table + "><ques:" + self.question + "><options:" + self.options + "><index:" + str(self.index) + ">" | |
# SUBJECT ------------------------------------------------------------------------------- | |
# A subject object holds subject ID and count | |
class Subject: | |
def __init__(self,subid,subindex): | |
self.id = subid | |
self.index = subindex | |
def __str__(self): | |
return "<id:" + self.id + "><index:" + str(self.index) + ">" | |
# MAIN ---------------------------------------------------------------------------------- | |
def usage(): | |
print __doc__ | |
def main(): | |
print __doc__ | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# clinPhenoR.R reads in clinical data extracted using clinPheno.py, | |
# and helps user to view questions and (manually, carefully!) recode | |
# Read in the data dictionary | |
dd = read.csv("SFARI_eyegaze_vars.txt",sep="\t",header=TRUE) | |
# Read in the data | |
data = read.csv("SFARI_eyegaze_data.txt",sep="\t",header=TRUE) | |
rownames(data) = data[,1] | |
data = data[,-1] | |
# Last column is empty, because of extra delimiter | |
# (fix this in export) | |
data = data[,1:ncol(data)-1] | |
# There should be one less column in dd, does not list subject field | |
# (fix this in export) | |
# MANUAL DIRECTION SCORING: | |
# Look at the options, and figure out max and min for each: | |
# IDs to eliminate (not relevant questions) | |
delete = c() | |
# Max values | |
maxy = c() | |
# Min values | |
minny = c() | |
# Direction | |
direction = c() | |
# Recodes | |
recodes = c() | |
# NA vals | |
NaVals = c() | |
for (o in 1:nrow(dd)) { | |
data_col_name = dd$name[o] | |
opt = as.character(dd$options[o]) | |
ques = as.character(dd$column_title[o]) | |
dat = data[as.character(data_col_name)] | |
cat("ID: ",as.character(data_col_name),"\n") | |
cat("?: ",ques,"\n") | |
cat("opt: ",opt,"\n\n") | |
# Ask user if we want to keep question | |
n = as.numeric(readline("Delete question? [0:No][1:Yes] ")) | |
if (n == 1) { | |
delete = c(delete,o) | |
recodes = c(recodes,"") | |
maxy = c(maxy,0) | |
minny = c(minny,0) | |
direction = c(direction,0) | |
} else { # if we want to keep the question | |
# Ask user if we want to recode question | |
n = as.numeric(readline("Question needs recode? (eg, if Yes/No, True/False) [0:No][1:Yes] ")) | |
if (n == 1) { | |
n = readline("Enter string with Value:Recode separated by commas (eg, \"No:0,Yes:1\" ") | |
recodes = c(recodes,n) | |
} else { | |
recodes = c(recodes,"") | |
} | |
# Ask user for NA values | |
n = readline("Specify NA values, in row separated by commas: (eg, \"8,9\"") | |
NaVals = c(NaVals,n) | |
# Ask user for direction (0=normal, 1=aberrant) | |
n = as.numeric(readline("Question Values Scale? [0:lower=normal][1:lower=not normal]")) | |
direction = c(direction,n) | |
# Ask user for max value | |
n = as.numeric(readline("Max Value?")) | |
maxy = c(maxy,n) | |
# Ask user for min value | |
n = as.numeric(readline("Min Value?")) | |
minny = c(minny,n) | |
} | |
} | |
# First, delete questions that we don't want | |
dd = dd[-delete,] | |
test <- subset(data, select = -delete ) | |
data = test | |
maxy = maxy[-delete] | |
minny = minny[-delete] | |
direction = direction[-delete] | |
recodes = recodes[-delete] | |
NaVals = NaVals[-delete] | |
# Manually rescoring each question | |
q = 15 | |
d = data[,q] | |
max = maxy[q] | |
min = minny[q] | |
dir = direction[q] | |
rec = recodes[q] | |
na = NaVals[q] | |
# This will hold new data | |
tmp = array(dim=dim(data)[1]) | |
cat("Max: ",max) | |
cat("Min: ",min) | |
cat("Dir: ",dir) | |
cat("Rec: ",rec) | |
cat("Na: ",na) | |
# Check for unique values | |
unique(d) | |
# Change Yes/No to 1/0 | |
tmp[which(d=="yes")] = 1 | |
tmp[which(d=="no")] = 0 | |
# Change 2 to 1 | |
tmp[which(d=="2")] = 1 | |
tmp[which(d=="0")] = 0 | |
tmp[which(d=="0")] = 0 | |
tmp[which(d=="1")] = 1 | |
tmp[which(d=="2")] = 2 | |
tmp[which(d=="3")] = 3 | |
# Also change direction | |
tmp[which(d=="0")] = 3 | |
tmp[which(d=="1")] = 2 | |
tmp[which(d=="2")] = 1 | |
tmp[which(d=="3")] = 0 | |
# Now normalize | |
tmp = ((tmp - min) / (max-min)) | |
hist(tmp) | |
# Save | |
data2[,q] = tmp | |
normdata = data2 | |
# Save to file | |
save(data, normdata, dd, file=paste(getwd(),"/Simons_Eye_Data.rda",sep='')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# This script will create a quasi-data dictionary from a database, | |
# as a csv file for import into a database | |
import clinPheno as cp | |
# Create database object to connect to data | |
hostname = "hostname" | |
user = "root" | |
dbname = "databasename" | |
password = "password" | |
db = cp.Database(hostname,user,password,dbname) | |
# Create lists for holding column names, and possible values | |
tnames = list() | |
cnames = list() | |
dattypes = list() | |
dbnames = list() | |
for t in db.tables: | |
dbname,tbname,colname,dattype = db.getTableCols(t) | |
tnames.append(tbname) | |
cnames.append(colname) | |
dattypes.append(dattype) | |
dbnames.append(dbname) | |
# Now print to csv file | |
outf = open("datadict.txt",'w') | |
# First print metric_column names and metric info to file | |
outf.write("table_name,column_name,question_instruction,domain,options,scale,Mom,Dad,Proband, Sibling,Family,Other,notes,display_title,column_title,column_hint\n") | |
# Print each row | |
for t in range(len(tnames)): | |
for c in range(len(tnames[t])): | |
outf.write(tnames[t][c] + "," + cnames[t][c] + ",,,," + dattypes[t][c] + ",,,,,,,,,,\n" ) | |
outf.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment