vsoch/clinPheno.py

## clinPheno.py
#!/usr/bin/python

"""
clinPheno: Methods for querying a SQL database of clinical
variables to establish a clinical phenotype.  The script will
search a data dictionary (database) for a set of user specified
terms (in some input.txt file), and then extract the matches
from the clinical database, for import into R (see clinPhenoR.R)

# SAMPLE INPUT.TXT file:
# One search term per line, capitalization not important
eye
gaze

# SAMPLE USAGE:

import clinPheno as cp

# Create database object for clinical data
hostname = "my.database.com"
user = "user"
dbname = "dbname"
password = "password"
db = cp.Database(hostname,user,password,dbname)

# Create database object for data dictionary
hostname = "my.data.dictionary"
user = "user"
dbname = "dbname"
password = "password"
dd = cp.Database(hostname,user,password,dbname)

# Create Phenotype object from the databases
pheno = cp.Phenotype(db,dd)

# This is the table name for the data dictionary
ddTableName = "data_dictionary"

# Now, search the data dictionary using our input file
inputfile = "input.txt";
pheno.search(inputfile,ddTableName)

# Parse the data - you can specify multiple subject ID columns, and
# each will be tried.  Be sure that no table has both!
subcolname = ["individual_id","subject_id"]
tableNameidx = 0             # col index with table names in data dictionary
quesNameidx = 1              # col index with column (field) names
pheno.parse(subcolname,tableNameidx,quesNameidx)

# Print results to file
pheno.printf("SFARI_eyegaze")

"""

__author__ = "Vanessa Sochat (vsochat@stanford.edu)"
__version__ = "$Revision: 1.0 $"
__date__ = "$Date: 2013/12/10 $"
__license__ = "Python"

import os
import sys
import re
import MySQLdb

# DATABASE ----------------------------------------------------------------------------------
# A database object manages connections to the database
class Database:
    def __init__(self,host,user,pw,name):
        self.host = host
        self.user = user
        self.password = pw
        self.dbname = name
        self.tables = self.getTables()

    # When user prints object, show all fields
    def __str__(self):
        return "<host:"  + self.host + ">" + "<user:"  + self.user + ">" + "<pw:"  + self.password + ">" + "<name:"  + self.dbname + ">" + "<tables:"  + str(len(self.tables)) + ">"

    # Connect to database - get tables, and close connection
    def __connect__(self):
	try:
          # Open database connection, get list of tables
          self.db = MySQLdb.connect(self.host,self.user,self.password,self.dbname)
        except:
          print("Could not connect to database. Check credentials and try again.")
          sys.exit()

    # Get list of tables
    def getTables(self):

        temp = self.query("SELECT * FROM INFORMATION_SCHEMA.TABLES;")

        tables = []
	for t in temp:
          # If we have found a table in the db of interest
          if t[1] == self.dbname:
            tables.append(t[2])

        return tables

    # Close connection to database
    def __disconnect__(self):
        self.db.close()

    # Get data dictionary
    def getDataDict(self,dicttablename):

        self.__connect__()
        # Get all column names for the data dictionary
	temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + dicttablename + "\";")

        self.fields = []
        for t in temp:
            self.fields.append(t[3])

	# We can't be certain about data dictionary format, so get all data fields
        self.datadict = self.query("SELECT * FROM " + dicttablename + ";")

    # get all column information from a table with unique values, etc.
    def getTableCols(self,tablename):
        temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + tablename + "\";")
        colnames = list(); tabnames = list(); dbname = list(); dattype = list()
        # temp[*][1] is the database, 2 is table, 3 is colname, 7 is datatype,
        for t in temp:
          colnames.append(t[3])
          dbname.append(t[1])
          tabnames.append(t[2])
          dattype.append(t[7])
        return dbname,tabnames,colnames,dattype

    # get unique values for a particular table and column
    def getSingleCol(self,tablename,colname):
      temp = self.query("SELECT DISTINCT(" + colname + ") as Dat FROM " + tablename + " ORDER BY Dat DESC;")
      return temp

    # Query the database!
    def query(self,qstring):
        self.__connect__()
        cursor = self.db.cursor()
        cursor.execute(qstring)
        temp = cursor.fetchall()
        self.__disconnect__()
        return temp


# PHENOTYPE
# -----------------------------------------------------------------------------
# A phenotype object brings together the database, tables, and builds the phenotype

class Phenotype:
    def __init__(self,db,dd):
        self.db = db   # Database
        self.dd = dd   # Data dictionary

    # When print phenotype object:
    def __str__(self):
        if self.matches:
          return 'Matches: ' + str(len(self.matches)) + '\nDatabase: ' + str(self.db) + '\nData Dict: ' + str(self.dd)
        else:
          return 'Matches: 0\n' + 'Database: ' + str(self.db) + '\nData Dict: ' + str(self.dd)

    # Search
    def search(self,infile,datadicttable):
	# Create search pattern
        words = self.readInput(infile)
     	pattern = self.createRegexp(words)
	# Now search data dictionary for pattern
	self.dd.getDataDict(datadicttable)
        # Get list of table, column id, variable type, and options
        self.matches = self.matchPattern(self.dd.datadict,pattern)
        print "New list of matches: found " + str(len(self.matches)) + "."

    # Parse - uses list of matches from data dictionary to query database
    # Subcolname should be the name of the column with subject IDs
    def parse(self,subcolname,tablecolidx,quescolidx):
        if not self.matches:
          print "Error: please run Phenotype.search(infile) to find matches before querying!"
          return
        else:
	  self.db.__connect__()  # connect to database
          Metrics = dict()       # This is a dictionary of metric objects, one per column match
          subs = dict()     # This is a dictionary of unique subjects, each is a data dictionary
          metric_count = 0  # index for metrics
          sub_count = 0     # index for subjects
          for m in self.matches:
            tname = m[tablecolidx]
            cname = m[quescolidx]

            found = False
            for scolname in subcolname:
              try:
                temp = self.db.query("SELECT " + scolname + ", " + cname + " FROM " + tname + ";")
                # Create new metric(table name, column name, data, type, options)
      	        metric = Metric(m,temp,metric_count)
                Metrics[cname] = metric
                metric_count = metric_count+1
                # Save unique subject IDS to dictionary
                for s in temp:
                  subid = s[0]
                  # If the subject isn't in the list, add, add one to count
 	          if subid not in subs.keys():
                    subject = Subject(subid,sub_count)
                    subs[subid] = subject
                    sub_count = sub_count + 1
                found = True

              except:
                pass
            if not found:
              print "Table " + tname + " Question: " + m[1] + " not in database!"


          # Save metrics and subjects
          self.subjects = subs
          self.metrics = Metrics

          # Each subject and metric holds track of its index,
          # and the length of each is the size of the final data
          self.data = [[None for _ in range(len(self.metrics))] for _ in range(len(self.subjects))]

	  # Now, go through metrics, and add subject specific data
	  print "Parsing " + str(len(self.metrics)) + " metrics for " + str(len(self.subjects))
          for question,m in self.metrics.iteritems():
            dat = m.data
            metidx = m.index   # This is the metric index

            for d in dat:  # For each subid, value
              subid = d[0]; val = d[1]
              # Get subject index, add to data
              subidx = self.subjects[subid].index
              self.data[subidx][metidx] = val

          print "Data parsing complete."


    # Read input words, create regular expression
    def readInput(self,infile):
        words = []
        filey = open(infile,'r')
        for f in filey.readlines():
          words.append(f.strip('\n'))
        filey.close()
        return words

    # Create regular expression
    def createRegexp(self,words):
      pattern_strings = []
      for w in words:
        pattern_strings.append(w)

      pattern_string = '|'.join(pattern_strings)
      pattern = re.compile(pattern_string)
      return pattern

    # Find tables and columns that match our pattern
    def matchPattern(self,datadict,pattern):
      matches = list()  # Will hold (table name, column name, variable type, options)
      count = 1
      for d in datadict:
        # table_name, column_name, question_instruction, domain, options, scale, Mom, Dad, Proband, Siblin, Family, Other, notes, display_title, column_title, column_hint
        temp = " ".join(d)
        if pattern.search(temp.lower()):
          matches.append(d)
          count = count + 1
      return matches

    # Print self.data to file, as well as print a table of unique values, types, etc.
    def printf(self,outfile):
        nrow = len(self.data)
        ncol = len(self.data[0])
        print "Printing " + str(nrow) + " subjects for " + str(ncol) + " questions..."

        # Get the correctly ordered list of subjects
        sublist = []
        suborder = []
        for s,v in self.subjects.iteritems():
          sublist.append(s)
          suborder.append(v.index)

	subs = [x for y,x in sorted(zip(suborder,sublist))]

        # Get the correctly ordered list of metrics
        metlist = []
        metorder = []
        for s,v in self.metrics.iteritems():
          metlist.append(s)
          metorder.append(v.index)

	mets = [x for y,x in sorted(zip(metorder,metlist))]

        # Print both an output data file, and dictionary file
	outfdict = open(outfile + "_vars.txt",'w')  # column names, ranges, type, and text
   	outf = open(outfile + "_data.txt",'w')      # the data

	# First print metric_column names and metric info to file
	outf.write("subjectkey\t")
        outfdict.write("\t".join(self.dd.fields) + "\n")

        for m in mets:
          outf.write(m + "\t")
          outfdict.write("\t".join(self.metrics[m].fields) + "\n")
        outf.write("\n")

        # Print subid, followed by subject data
        count = 0;
	for r in range(nrow):
          row = self.data[r]
          outf.write(subs[count] + "\t" + "\t".join(str(x) for x in row) + "\n")
          count = count+1


	print "Done printing to file."
        print "See " + outfile + "_data.txt for question data"
        print "See " + outfile + "_vars.txt for data dictionary"

        outfdict.close()
        outf.close()


# METRIC -------------------------------------------------------------------------------
# A metric object holds results for a particular column in a table
class Metric:
          # data-dict fields, data,idx
    def __init__(self,d,data,idx):
        self.fields = d
        self.data = data;
        self.index = idx

    def __str__(self):
        return "<table:" + self.table + "><ques:" + self.question + "><options:" + self.options + "><index:" + str(self.index) + ">"


# SUBJECT -------------------------------------------------------------------------------
# A subject object holds subject ID and count
class Subject:
    def __init__(self,subid,subindex):
        self.id = subid
        self.index = subindex

    def __str__(self):
        return "<id:" + self.id + "><index:" + str(self.index) + ">"

# MAIN ----------------------------------------------------------------------------------
def usage():
    print __doc__

def main():
    print __doc__

if __name__ == "__main__":
    main(sys.argv[1:])

## clinPhenoR.R
# clinPhenoR.R reads in clinical data extracted using clinPheno.py,
# and helps user to view questions and (manually, carefully!) recode

# Read in the data dictionary
dd = read.csv("SFARI_eyegaze_vars.txt",sep="\t",header=TRUE)

# Read in the data
data = read.csv("SFARI_eyegaze_data.txt",sep="\t",header=TRUE)
rownames(data) = data[,1]
data = data[,-1]

# Last column is empty, because of extra delimiter
# (fix this in export)
data = data[,1:ncol(data)-1]

# There should be one less column in dd, does not list subject field
# (fix this in export)

# MANUAL DIRECTION SCORING:
# Look at the options, and figure out max and min for each:


# IDs to eliminate (not relevant questions)
delete = c()
# Max values
maxy = c()
# Min values
minny = c()
# Direction
direction = c()
# Recodes
recodes = c()
# NA vals
NaVals = c()

for (o in 1:nrow(dd)) {
  data_col_name = dd$name[o]
  opt = as.character(dd$options[o])
  ques = as.character(dd$column_title[o])
  dat = data[as.character(data_col_name)]
  cat("ID: ",as.character(data_col_name),"\n")
  cat("?: ",ques,"\n")
  cat("opt: ",opt,"\n\n")

  # Ask user if we want to keep question
  n = as.numeric(readline("Delete question? [0:No][1:Yes] "))
  if (n == 1) {
    delete = c(delete,o)
    recodes = c(recodes,"")
    maxy = c(maxy,0)
    minny = c(minny,0)
    direction = c(direction,0)
  } else { # if we want to keep the question

  # Ask user if we want to recode question
  n = as.numeric(readline("Question needs recode? (eg, if Yes/No, True/False) [0:No][1:Yes] "))
  if (n == 1) {
    n = readline("Enter string with Value:Recode separated by commas (eg, \"No:0,Yes:1\" ")
    recodes = c(recodes,n)
  } else {
    recodes = c(recodes,"")
  }

  # Ask user for NA values
  n = readline("Specify NA values, in row separated by commas: (eg, \"8,9\"")
  NaVals = c(NaVals,n)

  # Ask user for direction (0=normal, 1=aberrant)
  n = as.numeric(readline("Question Values Scale? [0:lower=normal][1:lower=not normal]"))
  direction = c(direction,n)

  # Ask user for max value
  n = as.numeric(readline("Max Value?"))
  maxy = c(maxy,n)

  # Ask user for min value
  n = as.numeric(readline("Min Value?"))
  minny = c(minny,n)
  }
}

# First, delete questions that we don't want
dd = dd[-delete,]
test <- subset(data, select = -delete )
data = test
maxy = maxy[-delete]
minny = minny[-delete]
direction = direction[-delete]
recodes = recodes[-delete]
NaVals = NaVals[-delete]

# Manually rescoring each question
q = 15
d = data[,q]
max = maxy[q]
min = minny[q]
dir = direction[q]
rec = recodes[q]
na = NaVals[q]
# This will hold new data
tmp = array(dim=dim(data)[1])

cat("Max: ",max)
cat("Min: ",min)
cat("Dir: ",dir)
cat("Rec: ",rec)
cat("Na: ",na)

# Check for unique values
unique(d)

# Change Yes/No to 1/0
tmp[which(d=="yes")] = 1
tmp[which(d=="no")] = 0

# Change 2 to 1
tmp[which(d=="2")] = 1
tmp[which(d=="0")] = 0

tmp[which(d=="0")] = 0
tmp[which(d=="1")] = 1
tmp[which(d=="2")] = 2
tmp[which(d=="3")] = 3

# Also change direction
tmp[which(d=="0")] = 3
tmp[which(d=="1")] = 2
tmp[which(d=="2")] = 1
tmp[which(d=="3")] = 0

# Now normalize
tmp = ((tmp - min) / (max-min))
hist(tmp)

# Save
data2[,q] = tmp
normdata = data2

# Save to file
save(data, normdata, dd, file=paste(getwd(),"/Simons_Eye_Data.rda",sep=''))

## createDD.py
#!/usr/bin/python

# This script will create a quasi-data dictionary from a database,
# as a csv file for import into a database

import clinPheno as cp

# Create database object to connect to data
hostname = "hostname"
user = "root"
dbname = "databasename"
password = "password"
db = cp.Database(hostname,user,password,dbname)

# Create lists for holding column names, and possible values
tnames = list()
cnames = list()
dattypes = list()
dbnames = list()

for t in db.tables:
   dbname,tbname,colname,dattype = db.getTableCols(t)
   tnames.append(tbname)
   cnames.append(colname)
   dattypes.append(dattype)
   dbnames.append(dbname)

# Now print to csv file
outf = open("datadict.txt",'w')

# First print metric_column names and metric info to file
outf.write("table_name,column_name,question_instruction,domain,options,scale,Mom,Dad,Proband, Sibling,Family,Other,notes,display_title,column_title,column_hint\n")

# Print each row
for t in range(len(tnames)):
  for c in range(len(tnames[t])):
    outf.write(tnames[t][c] + "," + cnames[t][c] + ",,,," + dattypes[t][c] + ",,,,,,,,,,\n" )

outf.close()
	#!/usr/bin/python

	"""
	clinPheno: Methods for querying a SQL database of clinical
	variables to establish a clinical phenotype. The script will
	search a data dictionary (database) for a set of user specified
	terms (in some input.txt file), and then extract the matches
	from the clinical database, for import into R (see clinPhenoR.R)

	# SAMPLE INPUT.TXT file:
	# One search term per line, capitalization not important
	eye
	gaze

	# SAMPLE USAGE:

	import clinPheno as cp

	# Create database object for clinical data
	hostname = "my.database.com"
	user = "user"
	dbname = "dbname"
	password = "password"
	db = cp.Database(hostname,user,password,dbname)

	# Create database object for data dictionary
	hostname = "my.data.dictionary"
	user = "user"
	dbname = "dbname"
	password = "password"
	dd = cp.Database(hostname,user,password,dbname)

	# Create Phenotype object from the databases
	pheno = cp.Phenotype(db,dd)

	# This is the table name for the data dictionary
	ddTableName = "data_dictionary"

	# Now, search the data dictionary using our input file
	inputfile = "input.txt";
	pheno.search(inputfile,ddTableName)

	# Parse the data - you can specify multiple subject ID columns, and
	# each will be tried. Be sure that no table has both!
	subcolname = ["individual_id","subject_id"]
	tableNameidx = 0 # col index with table names in data dictionary
	quesNameidx = 1 # col index with column (field) names
	pheno.parse(subcolname,tableNameidx,quesNameidx)

	# Print results to file
	pheno.printf("SFARI_eyegaze")

	"""

	__author__ = "Vanessa Sochat (vsochat@stanford.edu)"
	__version__ = "$Revision: 1.0 $"
	__date__ = "$Date: 2013/12/10 $"
	__license__ = "Python"

	import os
	import sys
	import re
	import MySQLdb

	# DATABASE ----------------------------------------------------------------------------------
	# A database object manages connections to the database
	class Database:
	def __init__(self,host,user,pw,name):
	self.host = host
	self.user = user
	self.password = pw
	self.dbname = name
	self.tables = self.getTables()

	# When user prints object, show all fields
	def __str__(self):
	return "<host:" + self.host + ">" + "<user:" + self.user + ">" + "<pw:" + self.password + ">" + "<name:" + self.dbname + ">" + "<tables:" + str(len(self.tables)) + ">"

	# Connect to database - get tables, and close connection
	def __connect__(self):
	try:
	# Open database connection, get list of tables
	self.db = MySQLdb.connect(self.host,self.user,self.password,self.dbname)
	except:
	print("Could not connect to database. Check credentials and try again.")
	sys.exit()

	# Get list of tables
	def getTables(self):

	temp = self.query("SELECT * FROM INFORMATION_SCHEMA.TABLES;")

	tables = []
	for t in temp:
	# If we have found a table in the db of interest
	if t[1] == self.dbname:
	tables.append(t[2])

	return tables

	# Close connection to database
	def __disconnect__(self):
	self.db.close()

	# Get data dictionary
	def getDataDict(self,dicttablename):

	self.__connect__()
	# Get all column names for the data dictionary
	temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + dicttablename + "\";")

	self.fields = []
	for t in temp:
	self.fields.append(t[3])

	# We can't be certain about data dictionary format, so get all data fields
	self.datadict = self.query("SELECT * FROM " + dicttablename + ";")

	# get all column information from a table with unique values, etc.
	def getTableCols(self,tablename):
	temp = self.query("Select * FROM INFORMATION_SCHEMA.COLUMNS where table_name = \"" + tablename + "\";")
	colnames = list(); tabnames = list(); dbname = list(); dattype = list()
	# temp[*][1] is the database, 2 is table, 3 is colname, 7 is datatype,
	for t in temp:
	colnames.append(t[3])
	dbname.append(t[1])
	tabnames.append(t[2])
	dattype.append(t[7])
	return dbname,tabnames,colnames,dattype

	# get unique values for a particular table and column
	def getSingleCol(self,tablename,colname):
	temp = self.query("SELECT DISTINCT(" + colname + ") as Dat FROM " + tablename + " ORDER BY Dat DESC;")
	return temp

	# Query the database!
	def query(self,qstring):
	self.__connect__()
	cursor = self.db.cursor()
	cursor.execute(qstring)
	temp = cursor.fetchall()
	self.__disconnect__()
	return temp



	# PHENOTYPE
	# -----------------------------------------------------------------------------
	# A phenotype object brings together the database, tables, and builds the phenotype

	class Phenotype:
	def __init__(self,db,dd):
	self.db = db # Database
	self.dd = dd # Data dictionary

	# When print phenotype object:
	def __str__(self):
	if self.matches:
	return 'Matches: ' + str(len(self.matches)) + '\nDatabase: ' + str(self.db) + '\nData Dict: ' + str(self.dd)
	else:
	return 'Matches: 0\n' + 'Database: ' + str(self.db) + '\nData Dict: ' + str(self.dd)

	# Search
	def search(self,infile,datadicttable):
	# Create search pattern
	words = self.readInput(infile)
	pattern = self.createRegexp(words)
	# Now search data dictionary for pattern
	self.dd.getDataDict(datadicttable)
	# Get list of table, column id, variable type, and options
	self.matches = self.matchPattern(self.dd.datadict,pattern)
	print "New list of matches: found " + str(len(self.matches)) + "."

	# Parse - uses list of matches from data dictionary to query database
	# Subcolname should be the name of the column with subject IDs
	def parse(self,subcolname,tablecolidx,quescolidx):
	if not self.matches:
	print "Error: please run Phenotype.search(infile) to find matches before querying!"
	return
	else:
	self.db.__connect__() # connect to database
	Metrics = dict() # This is a dictionary of metric objects, one per column match
	subs = dict() # This is a dictionary of unique subjects, each is a data dictionary
	metric_count = 0 # index for metrics
	sub_count = 0 # index for subjects
	for m in self.matches:
	tname = m[tablecolidx]
	cname = m[quescolidx]

	found = False
	for scolname in subcolname:
	try:
	temp = self.db.query("SELECT " + scolname + ", " + cname + " FROM " + tname + ";")
	# Create new metric(table name, column name, data, type, options)
	metric = Metric(m,temp,metric_count)
	Metrics[cname] = metric
	metric_count = metric_count+1
	# Save unique subject IDS to dictionary
	for s in temp:
	subid = s[0]
	# If the subject isn't in the list, add, add one to count
	if subid not in subs.keys():
	subject = Subject(subid,sub_count)
	subs[subid] = subject
	sub_count = sub_count + 1
	found = True

	except:
	pass
	if not found:
	print "Table " + tname + " Question: " + m[1] + " not in database!"


	# Save metrics and subjects
	self.subjects = subs
	self.metrics = Metrics

	# Each subject and metric holds track of its index,
	# and the length of each is the size of the final data
	self.data = [[None for _ in range(len(self.metrics))] for _ in range(len(self.subjects))]

	# Now, go through metrics, and add subject specific data
	print "Parsing " + str(len(self.metrics)) + " metrics for " + str(len(self.subjects))
	for question,m in self.metrics.iteritems():
	dat = m.data
	metidx = m.index # This is the metric index

	for d in dat: # For each subid, value
	subid = d[0]; val = d[1]
	# Get subject index, add to data
	subidx = self.subjects[subid].index
	self.data[subidx][metidx] = val

	print "Data parsing complete."


	# Read input words, create regular expression
	def readInput(self,infile):
	words = []
	filey = open(infile,'r')
	for f in filey.readlines():
	words.append(f.strip('\n'))
	filey.close()
	return words

	# Create regular expression
	def createRegexp(self,words):
	pattern_strings = []
	for w in words:
	pattern_strings.append(w)

	pattern_string = '\|'.join(pattern_strings)
	pattern = re.compile(pattern_string)
	return pattern

	# Find tables and columns that match our pattern
	def matchPattern(self,datadict,pattern):
	matches = list() # Will hold (table name, column name, variable type, options)
	count = 1
	for d in datadict:
	# table_name, column_name, question_instruction, domain, options, scale, Mom, Dad, Proband, Siblin, Family, Other, notes, display_title, column_title, column_hint
	temp = " ".join(d)
	if pattern.search(temp.lower()):
	matches.append(d)
	count = count + 1
	return matches

	# Print self.data to file, as well as print a table of unique values, types, etc.
	def printf(self,outfile):
	nrow = len(self.data)
	ncol = len(self.data[0])
	print "Printing " + str(nrow) + " subjects for " + str(ncol) + " questions..."

	# Get the correctly ordered list of subjects
	sublist = []
	suborder = []
	for s,v in self.subjects.iteritems():
	sublist.append(s)
	suborder.append(v.index)

	subs = [x for y,x in sorted(zip(suborder,sublist))]

	# Get the correctly ordered list of metrics
	metlist = []
	metorder = []
	for s,v in self.metrics.iteritems():
	metlist.append(s)
	metorder.append(v.index)

	mets = [x for y,x in sorted(zip(metorder,metlist))]

	# Print both an output data file, and dictionary file
	outfdict = open(outfile + "_vars.txt",'w') # column names, ranges, type, and text
	outf = open(outfile + "_data.txt",'w') # the data

	# First print metric_column names and metric info to file
	outf.write("subjectkey\t")
	outfdict.write("\t".join(self.dd.fields) + "\n")

	for m in mets:
	outf.write(m + "\t")
	outfdict.write("\t".join(self.metrics[m].fields) + "\n")
	outf.write("\n")

	# Print subid, followed by subject data
	count = 0;
	for r in range(nrow):
	row = self.data[r]
	outf.write(subs[count] + "\t" + "\t".join(str(x) for x in row) + "\n")
	count = count+1


	print "Done printing to file."
	print "See " + outfile + "_data.txt for question data"
	print "See " + outfile + "_vars.txt for data dictionary"

	outfdict.close()
	outf.close()


	# METRIC -------------------------------------------------------------------------------
	# A metric object holds results for a particular column in a table
	class Metric:
	# data-dict fields, data,idx
	def __init__(self,d,data,idx):
	self.fields = d
	self.data = data;
	self.index = idx

	def __str__(self):
	return "<table:" + self.table + "><ques:" + self.question + "><options:" + self.options + "><index:" + str(self.index) + ">"


	# SUBJECT -------------------------------------------------------------------------------
	# A subject object holds subject ID and count
	class Subject:
	def __init__(self,subid,subindex):
	self.id = subid
	self.index = subindex

	def __str__(self):
	return "<id:" + self.id + "><index:" + str(self.index) + ">"

	# MAIN ----------------------------------------------------------------------------------
	def usage():
	print __doc__

	def main():
	print __doc__

	if __name__ == "__main__":
	main(sys.argv[1:])
	# clinPhenoR.R reads in clinical data extracted using clinPheno.py,
	# and helps user to view questions and (manually, carefully!) recode

	# Read in the data dictionary
	dd = read.csv("SFARI_eyegaze_vars.txt",sep="\t",header=TRUE)

	# Read in the data
	data = read.csv("SFARI_eyegaze_data.txt",sep="\t",header=TRUE)
	rownames(data) = data[,1]
	data = data[,-1]

	# Last column is empty, because of extra delimiter
	# (fix this in export)
	data = data[,1:ncol(data)-1]

	# There should be one less column in dd, does not list subject field
	# (fix this in export)

	# MANUAL DIRECTION SCORING:
	# Look at the options, and figure out max and min for each:


	# IDs to eliminate (not relevant questions)
	delete = c()
	# Max values
	maxy = c()
	# Min values
	minny = c()
	# Direction
	direction = c()
	# Recodes
	recodes = c()
	# NA vals
	NaVals = c()

	for (o in 1:nrow(dd)) {
	data_col_name = dd$name[o]
	opt = as.character(dd$options[o])
	ques = as.character(dd$column_title[o])
	dat = data[as.character(data_col_name)]
	cat("ID: ",as.character(data_col_name),"\n")
	cat("?: ",ques,"\n")
	cat("opt: ",opt,"\n\n")

	# Ask user if we want to keep question
	n = as.numeric(readline("Delete question? [0:No][1:Yes] "))
	if (n == 1) {
	delete = c(delete,o)
	recodes = c(recodes,"")
	maxy = c(maxy,0)
	minny = c(minny,0)
	direction = c(direction,0)
	} else { # if we want to keep the question

	# Ask user if we want to recode question
	n = as.numeric(readline("Question needs recode? (eg, if Yes/No, True/False) [0:No][1:Yes] "))
	if (n == 1) {
	n = readline("Enter string with Value:Recode separated by commas (eg, \"No:0,Yes:1\" ")
	recodes = c(recodes,n)
	} else {
	recodes = c(recodes,"")
	}

	# Ask user for NA values
	n = readline("Specify NA values, in row separated by commas: (eg, \"8,9\"")
	NaVals = c(NaVals,n)

	# Ask user for direction (0=normal, 1=aberrant)
	n = as.numeric(readline("Question Values Scale? [0:lower=normal][1:lower=not normal]"))
	direction = c(direction,n)

	# Ask user for max value
	n = as.numeric(readline("Max Value?"))
	maxy = c(maxy,n)

	# Ask user for min value
	n = as.numeric(readline("Min Value?"))
	minny = c(minny,n)
	}
	}

	# First, delete questions that we don't want
	dd = dd[-delete,]
	test <- subset(data, select = -delete )
	data = test
	maxy = maxy[-delete]
	minny = minny[-delete]
	direction = direction[-delete]
	recodes = recodes[-delete]
	NaVals = NaVals[-delete]

	# Manually rescoring each question
	q = 15
	d = data[,q]
	max = maxy[q]
	min = minny[q]
	dir = direction[q]
	rec = recodes[q]
	na = NaVals[q]
	# This will hold new data
	tmp = array(dim=dim(data)[1])

	cat("Max: ",max)
	cat("Min: ",min)
	cat("Dir: ",dir)
	cat("Rec: ",rec)
	cat("Na: ",na)

	# Check for unique values
	unique(d)

	# Change Yes/No to 1/0
	tmp[which(d=="yes")] = 1
	tmp[which(d=="no")] = 0

	# Change 2 to 1
	tmp[which(d=="2")] = 1
	tmp[which(d=="0")] = 0

	tmp[which(d=="0")] = 0
	tmp[which(d=="1")] = 1
	tmp[which(d=="2")] = 2
	tmp[which(d=="3")] = 3

	# Also change direction
	tmp[which(d=="0")] = 3
	tmp[which(d=="1")] = 2
	tmp[which(d=="2")] = 1
	tmp[which(d=="3")] = 0

	# Now normalize
	tmp = ((tmp - min) / (max-min))
	hist(tmp)

	# Save
	data2[,q] = tmp
	normdata = data2

	# Save to file
	save(data, normdata, dd, file=paste(getwd(),"/Simons_Eye_Data.rda",sep=''))
	#!/usr/bin/python

	# This script will create a quasi-data dictionary from a database,
	# as a csv file for import into a database

	import clinPheno as cp

	# Create database object to connect to data
	hostname = "hostname"
	user = "root"
	dbname = "databasename"
	password = "password"
	db = cp.Database(hostname,user,password,dbname)

	# Create lists for holding column names, and possible values
	tnames = list()
	cnames = list()
	dattypes = list()
	dbnames = list()

	for t in db.tables:
	dbname,tbname,colname,dattype = db.getTableCols(t)
	tnames.append(tbname)
	cnames.append(colname)
	dattypes.append(dattype)
	dbnames.append(dbname)

	# Now print to csv file
	outf = open("datadict.txt",'w')

	# First print metric_column names and metric info to file
	outf.write("table_name,column_name,question_instruction,domain,options,scale,Mom,Dad,Proband, Sibling,Family,Other,notes,display_title,column_title,column_hint\n")

	# Print each row
	for t in range(len(tnames)):
	for c in range(len(tnames[t])):
	outf.write(tnames[t][c] + "," + cnames[t][c] + ",,,," + dattypes[t][c] + ",,,,,,,,,,\n" )

	outf.close()