vsoch/searchNdarLocal.py

## searchNdarLocal.py
#!/usr/bin/python

"""
searchNdar: reads in a set of strings from file, and search behavioral metrics
to find questions / subscales that might be of interest

python searchNdarLocal.py --o outfile --i /home
/vanessa/Documents/Work/NDAR/behavioral -w input.txt

--o is the outfile name, without extension
--i is the input directory, without a trailing slash
--w is the input text file, either in PWD or fullpath


Example input file:

gaze
eye
contact

"""

__author__ = "Vanessa Sochat (vsochat@stanford.edu)"
__version__ = "$Revision: 1.0 $"
__date__ = "$Date: 2013/11/01 $"
__license__ = "Python"

import os
import sys
import re
import operator
from os import listdir
from time import sleep
from os.path import isfile, join
import pandas as pd
import numpy as nu
debug = False

# SUBJECT ----------------------------------------------------------------------------------
# A subject object organizes all metric data by subject
class Subject:
    def __init__(self,N):

        self.subs = []                              # a list of dictionaries of subjects subs[1] = {'metric':'value' }
        self.subind = dict()                        # A dictionary to index the list above
        self.N = N                                  # number of hits (columns from metrics)
        self.metrics = []                           # List of column metrics and indices
        self.marker = 0                             # Holds index of last metric added
        self.submarker = 0                          # Holds index of last sub added
	self.data = []				    # hold metric data
        self.suball = []			    # a list of subjects specific to a metric
	self.vartype = []                           # holds the (guessed) data type of a column
        self.labels = []                            # All metric labels


    # Assign metrics - each of N metrics needs a consistent index
    def addMetric(self,metric):
        # First get all column names
        colnames = metric.label
        colvals = metric.data
        # Now add each column name with metric name to our list

        for n in range(0,len(colnames)):
          self.metrics.append(self.marker)
          self.data.append(colvals[n])
          self.suball.append(metric.subid) # Save ALL subids here
          self.labels.append(colnames[n])
          self.marker = self.marker + 1
          # Save each subject data to dictionary
          for s in range(1,len(metric.subid)):
            idx = self.subind[metric.subid[s]] # Find subject index
            self.subs[idx][colnames[n]] = colvals[n][s]

    # addSubject: adds all subjects to dictionary
    def addSubjects(self,hit):

       for s in range(1,len(hit.subid)):             # For each subject
          if hit.subid[s] not in self.subind.keys():   # If they aren't added
            self.subs.append(dict())
            self.subs[self.submarker]['id'] = hit.subid[s] # This is the index of the subject
            self.subind[hit.subid[s]] = self.submarker
            self.submarker = self.submarker + 1

    # When we get here, all subjects and metrics are added, and we have indices.
    def summaryStats(self,metrics):
        self.min = nu.zeros(shape=(len(self.metrics),1))
        self.max = nu.zeros(shape=(len(self.metrics),1))
        self.vartype = ['numerical']*len(self.metrics)
        #print "Data matrix is size " + str(nu.shape(self.matrix))
        print "Estimating summary statistics..."

	# Iterate through metrics, add subjects based on idx
        for m in self.metrics:
          # Grab data
          print self.labels[m]

          data = list(self.data[m])
          subs = list(self.suball[m])

          # Since we convert to list, the first entry is a column header
          data = data[1:len(data)]  # All IDs specific to a particular column metric
	  subs = subs[1:len(subs)]  # All subjects that have the metric

          # Get an estimate of min/max for each column
          for s in range(0,len(subs)):
            if isinstance(data[s],basestring):  # If it isn't nan
              #self.matrix[m][subidx] = data[s]
              if not re.search("[A-za-z]|[-]|[#]|[&]|[/]",data[s]):
                val = data[s].strip('\'')
                if self.min[m] > float(val): self.min[m] = float(val)
                if self.max[m] < float(val): self.max[m] = float(val)


     # print to file
    def printSubjects(self,outfile):

	print "Printing variables to file..."
        outfcols = open(outfile + "_vars.txt",'w')  # single column of column names
   	outf = open(outfile + "_data.txt",'w')      # the data

	# First print metric_column names
	outf.write("subjectkey\t")
        for ee in self.metrics:
          outf.write(self.labels[ee] + "\t")
        outf.write("\n")

        print "There are " + str(len(self.metrics)) + " data items total for " + str(len(self.subs)) + " subjects"

        # Print subject data
	for ee in self.metrics:
	  label = self.labels[ee]

          # This prints the min and max to file, or "text" data if they are both zero
          if self.min[ee] == 0 and self.max[ee] == 0:
            outfcols.writelines(label + "\t" + "text" + "\n")
          else:
            outfcols.writelines(label + "\t" + str(self.min[ee]) + "\t" + str(self.max[ee]) + "\n")

        # Iterate through subjects, and then metrics, to print rows
	for s in range(0,len(self.subs)):
          subdata = self.subs[s]
          name = subdata['id']
	  print "Printing subject " + name + " " + str(s) + " of " + str(len(self.subs))

          outf.write(name + "\t")
	  # Print subject data
	  for ee in self.metrics:
	    label = self.labels[ee]
            data = list(self.data[ee]);
            #print data[0];
            #data = data[1:len(data)]
            if label in subdata.keys():  # If the metric is a key in the subject dictionary, get data from there
              outf.write(str(subdata[label]) + "\t")
            else:
              outf.write('NaN' + "\t")
          outf.write('\n')

        outfcols.close()
        outf.close()


# METRIC ----------------------------------------------------------------------------------
# A metric object holds results for a particular assessment
class Metric:
    def __init__(self,m,mfullfile,subid):

        self.name = m                                       # metric file name
        self.fullfile = mfullfile
	self.subid = subid                                  # List of subject IDs
        self.data = []                                      # List of column data
        self.label = []                                     # List of full labels

    # addColumn: adds a column of data to Metric
    def addColumn(self,column,coldata,description):
	self.data.append(coldata)  # Save the column data
        self.label.append(self.name + "_" + column + "_" + description)


# INPUT FUNCTIONS -----------------------------------------------------------------------

# Read input words, create regular expressiin
def readInput(infile):

    words = []
    filey = open(infile,'r')
    for f in filey.readlines():
      words.append(f.strip('\n'))
    return words

# Create regular expression
def createRegexp(words):
    pattern_strings = []
    for w in words:
      pattern_strings.append(w)

    pattern_string = '|'.join(pattern_strings)
    pattern = re.compile(pattern_string)
    return pattern

# Read metrics from NDAR Data Dictionary
def readMetrics(indir,pattern):

    # Get files in directory
    files = [ f for f in listdir(indir) if isfile(join(indir,f)) ]

    # This will be a list of metrics to return
    Metrics = []

    # Read in each file, search for pattern
    for f in files:
        fullfile = indir + '/' +  f
        if debug: print "File is: " + fullfile
        # If we find a match, parse entire file
        if pattern.search(open(fullfile).read()):
          lines = []

          # Now read in csv, and save indexed columns with subject IDs as dictionary
	  df = pd.read_csv(fullfile,delimiter='\t')
	  subjectids = df.subjectkey          # Subject ID

          # First row is variable names, 2nd is descriptions
          varnames = open(fullfile).readlines()[0]
          descript = open(fullfile).readlines()[1]

          colnums = []  # indices to columns to save
          count = 0
          if pattern.search(varnames.lower()) or pattern.search(descript.lower()):

              # Create a new metric object
              metric = Metric(f,fullfile,subjectids)
              # Find column indices with matches
              for v in (varnames.split('\t')):
                if pattern.search(v.lower()):
                  colnums.append(count)
                count = count + 1
              count = 0
              for d in (descript.split('\t')):
                if pattern.search(d.lower()):
	          colnums.append(count)
                count = count + 1

          # Only save unique values
          colnums = list(set(colnums))
          # Extract column names and descriptions
          varnames = varnames.split('\t')
          descript = descript.split('\t')
          varnames = [varnames[i] for i in colnums]
          descript = [descript[i] for i in colnums]

          # Add columns to metric object
          for i in range(0,len(varnames)):
            colname = varnames[i].strip('"')
            coldata = df[colname] # the column data
            d = descript[i]  # the full description
            metric.addColumn(colname,coldata,d)  # column short name, full data, and description

          # Save metric to list
          if len(varnames) is not 0:  Metrics.append(metric)

    return Metrics

# Print results to file - hits is a list of Metric objects
def printResults(hits, outfile):

    # First get the number of hits per assessment - each hit gets its own column
    N = 0            # The number of hits, across all metrics

    for h in hits:
      N = N + len(h.data)

    print "Found " + str(N) + " column hits."

    S = Subject(N) # A dict of subject indices, each w/ dict that indexes by metric/col name
    for h in hits:
      S.addSubjects(h)   # First add the subjects
      S.addMetric(h) # Now add metric

    print "Number of metrics added is " + str(len(hits))
    # Calculate mins and maxes
    S.summaryStats(hits)

    # Now print to file
    S.printSubjects(outfile)


# MAIN ----------------------------------------------------------------------------------
def usage():
    print __doc__

def main(argv):

    # We need an output file name, and an input file with words
    if len(argv) < 3: usage(); sys.exit()
    outfile = None

    # First cycle through the arguments to collect user variables
    for ar in range(0,len(argv)):
        if argv[ar] in ("-h", "--help"): usage(); sys.exit()
        elif argv[ar] == "--i": infile = argv[ar+1];
        elif argv[ar] == "--o": outfile = str(argv[ar+1])
	elif argv[ar] == "-i": infile = argv[ar+1];
        elif argv[ar] == "-o": outfile = str(argv[ar+1])
	elif argv[ar] == "-w": wordfile = argv[ar+1];
        elif argv[ar] == "--w": wordfile = str(argv[ar+1])

    # If user has not specified an output file name, we will use input file name
    if not outfile: outfile = os.path.basename(infile).split(".")[0]

    # Make sure variables are ok
    print "searchNdarLocal"
    print "-----------------------------------------------------------"
    print "Input file directory is: " + infile
    print "Word file is " + wordfile
    print "Output file name is " + outfile

    # Read words to search for in NDAR behavioral metrics
    print('Reading input files...')
    words = readInput(wordfile)

    # Create regular expression
    pattern = createRegexp(words)

    # Get all metrics from NDAR Data Dictionary
    metrics = readMetrics(infile,pattern)

    # Save and print metrics based on subject ID
    printResults(metrics,outfile)

if __name__ == "__main__":
    main(sys.argv[1:])
	#!/usr/bin/python

	"""
	searchNdar: reads in a set of strings from file, and search behavioral metrics
	to find questions / subscales that might be of interest

	python searchNdarLocal.py --o outfile --i /home
	/vanessa/Documents/Work/NDAR/behavioral -w input.txt

	--o is the outfile name, without extension
	--i is the input directory, without a trailing slash
	--w is the input text file, either in PWD or fullpath


	Example input file:

	gaze
	eye
	contact

	"""

	__author__ = "Vanessa Sochat (vsochat@stanford.edu)"
	__version__ = "$Revision: 1.0 $"
	__date__ = "$Date: 2013/11/01 $"
	__license__ = "Python"

	import os
	import sys
	import re
	import operator
	from os import listdir
	from time import sleep
	from os.path import isfile, join
	import pandas as pd
	import numpy as nu
	debug = False

	# SUBJECT ----------------------------------------------------------------------------------
	# A subject object organizes all metric data by subject
	class Subject:
	def __init__(self,N):

	self.subs = [] # a list of dictionaries of subjects subs[1] = {'metric':'value' }
	self.subind = dict() # A dictionary to index the list above
	self.N = N # number of hits (columns from metrics)
	self.metrics = [] # List of column metrics and indices
	self.marker = 0 # Holds index of last metric added
	self.submarker = 0 # Holds index of last sub added
	self.data = [] # hold metric data
	self.suball = [] # a list of subjects specific to a metric
	self.vartype = [] # holds the (guessed) data type of a column
	self.labels = [] # All metric labels


	# Assign metrics - each of N metrics needs a consistent index
	def addMetric(self,metric):
	# First get all column names
	colnames = metric.label
	colvals = metric.data
	# Now add each column name with metric name to our list

	for n in range(0,len(colnames)):
	self.metrics.append(self.marker)
	self.data.append(colvals[n])
	self.suball.append(metric.subid) # Save ALL subids here
	self.labels.append(colnames[n])
	self.marker = self.marker + 1
	# Save each subject data to dictionary
	for s in range(1,len(metric.subid)):
	idx = self.subind[metric.subid[s]] # Find subject index
	self.subs[idx][colnames[n]] = colvals[n][s]

	# addSubject: adds all subjects to dictionary
	def addSubjects(self,hit):

	for s in range(1,len(hit.subid)): # For each subject
	if hit.subid[s] not in self.subind.keys(): # If they aren't added
	self.subs.append(dict())
	self.subs[self.submarker]['id'] = hit.subid[s] # This is the index of the subject
	self.subind[hit.subid[s]] = self.submarker
	self.submarker = self.submarker + 1

	# When we get here, all subjects and metrics are added, and we have indices.
	def summaryStats(self,metrics):
	self.min = nu.zeros(shape=(len(self.metrics),1))
	self.max = nu.zeros(shape=(len(self.metrics),1))
	self.vartype = ['numerical']*len(self.metrics)
	#print "Data matrix is size " + str(nu.shape(self.matrix))
	print "Estimating summary statistics..."

	# Iterate through metrics, add subjects based on idx
	for m in self.metrics:
	# Grab data
	print self.labels[m]

	data = list(self.data[m])
	subs = list(self.suball[m])

	# Since we convert to list, the first entry is a column header
	data = data[1:len(data)] # All IDs specific to a particular column metric
	subs = subs[1:len(subs)] # All subjects that have the metric

	# Get an estimate of min/max for each column
	for s in range(0,len(subs)):
	if isinstance(data[s],basestring): # If it isn't nan
	#self.matrix[m][subidx] = data[s]
	if not re.search("[A-za-z]\|[-]\|[#]\|[&]\|[/]",data[s]):
	val = data[s].strip('\'')
	if self.min[m] > float(val): self.min[m] = float(val)
	if self.max[m] < float(val): self.max[m] = float(val)


	# print to file
	def printSubjects(self,outfile):

	print "Printing variables to file..."
	outfcols = open(outfile + "_vars.txt",'w') # single column of column names
	outf = open(outfile + "_data.txt",'w') # the data

	# First print metric_column names
	outf.write("subjectkey\t")
	for ee in self.metrics:
	outf.write(self.labels[ee] + "\t")
	outf.write("\n")

	print "There are " + str(len(self.metrics)) + " data items total for " + str(len(self.subs)) + " subjects"

	# Print subject data
	for ee in self.metrics:
	label = self.labels[ee]

	# This prints the min and max to file, or "text" data if they are both zero
	if self.min[ee] == 0 and self.max[ee] == 0:
	outfcols.writelines(label + "\t" + "text" + "\n")
	else:
	outfcols.writelines(label + "\t" + str(self.min[ee]) + "\t" + str(self.max[ee]) + "\n")

	# Iterate through subjects, and then metrics, to print rows
	for s in range(0,len(self.subs)):
	subdata = self.subs[s]
	name = subdata['id']
	print "Printing subject " + name + " " + str(s) + " of " + str(len(self.subs))

	outf.write(name + "\t")
	# Print subject data
	for ee in self.metrics:
	label = self.labels[ee]
	data = list(self.data[ee]);
	#print data[0];
	#data = data[1:len(data)]
	if label in subdata.keys(): # If the metric is a key in the subject dictionary, get data from there
	outf.write(str(subdata[label]) + "\t")
	else:
	outf.write('NaN' + "\t")
	outf.write('\n')

	outfcols.close()
	outf.close()


	# METRIC ----------------------------------------------------------------------------------
	# A metric object holds results for a particular assessment
	class Metric:
	def __init__(self,m,mfullfile,subid):

	self.name = m # metric file name
	self.fullfile = mfullfile
	self.subid = subid # List of subject IDs
	self.data = [] # List of column data
	self.label = [] # List of full labels

	# addColumn: adds a column of data to Metric
	def addColumn(self,column,coldata,description):
	self.data.append(coldata) # Save the column data
	self.label.append(self.name + "_" + column + "_" + description)



	# INPUT FUNCTIONS -----------------------------------------------------------------------

	# Read input words, create regular expressiin
	def readInput(infile):

	words = []
	filey = open(infile,'r')
	for f in filey.readlines():
	words.append(f.strip('\n'))
	return words

	# Create regular expression
	def createRegexp(words):
	pattern_strings = []
	for w in words:
	pattern_strings.append(w)

	pattern_string = '\|'.join(pattern_strings)
	pattern = re.compile(pattern_string)
	return pattern

	# Read metrics from NDAR Data Dictionary
	def readMetrics(indir,pattern):

	# Get files in directory
	files = [ f for f in listdir(indir) if isfile(join(indir,f)) ]

	# This will be a list of metrics to return
	Metrics = []

	# Read in each file, search for pattern
	for f in files:
	fullfile = indir + '/' + f
	if debug: print "File is: " + fullfile
	# If we find a match, parse entire file
	if pattern.search(open(fullfile).read()):
	lines = []

	# Now read in csv, and save indexed columns with subject IDs as dictionary
	df = pd.read_csv(fullfile,delimiter='\t')
	subjectids = df.subjectkey # Subject ID

	# First row is variable names, 2nd is descriptions
	varnames = open(fullfile).readlines()[0]
	descript = open(fullfile).readlines()[1]

	colnums = [] # indices to columns to save
	count = 0
	if pattern.search(varnames.lower()) or pattern.search(descript.lower()):

	# Create a new metric object
	metric = Metric(f,fullfile,subjectids)
	# Find column indices with matches
	for v in (varnames.split('\t')):
	if pattern.search(v.lower()):
	colnums.append(count)
	count = count + 1
	count = 0
	for d in (descript.split('\t')):
	if pattern.search(d.lower()):
	colnums.append(count)
	count = count + 1

	# Only save unique values
	colnums = list(set(colnums))
	# Extract column names and descriptions
	varnames = varnames.split('\t')
	descript = descript.split('\t')
	varnames = [varnames[i] for i in colnums]
	descript = [descript[i] for i in colnums]

	# Add columns to metric object
	for i in range(0,len(varnames)):
	colname = varnames[i].strip('"')
	coldata = df[colname] # the column data
	d = descript[i] # the full description
	metric.addColumn(colname,coldata,d) # column short name, full data, and description

	# Save metric to list
	if len(varnames) is not 0: Metrics.append(metric)

	return Metrics

	# Print results to file - hits is a list of Metric objects
	def printResults(hits, outfile):

	# First get the number of hits per assessment - each hit gets its own column
	N = 0 # The number of hits, across all metrics

	for h in hits:
	N = N + len(h.data)

	print "Found " + str(N) + " column hits."

	S = Subject(N) # A dict of subject indices, each w/ dict that indexes by metric/col name
	for h in hits:
	S.addSubjects(h) # First add the subjects
	S.addMetric(h) # Now add metric

	print "Number of metrics added is " + str(len(hits))
	# Calculate mins and maxes
	S.summaryStats(hits)

	# Now print to file
	S.printSubjects(outfile)


	# MAIN ----------------------------------------------------------------------------------
	def usage():
	print __doc__

	def main(argv):

	# We need an output file name, and an input file with words
	if len(argv) < 3: usage(); sys.exit()
	outfile = None

	# First cycle through the arguments to collect user variables
	for ar in range(0,len(argv)):
	if argv[ar] in ("-h", "--help"): usage(); sys.exit()
	elif argv[ar] == "--i": infile = argv[ar+1];
	elif argv[ar] == "--o": outfile = str(argv[ar+1])
	elif argv[ar] == "-i": infile = argv[ar+1];
	elif argv[ar] == "-o": outfile = str(argv[ar+1])
	elif argv[ar] == "-w": wordfile = argv[ar+1];
	elif argv[ar] == "--w": wordfile = str(argv[ar+1])

	# If user has not specified an output file name, we will use input file name
	if not outfile: outfile = os.path.basename(infile).split(".")[0]

	# Make sure variables are ok
	print "searchNdarLocal"
	print "-----------------------------------------------------------"
	print "Input file directory is: " + infile
	print "Word file is " + wordfile
	print "Output file name is " + outfile

	# Read words to search for in NDAR behavioral metrics
	print('Reading input files...')
	words = readInput(wordfile)

	# Create regular expression
	pattern = createRegexp(words)

	# Get all metrics from NDAR Data Dictionary
	metrics = readMetrics(infile,pattern)

	# Save and print metrics based on subject ID
	printResults(metrics,outfile)

	if __name__ == "__main__":
	main(sys.argv[1:])