Skip to content

Instantly share code, notes, and snippets.

@vsoch
Last active December 29, 2015 03:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsoch/7607539 to your computer and use it in GitHub Desktop.
Save vsoch/7607539 to your computer and use it in GitHub Desktop.
searchNdarLocal.py takes an input file (input.txt) with single keywords on each line, and searches a local database of NDAR behavioral data. Output includes: 1) outfile_vars.txt, with questions containing the search term(s) of interests on single lines, followed by an estimate of the min, max, and variable type 2) outfile_data.txt, with subject …
#!/usr/bin/python
"""
searchNdar: reads in a set of strings from file, and search behavioral metrics
to find questions / subscales that might be of interest
python searchNdarLocal.py --o outfile --i /home
/vanessa/Documents/Work/NDAR/behavioral -w input.txt
--o is the outfile name, without extension
--i is the input directory, without a trailing slash
--w is the input text file, either in PWD or fullpath
Example input file:
gaze
eye
contact
"""
__author__ = "Vanessa Sochat (vsochat@stanford.edu)"
__version__ = "$Revision: 1.0 $"
__date__ = "$Date: 2013/11/01 $"
__license__ = "Python"
import os
import sys
import re
import operator
from os import listdir
from time import sleep
from os.path import isfile, join
import pandas as pd
import numpy as nu
debug = False
# SUBJECT ----------------------------------------------------------------------------------
# A subject object organizes all metric data by subject
class Subject:
def __init__(self,N):
self.subs = [] # a list of dictionaries of subjects subs[1] = {'metric':'value' }
self.subind = dict() # A dictionary to index the list above
self.N = N # number of hits (columns from metrics)
self.metrics = [] # List of column metrics and indices
self.marker = 0 # Holds index of last metric added
self.submarker = 0 # Holds index of last sub added
self.data = [] # hold metric data
self.suball = [] # a list of subjects specific to a metric
self.vartype = [] # holds the (guessed) data type of a column
self.labels = [] # All metric labels
# Assign metrics - each of N metrics needs a consistent index
def addMetric(self,metric):
# First get all column names
colnames = metric.label
colvals = metric.data
# Now add each column name with metric name to our list
for n in range(0,len(colnames)):
self.metrics.append(self.marker)
self.data.append(colvals[n])
self.suball.append(metric.subid) # Save ALL subids here
self.labels.append(colnames[n])
self.marker = self.marker + 1
# Save each subject data to dictionary
for s in range(1,len(metric.subid)):
idx = self.subind[metric.subid[s]] # Find subject index
self.subs[idx][colnames[n]] = colvals[n][s]
# addSubject: adds all subjects to dictionary
def addSubjects(self,hit):
for s in range(1,len(hit.subid)): # For each subject
if hit.subid[s] not in self.subind.keys(): # If they aren't added
self.subs.append(dict())
self.subs[self.submarker]['id'] = hit.subid[s] # This is the index of the subject
self.subind[hit.subid[s]] = self.submarker
self.submarker = self.submarker + 1
# When we get here, all subjects and metrics are added, and we have indices.
def summaryStats(self,metrics):
self.min = nu.zeros(shape=(len(self.metrics),1))
self.max = nu.zeros(shape=(len(self.metrics),1))
self.vartype = ['numerical']*len(self.metrics)
#print "Data matrix is size " + str(nu.shape(self.matrix))
print "Estimating summary statistics..."
# Iterate through metrics, add subjects based on idx
for m in self.metrics:
# Grab data
print self.labels[m]
data = list(self.data[m])
subs = list(self.suball[m])
# Since we convert to list, the first entry is a column header
data = data[1:len(data)] # All IDs specific to a particular column metric
subs = subs[1:len(subs)] # All subjects that have the metric
# Get an estimate of min/max for each column
for s in range(0,len(subs)):
if isinstance(data[s],basestring): # If it isn't nan
#self.matrix[m][subidx] = data[s]
if not re.search("[A-za-z]|[-]|[#]|[&]|[/]",data[s]):
val = data[s].strip('\'')
if self.min[m] > float(val): self.min[m] = float(val)
if self.max[m] < float(val): self.max[m] = float(val)
# print to file
def printSubjects(self,outfile):
print "Printing variables to file..."
outfcols = open(outfile + "_vars.txt",'w') # single column of column names
outf = open(outfile + "_data.txt",'w') # the data
# First print metric_column names
outf.write("subjectkey\t")
for ee in self.metrics:
outf.write(self.labels[ee] + "\t")
outf.write("\n")
print "There are " + str(len(self.metrics)) + " data items total for " + str(len(self.subs)) + " subjects"
# Print subject data
for ee in self.metrics:
label = self.labels[ee]
# This prints the min and max to file, or "text" data if they are both zero
if self.min[ee] == 0 and self.max[ee] == 0:
outfcols.writelines(label + "\t" + "text" + "\n")
else:
outfcols.writelines(label + "\t" + str(self.min[ee]) + "\t" + str(self.max[ee]) + "\n")
# Iterate through subjects, and then metrics, to print rows
for s in range(0,len(self.subs)):
subdata = self.subs[s]
name = subdata['id']
print "Printing subject " + name + " " + str(s) + " of " + str(len(self.subs))
outf.write(name + "\t")
# Print subject data
for ee in self.metrics:
label = self.labels[ee]
data = list(self.data[ee]);
#print data[0];
#data = data[1:len(data)]
if label in subdata.keys(): # If the metric is a key in the subject dictionary, get data from there
outf.write(str(subdata[label]) + "\t")
else:
outf.write('NaN' + "\t")
outf.write('\n')
outfcols.close()
outf.close()
# METRIC ----------------------------------------------------------------------------------
# A metric object holds results for a particular assessment
class Metric:
def __init__(self,m,mfullfile,subid):
self.name = m # metric file name
self.fullfile = mfullfile
self.subid = subid # List of subject IDs
self.data = [] # List of column data
self.label = [] # List of full labels
# addColumn: adds a column of data to Metric
def addColumn(self,column,coldata,description):
self.data.append(coldata) # Save the column data
self.label.append(self.name + "_" + column + "_" + description)
# INPUT FUNCTIONS -----------------------------------------------------------------------
# Read input words, create regular expressiin
def readInput(infile):
words = []
filey = open(infile,'r')
for f in filey.readlines():
words.append(f.strip('\n'))
return words
# Create regular expression
def createRegexp(words):
pattern_strings = []
for w in words:
pattern_strings.append(w)
pattern_string = '|'.join(pattern_strings)
pattern = re.compile(pattern_string)
return pattern
# Read metrics from NDAR Data Dictionary
def readMetrics(indir,pattern):
# Get files in directory
files = [ f for f in listdir(indir) if isfile(join(indir,f)) ]
# This will be a list of metrics to return
Metrics = []
# Read in each file, search for pattern
for f in files:
fullfile = indir + '/' + f
if debug: print "File is: " + fullfile
# If we find a match, parse entire file
if pattern.search(open(fullfile).read()):
lines = []
# Now read in csv, and save indexed columns with subject IDs as dictionary
df = pd.read_csv(fullfile,delimiter='\t')
subjectids = df.subjectkey # Subject ID
# First row is variable names, 2nd is descriptions
varnames = open(fullfile).readlines()[0]
descript = open(fullfile).readlines()[1]
colnums = [] # indices to columns to save
count = 0
if pattern.search(varnames.lower()) or pattern.search(descript.lower()):
# Create a new metric object
metric = Metric(f,fullfile,subjectids)
# Find column indices with matches
for v in (varnames.split('\t')):
if pattern.search(v.lower()):
colnums.append(count)
count = count + 1
count = 0
for d in (descript.split('\t')):
if pattern.search(d.lower()):
colnums.append(count)
count = count + 1
# Only save unique values
colnums = list(set(colnums))
# Extract column names and descriptions
varnames = varnames.split('\t')
descript = descript.split('\t')
varnames = [varnames[i] for i in colnums]
descript = [descript[i] for i in colnums]
# Add columns to metric object
for i in range(0,len(varnames)):
colname = varnames[i].strip('"')
coldata = df[colname] # the column data
d = descript[i] # the full description
metric.addColumn(colname,coldata,d) # column short name, full data, and description
# Save metric to list
if len(varnames) is not 0: Metrics.append(metric)
return Metrics
# Print results to file - hits is a list of Metric objects
def printResults(hits, outfile):
# First get the number of hits per assessment - each hit gets its own column
N = 0 # The number of hits, across all metrics
for h in hits:
N = N + len(h.data)
print "Found " + str(N) + " column hits."
S = Subject(N) # A dict of subject indices, each w/ dict that indexes by metric/col name
for h in hits:
S.addSubjects(h) # First add the subjects
S.addMetric(h) # Now add metric
print "Number of metrics added is " + str(len(hits))
# Calculate mins and maxes
S.summaryStats(hits)
# Now print to file
S.printSubjects(outfile)
# MAIN ----------------------------------------------------------------------------------
def usage():
print __doc__
def main(argv):
# We need an output file name, and an input file with words
if len(argv) < 3: usage(); sys.exit()
outfile = None
# First cycle through the arguments to collect user variables
for ar in range(0,len(argv)):
if argv[ar] in ("-h", "--help"): usage(); sys.exit()
elif argv[ar] == "--i": infile = argv[ar+1];
elif argv[ar] == "--o": outfile = str(argv[ar+1])
elif argv[ar] == "-i": infile = argv[ar+1];
elif argv[ar] == "-o": outfile = str(argv[ar+1])
elif argv[ar] == "-w": wordfile = argv[ar+1];
elif argv[ar] == "--w": wordfile = str(argv[ar+1])
# If user has not specified an output file name, we will use input file name
if not outfile: outfile = os.path.basename(infile).split(".")[0]
# Make sure variables are ok
print "searchNdarLocal"
print "-----------------------------------------------------------"
print "Input file directory is: " + infile
print "Word file is " + wordfile
print "Output file name is " + outfile
# Read words to search for in NDAR behavioral metrics
print('Reading input files...')
words = readInput(wordfile)
# Create regular expression
pattern = createRegexp(words)
# Get all metrics from NDAR Data Dictionary
metrics = readMetrics(infile,pattern)
# Save and print metrics based on subject ID
printResults(metrics,outfile)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment