Last active
August 29, 2015 14:07
-
-
Save Krewn/32fbc5281c2166d3c1d5 to your computer and use it in GitHub Desktop.
Euclidean distance, correlation, and cos between sets of vectors with varying dimension.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################### | |
## Author : Kevin Nelson ## | |
# # | |
## Mapping MicroArrayData back to AS ## | |
######################################################### | |
import sys | |
import csv | |
import pylab as pl | |
import matplotlib.pyplot as plt | |
import numpy | |
import os | |
import threading | |
#print "ImportsComplete" | |
####Dependancies | |
#AnaylizeData.py Data.tsv GbyAs.csv test | |
#GbyAs.tsv | |
present=1 | |
absent=-1 | |
def is_number(s): | |
try: | |
float(s) | |
return True | |
except ValueError: | |
return False | |
def clip(D): | |
#print '\n.\n..\n...\n....\n...\n..\n.\n' | |
#prDict2(D) | |
BM96={} | |
n=0 | |
b=12 | |
while(is_number(D[b][2])): | |
b+=1 | |
#print b | |
for k in range(11,b): | |
row=[] | |
for p in range (1,98,1): | |
if(is_number(D[k][p])): | |
if(float(D[k][p])>0 and float(D[k][p])!=999): | |
row.append(float(D[k][p])) | |
else: | |
row.append(0) | |
else: | |
row.append(D[k][p]) | |
BM96[k-11]=row | |
#print '\#' | |
#print BM96 | |
return BM96 | |
def Boolienize(data): | |
global present | |
global absent | |
BData={} | |
n=0 | |
try: | |
for k in range(0,len(data),1): #boolienize Data | |
row=[] | |
for p in data[k]: | |
if(is_number(p)): | |
if(float(p)>0 and float(p)!=999): | |
row.append(present) | |
else: | |
row.append(absent) | |
else: | |
row.append(p) | |
BData[k]=row | |
#print BData | |
return BData | |
except(KeyError): | |
for k in data: #boolienize Data | |
row=[] | |
for p in data[k]: | |
if(is_number(p)): | |
if(float(p)>0 and float(p)!=999): | |
row.append(present) | |
else: | |
row.append(absent) | |
else: | |
row.append(p) | |
BData[k]=row | |
#print BData | |
return BData | |
#data={} | |
def readFileT(n): | |
fileIN=n | |
Data={} | |
with open(fileIN) as csvfile: | |
tableMain=csv.reader(csvfile,delimiter='\t') | |
k=0 | |
for row in tableMain: | |
Data[k]=row | |
k+=1 #Read the Data into dict Data | |
return(Data) | |
def readFileC(n): | |
fileIN=n | |
Data={} | |
with open(fileIN) as csvfile: | |
tableMain=csv.reader(csvfile,delimiter=',') | |
k=0 | |
for row in tableMain: | |
Data[k]=row | |
k+=1 #Read the Data into dict Data | |
return(Data) | |
def printDic2d(nm_at): | |
for k in nm_at: | |
for k2 in nm_at[k]: | |
print str(k)+'\t'+str(k2)+'\t'+str(nm_at[k][k2]) | |
def prDict(Dists): | |
once = True | |
Head="\t" | |
tail="" | |
n=0 | |
for k in Dists: | |
if once == True and n>0: | |
for k2 in Dists[k]: | |
Head+=str(k2)+'\t' | |
once =False | |
#Head+=str(k)+"\t" | |
tail+=str(k)+"\t" | |
for k2 in Dists[k]: | |
try: | |
tail+=str(Dists[k][k2])+"\t" | |
except(TypeError): | |
tail+=str(k2)+"\t" | |
tail+="\n" | |
n+=1 | |
Head+="\n" | |
print Head+tail | |
print "\n\n" | |
def strDict(Dists): | |
once = True | |
Head="\t" | |
tail="" | |
n=0 | |
for k in Dists: | |
if once == True and n>0: | |
for k2 in Dists[k]: | |
Head+=str(k2)+'\t' | |
once =False | |
#Head+=str(k)+"\t" | |
tail+=str(k)+"\t" | |
for k2 in Dists[k]: | |
try: | |
tail+=str(Dists[k][k2])+"\t" | |
except(TypeError): | |
tail+=str(k2)+"\t" | |
tail+="\n" | |
n+=1 | |
Head+="\n" | |
return( Head+tail ) | |
def getBetterTables3d(data): | |
Data={} | |
for k in data: | |
Data[k]={} | |
for k2 in data[k]: | |
if data[k][k2][0]=="Name": | |
for k3 in range(1,len(data[k][k2])): | |
Data[k][data[k][k2][k3]]={} | |
for k4 in range(1,len(data[k][k2])): | |
Data[k][data[k][k2][k3]][data[k][k2][k4]]=data[k][k3][k4] | |
#print Data | |
for k in Data: | |
for k2 in Data[k]: | |
for k3 in Data[k][k2]: | |
try: | |
a=float(Data[k][k2][k3]) | |
except(ValueError): | |
print Data[k][k2][k3] | |
return Data | |
def getBetterTables2D(data): | |
Data={} | |
first=True | |
for k in data: | |
if first: | |
b = k | |
first=False | |
else: | |
First=True | |
Data[data[k][0]]={} | |
c=0 | |
for k2 in data[k]: | |
if First: | |
First=False | |
else: | |
c+=1 | |
Data[data[k][0]][data[b][c]]=k2 | |
# else: | |
# Data[data[k][0]]={} | |
# n=0 | |
# First=True | |
# for k2 in data[k]: | |
# if First == True: | |
# First = False | |
# else: | |
# Data[data[k][0]][data[0][n]]=k2 | |
# n+=1 | |
return Data | |
#def checkSys(): | |
# SysArgvs={} | |
# length=0 | |
# def noInput(): | |
# print '...BetterLuckNextTime...' | |
# k=0 | |
# while k<15: | |
# try: | |
# SysArgvs[k]=sys.agrv[k] | |
# k+=1 | |
# except(AttributeError): | |
# print SysArgvs | |
# return(SysArgvs,length) | |
# pass | |
# try: | |
# time=10.0 | |
# t = threading.Timer(time,noInput) | |
# t.start() | |
# mmk = str(raw_input("Type the word Arrdvark and press enter:\nYou have "+str(time)+" seconds.")) | |
# k=0 | |
# while(k<3): | |
# try: | |
# x = int(raw_input("1:Enter M value\n2:Enter M and BOOL for absents\n3:A_M_P")) | |
# if(x>3 or x<1): | |
# ValueError() | |
# break | |
# except ValueError: | |
# print "Enter 1,2 or 3 ... Try again:" | |
# k+=1 | |
# if(k==1): | |
# SysArgvs[0]=float(raw_input("Enter Moderates weight value:")) | |
# length=1 | |
# return(SysArgvs,length) | |
# if(k==2): | |
# for k2 in range(0,2): | |
# if(k2==0): | |
# SysArgvs[0]=float(raw_input("Enter Moderates weight value & (True/False -Absent Values from GXD-):")) | |
# if(k == 3): | |
# print 'Enter Calls' | |
# while k<3: | |
# try: | |
# if(k==0): | |
# x = float(raw_input("Absent values Will be Calculated As:")) | |
# if(k==1): | |
# x = float(raw_input("Moderate values Will be Calculated As:")) | |
# if(k==2): | |
# x = float(raw_input("Calls values Will be Calculated As:")) | |
# k+=1 | |
# except ValueError: | |
# print "Oops! That was no valid number. Try again..." | |
# k=1 | |
# except(AttributeError): | |
# print 'WoooBuddy' | |
# pass | |
#a = checkSys() | |
#print a | |
def getGbyAs(GbyAS , Data): | |
global present | |
global absent | |
GbyAs={} | |
genes=[] | |
for k in Data: | |
genes=Data[k].keys() | |
break | |
#print(genes) | |
for k in GbyAS: | |
#print(GbyAS[k][0]) | |
GbyAs[GbyAS[k][0]]={} | |
for k2 in genes: | |
GbyAs[GbyAS[k][0]][k2]=0. | |
#print(len(GbyAs)) | |
#print(len(genes)) | |
#prDict(GbyAs) | |
for k in GbyAS: | |
First = True | |
for k2 in GbyAS[k]: | |
if First:First=False | |
elif(k2[:len(k2)-1] in genes): | |
if(k2[-1:]=='-'): | |
GbyAs[GbyAS[k][0]][k2[0:len(k2)-1]]=absent | |
else: | |
GbyAs[GbyAS[k][0]][k2[0:len(k2)-1]]=present | |
#printDic2d(GbyAs) | |
return GbyAs | |
def kos(a): | |
cos = 0 | |
magA= 0 | |
magB= 0 | |
for k in a: | |
cos+=k[0]*k[1] | |
magA+=numpy.power(k[0],2) | |
magB+=numpy.power(k[1],2) | |
cos = cos / (numpy.power(magA,0.5)*numpy.power(magB,0.5)) | |
return(cos) | |
def corro(a): | |
n=0. | |
avgA=0. | |
avgB=0. | |
for k in a: | |
n+=1. | |
avgA+=k[0] | |
avgB+=k[1] | |
if(n==0):return(None) | |
else: | |
avgA/=n | |
avgB/=n | |
coro=0. | |
stdA=0. | |
stdB=0. | |
for k in a: | |
coro+=(k[0]-avgA)*(k[1]-avgB) | |
stdA+=numpy.power(k[0]-avgA,2) | |
stdB+=numpy.power(k[1]-avgB,2) | |
coro /=numpy.power(stdA*stdB,0.5) | |
return(coro) | |
def euc(a): | |
mag= 0. | |
for k in a: | |
mag+=numpy.power(k[0]-k[1],2) | |
mag = numpy.power(mag,0.5) | |
return(mag) #while these computations are O(n) data prep is O(n^2) as we search n keys n times, once for each key. | |
print '-init -ImportsComplete--' | |
def runComp(data1,data2,compType): | |
Data= getBetterTables2D(Boolienize(clip(readFileC(data1))))#'Data.tsv' | |
GbyAS=readFileT(data2)#'GbyAs.csv' | |
#print 'Data -As Read-' | |
#print Data | |
#print Data | |
#print GbyAS | |
#Data=getBetterTables2D(Data) | |
#prDict(Data) | |
#GbyAS=getBetterTables2D(GbyAS) | |
#prDict(Data) | |
#prDict(GbyAS) | |
GbyAS = getGbyAs(GbyAS,Data) | |
#print'GbyAS' | |
#prDict(GbyAS) | |
# CosAB={} #microArraySmaples x AS | |
# Expression = {} | |
# for k in Data: | |
# for k3 in Data[k]: | |
# if k3!="ProbeID#" and k3!= "Accession#" and k3 != "Gene" and k3 != "": | |
# CosAB[k3]={} | |
# for k2 in GbyAS: | |
# CosAB[k3][k2]=0. | |
# break | |
#print 'Data' | |
#prDict(Data) | |
# for k in Data: | |
# Expression[k]={} | |
# for k3 in Data[k]: | |
# if k3!="ProbeID#" and k3!= "Accession#" and k3 != "Gene": | |
# if Data[k][k3]=='P': | |
# Expression[k][k3]=1.0#sys.argv[1] | |
# if Data[k][k3]=='M': | |
# Expression[k][k3]=0.5#sys.argv[2] | |
# if Data[k][k3]=='A': | |
# Expression[k][k3]=-1.0#sys.argv[3] | |
Comp = {} | |
#printDic2d(GbyAS) | |
for k in GbyAS: #Cos(A,B) = A.B/(|A|*|B|) | |
Comp[k]={} | |
for k3 in Data: | |
Comp[k][k3]=[] | |
for k2 in GbyAS[k]: #Where A_n and B_n are Known | |
if(GbyAS[k][k2] != 0): | |
for k3 in Data: | |
#print('#'*25) | |
#print(GbyAS[k][k2]) | |
#print(Data[k3]) | |
a= 0 if GbyAS[k][k2] == -1 else 1 | |
Comp[k][k3].append([a,Data[k3][k2]]) | |
CosAB={} | |
for k in Comp: | |
CosAB[k]={} | |
for k2 in Comp[k]: | |
#print k | |
#print k2 | |
#print Comp[k][k2] | |
#print Comp[k][k2] | |
if(compType == "cos"):CosAB[k][k2]=kos(Comp[k][k2]) | |
elif(compType == "eucdist"):CosAB[k][k2]=euc(Comp[k][k2]) | |
elif(compType == "corolate"):CosAB[k][k2]=corro(Comp[k][k2]) | |
#print CosAB[k2][k] | |
#print 'CosAB' | |
#prDict(CosAB) | |
#print 'Expression' | |
#prDict(Expression) | |
return(CosAB) | |
temp=open('Cos.tsv','w') | |
temp.write(strDict(runComp('_mousePlate1.csv','GbyAs.csv',"cos"))) | |
temp.close() | |
temp=open('EucDist.tsv','w') | |
temp.write(strDict(runComp('_mousePlate1.csv','GbyAs.csv',"eucdist"))) | |
temp.close() | |
temp = open('Corolate.tsv','w') | |
temp.write(strDict(runComp('_mousePlate1.csv','GbyAs.csv',"corolate"))) | |
temp.close() | |
temp=open('LocalData.tsv','w') | |
temp.write(strDict(getBetterTables2D(Boolienize(clip(readFileC('_mousePlate1.csv')))))) | |
temp.close() | |
temp = open('gxdData.tsv','w') | |
temp.write(strDict(getGbyAs(readFileT('GbyAs.csv'),getBetterTables2D(Boolienize(clip(readFileC('_mousePlate1.csv'))))))) | |
temp.close() | |
#for k in Data: | |
# for k2 in GbyAS: | |
# dot=0.0 | |
# maga=0.0 | |
# magb=0.0 | |
# for k3 in GbyAS: | |
# dot+= | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment