Skip to content

Instantly share code, notes, and snippets.

@Krewn
Last active August 29, 2015 14:07
Show Gist options
  • Save Krewn/32fbc5281c2166d3c1d5 to your computer and use it in GitHub Desktop.
Save Krewn/32fbc5281c2166d3c1d5 to your computer and use it in GitHub Desktop.
Euclidean distance, correlation, and cos between sets of vectors with varying dimension.
#########################################################
## Author : Kevin Nelson ##
# #
## Mapping MicroArrayData back to AS ##
#########################################################
import sys
import csv
import pylab as pl
import matplotlib.pyplot as plt
import numpy
import os
import threading
#print "ImportsComplete"
####Dependancies
#AnaylizeData.py Data.tsv GbyAs.csv test
#GbyAs.tsv
present=1
absent=-1
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def clip(D):
#print '\n.\n..\n...\n....\n...\n..\n.\n'
#prDict2(D)
BM96={}
n=0
b=12
while(is_number(D[b][2])):
b+=1
#print b
for k in range(11,b):
row=[]
for p in range (1,98,1):
if(is_number(D[k][p])):
if(float(D[k][p])>0 and float(D[k][p])!=999):
row.append(float(D[k][p]))
else:
row.append(0)
else:
row.append(D[k][p])
BM96[k-11]=row
#print '\#'
#print BM96
return BM96
def Boolienize(data):
global present
global absent
BData={}
n=0
try:
for k in range(0,len(data),1): #boolienize Data
row=[]
for p in data[k]:
if(is_number(p)):
if(float(p)>0 and float(p)!=999):
row.append(present)
else:
row.append(absent)
else:
row.append(p)
BData[k]=row
#print BData
return BData
except(KeyError):
for k in data: #boolienize Data
row=[]
for p in data[k]:
if(is_number(p)):
if(float(p)>0 and float(p)!=999):
row.append(present)
else:
row.append(absent)
else:
row.append(p)
BData[k]=row
#print BData
return BData
#data={}
def readFileT(n):
fileIN=n
Data={}
with open(fileIN) as csvfile:
tableMain=csv.reader(csvfile,delimiter='\t')
k=0
for row in tableMain:
Data[k]=row
k+=1 #Read the Data into dict Data
return(Data)
def readFileC(n):
fileIN=n
Data={}
with open(fileIN) as csvfile:
tableMain=csv.reader(csvfile,delimiter=',')
k=0
for row in tableMain:
Data[k]=row
k+=1 #Read the Data into dict Data
return(Data)
def printDic2d(nm_at):
for k in nm_at:
for k2 in nm_at[k]:
print str(k)+'\t'+str(k2)+'\t'+str(nm_at[k][k2])
def prDict(Dists):
once = True
Head="\t"
tail=""
n=0
for k in Dists:
if once == True and n>0:
for k2 in Dists[k]:
Head+=str(k2)+'\t'
once =False
#Head+=str(k)+"\t"
tail+=str(k)+"\t"
for k2 in Dists[k]:
try:
tail+=str(Dists[k][k2])+"\t"
except(TypeError):
tail+=str(k2)+"\t"
tail+="\n"
n+=1
Head+="\n"
print Head+tail
print "\n\n"
def strDict(Dists):
once = True
Head="\t"
tail=""
n=0
for k in Dists:
if once == True and n>0:
for k2 in Dists[k]:
Head+=str(k2)+'\t'
once =False
#Head+=str(k)+"\t"
tail+=str(k)+"\t"
for k2 in Dists[k]:
try:
tail+=str(Dists[k][k2])+"\t"
except(TypeError):
tail+=str(k2)+"\t"
tail+="\n"
n+=1
Head+="\n"
return( Head+tail )
def getBetterTables3d(data):
Data={}
for k in data:
Data[k]={}
for k2 in data[k]:
if data[k][k2][0]=="Name":
for k3 in range(1,len(data[k][k2])):
Data[k][data[k][k2][k3]]={}
for k4 in range(1,len(data[k][k2])):
Data[k][data[k][k2][k3]][data[k][k2][k4]]=data[k][k3][k4]
#print Data
for k in Data:
for k2 in Data[k]:
for k3 in Data[k][k2]:
try:
a=float(Data[k][k2][k3])
except(ValueError):
print Data[k][k2][k3]
return Data
def getBetterTables2D(data):
Data={}
first=True
for k in data:
if first:
b = k
first=False
else:
First=True
Data[data[k][0]]={}
c=0
for k2 in data[k]:
if First:
First=False
else:
c+=1
Data[data[k][0]][data[b][c]]=k2
# else:
# Data[data[k][0]]={}
# n=0
# First=True
# for k2 in data[k]:
# if First == True:
# First = False
# else:
# Data[data[k][0]][data[0][n]]=k2
# n+=1
return Data
#def checkSys():
# SysArgvs={}
# length=0
# def noInput():
# print '...BetterLuckNextTime...'
# k=0
# while k<15:
# try:
# SysArgvs[k]=sys.agrv[k]
# k+=1
# except(AttributeError):
# print SysArgvs
# return(SysArgvs,length)
# pass
# try:
# time=10.0
# t = threading.Timer(time,noInput)
# t.start()
# mmk = str(raw_input("Type the word Arrdvark and press enter:\nYou have "+str(time)+" seconds."))
# k=0
# while(k<3):
# try:
# x = int(raw_input("1:Enter M value\n2:Enter M and BOOL for absents\n3:A_M_P"))
# if(x>3 or x<1):
# ValueError()
# break
# except ValueError:
# print "Enter 1,2 or 3 ... Try again:"
# k+=1
# if(k==1):
# SysArgvs[0]=float(raw_input("Enter Moderates weight value:"))
# length=1
# return(SysArgvs,length)
# if(k==2):
# for k2 in range(0,2):
# if(k2==0):
# SysArgvs[0]=float(raw_input("Enter Moderates weight value & (True/False -Absent Values from GXD-):"))
# if(k == 3):
# print 'Enter Calls'
# while k<3:
# try:
# if(k==0):
# x = float(raw_input("Absent values Will be Calculated As:"))
# if(k==1):
# x = float(raw_input("Moderate values Will be Calculated As:"))
# if(k==2):
# x = float(raw_input("Calls values Will be Calculated As:"))
# k+=1
# except ValueError:
# print "Oops! That was no valid number. Try again..."
# k=1
# except(AttributeError):
# print 'WoooBuddy'
# pass
#a = checkSys()
#print a
def getGbyAs(GbyAS , Data):
global present
global absent
GbyAs={}
genes=[]
for k in Data:
genes=Data[k].keys()
break
#print(genes)
for k in GbyAS:
#print(GbyAS[k][0])
GbyAs[GbyAS[k][0]]={}
for k2 in genes:
GbyAs[GbyAS[k][0]][k2]=0.
#print(len(GbyAs))
#print(len(genes))
#prDict(GbyAs)
for k in GbyAS:
First = True
for k2 in GbyAS[k]:
if First:First=False
elif(k2[:len(k2)-1] in genes):
if(k2[-1:]=='-'):
GbyAs[GbyAS[k][0]][k2[0:len(k2)-1]]=absent
else:
GbyAs[GbyAS[k][0]][k2[0:len(k2)-1]]=present
#printDic2d(GbyAs)
return GbyAs
def kos(a):
cos = 0
magA= 0
magB= 0
for k in a:
cos+=k[0]*k[1]
magA+=numpy.power(k[0],2)
magB+=numpy.power(k[1],2)
cos = cos / (numpy.power(magA,0.5)*numpy.power(magB,0.5))
return(cos)
def corro(a):
n=0.
avgA=0.
avgB=0.
for k in a:
n+=1.
avgA+=k[0]
avgB+=k[1]
if(n==0):return(None)
else:
avgA/=n
avgB/=n
coro=0.
stdA=0.
stdB=0.
for k in a:
coro+=(k[0]-avgA)*(k[1]-avgB)
stdA+=numpy.power(k[0]-avgA,2)
stdB+=numpy.power(k[1]-avgB,2)
coro /=numpy.power(stdA*stdB,0.5)
return(coro)
def euc(a):
mag= 0.
for k in a:
mag+=numpy.power(k[0]-k[1],2)
mag = numpy.power(mag,0.5)
return(mag) #while these computations are O(n) data prep is O(n^2) as we search n keys n times, once for each key.
print '-init -ImportsComplete--'
def runComp(data1,data2,compType):
Data= getBetterTables2D(Boolienize(clip(readFileC(data1))))#'Data.tsv'
GbyAS=readFileT(data2)#'GbyAs.csv'
#print 'Data -As Read-'
#print Data
#print Data
#print GbyAS
#Data=getBetterTables2D(Data)
#prDict(Data)
#GbyAS=getBetterTables2D(GbyAS)
#prDict(Data)
#prDict(GbyAS)
GbyAS = getGbyAs(GbyAS,Data)
#print'GbyAS'
#prDict(GbyAS)
# CosAB={} #microArraySmaples x AS
# Expression = {}
# for k in Data:
# for k3 in Data[k]:
# if k3!="ProbeID#" and k3!= "Accession#" and k3 != "Gene" and k3 != "":
# CosAB[k3]={}
# for k2 in GbyAS:
# CosAB[k3][k2]=0.
# break
#print 'Data'
#prDict(Data)
# for k in Data:
# Expression[k]={}
# for k3 in Data[k]:
# if k3!="ProbeID#" and k3!= "Accession#" and k3 != "Gene":
# if Data[k][k3]=='P':
# Expression[k][k3]=1.0#sys.argv[1]
# if Data[k][k3]=='M':
# Expression[k][k3]=0.5#sys.argv[2]
# if Data[k][k3]=='A':
# Expression[k][k3]=-1.0#sys.argv[3]
Comp = {}
#printDic2d(GbyAS)
for k in GbyAS: #Cos(A,B) = A.B/(|A|*|B|)
Comp[k]={}
for k3 in Data:
Comp[k][k3]=[]
for k2 in GbyAS[k]: #Where A_n and B_n are Known
if(GbyAS[k][k2] != 0):
for k3 in Data:
#print('#'*25)
#print(GbyAS[k][k2])
#print(Data[k3])
a= 0 if GbyAS[k][k2] == -1 else 1
Comp[k][k3].append([a,Data[k3][k2]])
CosAB={}
for k in Comp:
CosAB[k]={}
for k2 in Comp[k]:
#print k
#print k2
#print Comp[k][k2]
#print Comp[k][k2]
if(compType == "cos"):CosAB[k][k2]=kos(Comp[k][k2])
elif(compType == "eucdist"):CosAB[k][k2]=euc(Comp[k][k2])
elif(compType == "corolate"):CosAB[k][k2]=corro(Comp[k][k2])
#print CosAB[k2][k]
#print 'CosAB'
#prDict(CosAB)
#print 'Expression'
#prDict(Expression)
return(CosAB)
temp=open('Cos.tsv','w')
temp.write(strDict(runComp('_mousePlate1.csv','GbyAs.csv',"cos")))
temp.close()
temp=open('EucDist.tsv','w')
temp.write(strDict(runComp('_mousePlate1.csv','GbyAs.csv',"eucdist")))
temp.close()
temp = open('Corolate.tsv','w')
temp.write(strDict(runComp('_mousePlate1.csv','GbyAs.csv',"corolate")))
temp.close()
temp=open('LocalData.tsv','w')
temp.write(strDict(getBetterTables2D(Boolienize(clip(readFileC('_mousePlate1.csv'))))))
temp.close()
temp = open('gxdData.tsv','w')
temp.write(strDict(getGbyAs(readFileT('GbyAs.csv'),getBetterTables2D(Boolienize(clip(readFileC('_mousePlate1.csv')))))))
temp.close()
#for k in Data:
# for k2 in GbyAS:
# dot=0.0
# maga=0.0
# magb=0.0
# for k3 in GbyAS:
# dot+=
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment