Created
May 26, 2014 11:11
-
-
Save mrgloom/fdd8134d1916650898a6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import time | |
def read_data(): | |
#M x N | |
data= np.loadtxt("data_3d.txt",delimiter=" ", skiprows=1, usecols=(0,1,2)) | |
print data.shape | |
# print data | |
return data | |
def matrix_append_row(fm,tp,mat): | |
#check if number of columns is the same | |
rows= fm.shape[0] + mat.shape[0] | |
new_fm = np.memmap(fm.filename, mode='r+', dtype= tp, shape=(rows,fm.shape[1])) | |
new_fm[fm.shape[0]:rows,:]= mat[:] | |
return new_fm | |
def generate_and_store_data(): | |
#create memmap file and append generated data in cycle | |
cols= 1000 | |
#rows = batch*iter | |
batch= 10 | |
iter= 10000 | |
#can't create empty array - need to store first entry by copy not by append | |
fm= np.memmap('A.npy', dtype='uint8', mode='w+', shape=(batch,cols)) | |
for i in xrange(iter): | |
data= np.random.rand(batch,cols)*100 # [0-1]*100 | |
# print i | |
# t0= time.time() | |
if i==0: | |
fm[:]= data[:] | |
else: | |
fm= matrix_append_row(fm,'uint8',data) | |
# print (time.time()-t0) | |
# fm.flush() | |
# print fm.shape | |
return fm | |
#so covariance matrix is depend on dimension of vector | |
#cov matrix consist of covariance vectors | |
# cov(a,a) cov(a,b) | |
# cov(a,b) cov(b,b) | |
def cov(a, b): # a,b one dimensional vectors? | |
if len(a) != len(b): | |
return | |
a_mean = np.mean(a) | |
b_mean = np.mean(b) | |
sum = 0 | |
for i in range(0, len(a)): | |
sum += ((a[i] - a_mean) * (b[i] - b_mean)) | |
return sum/(len(a)-1) | |
def cov(fmat): | |
M= fmat.shape[0] | |
N= fmat.shape[1] | |
fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N)) | |
# for r in xrange(M): | |
# for c in xrange(N): | |
# fcov[]= cov | |
def cov_mat(fmat): | |
#if fmat centered then 2*(A.T*A)/(n-1) covariance matrix | |
M= fmat.shape[0] | |
N= fmat.shape[1] | |
fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N)) | |
#assinging don't copy? | |
# fmat_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(N,M)) | |
fmat_tr= fmat.T | |
np.dot(fmat_tr,fmat,out=fcov) | |
fcov*= 2/(N-1) | |
return fcov | |
#need to determine k - just compute reconstruction error? or visualy inspect reconstructed images? | |
def pca(data,k): | |
M= data.shape[0] | |
N= data.shape[1] | |
print data.shape | |
#get mean | |
mean= np.mean(data,axis=0) | |
print mean.shape | |
# print mean | |
#M x N | |
data_c= (data-mean) | |
print data_c.shape | |
# print data_c | |
#N x N | |
#calculate covariance matrix | |
covData=np.cov(data_c,rowvar=0) | |
print covData.shape | |
t0= time.time() | |
eigenvalues, eigenvectors = np.linalg.eig(covData)# main time? | |
print (time.time()-t0) | |
print eigenvalues.shape # N long | |
print eigenvectors.shape # N x N | |
# print eigenvalues | |
# print eigenvectors | |
#sort and get k largest eigenvalues | |
idx = eigenvalues.argsort()[-k:][::-1] | |
print idx | |
eigenvalues = eigenvalues[idx] # k long | |
eigenvectors = eigenvectors[:,idx] # N x k | |
print eigenvalues.shape | |
print eigenvectors.shape | |
# print eigenvalues | |
# print eigenvectors | |
#projection | |
pr= np.dot(data_c,eigenvectors) # (M N) * (N k) = (M k) | |
print pr.shape | |
#reconstruction | |
rec= np.dot(pr, eigenvectors.T) #(M k) * (N k).T = (M N) | |
print rec.shape | |
print (data_c-rec) | |
#so only limitation is windows x32 at creation time of memmap file | |
#why don't work with large k | |
#assume M >> N | |
def pca_mm(fdata,k): | |
M= fdata.shape[0] | |
N= fdata.shape[1] | |
print fdata.shape | |
#get mean | |
mean= np.mean(fdata,axis=0) | |
print mean.shape | |
# print mean | |
#M x N | |
fdata_c= np.memmap('data_c.npy', dtype='float32', mode='w+', shape=(M,N)) | |
# fdata_c= fdata-mean | |
for i in xrange(M): | |
fdata_c[i,:]= fdata[i,:]-mean | |
print fdata_c.shape | |
# print data_c | |
#N x N | |
#calculate covariance matrix | |
# covData=np.cov(fdata_c,rowvar=0) # must be memmaped array | |
# fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N)) | |
# fcov= np.cov(data_c,rowvar=0) | |
t0= time.time() | |
fcov= cov_mat(fdata_c) | |
print (time.time()-t0) | |
# covData=np.cov(fdata_c,rowvar=0) | |
# print fcov | |
# print covData | |
print fcov.shape | |
t0= time.time() | |
eigenvalues, eigenvectors = np.linalg.eig(fcov)# fcov small don't depend on samples count only on dimensions | |
print (time.time()-t0) | |
print eigenvalues.shape # N long | |
print eigenvectors.shape # N x N | |
# print eigenvalues | |
# print eigenvectors | |
#sort and get k largest eigenvalues | |
idx = eigenvalues.argsort()[-k:][::-1] | |
print idx | |
eigenvalues = eigenvalues[idx] # k long | |
eigenvectors = eigenvectors[:,idx] # N x k | |
print eigenvalues.shape | |
print eigenvectors.shape | |
# print eigenvalues | |
# print eigenvectors | |
#projection | |
fpr= np.memmap('pr.npy', dtype='float32', mode='w+', shape=(M,k)) | |
np.dot(fdata_c,eigenvectors,out=fpr) # (M N) * (N k) = (M k) | |
print fpr.shape | |
#reconstruction | |
frec= np.memmap('rec.npy', dtype='float32', mode='w+', shape=(M,N)) | |
np.dot(fpr, eigenvectors.T,out=frec) #(M k) * (N k).T = (M N) | |
print frec.shape | |
#reconstruction error | |
# print (fdata_c-frec) | |
print (fdata_c[1,:]-frec[1,:]) | |
#need to write func to test operations with numpy array and memory consumtions | |
def memmap_operations(): | |
#create array | |
rows= 250000 | |
cols= 1000 | |
fA= np.memmap('A.npy', dtype='float32', mode='w+', shape=(rows,cols)) | |
# fA1= np.memmap('A1.npy', dtype='float32', mode='w+', shape=(rows,cols)) # can't create another one big memmap | |
# print fA[1:2,3:7] | |
print fA.nbytes/1024/1024 # 953 mb | |
#operations on matrix | |
fA+=1 # memory consumption is up | |
# fA=fA+1 #fails memory error creates temp copy? | |
# for r in xrange(rows): # too slow | |
# for c in xrange(cols): | |
# fA[r,c]+=1 | |
# for r in xrange(rows): #memory up | |
# fA[r,:]+=1 | |
A_tr= fA.T # no copy? | |
t0= time.time() | |
res= np.dot(A_tr,fA) # only small res in memory? | |
print (time.time()-t0) | |
#to create another memmap we need extra memory? | |
# t0= time.time() | |
# fA_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(cols,rows)) | |
# res= np.dot(fA_tr,fA) | |
# print (time.time()-t0) | |
print fA[1:2,3:7] | |
print 'done' | |
#maybe principal promblem is we can't use very large number to pass to numpy.array | |
def memmap_size_test(): | |
cols= 100 | |
rows= 1000L | |
for powers in np.arange(1,10): | |
l1 = rows*(10**powers) | |
print('try %d elements'%(l1)) | |
# print('number bytes required %f GB'%(l1*8/1e9)) | |
try: | |
# why change 1 dimension? and why use float64 | |
fp = np.memmap('test.map',dtype='float32', mode='w+',shape=(l1,cols)) | |
print('working with %d mb'%(fp.nbytes/1024/1024)) | |
del fp | |
except Exception as e: | |
print(repr(e)) | |
print 'done' | |
memmap_size_test() | |
# memmap_operations() | |
# fm= generate_and_store_data() | |
# k= 999 | |
# t0= time.time() | |
# pca_mm(fm,k) | |
# print (time.time()-t0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment