Skip to content

Instantly share code, notes, and snippets.

@mrgloom
Created May 23, 2014 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrgloom/7509836d9b657cf9eaa4 to your computer and use it in GitHub Desktop.
Save mrgloom/7509836d9b657cf9eaa4 to your computer and use it in GitHub Desktop.
pca memmap
import numpy as np
import time
def read_data():
#M x N
data= np.loadtxt("data_3d.txt",delimiter=" ", skiprows=1, usecols=(0,1,2))
print data.shape
# print data
return data
def matrix_append_row(fm,tp,mat):
#check if number of columns is the same
rows= fm.shape[0] + mat.shape[0]
new_fm = np.memmap(fm.filename, mode='r+', dtype= tp, shape=(rows,fm.shape[1]))
new_fm[fm.shape[0]:rows,:]= mat[:]
return new_fm
def generate_and_store_data():
#create memmap file and append generated data in cycle
cols= 1000
#rows = batch*iter
batch= 10
iter= 10000
#can't create empty array - need to store first entry by copy not by append
fm= np.memmap('A.npy', dtype='uint8', mode='w+', shape=(batch,cols))
for i in xrange(iter):
data= np.random.rand(batch,cols)*100 # [0-1]*100
# print i
# t0= time.time()
if i==0:
fm[:]= data[:]
else:
fm= matrix_append_row(fm,'uint8',data)
# print (time.time()-t0)
# fm.flush()
# print fm.shape
return fm
#so covariance matrix is depend on dimension of vector
#cov matrix consist of covariance vectors
# cov(a,a) cov(a,b)
# cov(a,b) cov(b,b)
def cov(a, b): # a,b one dimensional vectors?
if len(a) != len(b):
return
a_mean = np.mean(a)
b_mean = np.mean(b)
sum = 0
for i in range(0, len(a)):
sum += ((a[i] - a_mean) * (b[i] - b_mean))
return sum/(len(a)-1)
def cov(fmat):
M= fmat.shape[0]
N= fmat.shape[1]
fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))
# for r in xrange(M):
# for c in xrange(N):
# fcov[]= cov
def cov_mat(fmat):
#if fmat centered then 2*(A.T*A)/(n-1) covariance matrix
M= fmat.shape[0]
N= fmat.shape[1]
fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))
#assinging don't copy?
# fmat_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(N,M))
fmat_tr= fmat.T
np.dot(fmat_tr,fmat,out=fcov)
fcov*= 2/(N-1)
return fcov
#need to determine k - just compute reconstruction error? or visualy inspect reconstructed images?
def pca(data,k):
M= data.shape[0]
N= data.shape[1]
print data.shape
#get mean
mean= np.mean(data,axis=0)
print mean.shape
# print mean
#M x N
data_c= (data-mean)
print data_c.shape
# print data_c
#N x N
#calculate covariance matrix
covData=np.cov(data_c,rowvar=0)
print covData.shape
t0= time.time()
eigenvalues, eigenvectors = np.linalg.eig(covData)# main time?
print (time.time()-t0)
print eigenvalues.shape # N long
print eigenvectors.shape # N x N
# print eigenvalues
# print eigenvectors
#sort and get k largest eigenvalues
idx = eigenvalues.argsort()[-k:][::-1]
print idx
eigenvalues = eigenvalues[idx] # k long
eigenvectors = eigenvectors[:,idx] # N x k
print eigenvalues.shape
print eigenvectors.shape
# print eigenvalues
# print eigenvectors
#projection
pr= np.dot(data_c,eigenvectors) # (M N) * (N k) = (M k)
print pr.shape
#reconstruction
rec= np.dot(pr, eigenvectors.T) #(M k) * (N k).T = (M N)
print rec.shape
print (data_c-rec)
#so only limitation is windows x32 at creation time of memmap file
#why don't work with large k
#assume M >> N
def pca_mm(fdata,k):
M= fdata.shape[0]
N= fdata.shape[1]
print fdata.shape
#get mean
mean= np.mean(fdata,axis=0)
print mean.shape
# print mean
#M x N
fdata_c= np.memmap('data_c.npy', dtype='float32', mode='w+', shape=(M,N))
# fdata_c= fdata-mean
for i in xrange(M):
fdata_c[i,:]= fdata[i,:]-mean
print fdata_c.shape
# print data_c
#N x N
#calculate covariance matrix
# covData=np.cov(fdata_c,rowvar=0) # must be memmaped array
# fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))
# fcov= np.cov(data_c,rowvar=0)
t0= time.time()
fcov= cov_mat(fdata_c)
print (time.time()-t0)
# covData=np.cov(fdata_c,rowvar=0)
# print fcov
# print covData
print fcov.shape
t0= time.time()
eigenvalues, eigenvectors = np.linalg.eig(fcov)# fcov small don't depend on samples count only on dimensions
print (time.time()-t0)
print eigenvalues.shape # N long
print eigenvectors.shape # N x N
# print eigenvalues
# print eigenvectors
#sort and get k largest eigenvalues
idx = eigenvalues.argsort()[-k:][::-1]
print idx
eigenvalues = eigenvalues[idx] # k long
eigenvectors = eigenvectors[:,idx] # N x k
print eigenvalues.shape
print eigenvectors.shape
# print eigenvalues
# print eigenvectors
#projection
fpr= np.memmap('pr.npy', dtype='float32', mode='w+', shape=(M,k))
np.dot(fdata_c,eigenvectors,out=fpr) # (M N) * (N k) = (M k)
print fpr.shape
#reconstruction
frec= np.memmap('rec.npy', dtype='float32', mode='w+', shape=(M,N))
np.dot(fpr, eigenvectors.T,out=frec) #(M k) * (N k).T = (M N)
print frec.shape
#reconstruction error
# print (fdata_c-frec)
print (fdata_c[1,:]-frec[1,:])
#need to write func to test operations with numpy array and memory consumtions
def memmap_operations():
#create array
rows= 250000
cols= 1000
fA= np.memmap('A.npy', dtype='float32', mode='w+', shape=(rows,cols))
# fA1= np.memmap('A1.npy', dtype='float32', mode='w+', shape=(rows,cols)) # can't create another one big memmap
# print fA[1:2,3:7]
print fA.nbytes/1024/1024 # 953 mb
#operations on matrix
fA+=1 # memory consumption is up
# fA=fA+1 #fails memory error creates temp copy?
# for r in xrange(rows): # too slow
# for c in xrange(cols):
# fA[r,c]+=1
# for r in xrange(rows): #memory up
# fA[r,:]+=1
A_tr= fA.T # no copy?
t0= time.time()
res= np.dot(A_tr,fA) # only small res in memory?
print (time.time()-t0)
#to create another memmap we need extra memory?
# t0= time.time()
# fA_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(cols,rows))
# res= np.dot(fA_tr,fA)
# print (time.time()-t0)
print fA[1:2,3:7]
print 'done'
def memmap_x32():
baseNumber = 3000000L
for powers in np.arange(1,7):
l1 = baseNumber*10**powers
print('working with %d elements'%(l1))
print('number bytes required %f GB'%(l1*8/1e9))
try:
fp = np.memmap('test.map',dtype='float64', mode='w+',shape=(1,l1))
#works
print('works')
del fp
except Exception as e:
print(repr(e))
print 'done'
memmap_x32()
# memmap_operations()
# fm= generate_and_store_data()
# k= 999
# t0= time.time()
# pca_mm(fm,k)
# print (time.time()-t0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment