mrgloom/gist:fdd8134d1916650898a6

## gistfile1.txt
import numpy as np
import time

def read_data():
	#M x N
	data= np.loadtxt("data_3d.txt",delimiter=" ", skiprows=1, usecols=(0,1,2))
	print data.shape
	# print data
	return data

def matrix_append_row(fm,tp,mat):
	#check if number of columns is the same
	rows= fm.shape[0] + mat.shape[0]
	new_fm = np.memmap(fm.filename, mode='r+', dtype= tp, shape=(rows,fm.shape[1]))
	new_fm[fm.shape[0]:rows,:]= mat[:]
	return new_fm

def generate_and_store_data():
	#create memmap file and append generated data in cycle
	cols= 1000
	#rows = batch*iter
	batch= 10
	iter= 10000

	#can't create empty array - need to store first entry by copy not by append
	fm= np.memmap('A.npy', dtype='uint8', mode='w+', shape=(batch,cols))

	for i in xrange(iter):
		data= np.random.rand(batch,cols)*100  # [0-1]*100
		# print i
		# t0= time.time()
		if i==0:
			fm[:]= data[:]
		else:
			fm= matrix_append_row(fm,'uint8',data)
		# print (time.time()-t0)

		# fm.flush()
		# print fm.shape

	return fm

#so covariance matrix is depend on dimension of vector
#cov matrix consist of covariance vectors
# cov(a,a)  cov(a,b)
# cov(a,b)  cov(b,b)
def cov(a, b):  # a,b one dimensional vectors?

    if len(a) != len(b):
        return

    a_mean = np.mean(a)
    b_mean = np.mean(b)

    sum = 0

    for i in range(0, len(a)):
        sum += ((a[i] - a_mean) * (b[i] - b_mean))

    return sum/(len(a)-1)

def cov(fmat):
	M= fmat.shape[0]
	N= fmat.shape[1]
	fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))

	# for r in xrange(M):
		# for c in xrange(N):
			# fcov[]= cov

def cov_mat(fmat):
	#if fmat centered then 2*(A.T*A)/(n-1) covariance matrix
	M= fmat.shape[0]
	N= fmat.shape[1]
	fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))

	#assinging don't copy?
	# fmat_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(N,M))
	fmat_tr= fmat.T
	np.dot(fmat_tr,fmat,out=fcov)
	fcov*= 2/(N-1)

	return fcov

#need to determine k - just compute reconstruction error? or visualy inspect reconstructed images?
def pca(data,k):
	M= data.shape[0]
	N= data.shape[1]
	print data.shape
	#get mean
	mean= np.mean(data,axis=0)
	print mean.shape
	# print mean

	#M x N
	data_c= (data-mean)
	print data_c.shape
	# print data_c

	#N x N
	#calculate covariance matrix
	covData=np.cov(data_c,rowvar=0)
	print covData.shape

	t0= time.time()
	eigenvalues, eigenvectors = np.linalg.eig(covData)# main time?
	print (time.time()-t0)

	print eigenvalues.shape # N long
	print eigenvectors.shape # N x N
	# print eigenvalues
	# print eigenvectors

	#sort and get k largest eigenvalues
	idx = eigenvalues.argsort()[-k:][::-1]
	print idx

	eigenvalues = eigenvalues[idx] # k long
	eigenvectors = eigenvectors[:,idx] # N x k
	print eigenvalues.shape
	print eigenvectors.shape
	# print eigenvalues
	# print eigenvectors

	#projection
	pr= np.dot(data_c,eigenvectors) # (M N) * (N k) = (M k)
	print pr.shape
	#reconstruction
	rec= np.dot(pr, eigenvectors.T) #(M k) * (N k).T = (M N)
	print rec.shape

	print (data_c-rec)

#so only limitation is windows x32 at creation time of memmap file
#why don't work with large k
#assume M >> N
def pca_mm(fdata,k):
	M= fdata.shape[0]
	N= fdata.shape[1]
	print fdata.shape
	#get mean
	mean= np.mean(fdata,axis=0)
	print mean.shape
	# print mean

	#M x N
	fdata_c= np.memmap('data_c.npy', dtype='float32', mode='w+', shape=(M,N))
	# fdata_c= fdata-mean
	for i in xrange(M):
		fdata_c[i,:]= fdata[i,:]-mean
	print fdata_c.shape
	# print data_c

	#N x N
	#calculate covariance matrix
	# covData=np.cov(fdata_c,rowvar=0)   # must be memmaped array
	# fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))
	# fcov= np.cov(data_c,rowvar=0)
	t0= time.time()
	fcov= cov_mat(fdata_c)
	print (time.time()-t0)
	# covData=np.cov(fdata_c,rowvar=0)
	# print fcov
	# print covData
	print fcov.shape

	t0= time.time()
	eigenvalues, eigenvectors = np.linalg.eig(fcov)# fcov small don't depend on samples count only on dimensions
	print (time.time()-t0)

	print eigenvalues.shape # N long
	print eigenvectors.shape # N x N
	# print eigenvalues
	# print eigenvectors

	#sort and get k largest eigenvalues
	idx = eigenvalues.argsort()[-k:][::-1]
	print idx

	eigenvalues = eigenvalues[idx] # k long
	eigenvectors = eigenvectors[:,idx] # N x k
	print eigenvalues.shape
	print eigenvectors.shape
	# print eigenvalues
	# print eigenvectors

	#projection
	fpr= np.memmap('pr.npy', dtype='float32', mode='w+', shape=(M,k))
	np.dot(fdata_c,eigenvectors,out=fpr) # (M N) * (N k) = (M k)
	print fpr.shape
	#reconstruction
	frec= np.memmap('rec.npy', dtype='float32', mode='w+', shape=(M,N))
	np.dot(fpr, eigenvectors.T,out=frec) #(M k) * (N k).T = (M N)
	print frec.shape

	#reconstruction error
	# print (fdata_c-frec)
	print (fdata_c[1,:]-frec[1,:])

#need to write func to test operations with numpy array and memory consumtions
def memmap_operations():
	#create array
	rows= 250000
	cols= 1000
	fA= np.memmap('A.npy', dtype='float32', mode='w+', shape=(rows,cols))
	# fA1= np.memmap('A1.npy', dtype='float32', mode='w+', shape=(rows,cols)) # can't create another one big memmap

	# print fA[1:2,3:7]
	print fA.nbytes/1024/1024 # 953 mb

	#operations on matrix
	fA+=1 # memory consumption is up
	# fA=fA+1 #fails memory error creates temp copy?
	# for r in xrange(rows): # too slow
		# for c in xrange(cols):
			# fA[r,c]+=1
	# for r in xrange(rows): #memory up
		# fA[r,:]+=1

	A_tr= fA.T # no copy?

	t0= time.time()
	res= np.dot(A_tr,fA) # only small res in memory?
	print (time.time()-t0)

	#to create another memmap we need extra memory?
	# t0= time.time()
	# fA_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(cols,rows))
	# res= np.dot(fA_tr,fA)
	# print (time.time()-t0)

	print fA[1:2,3:7]

	print 'done'

#maybe principal promblem is we can't use very large number to pass to numpy.array
def memmap_size_test():
	cols= 100
	rows= 1000L

	for powers in np.arange(1,10):
		l1 = rows*(10**powers)
		print('try %d elements'%(l1))
		# print('number bytes required %f GB'%(l1*8/1e9))
		try:
			# why change 1 dimension? and why use float64
			fp = np.memmap('test.map',dtype='float32', mode='w+',shape=(l1,cols))
			print('working with %d mb'%(fp.nbytes/1024/1024))
			del fp
		except Exception as e:
			print(repr(e))

	print 'done'

memmap_size_test()
# memmap_operations()

# fm= generate_and_store_data()
# k= 999
# t0= time.time()
# pca_mm(fm,k)
# print (time.time()-t0)
	import numpy as np
	import time

	def read_data():
	#M x N
	data= np.loadtxt("data_3d.txt",delimiter=" ", skiprows=1, usecols=(0,1,2))
	print data.shape
	# print data
	return data

	def matrix_append_row(fm,tp,mat):
	#check if number of columns is the same
	rows= fm.shape[0] + mat.shape[0]
	new_fm = np.memmap(fm.filename, mode='r+', dtype= tp, shape=(rows,fm.shape[1]))
	new_fm[fm.shape[0]:rows,:]= mat[:]
	return new_fm

	def generate_and_store_data():
	#create memmap file and append generated data in cycle
	cols= 1000
	#rows = batch*iter
	batch= 10
	iter= 10000

	#can't create empty array - need to store first entry by copy not by append
	fm= np.memmap('A.npy', dtype='uint8', mode='w+', shape=(batch,cols))

	for i in xrange(iter):
	data= np.random.rand(batch,cols)100 # [0-1]100
	# print i
	# t0= time.time()
	if i==0:
	fm[:]= data[:]
	else:
	fm= matrix_append_row(fm,'uint8',data)
	# print (time.time()-t0)

	# fm.flush()
	# print fm.shape

	return fm

	#so covariance matrix is depend on dimension of vector
	#cov matrix consist of covariance vectors
	# cov(a,a) cov(a,b)
	# cov(a,b) cov(b,b)
	def cov(a, b): # a,b one dimensional vectors?

	if len(a) != len(b):
	return

	a_mean = np.mean(a)
	b_mean = np.mean(b)

	sum = 0

	for i in range(0, len(a)):
	sum += ((a[i] - a_mean) * (b[i] - b_mean))

	return sum/(len(a)-1)

	def cov(fmat):
	M= fmat.shape[0]
	N= fmat.shape[1]
	fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))

	# for r in xrange(M):
	# for c in xrange(N):
	# fcov[]= cov

	def cov_mat(fmat):
	#if fmat centered then 2(A.TA)/(n-1) covariance matrix
	M= fmat.shape[0]
	N= fmat.shape[1]
	fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))

	#assinging don't copy?
	# fmat_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(N,M))
	fmat_tr= fmat.T
	np.dot(fmat_tr,fmat,out=fcov)
	fcov*= 2/(N-1)

	return fcov

	#need to determine k - just compute reconstruction error? or visualy inspect reconstructed images?
	def pca(data,k):
	M= data.shape[0]
	N= data.shape[1]
	print data.shape
	#get mean
	mean= np.mean(data,axis=0)
	print mean.shape
	# print mean

	#M x N
	data_c= (data-mean)
	print data_c.shape
	# print data_c

	#N x N
	#calculate covariance matrix
	covData=np.cov(data_c,rowvar=0)
	print covData.shape

	t0= time.time()
	eigenvalues, eigenvectors = np.linalg.eig(covData)# main time?
	print (time.time()-t0)

	print eigenvalues.shape # N long
	print eigenvectors.shape # N x N
	# print eigenvalues
	# print eigenvectors

	#sort and get k largest eigenvalues
	idx = eigenvalues.argsort()[-k:][::-1]
	print idx

	eigenvalues = eigenvalues[idx] # k long
	eigenvectors = eigenvectors[:,idx] # N x k
	print eigenvalues.shape
	print eigenvectors.shape
	# print eigenvalues
	# print eigenvectors

	#projection
	pr= np.dot(data_c,eigenvectors) # (M N) * (N k) = (M k)
	print pr.shape
	#reconstruction
	rec= np.dot(pr, eigenvectors.T) #(M k) * (N k).T = (M N)
	print rec.shape

	print (data_c-rec)

	#so only limitation is windows x32 at creation time of memmap file
	#why don't work with large k
	#assume M >> N
	def pca_mm(fdata,k):
	M= fdata.shape[0]
	N= fdata.shape[1]
	print fdata.shape
	#get mean
	mean= np.mean(fdata,axis=0)
	print mean.shape
	# print mean

	#M x N
	fdata_c= np.memmap('data_c.npy', dtype='float32', mode='w+', shape=(M,N))
	# fdata_c= fdata-mean
	for i in xrange(M):
	fdata_c[i,:]= fdata[i,:]-mean
	print fdata_c.shape
	# print data_c

	#N x N
	#calculate covariance matrix
	# covData=np.cov(fdata_c,rowvar=0) # must be memmaped array
	# fcov= np.memmap('cov.npy', dtype='float32', mode='w+', shape=(N,N))
	# fcov= np.cov(data_c,rowvar=0)
	t0= time.time()
	fcov= cov_mat(fdata_c)
	print (time.time()-t0)
	# covData=np.cov(fdata_c,rowvar=0)
	# print fcov
	# print covData
	print fcov.shape

	t0= time.time()
	eigenvalues, eigenvectors = np.linalg.eig(fcov)# fcov small don't depend on samples count only on dimensions
	print (time.time()-t0)

	print eigenvalues.shape # N long
	print eigenvectors.shape # N x N
	# print eigenvalues
	# print eigenvectors

	#sort and get k largest eigenvalues
	idx = eigenvalues.argsort()[-k:][::-1]
	print idx

	eigenvalues = eigenvalues[idx] # k long
	eigenvectors = eigenvectors[:,idx] # N x k
	print eigenvalues.shape
	print eigenvectors.shape
	# print eigenvalues
	# print eigenvectors

	#projection
	fpr= np.memmap('pr.npy', dtype='float32', mode='w+', shape=(M,k))
	np.dot(fdata_c,eigenvectors,out=fpr) # (M N) * (N k) = (M k)
	print fpr.shape
	#reconstruction
	frec= np.memmap('rec.npy', dtype='float32', mode='w+', shape=(M,N))
	np.dot(fpr, eigenvectors.T,out=frec) #(M k) * (N k).T = (M N)
	print frec.shape

	#reconstruction error
	# print (fdata_c-frec)
	print (fdata_c[1,:]-frec[1,:])

	#need to write func to test operations with numpy array and memory consumtions
	def memmap_operations():
	#create array
	rows= 250000
	cols= 1000
	fA= np.memmap('A.npy', dtype='float32', mode='w+', shape=(rows,cols))
	# fA1= np.memmap('A1.npy', dtype='float32', mode='w+', shape=(rows,cols)) # can't create another one big memmap

	# print fA[1:2,3:7]
	print fA.nbytes/1024/1024 # 953 mb

	#operations on matrix
	fA+=1 # memory consumption is up
	# fA=fA+1 #fails memory error creates temp copy?
	# for r in xrange(rows): # too slow
	# for c in xrange(cols):
	# fA[r,c]+=1
	# for r in xrange(rows): #memory up
	# fA[r,:]+=1

	A_tr= fA.T # no copy?

	t0= time.time()
	res= np.dot(A_tr,fA) # only small res in memory?
	print (time.time()-t0)

	#to create another memmap we need extra memory?
	# t0= time.time()
	# fA_tr= np.memmap('A_tr.npy', dtype='float32', mode='w+', shape=(cols,rows))
	# res= np.dot(fA_tr,fA)
	# print (time.time()-t0)

	print fA[1:2,3:7]

	print 'done'

	#maybe principal promblem is we can't use very large number to pass to numpy.array
	def memmap_size_test():
	cols= 100
	rows= 1000L

	for powers in np.arange(1,10):
	l1 = rows(10*powers)
	print('try %d elements'%(l1))
	# print('number bytes required %f GB'%(l1*8/1e9))
	try:
	# why change 1 dimension? and why use float64
	fp = np.memmap('test.map',dtype='float32', mode='w+',shape=(l1,cols))
	print('working with %d mb'%(fp.nbytes/1024/1024))
	del fp
	except Exception as e:
	print(repr(e))

	print 'done'

	memmap_size_test()
	# memmap_operations()

	# fm= generate_and_store_data()
	# k= 999
	# t0= time.time()
	# pca_mm(fm,k)
	# print (time.time()-t0)