mrgloom/gist:11046631

## gistfile1.txt
#test save and load of numpy matrix
#test matrix multiplication in memory and using memmap

#in case of memmap no need to use batch processing

#also can test hdf5 and pytables for matrix mult

#can use matrix mult for pca (more smart to use randompca)

#need to test it on x64 machine

import numpy as np
import time
import struct

rows=100000
cols=1000

def create_matrix(rows,cols):
	data = (np.random.rand(rows,cols)*100).astype('uint8') #type for image [0 255] int8?
	return data

def save_matrix(filename, data):
	np.save(filename, data)

def load_matrix(filename):
	data= np.load(filename)
	return data

def load_npy_to_memmap(filename, dtype, shape):
    # npy format is documented here
    # https://github.com/numpy/numpy/blob/master/doc/neps/npy-format.txt
    with open(filename, 'r') as f:
        # skip magic string \x93NUMPY + 2 bytes major/minor version number
        # + 2 bytes little-endian unsigned short int
        junk, header_len = struct.unpack('<8sh', f.read(10))

    data= np.memmap(filename, dtype=dtype, shape=shape, offset=6+2+2+header_len)
    return data

def test_mult_ram():
	A= create_matrix(rows,cols)
	print 'a'
	save_matrix("A.npy", A)
	print 'aa'
	A= load_matrix("A.npy")
	# print A
	print A.nbytes/1024/1024
	B= create_matrix(cols,rows)
	save_matrix("B.npy", B)
	B= load_matrix("B.npy")
	# print B
	print B.nbytes/1024/1024

	pause

	# print A.shape
	# print B.shape
	t0= time.time()
	C= np.dot(A,B)
	print (time.time()-t0)
	# print C.shape

def test_memmap():
	#seems it creates array? or it reads array in different way reading header as data?
	# fA = np.memmap('A.npy', dtype='uint8', mode='r', shape=(rows,cols))
	# fB = np.memmap('B.npy', dtype='uint8', mode='r', shape=(cols,rows))
	fA = load_npy_to_memmap('A.npy', dtype='uint8', shape=(rows, cols))
	fB = load_npy_to_memmap('B.npy', dtype='uint8', shape=(cols, rows))
	#need to predefine size of result? need write header? no need to save in .npy format?
	fC = np.memmap('C.npy', dtype='uint16', mode='w+', shape=(rows,rows))

	# print fA
	# print fB

	t0= time.time()
	fC= np.dot(fA,fB)
	print (time.time()-t0)

	# print fC[12:10]
	# print fC


test_mult_ram()

test_memmap()
	#test save and load of numpy matrix
	#test matrix multiplication in memory and using memmap

	#in case of memmap no need to use batch processing

	#also can test hdf5 and pytables for matrix mult

	#can use matrix mult for pca (more smart to use randompca)

	#need to test it on x64 machine

	import numpy as np
	import time
	import struct

	rows=100000
	cols=1000

	def create_matrix(rows,cols):
	data = (np.random.rand(rows,cols)*100).astype('uint8') #type for image [0 255] int8?
	return data

	def save_matrix(filename, data):
	np.save(filename, data)

	def load_matrix(filename):
	data= np.load(filename)
	return data

	def load_npy_to_memmap(filename, dtype, shape):
	# npy format is documented here
	# https://github.com/numpy/numpy/blob/master/doc/neps/npy-format.txt
	with open(filename, 'r') as f:
	# skip magic string \x93NUMPY + 2 bytes major/minor version number
	# + 2 bytes little-endian unsigned short int
	junk, header_len = struct.unpack('<8sh', f.read(10))

	data= np.memmap(filename, dtype=dtype, shape=shape, offset=6+2+2+header_len)
	return data

	def test_mult_ram():
	A= create_matrix(rows,cols)
	print 'a'
	save_matrix("A.npy", A)
	print 'aa'
	A= load_matrix("A.npy")
	# print A
	print A.nbytes/1024/1024
	B= create_matrix(cols,rows)
	save_matrix("B.npy", B)
	B= load_matrix("B.npy")
	# print B
	print B.nbytes/1024/1024

	pause

	# print A.shape
	# print B.shape
	t0= time.time()
	C= np.dot(A,B)
	print (time.time()-t0)
	# print C.shape

	def test_memmap():
	#seems it creates array? or it reads array in different way reading header as data?
	# fA = np.memmap('A.npy', dtype='uint8', mode='r', shape=(rows,cols))
	# fB = np.memmap('B.npy', dtype='uint8', mode='r', shape=(cols,rows))
	fA = load_npy_to_memmap('A.npy', dtype='uint8', shape=(rows, cols))
	fB = load_npy_to_memmap('B.npy', dtype='uint8', shape=(cols, rows))
	#need to predefine size of result? need write header? no need to save in .npy format?
	fC = np.memmap('C.npy', dtype='uint16', mode='w+', shape=(rows,rows))

	# print fA
	# print fB

	t0= time.time()
	fC= np.dot(fA,fB)
	print (time.time()-t0)

	# print fC[12:10]
	# print fC



	test_mult_ram()

	test_memmap()