daskol/llama.cpp-quantize.py

## llama.cpp-quantize.py
import numpy as np


def pack(xs, dtype='q4_0'):
    assert dtype == 'q4_0', 'Only quantized int4 type is supported.'
    assert xs.size % 2 == 0, 'Only arrays of even length.'
    # Estimate magnitude of array elements and its inverse.
    amax = abs(xs).max()
    magnitude = amax / 0b0111
    precision = np.float32(1) / magnitude if magnitude else np.float32(0)
    # Quantize elements.
    xs = xs.flatten()
    xs = (xs * precision).astype(np.int8) + 8
    ys = xs[::2] | (xs[1::2] << 4)
    # Append magnitude to the end of int8 array for unpacking.
    footer = amax.tobytes() + magnitude.astype(np.float32).tobytes()
    zs = np.frombuffer(footer, np.int8)
    return np.hstack([ys, zs])


def unpack(xs: np.ndarray):
    assert xs.ndim == 1, 'Only int8 sequences are supported.'
    assert xs.size >= 8, 'Too short array.'
    # Restore magnitude of quantization.
    amax, magnitude = np.frombuffer(xs[-8:].tobytes(), np.float32)
    # Restore sequence elements to array with stride 2 (interleaving).
    xs = xs[:-8]
    zs = np.zeros((xs.size, 2), np.float32)
    zs[:, 0] = magnitude * ((xs & 0x0f) - 8)
    zs[:, 1] = magnitude * (((xs & 0xf0) >> 4) - 8)
    # Flatten array in order to restore sequence of elements.
    return zs.flatten()


def test_pack_unpack():
    xs = np.random.randn(100)
    xs = np.arange(100)
    xs -= xs.size // 2
    print('original')
    print(xs)
    print('packed')
    ys = pack(xs.astype(np.float32))
    print(ys)
    print('unpacked')
    zs = unpack(ys)
    print(zs)
    print('absolute errors')
    aerr = zs - xs
    print(aerr)
    print('relateive error')
    rerr = np.linalg.norm(aerr) / np.linalg.norm(xs)
    print(rerr)
	import numpy as np


	def pack(xs, dtype='q4_0'):
	assert dtype == 'q4_0', 'Only quantized int4 type is supported.'
	assert xs.size % 2 == 0, 'Only arrays of even length.'
	# Estimate magnitude of array elements and its inverse.
	amax = abs(xs).max()
	magnitude = amax / 0b0111
	precision = np.float32(1) / magnitude if magnitude else np.float32(0)
	# Quantize elements.
	xs = xs.flatten()
	xs = (xs * precision).astype(np.int8) + 8
	ys = xs[::2] \| (xs[1::2] << 4)
	# Append magnitude to the end of int8 array for unpacking.
	footer = amax.tobytes() + magnitude.astype(np.float32).tobytes()
	zs = np.frombuffer(footer, np.int8)
	return np.hstack([ys, zs])


	def unpack(xs: np.ndarray):
	assert xs.ndim == 1, 'Only int8 sequences are supported.'
	assert xs.size >= 8, 'Too short array.'
	# Restore magnitude of quantization.
	amax, magnitude = np.frombuffer(xs[-8:].tobytes(), np.float32)
	# Restore sequence elements to array with stride 2 (interleaving).
	xs = xs[:-8]
	zs = np.zeros((xs.size, 2), np.float32)
	zs[:, 0] = magnitude * ((xs & 0x0f) - 8)
	zs[:, 1] = magnitude * (((xs & 0xf0) >> 4) - 8)
	# Flatten array in order to restore sequence of elements.
	return zs.flatten()


	def test_pack_unpack():
	xs = np.random.randn(100)
	xs = np.arange(100)
	xs -= xs.size // 2
	print('original')
	print(xs)
	print('packed')
	ys = pack(xs.astype(np.float32))
	print(ys)
	print('unpacked')
	zs = unpack(ys)
	print(zs)
	print('absolute errors')
	aerr = zs - xs
	print(aerr)
	print('relateive error')
	rerr = np.linalg.norm(aerr) / np.linalg.norm(xs)
	print(rerr)