cyyynthia/llama_measure_perf.mjs

## llama_measure_perf.mjs
/*!
 * BSD Zero Clause License
 * Copyright (c) Cynthia Rey
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
 */

import { spawn, execSync } from 'child_process'
import { open } from 'fs/promises'

const LLAMA_PARAMETERS = [
	'-t', '4',
	'-p', 'Here is a long story about how programming came to be:',
	'-n', '256',
	'-c', '1024',
	'--top_k', '40',
	'--top_p', '0.95',
	'--repeat_last_n', '64',
	'--repeat_penalty', '1.1',
]

const LLAMA_START_AT = '2d64715ad475f192a4004a52d134c67ccb6f44ad'

const LLAMA_GGMF_SINCE = '074bea2eb1f1349a0118239c4152914aecaa1be4'
const LLAMA_GGJT_SINCE = '78ca9838ee36660a776e97e3391b6fb5dcaacf7f'

const IMPORTANT_FILES = [
	'ggml.c', 'ggml.h',
	'llama.cpp', 'llama.h',
	'examples/common.cpp', 'examples/common.h',
	'examples/main/main.cpp',
]

const FORMATS = {
	unversioned: './models/7B/ggml-model-q4_0.unversioned.bin',
	ggmf: './models/7B/ggml-model-q4_0.ggmf.bin',
	ggjt: './models/7B/ggml-model-q4_0.bin',
}

const csv = await open('./result.csv', 'w')
csv.appendFile('commit,format,load time,sample time,prompt eval time,eval time,total time,eval token time,max memory rss (KB)\n')

function getMemoryUsageOf (pid) {
	return Number(execSync(`ps -p ${pid} -o rss`).subarray(6).toString())
}

async function runLlama (commit, format) {
	return new Promise((resolve) => {
		const llama = spawn('./main',  [ ...LLAMA_PARAMETERS, '-m', FORMATS[format] ])

		// Keep used memory information
		// Ugly way of doing it but I cba to make it better :D
		let usedMemory = 0
		const memCheck = setInterval(() => (usedMemory = Math.max(usedMemory, getMemoryUsageOf(llama.pid))), 5e3)

		let dnf = false
		const to = setTimeout(() => {
			dnf = true
			llama.kill('SIGKILL')
			console.log('Warn: DNF (timeout)')
		}, 300e3)

		const stderrChunks = []
		llama.stderr.on('data', (d) => stderrChunks.push(d))
		llama.on('exit', () => {
			clearInterval(memCheck)
			clearTimeout(to)
			// DNF; go next
			if (dnf) return resolve()

			const stderr = Buffer.concat(stderrChunks).toString()

			const timings = stderr.matchAll(/(?:llama_print_timings|main):.*?time =\s+([\d\.]+)/g)
			const tokTime = stderr.match(/eval time.*?\(\s+([\d\.]+) ms per run|\/\s+([\d.]+) ms/)

			const loadTime = timings.next().value[1]
			const sampleTime = timings.next().value[1]
			const promptEvalTime = stderr.includes('prompt eval time') ? timings.next().value[1] : ''
			const evalTime = timings.next().value[1]
			const totalTime = timings.next().value[1]
			const tokenTime = tokTime[1] ?? tokTime[2]

			csv.appendFile(`${commit},${format},${loadTime},${sampleTime},${promptEvalTime},${evalTime},${totalTime},${tokenTime},${usedMemory}\n`)
			resolve()
		})
	})
}

function checkoutLlama (commit) {
	execSync('make clean', { stdio: 'ignore' })
	execSync(`git checkout ${commit}`, { stdio: 'ignore' })
	execSync('make -j main', { stdio: 'ignore' })
}

function getCommits () {
	// This function excludes commits which did not modify llama-related files.
	// It's not made in the most efficient way, but whatever :D

	const allCommits = execSync('git log --pretty=format:"%H"').toString().split('\n')

	const importantCommits = new Set()
	for (const file of IMPORTANT_FILES) {
		execSync(`git log --pretty=format:"%H" --follow '${file}'`).toString().split('\n')
			.forEach((c) => importantCommits.add(c))
	}

	return allCommits.slice(0, allCommits.indexOf(LLAMA_START_AT) + 1).reverse()
		.filter((c) => importantCommits.has(c))
}

const commits = getCommits()
const ggmfChangeIdx = commits.indexOf(LLAMA_GGMF_SINCE)
const ggjtChangeIdx = commits.indexOf(LLAMA_GGJT_SINCE)
for (let i = 0; i < commits.length; i++) {
	const commit = commits[i]
	const format = i < ggmfChangeIdx
		? 'unversioned'
		: i < ggjtChangeIdx
			? 'ggmf'
			: 'ggjt'


	console.log('Processing commit %s (%d/%d)', commit, i, commits.length)
	try {
		checkoutLlama(commit)
	} catch {
		console.log('Warn: DNF (compile error)')
		continue
	}

	try {
		await runLlama(commit, format)
	} catch {
		console.log('Warn: DNF (runtime error)')
		continue
	}
}
	/*!
	* BSD Zero Clause License
	* Copyright (c) Cynthia Rey
	*
	* Permission to use, copy, modify, and/or distribute this software for any
	* purpose with or without fee is hereby granted.
	*
	* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
	* REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
	* AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
	* INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
	* LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
	* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
	* PERFORMANCE OF THIS SOFTWARE.
	*/

	import { spawn, execSync } from 'child_process'
	import { open } from 'fs/promises'

	const LLAMA_PARAMETERS = [
	'-t', '4',
	'-p', 'Here is a long story about how programming came to be:',
	'-n', '256',
	'-c', '1024',
	'--top_k', '40',
	'--top_p', '0.95',
	'--repeat_last_n', '64',
	'--repeat_penalty', '1.1',
	]

	const LLAMA_START_AT = '2d64715ad475f192a4004a52d134c67ccb6f44ad'

	const LLAMA_GGMF_SINCE = '074bea2eb1f1349a0118239c4152914aecaa1be4'
	const LLAMA_GGJT_SINCE = '78ca9838ee36660a776e97e3391b6fb5dcaacf7f'

	const IMPORTANT_FILES = [
	'ggml.c', 'ggml.h',
	'llama.cpp', 'llama.h',
	'examples/common.cpp', 'examples/common.h',
	'examples/main/main.cpp',
	]

	const FORMATS = {
	unversioned: './models/7B/ggml-model-q4_0.unversioned.bin',
	ggmf: './models/7B/ggml-model-q4_0.ggmf.bin',
	ggjt: './models/7B/ggml-model-q4_0.bin',
	}

	const csv = await open('./result.csv', 'w')
	csv.appendFile('commit,format,load time,sample time,prompt eval time,eval time,total time,eval token time,max memory rss (KB)\n')

	function getMemoryUsageOf (pid) {
	return Number(execSync(`ps -p ${pid} -o rss`).subarray(6).toString())
	}

	async function runLlama (commit, format) {
	return new Promise((resolve) => {
	const llama = spawn('./main', [ ...LLAMA_PARAMETERS, '-m', FORMATS[format] ])

	// Keep used memory information
	// Ugly way of doing it but I cba to make it better :D
	let usedMemory = 0
	const memCheck = setInterval(() => (usedMemory = Math.max(usedMemory, getMemoryUsageOf(llama.pid))), 5e3)

	let dnf = false
	const to = setTimeout(() => {
	dnf = true
	llama.kill('SIGKILL')
	console.log('Warn: DNF (timeout)')
	}, 300e3)

	const stderrChunks = []
	llama.stderr.on('data', (d) => stderrChunks.push(d))
	llama.on('exit', () => {
	clearInterval(memCheck)
	clearTimeout(to)
	// DNF; go next
	if (dnf) return resolve()

	const stderr = Buffer.concat(stderrChunks).toString()

	const timings = stderr.matchAll(/(?:llama_print_timings\|main):.*?time =\s+([\d\.]+)/g)
	const tokTime = stderr.match(/eval time.*?\(\s+([\d\.]+) ms per run\|\/\s+([\d.]+) ms/)

	const loadTime = timings.next().value[1]
	const sampleTime = timings.next().value[1]
	const promptEvalTime = stderr.includes('prompt eval time') ? timings.next().value[1] : ''
	const evalTime = timings.next().value[1]
	const totalTime = timings.next().value[1]
	const tokenTime = tokTime[1] ?? tokTime[2]

	csv.appendFile(`${commit},${format},${loadTime},${sampleTime},${promptEvalTime},${evalTime},${totalTime},${tokenTime},${usedMemory}\n`)
	resolve()
	})
	})
	}

	function checkoutLlama (commit) {
	execSync('make clean', { stdio: 'ignore' })
	execSync(`git checkout ${commit}`, { stdio: 'ignore' })
	execSync('make -j main', { stdio: 'ignore' })
	}

	function getCommits () {
	// This function excludes commits which did not modify llama-related files.
	// It's not made in the most efficient way, but whatever :D

	const allCommits = execSync('git log --pretty=format:"%H"').toString().split('\n')

	const importantCommits = new Set()
	for (const file of IMPORTANT_FILES) {
	execSync(`git log --pretty=format:"%H" --follow '${file}'`).toString().split('\n')
	.forEach((c) => importantCommits.add(c))
	}

	return allCommits.slice(0, allCommits.indexOf(LLAMA_START_AT) + 1).reverse()
	.filter((c) => importantCommits.has(c))
	}

	const commits = getCommits()
	const ggmfChangeIdx = commits.indexOf(LLAMA_GGMF_SINCE)
	const ggjtChangeIdx = commits.indexOf(LLAMA_GGJT_SINCE)
	for (let i = 0; i < commits.length; i++) {
	const commit = commits[i]
	const format = i < ggmfChangeIdx
	? 'unversioned'
	: i < ggjtChangeIdx
	? 'ggmf'
	: 'ggjt'


	console.log('Processing commit %s (%d/%d)', commit, i, commits.length)
	try {
	checkoutLlama(commit)
	} catch {
	console.log('Warn: DNF (compile error)')
	continue
	}

	try {
	await runLlama(commit, format)
	} catch {
	console.log('Warn: DNF (runtime error)')
	continue
	}
	}