nauful/gist:45b38a87142010a401b710f56ce08baa

## gistfile1.txt
void opt_parse_path_alt(const byte* window, ParsePath* table, const MatchNode* match_table, State& state, int opt_current_p, int opt_start_at, int opt_next_at, int block_end) {
		const int MAX_STATE_SIZE = 0x100;
		const int MAX_STATE_MASK = MAX_STATE_SIZE - 1;

		struct CarriedState {
			byte match_hist;
			BitNibble2 cmd[4];

			RepState rep;
		};

		struct IncrementalState {
			//byte prev_match_ctx;
			//uint32 prev_match_pos;
		};

		uint32 prev[OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1];
		uint32 next[OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1];

		CarriedState carried_state[MAX_STATE_SIZE];
		IncrementalState incremental_state[OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1];
		const int stack_mem = (sizeof(next) + sizeof(prev) + sizeof(carried_state) + sizeof(incremental_state) + 1023) >> 10;

		memset(prev, -1, sizeof(prev));
		memset(next, -1, sizeof(next));
		memset(incremental_state, -1, sizeof(incremental_state));

		uint32 path_len = block_end - opt_start_at;
		uint32 path_offs = opt_current_p - opt_start_at;

		ASSERT(path_len <= OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1);

		table[path_len].cmd = CMD_LITERAL;
		table[path_len].carried_cost = 0;
		table[path_len].local_cost = 0;

		for (uint32 op = 0; op < path_offs; op++) {
			table[op].carried_cost = -1;
			prev[op] = 0;
		}

		for (int i = 0; i < 4; i++) {
			carried_state[path_offs & MAX_STATE_MASK].cmd[i] = state.cmd[i][0];
		}
		carried_state[path_offs & MAX_STATE_MASK].match_hist = state.match_hist;
		carried_state[path_offs & MAX_STATE_MASK].rep = state.rep;
		//incremental_state[path_offs].prev_match_ctx = -1;
		//incremental_state[path_offs].prev_match_pos = -1;

		table[path_offs].carried_cost = 0;
		prev[path_offs] = -1;

		for (uint32 p = path_offs + 1; p <= path_len; p++) {
			table[p].carried_cost = -1;
		}

		for (uint32 p = path_offs; p < path_len; p++) {
			uint32 abs_p = p + opt_start_at;

			int pb = state.pos_align_bits ? (p & state.pos_align_bits) : 0;
			int ctx = abs_p > 0 ? window[abs_p - 1] : 0;

			uint32 literal_cost = state.PriceLiteralCachedCmd(window[abs_p], pb, ctx,
				carried_state[p & MAX_STATE_MASK].cmd[carried_state[p & MAX_STATE_MASK].match_hist].Price(CMD_LITERAL));
			uint32 np = 1 + p;
			if (literal_cost + table[p].carried_cost < table[np].carried_cost) {
				prev[np] = p;
				table[np].cmd = CMD_LITERAL;
				table[np].carried_cost = literal_cost + table[p].carried_cost;

				carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
				carried_state[np & MAX_STATE_MASK].cmd[carried_state[np & MAX_STATE_MASK].match_hist].Update(CMD_LITERAL);
				carried_state[np & MAX_STATE_MASK].match_hist = ((carried_state[np & MAX_STATE_MASK].match_hist << 1) + 0) & 3;
				//incremental_state[np].prev_match_ctx = -1;
				//incremental_state[np].prev_match_pos = -1;
			}

			for (uint32 rep_index = 0; rep_index < 4; rep_index++) {
				uint32 rp = abs_p - carried_state[p & MAX_STATE_MASK].rep.entry[rep_index];
				if (rp >= abs_p) {
					continue;
				}

				int max_len = try_match(window, rp, abs_p, block_end);
				if (max_len >= MAX_STATE_SIZE) { max_len = MAX_STATE_MASK; }
				if (p + max_len > path_len) { max_len = path_len - p; }

				if (max_len < MATCH_MIN_BASE) {
					continue;
				}

				//int test_step = (max_len - MATCH_MIN_BASE) >> 3;
				//test_step += test_step == 0;

				int test_len = max_len;
				while (test_len >= MATCH_MIN_BASE) {
					ASSERT_MATCH(!memcmp(window + abs_p, window + rp, test_len));
					ASSERT_MATCH(rp + test_len <= opt_start_at + path_len);
					uint32 lv = test_len - MATCH_MIN_BASE;

					uint32 np = p + test_len;
					uint32 rep_cost = state.PriceRepCachedCmd(lv, rep_index, pb, ctx,
						carried_state[p & MAX_STATE_MASK].cmd[carried_state[p & MAX_STATE_MASK].match_hist].Price(CMD_REP4));
					if (rep_cost + table[p].carried_cost < table[np].carried_cost) {
						prev[np] = p;
						table[np].cmd = CMD_REP4;
						table[np].rep_index = rep_index;
						table[np].dict_match_pos = rp;
						table[np].carried_cost = rep_cost + table[p].carried_cost;

						carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
						carried_state[np & MAX_STATE_MASK].rep.Update(abs_p - rp);
						carried_state[np & MAX_STATE_MASK].cmd[carried_state[np & MAX_STATE_MASK].match_hist].Update(CMD_REP4);
						carried_state[np & MAX_STATE_MASK].match_hist = ((carried_state[np & MAX_STATE_MASK].match_hist << 1) + 1) & 3;
						//incremental_state[np].prev_match_ctx = ctx;
						//incremental_state[np].prev_match_pos = abs_p;
					}

					--test_len;
				}
			}

			//uint32 prev_match_table[PrevMatchState::NUM_ENTRIES];
			//memcpy(prev_match_table, state.prev_match.entry[ctx], sizeof(prev_match_table));
			//for (uint32 op = p; op >= path_offs && op <= p; op = prev[op]) {
			//	if (incremental_state[op].prev_match_ctx == ctx) {
			//		for (int i = PrevMatchState::NUM_ENTRIES - 1; i > 0; i--) {
			//			prev_match_table[i] = prev_match_table[i - 1];
			//		}

			//		prev_match_table[0] = incremental_state[op].prev_match_pos;
			//	}
			//}

			//for (int i = 0; i < PrevMatchState::NUM_ENTRIES; i++) {
			//	if (prev_match_table[i] >= abs_p) {
			//		continue;
			//	}
			//	uint32 rp = prev_match_table[i];

			//	int max_len = try_match(window, rp, abs_p, block_end);
			//	if (max_len >= MAX_STATE_SIZE) { max_len = MAX_STATE_MASK; }
			//	if (p + max_len > path_len) { max_len = path_len - p; }

			//	int test_len = max_len;
			//	while (test_len >= MATCH_MIN_BASE) {
			//		ASSERT_MATCH(!memcmp(window + abs_p, window + rp, test_len));
			//		ASSERT_MATCH(rp + test_len <= opt_start_at + path_len);
			//		uint32 lv = test_len - MATCH_MIN_BASE;

			//		uint32 np = p + test_len;
			//		uint32 match_cost = state.PricePrevMatchCached(lv, i, pb, ctx);
			//		if (match_cost + table[p].carried_cost < table[np].carried_cost) {
			//			prev[np] = p;
			//			table[np].cmd = CMD_PREV_MATCH;
			//			table[np].dict_match_pos = rp;
			//			table[np].rep_index = i;
			//			table[np].carried_cost = match_cost + table[p].carried_cost;

			//			carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
			//			carried_state[np & MAX_STATE_MASK].rep.Update(abs_p - rp);
			//			incremental_state[np].prev_match_ctx = -1;
			//			incremental_state[np].prev_match_pos = -1;
			//		}

			//		--test_len;
			//	}
			//}

			const MatchNode* opt = match_table + p;
			if (opt->max_len >= MATCH_MIN_BASE) {
				uint32 max_match_pos = opt->max_match_pos;
				int max_len = opt->max_len;

				if (max_len >= MAX_STATE_SIZE) { max_len = MAX_STATE_MASK; }
				if (p + max_len > path_len) { max_len = path_len - p; }

				//int test_step = (max_len - MATCH_MIN_BASE) >> 3;
				//test_step += test_step == 0;

				int test_len = max_len;
				while (test_len >= MATCH_MIN_BASE) {
					int node_slot = test_len - MATCH_MIN_BASE;

					uint32 match_pos = node_slot < TABLE_MATCH_SLOTS ? opt->match_pos[node_slot] : max_match_pos;
					uint32 dv = abs_p - match_pos - 1;

					ASSERT_MATCH(!memcmp(window + abs_p, window + match_pos, test_len));
					ASSERT_MATCH(match_pos + test_len <= opt_start_at + path_len);

					if (test_len < get_match_min(dv)) {
						--test_len;
						continue;
					}

					uint32 lv = test_len - get_match_min(dv);
					ASSERT(lv < 0x7FFFFF);

					uint32 np = p + test_len;
					uint32 dict_cost = state.PriceMatchCachedCmd(lv, dv, pb, ctx,
						carried_state[p & MAX_STATE_MASK].cmd[carried_state[p & MAX_STATE_MASK].match_hist].Price(CMD_DICT_MATCH));
					if (dict_cost + table[p].carried_cost < table[np].carried_cost) {
						prev[np] = p;
						table[np].cmd = CMD_DICT_MATCH;
						table[np].dict_match_pos = match_pos;
						table[np].carried_cost = dict_cost + table[p].carried_cost;

						carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
						carried_state[np & MAX_STATE_MASK].rep.Update(abs_p - match_pos);
						carried_state[np & MAX_STATE_MASK].cmd[carried_state[np & MAX_STATE_MASK].match_hist].Update(CMD_DICT_MATCH);
						carried_state[np & MAX_STATE_MASK].match_hist = ((carried_state[np & MAX_STATE_MASK].match_hist << 1) + 1) & 3;
						//incremental_state[np].prev_match_ctx = ctx;
						//incremental_state[np].prev_match_pos = abs_p;
					}

					--test_len;
				}
			}
		}

		for (uint32 p = path_len; p > path_offs; p = prev[p]) {
			next[prev[p]] = p;
		}

		for (uint32 p = path_offs; p < path_len; p = next[p]) {
			uint32 np = next[p];

			ASSERT(np > p);
			ASSERT(np <= path_len);
			ASSERT(table[p].carried_cost < table[np].carried_cost);

			uint32 local_cost = table[np].carried_cost - table[p].carried_cost;

			table[p] = table[np];
			table[p].match_len = np - p;
			table[p].local_cost = local_cost;
		}
	}
	void opt_parse_path_alt(const byte* window, ParsePath* table, const MatchNode* match_table, State& state, int opt_current_p, int opt_start_at, int opt_next_at, int block_end) {
	const int MAX_STATE_SIZE = 0x100;
	const int MAX_STATE_MASK = MAX_STATE_SIZE - 1;

	struct CarriedState {
	byte match_hist;
	BitNibble2 cmd[4];

	RepState rep;
	};

	struct IncrementalState {
	//byte prev_match_ctx;
	//uint32 prev_match_pos;
	};

	uint32 prev[OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1];
	uint32 next[OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1];

	CarriedState carried_state[MAX_STATE_SIZE];
	IncrementalState incremental_state[OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1];
	const int stack_mem = (sizeof(next) + sizeof(prev) + sizeof(carried_state) + sizeof(incremental_state) + 1023) >> 10;

	memset(prev, -1, sizeof(prev));
	memset(next, -1, sizeof(next));
	memset(incremental_state, -1, sizeof(incremental_state));

	uint32 path_len = block_end - opt_start_at;
	uint32 path_offs = opt_current_p - opt_start_at;

	ASSERT(path_len <= OPT_BLOCK_SIZE + OPT_BLOCK_PEEK + 1);

	table[path_len].cmd = CMD_LITERAL;
	table[path_len].carried_cost = 0;
	table[path_len].local_cost = 0;

	for (uint32 op = 0; op < path_offs; op++) {
	table[op].carried_cost = -1;
	prev[op] = 0;
	}

	for (int i = 0; i < 4; i++) {
	carried_state[path_offs & MAX_STATE_MASK].cmd[i] = state.cmd[i][0];
	}
	carried_state[path_offs & MAX_STATE_MASK].match_hist = state.match_hist;
	carried_state[path_offs & MAX_STATE_MASK].rep = state.rep;
	//incremental_state[path_offs].prev_match_ctx = -1;
	//incremental_state[path_offs].prev_match_pos = -1;

	table[path_offs].carried_cost = 0;
	prev[path_offs] = -1;

	for (uint32 p = path_offs + 1; p <= path_len; p++) {
	table[p].carried_cost = -1;
	}

	for (uint32 p = path_offs; p < path_len; p++) {
	uint32 abs_p = p + opt_start_at;

	int pb = state.pos_align_bits ? (p & state.pos_align_bits) : 0;
	int ctx = abs_p > 0 ? window[abs_p - 1] : 0;

	uint32 literal_cost = state.PriceLiteralCachedCmd(window[abs_p], pb, ctx,
	carried_state[p & MAX_STATE_MASK].cmd[carried_state[p & MAX_STATE_MASK].match_hist].Price(CMD_LITERAL));
	uint32 np = 1 + p;
	if (literal_cost + table[p].carried_cost < table[np].carried_cost) {
	prev[np] = p;
	table[np].cmd = CMD_LITERAL;
	table[np].carried_cost = literal_cost + table[p].carried_cost;

	carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
	carried_state[np & MAX_STATE_MASK].cmd[carried_state[np & MAX_STATE_MASK].match_hist].Update(CMD_LITERAL);
	carried_state[np & MAX_STATE_MASK].match_hist = ((carried_state[np & MAX_STATE_MASK].match_hist << 1) + 0) & 3;
	//incremental_state[np].prev_match_ctx = -1;
	//incremental_state[np].prev_match_pos = -1;
	}

	for (uint32 rep_index = 0; rep_index < 4; rep_index++) {
	uint32 rp = abs_p - carried_state[p & MAX_STATE_MASK].rep.entry[rep_index];
	if (rp >= abs_p) {
	continue;
	}

	int max_len = try_match(window, rp, abs_p, block_end);
	if (max_len >= MAX_STATE_SIZE) { max_len = MAX_STATE_MASK; }
	if (p + max_len > path_len) { max_len = path_len - p; }

	if (max_len < MATCH_MIN_BASE) {
	continue;
	}

	//int test_step = (max_len - MATCH_MIN_BASE) >> 3;
	//test_step += test_step == 0;

	int test_len = max_len;
	while (test_len >= MATCH_MIN_BASE) {
	ASSERT_MATCH(!memcmp(window + abs_p, window + rp, test_len));
	ASSERT_MATCH(rp + test_len <= opt_start_at + path_len);
	uint32 lv = test_len - MATCH_MIN_BASE;

	uint32 np = p + test_len;
	uint32 rep_cost = state.PriceRepCachedCmd(lv, rep_index, pb, ctx,
	carried_state[p & MAX_STATE_MASK].cmd[carried_state[p & MAX_STATE_MASK].match_hist].Price(CMD_REP4));
	if (rep_cost + table[p].carried_cost < table[np].carried_cost) {
	prev[np] = p;
	table[np].cmd = CMD_REP4;
	table[np].rep_index = rep_index;
	table[np].dict_match_pos = rp;
	table[np].carried_cost = rep_cost + table[p].carried_cost;

	carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
	carried_state[np & MAX_STATE_MASK].rep.Update(abs_p - rp);
	carried_state[np & MAX_STATE_MASK].cmd[carried_state[np & MAX_STATE_MASK].match_hist].Update(CMD_REP4);
	carried_state[np & MAX_STATE_MASK].match_hist = ((carried_state[np & MAX_STATE_MASK].match_hist << 1) + 1) & 3;
	//incremental_state[np].prev_match_ctx = ctx;
	//incremental_state[np].prev_match_pos = abs_p;
	}

	--test_len;
	}
	}

	//uint32 prev_match_table[PrevMatchState::NUM_ENTRIES];
	//memcpy(prev_match_table, state.prev_match.entry[ctx], sizeof(prev_match_table));
	//for (uint32 op = p; op >= path_offs && op <= p; op = prev[op]) {
	// if (incremental_state[op].prev_match_ctx == ctx) {
	// for (int i = PrevMatchState::NUM_ENTRIES - 1; i > 0; i--) {
	// prev_match_table[i] = prev_match_table[i - 1];
	// }

	// prev_match_table[0] = incremental_state[op].prev_match_pos;
	// }
	//}

	//for (int i = 0; i < PrevMatchState::NUM_ENTRIES; i++) {
	// if (prev_match_table[i] >= abs_p) {
	// continue;
	// }
	// uint32 rp = prev_match_table[i];

	// int max_len = try_match(window, rp, abs_p, block_end);
	// if (max_len >= MAX_STATE_SIZE) { max_len = MAX_STATE_MASK; }
	// if (p + max_len > path_len) { max_len = path_len - p; }

	// int test_len = max_len;
	// while (test_len >= MATCH_MIN_BASE) {
	// ASSERT_MATCH(!memcmp(window + abs_p, window + rp, test_len));
	// ASSERT_MATCH(rp + test_len <= opt_start_at + path_len);
	// uint32 lv = test_len - MATCH_MIN_BASE;

	// uint32 np = p + test_len;
	// uint32 match_cost = state.PricePrevMatchCached(lv, i, pb, ctx);
	// if (match_cost + table[p].carried_cost < table[np].carried_cost) {
	// prev[np] = p;
	// table[np].cmd = CMD_PREV_MATCH;
	// table[np].dict_match_pos = rp;
	// table[np].rep_index = i;
	// table[np].carried_cost = match_cost + table[p].carried_cost;

	// carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
	// carried_state[np & MAX_STATE_MASK].rep.Update(abs_p - rp);
	// incremental_state[np].prev_match_ctx = -1;
	// incremental_state[np].prev_match_pos = -1;
	// }

	// --test_len;
	// }
	//}

	const MatchNode* opt = match_table + p;
	if (opt->max_len >= MATCH_MIN_BASE) {
	uint32 max_match_pos = opt->max_match_pos;
	int max_len = opt->max_len;

	if (max_len >= MAX_STATE_SIZE) { max_len = MAX_STATE_MASK; }
	if (p + max_len > path_len) { max_len = path_len - p; }

	//int test_step = (max_len - MATCH_MIN_BASE) >> 3;
	//test_step += test_step == 0;

	int test_len = max_len;
	while (test_len >= MATCH_MIN_BASE) {
	int node_slot = test_len - MATCH_MIN_BASE;

	uint32 match_pos = node_slot < TABLE_MATCH_SLOTS ? opt->match_pos[node_slot] : max_match_pos;
	uint32 dv = abs_p - match_pos - 1;

	ASSERT_MATCH(!memcmp(window + abs_p, window + match_pos, test_len));
	ASSERT_MATCH(match_pos + test_len <= opt_start_at + path_len);

	if (test_len < get_match_min(dv)) {
	--test_len;
	continue;
	}

	uint32 lv = test_len - get_match_min(dv);
	ASSERT(lv < 0x7FFFFF);

	uint32 np = p + test_len;
	uint32 dict_cost = state.PriceMatchCachedCmd(lv, dv, pb, ctx,
	carried_state[p & MAX_STATE_MASK].cmd[carried_state[p & MAX_STATE_MASK].match_hist].Price(CMD_DICT_MATCH));
	if (dict_cost + table[p].carried_cost < table[np].carried_cost) {
	prev[np] = p;
	table[np].cmd = CMD_DICT_MATCH;
	table[np].dict_match_pos = match_pos;
	table[np].carried_cost = dict_cost + table[p].carried_cost;

	carried_state[np & MAX_STATE_MASK] = carried_state[p & MAX_STATE_MASK];
	carried_state[np & MAX_STATE_MASK].rep.Update(abs_p - match_pos);
	carried_state[np & MAX_STATE_MASK].cmd[carried_state[np & MAX_STATE_MASK].match_hist].Update(CMD_DICT_MATCH);
	carried_state[np & MAX_STATE_MASK].match_hist = ((carried_state[np & MAX_STATE_MASK].match_hist << 1) + 1) & 3;
	//incremental_state[np].prev_match_ctx = ctx;
	//incremental_state[np].prev_match_pos = abs_p;
	}

	--test_len;
	}
	}
	}

	for (uint32 p = path_len; p > path_offs; p = prev[p]) {
	next[prev[p]] = p;
	}

	for (uint32 p = path_offs; p < path_len; p = next[p]) {
	uint32 np = next[p];

	ASSERT(np > p);
	ASSERT(np <= path_len);
	ASSERT(table[p].carried_cost < table[np].carried_cost);

	uint32 local_cost = table[np].carried_cost - table[p].carried_cost;

	table[p] = table[np];
	table[p].match_len = np - p;
	table[p].local_cost = local_cost;
	}
	}