Yawning/gist:2f7826f10a5d7af813d9b9b7121eb117 Secret

## gistfile1.txt
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
	@(private)
	SCANNER_INDICES_256 : simd.u8x32 : {
		 0,  1,  2,  3,  4,  5,  6,  7,
		 8,  9, 10, 11, 12, 13, 14, 15,
		16, 17, 18, 19, 20, 21, 22, 23,
		24, 25, 26, 27, 28, 29, 30, 31,
	}
	@(private)
	SCANNER_SENTINEL_256: simd.u8x32 : u8(0xff)
}
@(private)
SCANNER_INDICES_128 : simd.u8x16 : {
	 0,  1,  2,  3,  4,  5,  6,  7,
	 8,  9, 10, 11, 12, 13, 14, 15,
}
@(private)
SCANNER_SENTINEL: simd.u8x16 : u8(0xff)

/*
Scan a slice of bytes for a specific byte.

This procedure safely handles slices of any length, including empty slices.

Inputs:
- data: A slice of bytes.
- c: The byte to search for.

Returns:
- index: The index of the byte `c`, or -1 if it was not found.
*/
index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
	i, l := 0, len(s)

	// Don't bother with SIMD if the slice is smaller than a SIMD register.
	if l < 16 {
		for /**/; i < l; i += 1 {
			if s[i] == c {
				return i
			}
		}
		return -1
	}

	scanner: simd.u8x16 = c

	when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
		if l > 64 {
			scanner_256 := simd.swizzle(
				scanner,
				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
			)

			// Scan 128-byte chunks, using 256-bit SIMD.
			for nr_blocks := l / 128; nr_blocks > 0; nr_blocks -= 1 {
				s0 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i:]))
				s1 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+32:]))
				s2 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+64:]))
				s3 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+96:]))
				c0 := simd.lanes_eq(s0, scanner_256)
				c1 := simd.lanes_eq(s1, scanner_256)
				c2 := simd.lanes_eq(s2, scanner_256)
				c3 := simd.lanes_eq(s3, scanner_256)

				m0 := simd.reduce_or(c0)
				m1 := simd.reduce_or(c1)
				m2 := simd.reduce_or(c2)
				m3 := simd.reduce_or(c3)
				if m0 | m1 | m2 | m3 > 0 {
					if m0 > 0 {
						sel := simd.select(c0, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
						off := simd.reduce_min(sel)
						return i + int(off)
					}
					if m1 > 0 {
						sel := simd.select(c1, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
						off := simd.reduce_min(sel)
						return i + 32 + int(off)
					}
					if m2 > 0 {
						sel := simd.select(c2, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
						off := simd.reduce_min(sel)
						return i + 64 + int(off)
					}
					if m3 > 0 {
						sel := simd.select(c3, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
						off := simd.reduce_min(sel)
						return i + 96 + int(off)
					}
				}

				i += 128
			}

			// Scan 64-byte chunks, using 256-bit SIMD.
			for nr_blocks := (l - i) / 64; nr_blocks > 0; nr_blocks -= 1 {
				s0 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i:]))
				s1 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+32:]))
				c0 := simd.lanes_eq(s0, scanner_256)
				c1 := simd.lanes_eq(s1, scanner_256)

				m0 := simd.reduce_or(c0)
				m1 := simd.reduce_or(c1)
				if m0 | m1 > 0 {
					if m0 > 0 {
						sel := simd.select(c0, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
						off := simd.reduce_min(sel)
						return i + int(off)
					}
					if m1 > 0 {
						sel := simd.select(c1, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
						off := simd.reduce_min(sel)
						return i + 32 + int(off)
					}
				}

				i += 64
			}
		}
	} else {
		// Note: ARM can probaby do 128 or 256-bytes at a time.

		// Scan 64-byte chunks, using 128-bit SIMD.
		for nr_blocks := (l - i) / 64; nr_blocks > 0; nr_blocks -= 1 {
			s0 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i:]))
			s1 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+16:]))
			s2 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+32:]))
			s3 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+48:]))
			c0 := simd.lanes_eq(s0, scanner)
			c1 := simd.lanes_eq(s1, scanner)
			c2 := simd.lanes_eq(s2, scanner)
			c3 := simd.lanes_eq(s3, scanner)

			m0 := simd.reduce_or(c0)
			m1 := simd.reduce_or(c1)
			m2 := simd.reduce_or(c2)
			m3 := simd.reduce_or(c3)
			if m0 | m1 | m2 | m3 > 0 {
				if m0 > 0 {
					sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL)
					off := simd.reduce_min(sel)
					return i + int(off)
				}
				if m1 > 0 {
					sel := simd.select(c1, SCANNER_INDICES_128, SCANNER_SENTINEL)
					off := simd.reduce_min(sel)
					return i + 16 + int(off)
				}
				if m2 > 0 {
					sel := simd.select(c2, SCANNER_INDICES_128, SCANNER_SENTINEL)
					off := simd.reduce_min(sel)
					return i + 32 + int(off)
				}
				if m3 > 0 {
					sel := simd.select(c3, SCANNER_INDICES_128, SCANNER_SENTINEL)
					off := simd.reduce_min(sel)
					return i + 48 + int(off)
				}
			}

			i += 64
		}
	}

	// Scan 16-byte chunks, using 128-bit SIMD.
	for nr_blocks := (l - i) / 16; nr_blocks > 0; nr_blocks -= 1 {
		s0 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i:]))
		c0 := simd.lanes_eq(s0, scanner)
		if simd.reduce_or(c0) > 0 {
			sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL)
			off := simd.reduce_min(sel)
			return i + int(off)
		}

		i += 16
	}

	// Iterate as a scalar over the remaining non-SIMD register sized
	// portion.
	for /**/; i < l; i += 1 {
		if s[i] == c {
			return i
		}
	}

	return -1
}
	when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
	@(private)
	SCANNER_INDICES_256 : simd.u8x32 : {
	0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 10, 11, 12, 13, 14, 15,
	16, 17, 18, 19, 20, 21, 22, 23,
	24, 25, 26, 27, 28, 29, 30, 31,
	}
	@(private)
	SCANNER_SENTINEL_256: simd.u8x32 : u8(0xff)
	}
	@(private)
	SCANNER_INDICES_128 : simd.u8x16 : {
	0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 10, 11, 12, 13, 14, 15,
	}
	@(private)
	SCANNER_SENTINEL: simd.u8x16 : u8(0xff)

	/*
	Scan a slice of bytes for a specific byte.

	This procedure safely handles slices of any length, including empty slices.

	Inputs:
	- data: A slice of bytes.
	- c: The byte to search for.

	Returns:
	- index: The index of the byte `c`, or -1 if it was not found.
	*/
	index_byte :: proc(s: []byte, c: byte) -> (index: int) #no_bounds_check {
	i, l := 0, len(s)

	// Don't bother with SIMD if the slice is smaller than a SIMD register.
	if l < 16 {
	for /**/; i < l; i += 1 {
	if s[i] == c {
	return i
	}
	}
	return -1
	}

	scanner: simd.u8x16 = c

	when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
	if l > 64 {
	scanner_256 := simd.swizzle(
	scanner,
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
	)

	// Scan 128-byte chunks, using 256-bit SIMD.
	for nr_blocks := l / 128; nr_blocks > 0; nr_blocks -= 1 {
	s0 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i:]))
	s1 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+32:]))
	s2 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+64:]))
	s3 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+96:]))
	c0 := simd.lanes_eq(s0, scanner_256)
	c1 := simd.lanes_eq(s1, scanner_256)
	c2 := simd.lanes_eq(s2, scanner_256)
	c3 := simd.lanes_eq(s3, scanner_256)

	m0 := simd.reduce_or(c0)
	m1 := simd.reduce_or(c1)
	m2 := simd.reduce_or(c2)
	m3 := simd.reduce_or(c3)
	if m0 \| m1 \| m2 \| m3 > 0 {
	if m0 > 0 {
	sel := simd.select(c0, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
	off := simd.reduce_min(sel)
	return i + int(off)
	}
	if m1 > 0 {
	sel := simd.select(c1, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
	off := simd.reduce_min(sel)
	return i + 32 + int(off)
	}
	if m2 > 0 {
	sel := simd.select(c2, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
	off := simd.reduce_min(sel)
	return i + 64 + int(off)
	}
	if m3 > 0 {
	sel := simd.select(c3, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
	off := simd.reduce_min(sel)
	return i + 96 + int(off)
	}
	}

	i += 128
	}

	// Scan 64-byte chunks, using 256-bit SIMD.
	for nr_blocks := (l - i) / 64; nr_blocks > 0; nr_blocks -= 1 {
	s0 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i:]))
	s1 := intrinsics.unaligned_load(transmute(^simd.u8x32)raw_data(s[i+32:]))
	c0 := simd.lanes_eq(s0, scanner_256)
	c1 := simd.lanes_eq(s1, scanner_256)

	m0 := simd.reduce_or(c0)
	m1 := simd.reduce_or(c1)
	if m0 \| m1 > 0 {
	if m0 > 0 {
	sel := simd.select(c0, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
	off := simd.reduce_min(sel)
	return i + int(off)
	}
	if m1 > 0 {
	sel := simd.select(c1, SCANNER_INDICES_256, SCANNER_SENTINEL_256)
	off := simd.reduce_min(sel)
	return i + 32 + int(off)
	}
	}

	i += 64
	}
	}
	} else {
	// Note: ARM can probaby do 128 or 256-bytes at a time.

	// Scan 64-byte chunks, using 128-bit SIMD.
	for nr_blocks := (l - i) / 64; nr_blocks > 0; nr_blocks -= 1 {
	s0 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i:]))
	s1 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+16:]))
	s2 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+32:]))
	s3 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i+48:]))
	c0 := simd.lanes_eq(s0, scanner)
	c1 := simd.lanes_eq(s1, scanner)
	c2 := simd.lanes_eq(s2, scanner)
	c3 := simd.lanes_eq(s3, scanner)

	m0 := simd.reduce_or(c0)
	m1 := simd.reduce_or(c1)
	m2 := simd.reduce_or(c2)
	m3 := simd.reduce_or(c3)
	if m0 \| m1 \| m2 \| m3 > 0 {
	if m0 > 0 {
	sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL)
	off := simd.reduce_min(sel)
	return i + int(off)
	}
	if m1 > 0 {
	sel := simd.select(c1, SCANNER_INDICES_128, SCANNER_SENTINEL)
	off := simd.reduce_min(sel)
	return i + 16 + int(off)
	}
	if m2 > 0 {
	sel := simd.select(c2, SCANNER_INDICES_128, SCANNER_SENTINEL)
	off := simd.reduce_min(sel)
	return i + 32 + int(off)
	}
	if m3 > 0 {
	sel := simd.select(c3, SCANNER_INDICES_128, SCANNER_SENTINEL)
	off := simd.reduce_min(sel)
	return i + 48 + int(off)
	}
	}

	i += 64
	}
	}

	// Scan 16-byte chunks, using 128-bit SIMD.
	for nr_blocks := (l - i) / 16; nr_blocks > 0; nr_blocks -= 1 {
	s0 := intrinsics.unaligned_load(transmute(^simd.u8x16)raw_data(s[i:]))
	c0 := simd.lanes_eq(s0, scanner)
	if simd.reduce_or(c0) > 0 {
	sel := simd.select(c0, SCANNER_INDICES_128, SCANNER_SENTINEL)
	off := simd.reduce_min(sel)
	return i + int(off)
	}

	i += 16
	}

	// Iterate as a scalar over the remaining non-SIMD register sized
	// portion.
	for /**/; i < l; i += 1 {
	if s[i] == c {
	return i
	}
	}

	return -1
	}