Skip to content

Instantly share code, notes, and snippets.

@klauspost
Last active November 21, 2018 05:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klauspost/64b36e9904d76d6fc122 to your computer and use it in GitHub Desktop.
Save klauspost/64b36e9904d76d6fc122 to your computer and use it in GitHub Desktop.
// Copyright 2011 The Go Authors. All rights reserved.
// Copyright 2013 Klaus Post
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crc32
import (
"github.com/klauspost/intrinsics/x86/sse2"
"github.com/klauspost/intrinsics/x86"
"github.com/klauspost/intrinsics/x86/pclmulqdq"
"github.com/klauspost/intrinsics/x86/sse4"
)
// This file contains the code to call the SSE 4.2 version of the Castagnoli
// and IEEE CRC.
// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and uses
// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
func haveSSE41() bool
func haveSSE42() bool
func haveCLMUL() bool
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
// instruction.
func castagnoliSSE42(crc uint32, p []byte) uint32
var sse42 = haveSSE42()
var useFastIEEE = haveCLMUL() && haveSSE41()
func updateCastagnoli(crc uint32, p []byte) uint32 {
if sse42 {
return castagnoliSSE42(crc, p)
}
return update(crc, castagnoliTable, p)
}
func updateIEEE(crc uint32, p []byte) uint32 {
if useFastIEEE && len(p) >= 64 {
left := len(p) & 15
do := len(p) - left
crc := ^ieeeCLMUL(^crc, p[:do])
if left > 0 {
crc = update(crc, IEEETable, p[do:])
}
return crc
}
// only use slicing-by-8 when input is >= 4KB
if len(p) >= 4096 {
iEEETable8Once.Do(func() {
iEEETable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, iEEETable8, p)
}
return update(crc, IEEETable, p)
}
// Update an IEEE crc32 Checksum. Based on
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
// len(p) must be at least 64, and must be a multiple of 16.
func ieeeCLMUL(crc uint32, p []byte) uint32 {
in := x86.BytesToM128i(p)
crci := sse2.SetEpi64x(0,int64(crc))
p1 := sse2.XorSi128(in[0], crci)
p2 := in[1]
p3 := in[2]
p4 := in[3]
for len(in) >= 4 {
var r1p2 = sse2.SetEpi64x(0x1c6e41596, 0x154442bd4)
t1 := pclmulqdq.Clmulepi64Si128(r1p2, p1, 0)
t2 := pclmulqdq.Clmulepi64Si128(r1p2, p2, 0)
t3 := pclmulqdq.Clmulepi64Si128(r1p2, p3, 0)
t4 := pclmulqdq.Clmulepi64Si128(r1p2, p4, 0)
t5 := pclmulqdq.Clmulepi64Si128(r1p2, p1, 0x11)
t6 := pclmulqdq.Clmulepi64Si128(r1p2, p2, 0x11)
t7 := pclmulqdq.Clmulepi64Si128(r1p2, p3, 0x11)
t8 := pclmulqdq.Clmulepi64Si128(r1p2, p4, 0x11)
t1 = sse2.XorSi128(t1, t5)
t2 = sse2.XorSi128(t2, t6)
t3 = sse2.XorSi128(t3, t7)
t4 = sse2.XorSi128(t4, t8)
p1 = sse2.XorSi128(t1, in[0])
p2 = sse2.XorSi128(t2, in[1])
p3 = sse2.XorSi128(t3, in[2])
p4 = sse2.XorSi128(t4, in[3])
in = in[4:]
}
/* Fold result into a single register (p1) */
var r4r3 = sse2.SetEpi64x(0x0ccaa009e, 0x1751997d0 )
// Merge p2
t1 := pclmulqdq.Clmulepi64Si128(r4r3, p1, 0)
t2 := pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11)
p1 = sse2.XorSi128(t1, t2)
p1 = sse2.XorSi128(p1, p2)
// Merge p3
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0)
t2 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11)
p1 = sse2.XorSi128(t1, t2)
p1 = sse2.XorSi128(p1, p3)
// Merge p4
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0)
t2 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11)
p1 = sse2.XorSi128(t1, t2)
p1 = sse2.XorSi128(p1, p4)
// Encode remaining in 16 byte blocks
for len(in) > 0 {
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0)
t2 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0x11)
p1 = sse2.XorSi128(t1, t2)
p1 = sse2.XorSi128(p1, in[0])
in = in[1:]
}
// Merge result
t1 = pclmulqdq.Clmulepi64Si128(r4r3, p1, 0)
p1 = sse2.XorSi128(sse2.SrliSi128(p1, 8), t1)
mask := sse2.SrlEpi64(sse2.CmpeqEpi8(p1,p1), 32)
r5 := sse2.SetEpi64x(0, 0x163cd6124)
t1 = sse2.SrliSi128(p1, 4)
p1 = sse2.AndSi128(p1, mask)
p1 = pclmulqdq.Clmulepi64Si128(r5, p1, 0)
p1 = sse2.XorSi128(p1, t1)
rupoly := sse2.SetEpi64x(0x1db710641, 0x1f7011641)
t1 = sse2.AndSi128(p1, mask)
t1 = pclmulqdq.Clmulepi64Si128(rupoly, t1, 0x10)
t1 = sse2.AndSi128(t1, mask)
t1 = pclmulqdq.Clmulepi64Si128(rupoly, t1, 0x0)
p1 = sse2.XorSi128(p1, t1)
return sse4.ExtractEpi32(p1, 1)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment