Skip to content

Instantly share code, notes, and snippets.

@darkarnium
Last active January 27, 2022 13:56
Show Gist options
  • Save darkarnium/56c078c30bb359d8e013e8f56af80c3d to your computer and use it in GitHub Desktop.
Save darkarnium/56c078c30bb359d8e013e8f56af80c3d to your computer and use it in GitHub Desktop.
Go vs Python - SHA1 and MD5
package hasher
import (
"crypto/md5"
"crypto/sha1"
"encoding/hex"
"io"
"os"
)
func HashSHA1(chunk int) string {
file, err := os.Open("nexus_latest.tar")
if err != nil {
panic(err)
}
defer file.Close()
hash := sha1.New()
buffer := make([]byte, chunk)
for {
n, err := file.Read(buffer)
hash.Write(buffer[0:n])
if err == io.EOF {
break
}
}
return hex.EncodeToString(hash.Sum(nil))
}
func HashMD5(chunk int) string {
file, err := os.Open("nexus_latest.tar")
if err != nil {
panic(err)
}
defer file.Close()
hash := md5.New()
buffer := make([]byte, chunk)
for {
n, err := file.Read(buffer)
hash.Write(buffer[0:n])
if err == io.EOF {
break
}
}
return hex.EncodeToString(hash.Sum(nil))
}
func HashSHA1Copy() string {
file, err := os.Open("nexus_latest.tar")
if err != nil {
panic(err)
}
defer file.Close()
hash := sha1.New()
io.Copy(hash, file)
return hex.EncodeToString(hash.Sum(nil))
}
func HashMD5Copy() string {
file, err := os.Open("nexus_latest.tar")
if err != nil {
panic(err)
}
defer file.Close()
hash := md5.New()
io.Copy(hash, file)
return hex.EncodeToString(hash.Sum(nil))
}
"""Compare hash rates of MD5 and SHA1 over N rounds and X chunk size."""
import sys
import timeit
import hashlib
def hash(sz=10240, func=hashlib.sha1):
h = func()
with open('nexus_latest.tar', "rb") as fin:
while chunk := fin.read(sz):
h.update(chunk)
return h.hexdigest()
def benchmark_md5_chunk_8(rounds: int):
md5 = timeit.Timer(lambda: hash(sz=8 * 1024, func=hashlib.md5)).timeit(number = rounds)
print(f"ok\thasher\t{md5}s")
def benchmark_sha1_chunk_8(rounds: int):
md5 = timeit.Timer(lambda: hash(sz=8 * 1024, func=hashlib.sha1)).timeit(number = rounds)
print(f"ok\thasher\t{md5}s")
if __name__ == "__main__":
# This is amazingly gross, but we're a benchmark.
if len(sys.argv) < 3:
print("Usage: hasher.py <case> <rounds>")
sys.exit(0)
case = getattr(sys.modules[__name__], sys.argv[1])
count = int(sys.argv[2])
case(count)
package hasher
import (
"testing"
)
var result string
func BenchmarkMD5Chunk8(b *testing.B) {
var r string
for i := 0; i < b.N; i++ {
r = HashMD5(8 * 1024)
}
result = r
}
func BenchmarkSHA1Chunk8(b *testing.B) {
var r string
for i := 0; i < b.N; i++ {
r = HashSHA1(8 * 1024)
}
result = r
}
func BenchmarkMD5Copy(b *testing.B) {
var r string
for i := 0; i < b.N; i++ {
r = HashMD5Copy()
}
result = r
}
func BenchmarkSHA1Copy(b *testing.B) {
var r string
for i := 0; i < b.N; i++ {
r = HashSHA1Copy()
}
result = r
}
@darkarnium
Copy link
Author

Replacing hex.EncodeToString with fmt.Sprintf:

return fmt.Sprintf("%x", hash.Sum(nil))

Result:

$ go test -bench=BenchmarkMD5Chunk8 -count 10 | grep -iE ^ok
ok      hasher  25.986s
$ go test -bench=BenchmarkSHA1Chunk8 -count 10 | grep -iE ^ok
ok      hasher  19.190s

@darkarnium
Copy link
Author

Input file:

$ time md5sum nexus_latest.tar 
1d4d172f2d0d70aa4f6a803cb7f01d6d  nexus_latest.tar

real    0m0.901s
user    0m0.851s
sys     0m0.050s

$ time shasum -a 1 nexus_latest.tar 
3edf9ad45cb1e7043f5c8b41c75512224489295c  nexus_latest.tar

real    0m0.967s
user    0m0.877s
sys     0m0.081s

$ ls -lah nexus_latest.tar 
-rw------- 1 darkarnium darkarnium 645M Jan 26 21:00 nexus_latest.tar

@darkarnium
Copy link
Author

Running with io.copy() in Go and open(...).read() in Python (no chunking):

ubuntu@ip-172-31-18-118:~/hasher$ python3.9 hasher.py benchmark_sha1_copy 10
ok      hasher  14.152461315999972s
ubuntu@ip-172-31-18-118:~/hasher$ python3.9 hasher.py benchmark_md5_copy 10
ok      hasher  14.06667994999998s
ubuntu@ip-172-31-18-118:~/hasher$ python3.10 hasher.py benchmark_sha1_copy 10
ok      hasher  14.014313474000005s
ubuntu@ip-172-31-18-118:~/hasher$ python3.10 hasher.py benchmark_md5_copy 10
ok      hasher  14.069881003000091s
ubuntu@ip-172-31-18-118:~/hasher$ go test -bench=BenchmarkMD5Copy -count 10 | grep -iE ^ok
ok      hasher  12.171s
ubuntu@ip-172-31-18-118:~/hasher$ go test -bench=BenchmarkSHA1Copy -count 10 | grep -iE ^ok
ok      hasher  24.296s

Versions:

ubuntu@ip-172-31-18-118:~/hasher$ go version
go version go1.16.2 linux/amd64

ubuntu@ip-172-31-18-118:~/hasher$ uname -a
Linux ip-172-31-18-118 5.11.0-1022-aws #23~20.04.1-Ubuntu SMP Mon Nov 15 14:03:19 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux

Memory:

ubuntu@ip-172-31-18-118:~/hasher$ free -h
              total        used        free      shared  buff/cache   available
Mem:           15Gi       226Mi        13Gi       0.0Ki       1.4Gi        14Gi
Swap:            0B          0B          0B

CPU:

ubuntu@ip-172-31-18-118:~/hasher$ grep -Eic ^processor /proc/cpuinfo
4
ubuntu@ip-172-31-18-118:~/hasher$ grep -Ei -m1 ^flags /proc/cpuinfo
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single pti fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves ida arat pku ospke

@darkarnium
Copy link
Author

darkarnium commented Jan 26, 2022

Running with io.copy() in Go and open(...).read() in Python (no chunking) on macOS (Intel / Native):

$ go test -bench=BenchmarkSHA1Copy -count 10 | grep -iE ^ok
ok  	hasher	0.100s

$ go test -bench=BenchmarkMD5Copy -count 10 | grep -iE ^ok
ok  	hasher	0.101s
$ python3.9 hasher.py benchmark_sha1_chunk_copy 10
ok	hasher	12.468418374s

$ python3.9 hasher.py benchmark_md5_chunk_copy 10
ok	hasher	12.408970059000001s

@darkarnium
Copy link
Author

"""Compare hash rates of MD5 and SHA1 over N rounds and X chunk size."""
import sys
import timeit
import hashlib

def hash_copy(func=hashlib.md5):
    h = func()
    h.update(open('nexus_latest.tar', "rb", 0).read())
    return h.hexdigest()

def hash(sz=10240, func=hashlib.sha1):
    h = func()

    with open('nexus_latest.tar', "rb") as fin:
        while chunk := fin.read(sz):
            h.update(chunk)

    return h.hexdigest()


def benchmark_md5_chunk_copy(rounds: int):
    md5 = timeit.Timer(lambda: hash_copy(func=hashlib.md5)).timeit(number = rounds)
    print(f"ok\thasher\t{md5}s")

def benchmark_md5_chunk_8(rounds: int):
    md5 = timeit.Timer(lambda: hash(sz=8 * 1024, func=hashlib.md5)).timeit(number = rounds)
    print(f"ok\thasher\t{md5}s")


def benchmark_sha1_chunk_copy(rounds: int):
    sha1 = timeit.Timer(lambda: hash_copy(func=hashlib.md5)).timeit(number = rounds)
    print(f"ok\thasher\t{sha1}s")

def benchmark_sha1_chunk_8(rounds: int):
    md5 = timeit.Timer(lambda: hash(sz=8 * 1024, func=hashlib.sha1)).timeit(number = rounds)
    print(f"ok\thasher\t{md5}s")


if __name__ == "__main__":
    # This is amazingly gross, but we're a benchmark.
    if len(sys.argv) < 3:
        print("Usage: hasher.py <case> <rounds>")
        sys.exit(0)

    case = getattr(sys.modules[__name__], sys.argv[1])
    count = int(sys.argv[2])
    case(count)

@darkarnium
Copy link
Author

Running on FreeBSD with io.copy() and open(...).read() as well as 8K chunks in Python:

% go test -bench=BenchmarkSHA1Copy -count 10 | grep -iE ^ok
ok      hasher  12.108s

% python3.8 hasher.py benchmark_sha1_chunk_8 10
ok      hasher  1.1660769821610302s

% python3.8 hasher.py benchmark_sha1_chunk_copy 10
ok      hasher  1.603817748837173s

Version:

% go version
go version go1.17.5 freebsd/amd64

@darkarnium
Copy link
Author

darkarnium commented Jan 27, 2022

Python buffering disabled on open() on macOS:

Via open(..., 0).read():

$ egrep -i 'def hash_copy\(' -A 4 hasher.py | grep -i open
    h.update(open('nexus_latest.tar', "rb", 0).read())

$ python3.9 hasher.py benchmark_md5_chunk_copy 10
ok	hasher	12.137283342s

$ python3.9 hasher.py benchmark_sha1_chunk_copy 10
ok	hasher	12.106864423000001s

Chunked read with open(..., 0):

$ egrep -i 'def hash\(' -A 4 hasher.py | grep -i open
    with open('nexus_latest.tar', "rb", 0) as fin:

$ python3.9 hasher.py benchmark_sha1_chunk_8 10
ok	hasher	7.7224440329999995s

$ python3.9 hasher.py benchmark_md5_chunk_8 10
ok	hasher	10.369061641s

Versions:

$ uname -a
Darwin Callisto.local 20.6.0 Darwin Kernel Version 20.6.0: Tue Oct 12 18:33:42 PDT 2021; root:xnu-7195.141.8~1/RELEASE_X86_64 x86_64

@darkarnium
Copy link
Author

Full output requested when run with -benchtime 10x and -count 10:

MD5;

$ go test -bench=BenchmarkMD5Chunk8 -benchtime 10x
goos: darwin
goarch: amd64
pkg: hasher
cpu: Intel(R) Core(TM) i5-8259U CPU @ 2.30GHz
BenchmarkMD5Chunk8-8   	      10	1005807555 ns/op
PASS
ok  	hasher	11.403s

$ go test -bench=BenchmarkMD5Chunk8 -count 10
goos: darwin
goarch: amd64
pkg: hasher
cpu: Intel(R) Core(TM) i5-8259U CPU @ 2.30GHz
BenchmarkMD5Chunk8-8   	       1	1054308178 ns/op
BenchmarkMD5Chunk8-8   	       1	1040079144 ns/op
BenchmarkMD5Chunk8-8   	       1	1020537304 ns/op
BenchmarkMD5Chunk8-8   	       1	1021756455 ns/op
BenchmarkMD5Chunk8-8   	       1	1080426179 ns/op
BenchmarkMD5Chunk8-8   	       1	1012884916 ns/op
BenchmarkMD5Chunk8-8   	       1	1006444572 ns/op
BenchmarkMD5Chunk8-8   	       1	1003369154 ns/op
BenchmarkMD5Chunk8-8   	       1	1009236434 ns/op
BenchmarkMD5Chunk8-8   	       2	1001233669 ns/op
PASS
ok  	hasher	12.353s

SHA1:

$ go test -bench=BenchmarkSHA1Chunk8 -benchtime 10x
goos: darwin
goarch: amd64
pkg: hasher
cpu: Intel(R) Core(TM) i5-8259U CPU @ 2.30GHz
BenchmarkSHA1Chunk8-8   	      10	 772829689 ns/op
PASS
ok  	hasher	8.672s

$ go test -bench=BenchmarkSHA1Chunk8 -count 10
goos: darwin
goarch: amd64
pkg: hasher
cpu: Intel(R) Core(TM) i5-8259U CPU @ 2.30GHz
BenchmarkSHA1Chunk8-8   	       2	 758764051 ns/op
BenchmarkSHA1Chunk8-8   	       2	 756892006 ns/op
BenchmarkSHA1Chunk8-8   	       2	 758007766 ns/op
BenchmarkSHA1Chunk8-8   	       2	 756452965 ns/op
BenchmarkSHA1Chunk8-8   	       2	 760778676 ns/op
BenchmarkSHA1Chunk8-8   	       2	 753645592 ns/op
BenchmarkSHA1Chunk8-8   	       2	 753705684 ns/op
BenchmarkSHA1Chunk8-8   	       2	 753714502 ns/op
BenchmarkSHA1Chunk8-8   	       2	 757031118 ns/op
BenchmarkSHA1Chunk8-8   	       2	 751179043 ns/op
PASS
ok  	hasher	22.982s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment