Skip to content

Instantly share code, notes, and snippets.

@Transfusion
Created October 26, 2019 10:40
Show Gist options
  • Save Transfusion/be47f38522f0a22e6dce6af95243082b to your computer and use it in GitHub Desktop.
Save Transfusion/be47f38522f0a22e6dce6af95243082b to your computer and use it in GitHub Desktop.
chardet incremental detection on websites https://github.com/aio-libs/aiohttp/issues/4112
{'encoding': 'SHIFT_JIS', 'confidence': 0.5227341162300512, 'language': 'Japanese'}
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
13 21.5 MiB 21.5 MiB @profile
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024):
15 21.5 MiB 0.0 MiB detector = chardet.UniversalDetector()
16 21.5 MiB 0.0 MiB _body_memoryview = memoryview(_bytes)
17 # print(len(_bytes) == len(_body_memoryview))
18 21.8 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1):
19 21.8 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size]
20 21.8 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes()
21 21.8 MiB 0.3 MiB detector.feed(_chunk_bytes)
22 21.8 MiB 0.0 MiB del _chunk_bytes
23 21.8 MiB 0.0 MiB if detector.done:
24 print("chunk " + str(i) + " reached")
25 break
26
27 21.8 MiB 0.0 MiB detector.close()
28 21.8 MiB 0.0 MiB print(detector.result)
SHIFT_JIS
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
31 21.8 MiB 21.8 MiB @profile
32 def without_chunking():
33 22.5 MiB 0.7 MiB print(chardet.detect(_bytes)['encoding'])
{'encoding': 'GB2312', 'confidence': 0.99, 'language': 'Chinese'}
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
13 22.5 MiB 22.5 MiB @profile
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024):
15 22.5 MiB 0.0 MiB detector = chardet.UniversalDetector()
16 22.5 MiB 0.0 MiB _body_memoryview = memoryview(_bytes)
17 # print(len(_bytes) == len(_body_memoryview))
18 22.5 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1):
19 22.5 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size]
20 22.5 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes()
21 22.5 MiB 0.0 MiB detector.feed(_chunk_bytes)
22 22.5 MiB 0.0 MiB del _chunk_bytes
23 22.5 MiB 0.0 MiB if detector.done:
24 print("chunk " + str(i) + " reached")
25 break
26
27 22.5 MiB 0.0 MiB detector.close()
28 22.5 MiB 0.0 MiB print(detector.result)
GB2312
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
31 22.5 MiB 22.5 MiB @profile
32 def without_chunking():
33 23.8 MiB 1.3 MiB print(chardet.detect(_bytes)['encoding'])
{'encoding': 'KOI8-R', 'confidence': 0.9411772590471068, 'language': 'Russian'}
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
13 23.8 MiB 23.8 MiB @profile
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024):
15 23.8 MiB 0.0 MiB detector = chardet.UniversalDetector()
16 23.8 MiB 0.0 MiB _body_memoryview = memoryview(_bytes)
17 # print(len(_bytes) == len(_body_memoryview))
18 23.8 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1):
19 23.8 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size]
20 23.8 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes()
21 23.8 MiB 0.0 MiB detector.feed(_chunk_bytes)
22 23.8 MiB 0.0 MiB del _chunk_bytes
23 23.8 MiB 0.0 MiB if detector.done:
24 print("chunk " + str(i) + " reached")
25 break
26
27 23.8 MiB 0.0 MiB detector.close()
28 23.8 MiB 0.0 MiB print(detector.result)
KOI8-R
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
31 23.8 MiB 23.8 MiB @profile
32 def without_chunking():
33 23.8 MiB 0.0 MiB print(chardet.detect(_bytes)['encoding'])
{'encoding': 'EUC-KR', 'confidence': 0.99, 'language': 'Korean'}
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
13 23.8 MiB 23.8 MiB @profile
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024):
15 23.8 MiB 0.0 MiB detector = chardet.UniversalDetector()
16 23.8 MiB 0.0 MiB _body_memoryview = memoryview(_bytes)
17 # print(len(_bytes) == len(_body_memoryview))
18 23.8 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1):
19 23.8 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size]
20 23.8 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes()
21 23.8 MiB 0.0 MiB detector.feed(_chunk_bytes)
22 23.8 MiB 0.0 MiB del _chunk_bytes
23 23.8 MiB 0.0 MiB if detector.done:
24 print("chunk " + str(i) + " reached")
25 break
26
27 23.8 MiB 0.0 MiB detector.close()
28 23.8 MiB 0.0 MiB print(detector.result)
EUC-KR
Filename: memory_profiling_4112.py
Line # Mem usage Increment Line Contents
================================================
31 23.8 MiB 23.8 MiB @profile
32 def without_chunking():
33 23.8 MiB 0.0 MiB print(chardet.detect(_bytes)['encoding'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment