Created
October 26, 2019 10:40
-
-
Save Transfusion/be47f38522f0a22e6dce6af95243082b to your computer and use it in GitHub Desktop.
chardet incremental detection on websites https://github.com/aio-libs/aiohttp/issues/4112
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{'encoding': 'SHIFT_JIS', 'confidence': 0.5227341162300512, 'language': 'Japanese'} | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
13 21.5 MiB 21.5 MiB @profile | |
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024): | |
15 21.5 MiB 0.0 MiB detector = chardet.UniversalDetector() | |
16 21.5 MiB 0.0 MiB _body_memoryview = memoryview(_bytes) | |
17 # print(len(_bytes) == len(_body_memoryview)) | |
18 21.8 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1): | |
19 21.8 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size] | |
20 21.8 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes() | |
21 21.8 MiB 0.3 MiB detector.feed(_chunk_bytes) | |
22 21.8 MiB 0.0 MiB del _chunk_bytes | |
23 21.8 MiB 0.0 MiB if detector.done: | |
24 print("chunk " + str(i) + " reached") | |
25 break | |
26 | |
27 21.8 MiB 0.0 MiB detector.close() | |
28 21.8 MiB 0.0 MiB print(detector.result) | |
SHIFT_JIS | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
31 21.8 MiB 21.8 MiB @profile | |
32 def without_chunking(): | |
33 22.5 MiB 0.7 MiB print(chardet.detect(_bytes)['encoding']) | |
{'encoding': 'GB2312', 'confidence': 0.99, 'language': 'Chinese'} | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
13 22.5 MiB 22.5 MiB @profile | |
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024): | |
15 22.5 MiB 0.0 MiB detector = chardet.UniversalDetector() | |
16 22.5 MiB 0.0 MiB _body_memoryview = memoryview(_bytes) | |
17 # print(len(_bytes) == len(_body_memoryview)) | |
18 22.5 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1): | |
19 22.5 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size] | |
20 22.5 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes() | |
21 22.5 MiB 0.0 MiB detector.feed(_chunk_bytes) | |
22 22.5 MiB 0.0 MiB del _chunk_bytes | |
23 22.5 MiB 0.0 MiB if detector.done: | |
24 print("chunk " + str(i) + " reached") | |
25 break | |
26 | |
27 22.5 MiB 0.0 MiB detector.close() | |
28 22.5 MiB 0.0 MiB print(detector.result) | |
GB2312 | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
31 22.5 MiB 22.5 MiB @profile | |
32 def without_chunking(): | |
33 23.8 MiB 1.3 MiB print(chardet.detect(_bytes)['encoding']) | |
{'encoding': 'KOI8-R', 'confidence': 0.9411772590471068, 'language': 'Russian'} | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
13 23.8 MiB 23.8 MiB @profile | |
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024): | |
15 23.8 MiB 0.0 MiB detector = chardet.UniversalDetector() | |
16 23.8 MiB 0.0 MiB _body_memoryview = memoryview(_bytes) | |
17 # print(len(_bytes) == len(_body_memoryview)) | |
18 23.8 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1): | |
19 23.8 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size] | |
20 23.8 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes() | |
21 23.8 MiB 0.0 MiB detector.feed(_chunk_bytes) | |
22 23.8 MiB 0.0 MiB del _chunk_bytes | |
23 23.8 MiB 0.0 MiB if detector.done: | |
24 print("chunk " + str(i) + " reached") | |
25 break | |
26 | |
27 23.8 MiB 0.0 MiB detector.close() | |
28 23.8 MiB 0.0 MiB print(detector.result) | |
KOI8-R | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
31 23.8 MiB 23.8 MiB @profile | |
32 def without_chunking(): | |
33 23.8 MiB 0.0 MiB print(chardet.detect(_bytes)['encoding']) | |
{'encoding': 'EUC-KR', 'confidence': 0.99, 'language': 'Korean'} | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
13 23.8 MiB 23.8 MiB @profile | |
14 def chunked_with_memoryview(chunk_size: int = 2 * 1024): | |
15 23.8 MiB 0.0 MiB detector = chardet.UniversalDetector() | |
16 23.8 MiB 0.0 MiB _body_memoryview = memoryview(_bytes) | |
17 # print(len(_bytes) == len(_body_memoryview)) | |
18 23.8 MiB 0.0 MiB for i in range((len(_body_memoryview) // chunk_size) + 1): | |
19 23.8 MiB 0.0 MiB _chunk = _body_memoryview[i * chunk_size: (i + 1) * chunk_size] | |
20 23.8 MiB 0.0 MiB _chunk_bytes = _chunk.tobytes() | |
21 23.8 MiB 0.0 MiB detector.feed(_chunk_bytes) | |
22 23.8 MiB 0.0 MiB del _chunk_bytes | |
23 23.8 MiB 0.0 MiB if detector.done: | |
24 print("chunk " + str(i) + " reached") | |
25 break | |
26 | |
27 23.8 MiB 0.0 MiB detector.close() | |
28 23.8 MiB 0.0 MiB print(detector.result) | |
EUC-KR | |
Filename: memory_profiling_4112.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
31 23.8 MiB 23.8 MiB @profile | |
32 def without_chunking(): | |
33 23.8 MiB 0.0 MiB print(chardet.detect(_bytes)['encoding']) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment