Created
August 31, 2022 03:00
-
-
Save mertcangokgoz/aa45d6da9b3e1d5dfd9a7add60217d43 to your computer and use it in GitHub Desktop.
character encoding detection via chardet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from collections import namedtuple | |
from chardet import UniversalDetector # pip install chardet | |
logger = logging.getLogger(__name__) | |
Guess = namedtuple("Guess", ["encoding", "confidence"]) | |
def detect_encodings(data: bytes) -> dict: | |
if not data: | |
return {"ascii": 1.0} | |
detector = UniversalDetector() | |
detector.reset() | |
detector.feed(data) | |
result = detector.close() | |
if not result or not result["encoding"]: | |
return {"utf-8": 1.0} | |
encodings = {result["encoding"]: result["confidence"]} | |
for prober in detector._charset_probers: # noqa | |
if prober: | |
encodings[prober.get_charset_name()] = prober.get_confidence() | |
return encodings | |
def guess_encodings(data: bytes) -> list[Guess]: | |
encodings = detect_encodings(data) | |
if "utf-8" in encodings and encodings["utf-8"] > 0.0: | |
encodings["utf-8"] = (encodings["utf-8"] + 2.0) / 3.0 | |
encodings = [ | |
Guess(encoding, confidence) | |
for encoding, confidence in encodings.items() | |
] | |
sorted_encodings = sorted( | |
encodings, key=lambda guess: guess.confidence, reverse=True | |
) | |
logger.debug("Possible encodings: %s" % sorted_encodings) | |
return sorted_encodings | |
def guess_encoding(data: bytes) -> str: | |
encodings = guess_encodings(data) | |
for encoding in encodings: | |
logger.debug("Trying encoding: %s", encoding) | |
try: | |
data.decode(encoding.encoding) | |
return encoding.encoding | |
except (UnicodeDecodeError, TypeError): | |
pass | |
raise ValueError("No encoding could be guessed for this data") | |
def decode(data: bytes) -> str: | |
encoding = guess_encoding(data) | |
return data.decode(encoding) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment