Skip to content

Instantly share code, notes, and snippets.

@mertcangokgoz
Created August 31, 2022 03:00
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mertcangokgoz/aa45d6da9b3e1d5dfd9a7add60217d43 to your computer and use it in GitHub Desktop.
Save mertcangokgoz/aa45d6da9b3e1d5dfd9a7add60217d43 to your computer and use it in GitHub Desktop.
character encoding detection via chardet
import logging
from collections import namedtuple
from chardet import UniversalDetector # pip install chardet
logger = logging.getLogger(__name__)
Guess = namedtuple("Guess", ["encoding", "confidence"])
def detect_encodings(data: bytes) -> dict:
if not data:
return {"ascii": 1.0}
detector = UniversalDetector()
detector.reset()
detector.feed(data)
result = detector.close()
if not result or not result["encoding"]:
return {"utf-8": 1.0}
encodings = {result["encoding"]: result["confidence"]}
for prober in detector._charset_probers: # noqa
if prober:
encodings[prober.get_charset_name()] = prober.get_confidence()
return encodings
def guess_encodings(data: bytes) -> list[Guess]:
encodings = detect_encodings(data)
if "utf-8" in encodings and encodings["utf-8"] > 0.0:
encodings["utf-8"] = (encodings["utf-8"] + 2.0) / 3.0
encodings = [
Guess(encoding, confidence)
for encoding, confidence in encodings.items()
]
sorted_encodings = sorted(
encodings, key=lambda guess: guess.confidence, reverse=True
)
logger.debug("Possible encodings: %s" % sorted_encodings)
return sorted_encodings
def guess_encoding(data: bytes) -> str:
encodings = guess_encodings(data)
for encoding in encodings:
logger.debug("Trying encoding: %s", encoding)
try:
data.decode(encoding.encoding)
return encoding.encoding
except (UnicodeDecodeError, TypeError):
pass
raise ValueError("No encoding could be guessed for this data")
def decode(data: bytes) -> str:
encoding = guess_encoding(data)
return data.decode(encoding)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment