Skip to content

Instantly share code, notes, and snippets.

@TaoK
Last active July 21, 2023 08:56
Show Gist options
  • Star 35 You must be signed in to star a gist
  • Fork 19 You must be signed in to fork a gist
  • Save TaoK/945127 to your computer and use it in GitHub Desktop.
Save TaoK/945127 to your computer and use it in GitHub Desktop.
Simple class to automatically detect text file encoding, with English-biased "best guess" heuristic based on byte patterns in the absence of BOM.
using System;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
namespace KlerksSoft
{
public static class TextFileEncodingDetector
{
/*
* Simple class to handle text file encoding woes (in a primarily English-speaking tech
* world).
*
* - This code is fully managed, no shady calls to MLang (the unmanaged codepage
* detection library originally developed for Internet Explorer).
*
* - This class does NOT try to detect arbitrary codepages/charsets, it really only
* aims to differentiate between some of the most common variants of Unicode
* encoding, and a "default" (western / ascii-based) encoding alternative provided
* by the caller.
*
* - As there is no "Reliable" way to distinguish between UTF-8 (without BOM) and
* Windows-1252 (in .Net, also incorrectly called "ASCII") encodings, we use a
* heuristic - so the more of the file we can sample the better the guess. If you
* are going to read the whole file into memory at some point, then best to pass
* in the whole byte byte array directly. Otherwise, decide how to trade off
* reliability against performance / memory usage.
*
* - The UTF-8 detection heuristic only works for western text, as it relies on
* the presence of UTF-8 encoded accented and other characters found in the upper
* ranges of the Latin-1 and (particularly) Windows-1252 codepages.
*
* - For more general detection routines, see existing projects / resources:
* - MLang - Microsoft library originally for IE6, available in Windows XP and later APIs now (I think?)
* - MLang .Net bindings: http://www.codeproject.com/KB/recipes/DetectEncoding.aspx
* - CharDet - Mozilla browser's detection routines
* - Ported to Java then .Net: http://www.conceptdevelopment.net/Localization/NCharDet/
* - Ported straight to .Net: http://code.google.com/p/chardetsharp/source/browse
*
* Copyright Tao Klerks, Jan 2010, tao@klerks.biz
*
* Permission is hereby granted to use, modify and distribute this code freely, provided
* the original copyright attribution remains.
*
*/
const long _defaultHeuristicSampleSize = 0x10000; //completely arbitrary - inappropriate for high numbers of files / speed
public static Encoding DetectTextFileEncoding(string InputFilename, Encoding DefaultEncoding)
{
using (FileStream textfileStream = File.OpenRead(InputFilename))
{
return DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize);
}
}
public static Encoding DetectTextFileEncoding(FileStream InputFileStream, Encoding DefaultEncoding, long HeuristicSampleSize)
{
if (InputFileStream == null)
throw new ArgumentNullException("Must provide a valid Filestream!", "InputFileStream");
if (!InputFileStream.CanRead)
throw new ArgumentException("Provided file stream is not readable!", "InputFileStream");
if (!InputFileStream.CanSeek)
throw new ArgumentException("Provided file stream cannot seek!", "InputFileStream");
Encoding encodingFound = null;
long originalPos = InputFileStream.Position;
InputFileStream.Position = 0;
//First read only what we need for BOM detection
byte[] bomBytes = new byte[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];
InputFileStream.Read(bomBytes, 0, bomBytes.Length);
encodingFound = DetectBOMBytes(bomBytes);
if (encodingFound != null)
{
InputFileStream.Position = originalPos;
return encodingFound;
}
//BOM Detection failed, going for heuristics now.
// create sample byte array and populate it
byte[] sampleBytes = new byte[HeuristicSampleSize > InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];
Array.Copy(bomBytes, sampleBytes, bomBytes.Length);
if (InputFileStream.Length > bomBytes.Length)
InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);
InputFileStream.Position = originalPos;
//test byte array content
encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);
if (encodingFound != null)
return encodingFound;
else
return DefaultEncoding;
}
public static Encoding DetectTextByteArrayEncoding(byte[] TextData, Encoding DefaultEncoding)
{
if (TextData == null)
throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");
Encoding encodingFound = null;
encodingFound = DetectBOMBytes(TextData);
if (encodingFound != null)
{
return encodingFound;
}
else
{
//test byte array content
encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);
if (encodingFound != null)
return encodingFound;
else
return DefaultEncoding;
}
}
public static Encoding DetectBOMBytes(byte[] BOMBytes)
{
if (BOMBytes == null)
throw new ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes");
if (BOMBytes.Length < 2)
return null;
if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && (BOMBytes.Length < 4 || BOMBytes[2] != 0 || BOMBytes[3] != 0))
return Encoding.Unicode;
if (BOMBytes[0] == 0xfe && BOMBytes[1] == 0xff)
return Encoding.BigEndianUnicode;
if (BOMBytes.Length < 3)
return null;
if (BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)
return Encoding.UTF8;
if (BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)
return Encoding.UTF7;
if (BOMBytes.Length < 4)
return null;
if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)
return Encoding.UTF32;
if (BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)
return Encoding.GetEncoding(12001);
return null;
}
public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
{
long oddBinaryNullsInSample = 0;
long evenBinaryNullsInSample = 0;
long suspiciousUTF8SequenceCount = 0;
long suspiciousUTF8BytesTotal = 0;
long likelyUSASCIIBytesInSample = 0;
//Cycle through, keeping count of binary null positions, possible UTF-8
// sequences from upper ranges of Windows-1252, and probable US-ASCII
// character counts.
long currentPos = 0;
int skipUTF8Bytes = 0;
while (currentPos < SampleBytes.Length)
{
//binary null distribution
if (SampleBytes[currentPos] == 0)
{
if (currentPos % 2 == 0)
evenBinaryNullsInSample++;
else
oddBinaryNullsInSample++;
}
//likely US-ASCII characters
if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
likelyUSASCIIBytesInSample++;
//suspicious sequences (look like UTF-8)
if (skipUTF8Bytes == 0)
{
int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);
if (lengthFound > 0)
{
suspiciousUTF8SequenceCount++;
suspiciousUTF8BytesTotal += lengthFound;
skipUTF8Bytes = lengthFound - 1;
}
}
else
{
skipUTF8Bytes--;
}
currentPos++;
}
//1: UTF-16 LE - in english / european environments, this is usually characterized by a
// high proportion of odd binary nulls (starting at 0), with (as this is text) a low
// proportion of even binary nulls.
// The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
// 60% nulls where you do expect nulls) are completely arbitrary.
if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 && ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6)
return Encoding.Unicode;
//2: UTF-16 BE - in english / european environments, this is usually characterized by a
// high proportion of even binary nulls (starting at 0), with (as this is text) a low
// proportion of odd binary nulls.
// The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
// 60% nulls where you do expect nulls) are completely arbitrary.
if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 && ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6)
return Encoding.BigEndianUnicode;
//3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
// using regexp, in his w3c.org unicode FAQ entry:
// http://www.w3.org/International/questions/qa-forms-utf-8
// adapted here for C#.
string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
Regex UTF8Validator = new Regex(@"\A([\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*\z");
if (UTF8Validator.IsMatch(potentiallyMangledString))
{
//Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
//If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
//If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
// So, we need to play stats.
// The "Random" likelihood of any pair of randomly generated characters being one
// of these "suspicious" character sequences is:
// 128 / (256 * 256) = 0.2%.
//
// In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
// character range, so we assume that more than 1 in 500,000 of these character
// sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
//
// We can only assume these character sequences will be rare if we ALSO assume that this
// IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
// not already suspicious sequences) should be plain US-ASCII bytes. This, I
// arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
// approx 40%, so the chances of hitting this threshold by accident in random data are
// VERY low).
if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) //suspicious sequences
&& (
SampleBytes.Length - suspiciousUTF8BytesTotal == 0 //all suspicious, so cannot evaluate proportion of US-Ascii
||
likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
)
)
return Encoding.UTF8;
}
return null;
}
private static bool IsCommonUSASCIIByte(byte testByte)
{
if (testByte == 0x0A //lf
|| testByte == 0x0D //cr
|| testByte == 0x09 //tab
|| (testByte >= 0x20 && testByte <= 0x2F) //common punctuation
|| (testByte >= 0x30 && testByte <= 0x39) //digits
|| (testByte >= 0x3A && testByte <= 0x40) //common punctuation
|| (testByte >= 0x41 && testByte <= 0x5A) //capital letters
|| (testByte >= 0x5B && testByte <= 0x60) //common punctuation
|| (testByte >= 0x61 && testByte <= 0x7A) //lowercase letters
|| (testByte >= 0x7B && testByte <= 0x7E) //common punctuation
)
return true;
else
return false;
}
private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)
{
int lengthFound = 0;
if (SampleBytes.Length >= currentPos + 1 && SampleBytes[currentPos] == 0xC2)
{
if (SampleBytes[currentPos + 1] == 0x81 || SampleBytes[currentPos + 1] == 0x8D || SampleBytes[currentPos + 1] == 0x8F)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] == 0x90 || SampleBytes[currentPos + 1] == 0x9D)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] >= 0xA0 && SampleBytes[currentPos + 1] <= 0xBF)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1 && SampleBytes[currentPos] == 0xC3)
{
if (SampleBytes[currentPos + 1] >= 0x80 && SampleBytes[currentPos + 1] <= 0xBF)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1 && SampleBytes[currentPos] == 0xC5)
{
if (SampleBytes[currentPos + 1] == 0x92 || SampleBytes[currentPos + 1] == 0x93)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] == 0xA0 || SampleBytes[currentPos + 1] == 0xA1)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] == 0xB8 || SampleBytes[currentPos + 1] == 0xBD || SampleBytes[currentPos + 1] == 0xBE)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1 && SampleBytes[currentPos] == 0xC6)
{
if (SampleBytes[currentPos + 1] == 0x92)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1 && SampleBytes[currentPos] == 0xCB)
{
if (SampleBytes[currentPos + 1] == 0x86 || SampleBytes[currentPos + 1] == 0x9C)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 2 && SampleBytes[currentPos] == 0xE2)
{
if (SampleBytes[currentPos + 1] == 0x80)
{
if (SampleBytes[currentPos + 2] == 0x93 || SampleBytes[currentPos + 2] == 0x94)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0x98 || SampleBytes[currentPos + 2] == 0x99 || SampleBytes[currentPos + 2] == 0x9A)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0x9C || SampleBytes[currentPos + 2] == 0x9D || SampleBytes[currentPos + 2] == 0x9E)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xA0 || SampleBytes[currentPos + 2] == 0xA1 || SampleBytes[currentPos + 2] == 0xA2)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xA6)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xB0)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xB9 || SampleBytes[currentPos + 2] == 0xBA)
lengthFound = 3;
}
else if (SampleBytes[currentPos + 1] == 0x82 && SampleBytes[currentPos + 2] == 0xAC)
lengthFound = 3;
else if (SampleBytes[currentPos + 1] == 0x84 && SampleBytes[currentPos + 2] == 0xA2)
lengthFound = 3;
}
return lengthFound;
}
}
}
@X3MBoy
Copy link

X3MBoy commented Apr 4, 2016

It works very well, but it will help if you specify in the description that is C# code.

@maxiwheat
Copy link

See my revision, it addresses a bug when the last byte from the SampleBytes array is the first byte from a multi-byte UTF-8 sequence. It used to throw an Exception: "Index was outside the bounds of the array". My revision fixes it.

@baharh
Copy link

baharh commented Sep 4, 2016

hi thank for your attention,how can use this code ?

@sumanthjain1994
Copy link

Please anyone help me out to get same code in JAVA??

@ElektroStudios
Copy link

Unfortunately it doesn't work to detect the encoding of a file with encoding: Windows-1252 a.k.a Latin-1. Probably could be more encodings that this solution can't detect.

@prantzos
Copy link

prantzos commented Sep 9, 2020

I was testing it with a UTF8 encoded byte[] and it did not detect it as UTF8 encoded string.

@nextb123
Copy link

Please how can I use this code in my application, I am new to c#. Thanks.

@nextb123
Copy link

Hello everyone.
Would like me to try this code in my app.
But I don't know how to call this class in my application, I don't know how to use this class...
For example, if my file is located in the path: C:\TestFolder\Test.txt, how can I detect code page of this file Test.txt.
He is asking for your help. Thanks in advance.

for example something like:

private void Button1_Click(object sender, EventArgs e)
{
string path = @"C:\TestFolder\Test.txt";
TextFileEncodingDetector.DetectTextFileEncoding(path);
//.......
}

@houssem11957
Copy link

Hello everyone. Would like me to try this code in my app. But I don't know how to call this class in my application, I don't know how to use this class... For example, if my file is located in the path: C:\TestFolder\Test.txt, how can I detect code page of this file Test.txt. He is asking for your help. Thanks in advance.

for example something like:

private void Button1_Click(object sender, EventArgs e) { string path = @"C:\TestFolder\Test.txt"; TextFileEncodingDetector.DetectTextFileEncoding(path); //....... }

you have to read the content from the file before passing it to the methods ...
you can use :
var text = System.IO.File.ReadAllText(path)
and then pass the text to the encoding detector ...

@jochenwezel
Copy link

Please note/fix following bug: the argument HeuristicSampleSize is not used in following method:

public static Encoding DetectTextFileEncoding(FileStream InputFileStream, long HeuristicSampleSize)
{
            bool uselessBool = false;
            return DetectTextFileEncoding(InputFileStream, _defaultHeuristicSampleSize, out uselessBool);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment