Created
May 10, 2022 10:15
-
-
Save klinkby/26de88bf36e84bb832edeca9aa361509 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static async Task<Encoding> GetEncodingFromStream(Stream s, int bufferSize, CancellationToken ct) | |
{ | |
if (!s.CanSeek || !s.CanRead) | |
{ | |
// can't touch this | |
return Encoding.Default; | |
} | |
byte[] buffer = new byte[bufferSize]; | |
int bytesRead = await s.ReadAsync(buffer, 0, buffer.Length, ct); | |
if (0 == bytesRead) | |
{ | |
// file is empty | |
return Encoding.Default; | |
} | |
s.Seek(0, SeekOrigin.Begin); | |
return GetEncodingFromBuffer(buffer, bytesRead); | |
} | |
static Encoding GetEncodingFromBuffer(byte[] buffer, int count) | |
{ | |
var encodingsWithBoms = Encoding.GetEncodings().Select(x => x.GetEncoding()).Where(x => 0 != x.Preamble.Length).OrderByDescending(x => x.Preamble.Length); | |
// Strategy 1: Does it start with one of the byte order marks (bom)? | |
Encoding? theEncoding = encodingsWithBoms.FirstOrDefault(x => 0 == IndexOf(buffer, x.Preamble.ToArray(), x.Preamble.Length)); | |
if (theEncoding is not null) | |
{ | |
// detected from bom | |
return theEncoding; | |
} | |
//Strategy 2: Detect from national chars | |
var eightBitEncodings = new[] { Encoding.UTF8, Encoding.Latin1 }; // DOS code pages not supported in netcore | |
const string NationalChars = "æøåÆØÅ"; | |
var encodingSamples = eightBitEncodings.SelectMany(x => NationalChars.Select(y => (Sample: x.GetBytes(y.ToString()), Encoding: x))); | |
// buffer read | |
foreach (var es in encodingSamples) | |
{ | |
// look for encoding sample | |
if (-1 != IndexOf(buffer, es.Sample, count)) | |
{ | |
// detected from national chars | |
return es.Encoding; | |
} | |
} | |
// Stratgy 3: fall back | |
return Encoding.Default; | |
} | |
static int IndexOf(byte[] array, byte[] pattern, int count) | |
{ | |
int fidx = 0; | |
int result = Array.FindIndex(array, 0, count, (byte b) => { | |
fidx = (b == pattern[fidx]) ? fidx + 1 : 0; | |
return (fidx == pattern.Length); | |
}); | |
return (result < 0) ? -1 : result - fidx + 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment