Skip to content

Instantly share code, notes, and snippets.

@mjs3339
Last active August 5, 2018 18:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjs3339/399df63693fcbaf0b61bcd51d79a5bf1 to your computer and use it in GitHub Desktop.
Save mjs3339/399df63693fcbaf0b61bcd51d79a5bf1 to your computer and use it in GitHub Desktop.
C# Convert Unicode, and UTF32 to UTF8 Byte Array, Plus Determine File Type Binary or Text
public class TextBinaryFileId
{
private readonly BoyerMoore _boyerMoore4Null = new BoyerMoore(new byte[] {0, 0, 0, 0});
public Dictionary<string, string> BinaryFiles = new Dictionary<string, string>();
public Dictionary<string, Encoding> EncodingFiles = new Dictionary<string, Encoding>();
public Dictionary<string, double> TextFiles = new Dictionary<string, double>();
public bool IsBlockText(byte[] BinData, double ConfidenceThreshold = 25)
{
if(BinData.Length == 0)
return false;
if(_boyerMoore4Null.Search(BinData) != -1)
return false;
var enc = GetEncoding(BinData);
var CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));
var asp = (double) CcCount / BinData.Length * 100d;
return!(asp > ConfidenceThreshold);
}
public byte[] ReadBytes(string path)
{
try
{
var ba = File.ReadAllBytes(path);
var enc = GetEncoding(ba);
EncodingFiles.Add(path, enc);
if(Equals(enc, Encoding.ASCII)) return ba;
if(Equals(enc, Encoding.Unicode))
{
var len = ba.Length - 2;
var blen = len / 2;
if(blen * 2 < len) blen++;
var b = new byte[blen];
for(int i = 2, j = 0; i < ba.Length && j < blen; i += 2, ++j)
b[j] = ba[i];
return b;
}
if(Equals(enc, Encoding.UTF32))
{
var len1 = ba.Length - 4;
var blen1 = len1 / 4;
if(blen1 * 4 < len1)
blen1++;
var b1 = new byte[blen1];
for(int i = 4, j = 0; i < ba.Length && j < blen1; i += 4, ++j)
b1[j] = ba[i];
return b1;
}
if(Equals(enc, Encoding.UTF7)) return ba;
if(Equals(enc, Encoding.UTF8)) return ba;
return ba;
}
catch(Exception ex)
{
ExceptionLog.ExLog(ex, "ReadBytes", "ReadBytes");
}
return null;
}
public bool IsText(string path, bool EnsureConfidence = false, double ConfidenceThreshold = 100)
{
var Reason = "None";
var isText = true;
double asp = 0;
using(var fileStream = File.OpenRead(path))
{
var WindowSize = 0l;
if(EnsureConfidence)
{
WindowSize = fileStream.Length;
}
else
{
WindowSize = 512;
if(WindowSize > fileStream.Length)
WindowSize = fileStream.Length;
}
if(fileStream.Length == 0)
{
BinaryFiles.Add(path, "Zero Length File.");
return false;
}
var BinData = new byte[WindowSize];
var rawLength = fileStream.Read(BinData, 0, BinData.Length);
fileStream.Seek(0, SeekOrigin.Begin);
if(fileStream.Length < 4)
{
var r = BinData.All(b => IsValidTextByte(b));
if(!r)
BinaryFiles.Add(path, "Length 4 file Contains invalid Characters.");
return r;
}
if(_boyerMoore4Null.Search(BinData) != -1)
{
Reason = "4 Sequential Nulls Found within File.";
isText = false;
}
var enc = GetEncoding(BinData);
if(EnsureConfidence)
if(isText)
{
var TextData = new char[WindowSize];
using(var streamReader = new StreamReader(fileStream))
{
streamReader.Read(TextData, 0, TextData.Length);
}
using(var memoryStream = new MemoryStream())
{
using(var streamWriter = new StreamWriter(memoryStream, enc))
{
streamWriter.Write(TextData);
streamWriter.Flush();
var memoryBuffer = memoryStream.GetBuffer();
for(var i = 0; i < rawLength && isText; i++)
{
isText = BinData[i] == memoryBuffer[i];
if(!isText) Reason = $"Encoding Mismatch Found at position: {i}";
}
}
}
}
if(isText)
{
double CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII)));
asp = CcCount / BinData.Length * 100d;
if(asp > ConfidenceThreshold)
{
Reason = $"Confidence threshold {ConfidenceThreshold:0.0} Exceeded: {asp:0.0}";
isText = false;
}
}
}
if(isText)
TextFiles.Add(path, asp.TruncateToDecimalPlace(1));
else
BinaryFiles.Add(path, Reason);
return isText;
}
private static Encoding GetEncoding(byte[] Data)
{
if(Data == null)
throw new Exception("Array cannot be null.");
if(Data.Length < 2)
return Encoding.Default;
if(Data[0] == 0xff && Data[1] == 0xfe)
return Encoding.Unicode;
if(Data[0] == 0xfe && Data[1] == 0xff)
return Encoding.BigEndianUnicode;
if(Data.Length < 3)
return Encoding.Default;
if(Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf)
return Encoding.UTF8;
if(Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76)
return Encoding.UTF7;
if(Data.Length < 4)
return Encoding.Default;
if(Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0)
return Encoding.UTF32;
return Encoding.Default;
}
private static bool IsValidTextByte(byte _byte, bool IncludeNull = false)
{
if(IncludeNull)
if(_byte == 0x00)
return true;
if(_byte == 0x0A
|| _byte == 0x0D
|| _byte == 0x09
|| _byte >= 0x20 && _byte <= 0x2F
|| _byte >= 0x30 && _byte <= 0x39
|| _byte >= 0x3A && _byte <= 0x40
|| _byte >= 0x41 && _byte <= 0x5A
|| _byte >= 0x5B && _byte <= 0x60
|| _byte >= 0x61 && _byte <= 0x7A
|| _byte >= 0x7B && _byte <= 0x7E
)
return true;
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment