Last active
August 5, 2018 18:48
-
-
Save mjs3339/399df63693fcbaf0b61bcd51d79a5bf1 to your computer and use it in GitHub Desktop.
C# Convert Unicode, and UTF32 to UTF8 Byte Array, Plus Determine File Type Binary or Text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class TextBinaryFileId | |
{ | |
private readonly BoyerMoore _boyerMoore4Null = new BoyerMoore(new byte[] {0, 0, 0, 0}); | |
public Dictionary<string, string> BinaryFiles = new Dictionary<string, string>(); | |
public Dictionary<string, Encoding> EncodingFiles = new Dictionary<string, Encoding>(); | |
public Dictionary<string, double> TextFiles = new Dictionary<string, double>(); | |
public bool IsBlockText(byte[] BinData, double ConfidenceThreshold = 25) | |
{ | |
if(BinData.Length == 0) | |
return false; | |
if(_boyerMoore4Null.Search(BinData) != -1) | |
return false; | |
var enc = GetEncoding(BinData); | |
var CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII))); | |
var asp = (double) CcCount / BinData.Length * 100d; | |
return!(asp > ConfidenceThreshold); | |
} | |
public byte[] ReadBytes(string path) | |
{ | |
try | |
{ | |
var ba = File.ReadAllBytes(path); | |
var enc = GetEncoding(ba); | |
EncodingFiles.Add(path, enc); | |
if(Equals(enc, Encoding.ASCII)) return ba; | |
if(Equals(enc, Encoding.Unicode)) | |
{ | |
var len = ba.Length - 2; | |
var blen = len / 2; | |
if(blen * 2 < len) blen++; | |
var b = new byte[blen]; | |
for(int i = 2, j = 0; i < ba.Length && j < blen; i += 2, ++j) | |
b[j] = ba[i]; | |
return b; | |
} | |
if(Equals(enc, Encoding.UTF32)) | |
{ | |
var len1 = ba.Length - 4; | |
var blen1 = len1 / 4; | |
if(blen1 * 4 < len1) | |
blen1++; | |
var b1 = new byte[blen1]; | |
for(int i = 4, j = 0; i < ba.Length && j < blen1; i += 4, ++j) | |
b1[j] = ba[i]; | |
return b1; | |
} | |
if(Equals(enc, Encoding.UTF7)) return ba; | |
if(Equals(enc, Encoding.UTF8)) return ba; | |
return ba; | |
} | |
catch(Exception ex) | |
{ | |
ExceptionLog.ExLog(ex, "ReadBytes", "ReadBytes"); | |
} | |
return null; | |
} | |
public bool IsText(string path, bool EnsureConfidence = false, double ConfidenceThreshold = 100) | |
{ | |
var Reason = "None"; | |
var isText = true; | |
double asp = 0; | |
using(var fileStream = File.OpenRead(path)) | |
{ | |
var WindowSize = 0l; | |
if(EnsureConfidence) | |
{ | |
WindowSize = fileStream.Length; | |
} | |
else | |
{ | |
WindowSize = 512; | |
if(WindowSize > fileStream.Length) | |
WindowSize = fileStream.Length; | |
} | |
if(fileStream.Length == 0) | |
{ | |
BinaryFiles.Add(path, "Zero Length File."); | |
return false; | |
} | |
var BinData = new byte[WindowSize]; | |
var rawLength = fileStream.Read(BinData, 0, BinData.Length); | |
fileStream.Seek(0, SeekOrigin.Begin); | |
if(fileStream.Length < 4) | |
{ | |
var r = BinData.All(b => IsValidTextByte(b)); | |
if(!r) | |
BinaryFiles.Add(path, "Length 4 file Contains invalid Characters."); | |
return r; | |
} | |
if(_boyerMoore4Null.Search(BinData) != -1) | |
{ | |
Reason = "4 Sequential Nulls Found within File."; | |
isText = false; | |
} | |
var enc = GetEncoding(BinData); | |
if(EnsureConfidence) | |
if(isText) | |
{ | |
var TextData = new char[WindowSize]; | |
using(var streamReader = new StreamReader(fileStream)) | |
{ | |
streamReader.Read(TextData, 0, TextData.Length); | |
} | |
using(var memoryStream = new MemoryStream()) | |
{ | |
using(var streamWriter = new StreamWriter(memoryStream, enc)) | |
{ | |
streamWriter.Write(TextData); | |
streamWriter.Flush(); | |
var memoryBuffer = memoryStream.GetBuffer(); | |
for(var i = 0; i < rawLength && isText; i++) | |
{ | |
isText = BinData[i] == memoryBuffer[i]; | |
if(!isText) Reason = $"Encoding Mismatch Found at position: {i}"; | |
} | |
} | |
} | |
} | |
if(isText) | |
{ | |
double CcCount = BinData.AsParallel().Count(b => !IsValidTextByte(b, !(enc == Encoding.UTF8 || enc == Encoding.UTF7 || enc == Encoding.ASCII))); | |
asp = CcCount / BinData.Length * 100d; | |
if(asp > ConfidenceThreshold) | |
{ | |
Reason = $"Confidence threshold {ConfidenceThreshold:0.0} Exceeded: {asp:0.0}"; | |
isText = false; | |
} | |
} | |
} | |
if(isText) | |
TextFiles.Add(path, asp.TruncateToDecimalPlace(1)); | |
else | |
BinaryFiles.Add(path, Reason); | |
return isText; | |
} | |
private static Encoding GetEncoding(byte[] Data) | |
{ | |
if(Data == null) | |
throw new Exception("Array cannot be null."); | |
if(Data.Length < 2) | |
return Encoding.Default; | |
if(Data[0] == 0xff && Data[1] == 0xfe) | |
return Encoding.Unicode; | |
if(Data[0] == 0xfe && Data[1] == 0xff) | |
return Encoding.BigEndianUnicode; | |
if(Data.Length < 3) | |
return Encoding.Default; | |
if(Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf) | |
return Encoding.UTF8; | |
if(Data[0] == 0x2b && Data[1] == 0x2f && Data[2] == 0x76) | |
return Encoding.UTF7; | |
if(Data.Length < 4) | |
return Encoding.Default; | |
if(Data[0] == 0xff && Data[1] == 0xfe && Data[2] == 0 && Data[3] == 0) | |
return Encoding.UTF32; | |
return Encoding.Default; | |
} | |
private static bool IsValidTextByte(byte _byte, bool IncludeNull = false) | |
{ | |
if(IncludeNull) | |
if(_byte == 0x00) | |
return true; | |
if(_byte == 0x0A | |
|| _byte == 0x0D | |
|| _byte == 0x09 | |
|| _byte >= 0x20 && _byte <= 0x2F | |
|| _byte >= 0x30 && _byte <= 0x39 | |
|| _byte >= 0x3A && _byte <= 0x40 | |
|| _byte >= 0x41 && _byte <= 0x5A | |
|| _byte >= 0x5B && _byte <= 0x60 | |
|| _byte >= 0x61 && _byte <= 0x7A | |
|| _byte >= 0x7B && _byte <= 0x7E | |
) | |
return true; | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment