Skip to content

Instantly share code, notes, and snippets.

@indented-automation
Last active November 1, 2022 19:56
Show Gist options
  • Save indented-automation/8e603144167c7acca4dd8f653d47441e to your computer and use it in GitHub Desktop.
Save indented-automation/8e603144167c7acca4dd8f653d47441e to your computer and use it in GitHub Desktop.
Signature-based encoding detection
using namespace System.Collections.Generic; using namespace System.Linq
function Get-FileEncoding {
<#
.SYNOPSIS
Attempt to determine a file type based on a BOM or file header.
.DESCRIPTION
This script attempts to determine file types based on a byte sequence at the beginning of the file.
If an identifiable byte sequence is not present the file type cannot be determined using this method.
The order signatures appear in is critical where signatures overlap. For example, UTF32-LE must be evaluated before UTF16-LE.
.LINK
https://en.wikipedia.org/wiki/Byte_order_mark#cite_note-b-15
https://filesignatures.net
#>
[CmdletBinding()]
[OutputType('EncodingInfo')]
param (
# The path to a file to analyze.
[Parameter(Mandatory, Position = 1, ValueFromPipeline, ValueFromPipelineByPropertyName)]
[ValidateScript( { Test-Path $_ -PathType Leaf } )]
[Alias('FullName')]
[String]$Path,
# Test the file against a small set of signature definitions for binary file types.
#
# Identification should be treated as tentative. Several file formats cannot be identified using the sequence at the start alone.
[Switch]$IncludeBinary
)
begin {
$signatures = [Ordered]@{
'UTF32-LE' = 'FF-FE-00-00'
'UTF32-BE' = '00-00-FE-FF'
'UTF8' = 'EF-BB-BF'
'UTF16-LE' = 'FF-FE'
'UTF16-BE' = 'FE-FF'
'UTF7' = '2B-2F-76-38', '2B-2F-76-39', '2B-2F-76-2B', '2B-2F-76-2F'
'UTF1' = 'F7-64-4C'
'UTF-EBCDIC' = 'DD-73-66-73'
'SCSU' = '0E-FE-FF'
'BOCU-1' = 'FB-EE-28'
'GB-18030' = '84-31-95-33'
}
if ($IncludeBinary) {
$signatures += [Ordered]@{
'LNK' = '4C-00-00-00-01-14-02-00'
'MSEXCEL' = '50-4B-03-04-14-00-06-00'
'PNG' = '89-50-4E-47-0D-0A-1A-0A'
'MSOFFICE' = 'D0-CF-11-E0-A1-B1-1A-E1'
'7ZIP' = '37-7A-BC-AF-27-1C'
'RTF' = '7B-5C-72-74-66-31'
'GIF' = '47-49-46-38'
'REGPOL' = '50-52-65-67'
'GZIP' = '1F-8B'
'JPEG' = 'FF-D8'
'MSEXE' = '4D-5A'
'ZIP' = '50-4B'
}
}
# Convert sequence strings to byte arrays. Intended to simplify signature maintenance.
[String[]]$keys = $signatures.Keys
foreach ($name in $keys) {
[List[List[Byte]]]$values = foreach ($value in $signatures[$name]) {
[List[Byte]]$signatureBytes = foreach ($byte in $value.Split('-')) {
[Convert]::ToByte($byte, 16)
}
,$signatureBytes
}
$signatures[$name] = $values
}
}
process {
try {
$Path = $pscmdlet.GetUnresolvedProviderPathFromPSPath($Path)
$bytes = [Byte[]]::new(8)
$stream = [System.IO.File]::OpenRead($Path)
$null = $stream.Read($bytes, 0, $bytes.Count)
$bytes = [List[Byte]]$bytes
$stream.Close()
$encoding = foreach ($name in $signatures.Keys) {
$sampleEncoding = foreach ($sequence in $signatures[$name]) {
$sample = $bytes.GetRange(0, $sequence.Count)
if ([System.Linq.Enumerable]::SequenceEqual($sample, $sequence)) {
$name
break
}
}
if ($sampleEncoding) {
$sampleEncoding
break
}
}
[PSCustomObject]@{
Name = Split-Path $Path -Leaf
Extension = [System.IO.Path]::GetExtension($Path)
Encoding = $encoding
Path = $Path
PSTypeName = 'EncodingInfo'
}
} catch {
$pscmdlet.WriteError($_)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment