Skip to content

Instantly share code, notes, and snippets.

@taco-shellcode
Created February 27, 2018 15:54
Show Gist options
  • Save taco-shellcode/f6647f88361dc0dabdb1932cebd40f53 to your computer and use it in GitHub Desktop.
Save taco-shellcode/f6647f88361dc0dabdb1932cebd40f53 to your computer and use it in GitHub Desktop.
function Get-RegexMatch {
param($string, $pattern)
return [System.Text.RegularExpressions.Regex]::Match($string, $pattern)
}
function Invoke-RegexReplace {
param($string, $pattern, $replacement = '')
return [System.Text.RegularExpressions.Regex]::Replace($string, $pattern, $replacement)
}
function ConvertFrom-RawMhtmlBoundary {
param($value)
$location = $value[0].Replace('Content-Location: ', '')
$encoding = $value[1].Replace('Content-Transfer-Encoding: ', '')
$type = $value[2].Replace('Content-Type: ', '')
$out = @{
Location = Resolve-ContentIdFromRawLocation $location
Encoding = $encoding
MimeType = $type
Content = ConvertFrom-RawMhtmlBoundaryContent $value $encoding $type
IsHtmlDocument = $false
IsStream = $false
}
if ($out.Encoding -eq 'quoted-printable') {
$out.Content = Resolve-MhtmlQuotedPrintableString $out.Content
}
if ($out.MimeType.IndexOf('text/html', [StringComparison]::OrdinalIgnoreCase) -ge 0) {
$out.Content = Resolve-ContentIdFromRelativeLocation $out.Content
$out.Content = Clear-MhtmlOutlookHeaders $out.Content
$out.Content = Repair-OutlookParagraphStyle $out.Content
$out.IsHtmlDocument = $true
}
if ($encoding -eq 'base64') {
if (-not (Assert-MimeTypeIsText $type)) {
$out.IsStream = $true
}
}
return $out
}
function Assert-MimeTypeIsText {
param($type)
return ($type.IndexOf('text/', [StringComparison]::OrdinalIgnoreCase) -ge 0)
}
function ConvertFrom-RawMhtmlBoundaryContent {
param($value, $encoding, $type)
$out = $null
if ($encoding -eq 'base64') {
$base64 = ConvertFrom-RawMhtmlBoundaryContentAsString $value ''
$repaired = Repair-Base64StringPadding $base64
$bytes = [Convert]::FromBase64String($repaired)
if ((Assert-MimeTypeIsText $type)) {
# html encoded in to a base64 string
$charset = Get-CharsetValue $type
$out = ConvertFrom-ByteArrayToString $bytes $charset
} else {
# generic stream
$stream = New-Object IO.MemoryStream ($bytes, 0, $bytes.count)
$out = $stream
}
} else {
$out = ConvertFrom-RawMhtmlBoundaryContentAsString $value
}
return $out
}
function ConvertFrom-ByteArrayToString {
param($bytes, $charset)
switch($charset) {
utf32 {
return [System.Text.Encoding]::UTF32.GetString($bytes)
}
unicode {
return [System.Text.Encoding]::Unicode.GetString($bytes)
}
utf8 {
return [System.Text.Encoding]::UTF8.GetString($bytes)
}
utf7 {
return [System.Text.Encoding]::UTF7.GetString($bytes)
}
default {
return [System.Text.Encoding]::ASCII.GetString($bytes)
}
}
}
function Get-CharsetValue {
param($type)
$out = Get-RegexMatch $type '(?<=charset=")\w*(?=")'
return $out.Value
}
function ConvertFrom-RawMhtmlBoundaryContentAsString {
param($value, $lineEnding = "`n")
$range = $value.GetRange(4, $value.Count - 5)
$buffer = [System.String]::Join($lineEnding, $range.ToArray())
$buffer = $buffer.Trim()
return $buffer
}
function Resolve-ContentIdFromRawLocation {
param($value)
$out = Invoke-RegexReplace $value '\w*:///\w:/[^/]*/[^/]*/' ''
#$out = Invoke-RegexReplace $value '(?<=/)[^/]*$' ''
return $out
}
function Resolve-ContentIdFromRelativeLocation {
param($value)
$out = Invoke-RegexReplace $value '(?<=")[\w*]*_files/[^"]*(?=")' 'cid:$&'
$out = Invoke-RegexReplace $value 'src="[^/]*/' 'src="cid:'
return $out
}
function Clear-MhtmlOutlookHeaders {
param($value)
$header = '<div class=WordSection1>'
$footer = '<p class=MsoNormal><o:p>&nbsp;</o:p></p>'
$indexOfHeader = $value.IndexOf($header)
$indexOfFooter = $value.IndexOf($footer)
if ($indexOfHeader -eq -1 -or $indexOfFooter -eq -1) {
return $value
}
$headerString = $value.substring(0, $indexOfHeader + $header.length)
$footerString = $value.substring($indexOfFooter + $footer.length)
$replaced = $headerString + $footerString
return $replaced
}
function Repair-Base64StringPadding {
param($value)
$value = $value.replace('=', '')
$mod = $value.length % 4
$count = 4 - $mod
$padding = ''
if ($value.length % 4 -eq 0) {
return $value
} else {
for ($i = 0; $i -lt $count; $i++) {
$padding += '='
}
return $value + $padding
}
}
function Repair-OutlookParagraphStyle {
param($value)
$value = $value.replace('<p class=MsoNormal style=''', '<p style=''margin:0;')
$value = $value.replace('<p class=MsoNormal', '<p style="margin:0"')
$value = $value.replace('<o:p></o:p>', '')
return $value
}
function Resolve-MhtmlQuotedPrintableString {
param($value)
$regex = New-Object Text.RegularExpressions.Regex '(?<=.{69,})(=\r\n|=\n)'
$replaced = $regex.Replace($value, '')
$replaced = $replaced.replace('=3D', '=')
return $replaced
}
function Get-MhtmlBoundaries {
param($raw, $separator)
$boundaries = New-Object System.Collections.ArrayList
$index = 0
while($raw.IndexOf("--$separator") -gt -1 -or $raw.IndexOf("--$separator--") -gt -1) {
$index = $raw.IndexOf("--$separator")
if ($index -eq -1) {
$index = $raw.IndexOf("--$separator--")
}
$newBoundary = New-Object System.Collections.ArrayList
for($i = 0; $i -le $index; $i++) {
$null = $newBoundary.add($raw[$i])
}
$null = $raw.RemoveRange(0, $index + 1)
$null = $boundaries.Add($newBoundary)
}
$boundaries.RemoveAt(0) #remove the header
return $boundaries
}
function Get-MhtmlContentType {
param($raw)
$temp = $raw | Where-Object { $_ -match 'Content-Type: [^;]*' }
if ($temp -ne $null) {
$temp = (Get-RegexMatch $temp 'Content-Type: [^;]*').Value
if ($temp -ne $null) {
return $temp.Replace('Content-Type: ', '')
}
}
throw 'No content type could be detected, not a valid mhtml file'
}
function Get-MhtmlBoundarySeparator {
param($raw)
$temp = $raw | Where-Object { $_ -match 'boundary="[^"]*"' }
if ($temp -ne $null) {
$temp = (Get-RegexMatch $temp 'boundary="[^"]*"').Value
if ($temp -ne $null) {
return $temp.Replace('boundary="', '').Replace('"', '')
}
}
throw 'No boundary separator detected, not a valid mhtml file'
}
function Assert-MhtmlIsValidMimeVersion {
param($raw)
if (-not $raw[0] -eq 'MIME-Version: 1.0') {
throw 'Not a valid mthml file. Expected first line to be MIME-Version: 1.0'
}
}
function Invoke-AssembleSections {
param([hashtable]$hash)
for($i = 0; $i -lt $hash.RawBoundaries.count; $i++) {
$value = ConvertFrom-RawMhtmlBoundary $hash.RawBoundaries[$i]
if ($value -ne $null) {
$hash.Sections += $value
}
}
}
function Invoke-AssignSections {
param([hashtable]$hash)
for($i = 0; $i -lt $hash.Sections.count; $i++) {
if ($hash.Sections[$i].IsHtmlDocument) {
$hash.HtmlDocuments += $hash.Sections[$i]
} else {
$hash.ContentSections += $hash.Sections[$i]
}
}
}
function Invoke-AssembleInlineContent {
param([hashtable]$hash)
$hash.ContentSections | Where-Object { $_.IsStream } | ForEach-Object {
$mime = $_.MimeType
$stream = $_.Content
$cid = $_.Location
if ($mime -ne 'application/vnd.ms-officetheme') {
$hash.InlineContentHash[$cid] = @($stream, $mime)
}
}
}
function New-MhtmlObject {
param($mime, $separator, $boundaries, $raw)
$out = @{
MimeType = $mime
SegmentSeparator = $separator
Raw = $raw
RawBoundaries = $boundaries
Sections = @()
HtmlDocuments = @()
ContentSections = @()
InlineContentHash = @{}
}
return $out
}
function ConvertFrom-ArrayToArrayList {
param($values)
$out = New-Object 'System.Collections.Generic.List[string]'
for($i = 0; $i -lt $values.count; $i++) {
$out.add($values[$i].ToString())
}
return $out
}
function Import-OutlookMhtml {
param($path = $null, $value = $null)
if ($path -ne $null) {
$raw = Get-Content $path
}
if ($value -ne $null) {
$raw = $value.replace("`r", '').split("`n")
}
$data = [System.Collections.Generic.List[string]] (ConvertFrom-ArrayToArrayList $raw)
Assert-MhtmlIsValidMimeVersion $data
$mime = Get-MhtmlContentType $data
$separator = Get-MhtmlBoundarySeparator $data
$boundaries = Get-MhtmlBoundaries $data $separator
$out = New-MhtmlObject $mime $separator $boundaries $raw
Invoke-AssembleSections $out
Invoke-AssignSections $out
Invoke-AssembleInlineContent $out
return $out
}
Export-ModuleMember -Function Import-OutlookMhtml, ConvertFrom-ByteArrayToString
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment