Skip to content

Instantly share code, notes, and snippets.

@sualeh
Last active June 30, 2024 12:37
Show Gist options
  • Save sualeh/8e6b30e8b0fc1f2076c2fdf2240a6a67 to your computer and use it in GitHub Desktop.
Save sualeh/8e6b30e8b0fc1f2076c2fdf2240a6a67 to your computer and use it in GitHub Desktop.
Extracts raw text from Amazon TextTract zip files
Add-Type -Assembly System.IO.Compression.FileSystem
# Define the path to your ZIP files
$ZipFilesPath = "."
# Create a temporary folder for extraction
$TempPath = "."
# Extract files from ZIPs and rename them
$Shell = New-Object -com Shell.Application
$Location = $Shell.NameSpace($TempPath)
$ZipFiles = Get-ChildItem $ZipFilesPath -Recurse -Include *.zip
foreach ($ZipFile in $ZipFiles) {
$ZipFileStem = [IO.Path]::GetFileNameWithoutExtension($ZipFile.FullName)
Write-Host "Extracting and renaming: $ZipFile (stem $ZipFileStem)"
$ArchivePath = Convert-Path -LiteralPath $ZipFile
$RawTextFile = 'rawText.txt'
$ExtractDir = Convert-Path -LiteralPath "."
$Archive = [IO.Compression.ZipFile]::OpenRead($ArchivePath)
$VarType = $Archive.GetType().Name
Write-Host "Found: $Archive ($VarType)"
try {
# Locate the desired file in the ZIP archive
# Replace $_.Fullname by $_.Name if the file is in any subdirectory
if ($ArchiveFile = $Archive.Entries.Where({ $_.FullName -eq $RawTextFile }, 'First'))
{
$EntryStream = $ArchiveFile.Open()
$FileStream = [System.IO.File]::Create("$ExtractDir\$ZipFileStem.txt")
$EntryStream.CopyTo($FileStream)
}
}
finally {
$EntryStream.Close()
$FileStream.Close()
$Archive.Dispose()
$Archive.Dispose()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment