Last active
June 30, 2024 12:37
-
-
Save sualeh/8e6b30e8b0fc1f2076c2fdf2240a6a67 to your computer and use it in GitHub Desktop.
Extracts raw text from Amazon TextTract zip files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add-Type -Assembly System.IO.Compression.FileSystem | |
# Define the path to your ZIP files | |
$ZipFilesPath = "." | |
# Create a temporary folder for extraction | |
$TempPath = "." | |
# Extract files from ZIPs and rename them | |
$Shell = New-Object -com Shell.Application | |
$Location = $Shell.NameSpace($TempPath) | |
$ZipFiles = Get-ChildItem $ZipFilesPath -Recurse -Include *.zip | |
foreach ($ZipFile in $ZipFiles) { | |
$ZipFileStem = [IO.Path]::GetFileNameWithoutExtension($ZipFile.FullName) | |
Write-Host "Extracting and renaming: $ZipFile (stem $ZipFileStem)" | |
$ArchivePath = Convert-Path -LiteralPath $ZipFile | |
$RawTextFile = 'rawText.txt' | |
$ExtractDir = Convert-Path -LiteralPath "." | |
$Archive = [IO.Compression.ZipFile]::OpenRead($ArchivePath) | |
$VarType = $Archive.GetType().Name | |
Write-Host "Found: $Archive ($VarType)" | |
try { | |
# Locate the desired file in the ZIP archive | |
# Replace $_.Fullname by $_.Name if the file is in any subdirectory | |
if ($ArchiveFile = $Archive.Entries.Where({ $_.FullName -eq $RawTextFile }, 'First')) | |
{ | |
$EntryStream = $ArchiveFile.Open() | |
$FileStream = [System.IO.File]::Create("$ExtractDir\$ZipFileStem.txt") | |
$EntryStream.CopyTo($FileStream) | |
} | |
} | |
finally { | |
$EntryStream.Close() | |
$FileStream.Close() | |
$Archive.Dispose() | |
$Archive.Dispose() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment