Created
April 12, 2023 12:44
-
-
Save alirobe/18a4a2c7cefa6478120c9e82cb230e6f to your computer and use it in GitHub Desktop.
Convert-DocumentsToMachineReadable.ps1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$popplerPath = ".\poppler\bin\pdftotext.exe" | |
$maxDocs = 1000 | |
$inputPath = ".\source" | |
$outputPath = ".\dest" | |
function Convert-FilesToMarkdown { | |
param( | |
[string]$inputFolderPath, | |
[string]$outputFolderPath | |
) | |
function Convert-WordDocument { | |
param( | |
[string]$inputFile, | |
[string]$outputFile | |
) | |
pandoc $inputFile -o $outputFile | |
} | |
function Convert-PDFDocument { | |
param( | |
[string]$inputFile, | |
[string]$outputFile | |
) | |
if(Test-Path $outputFile) { | |
Remove-Item $outputFile -Force | |
} | |
& $popplerPath -enc UTF-8 $inputFile $outputFile | |
if($false -eq (Test-Path $outputFile)) { | |
Write-Host "PDF conversion failed for $outputFile" -ForegroundColor Red | |
Set-Content -Path $outputFile -Value "Conversion Failed" | |
} | |
} | |
function Convert-ExcelSheets { | |
param( | |
[string]$inputFile, | |
[string]$outputFile | |
) | |
# Load Excel COM object | |
$excel = New-Object -ComObject Excel.Application | |
$excel.Visible = $false | |
$workbook = $excel.Workbooks.Open($inputFile) | |
# Create an index file in Markdown format | |
$indexContent = "# Index`n`n" | |
# Loop through the sheets and save each one as a CSV | |
for ($i = 1; $i -le $workbook.Sheets.Count; $i++) { | |
$sheet = $workbook.Sheets.Item($i) | |
$sheetName = $sheet.Name | |
$csvFile = $outputFile.Replace(".md", "_$sheetName.csv") | |
if(Test-Path $csvFile) { | |
Remove-Item $csvFile -Force | |
} | |
$indexContent += "- [$sheetName]($csvFile)`n" | |
$sheet.SaveAs($csvFile, 6) # 6 = xlCSV format | |
} | |
# Save the index file and close the workbook | |
Set-Content -Path $outputFile -Value $indexContent | |
$workbook.Close($false) | |
$excel.Quit() | |
} | |
if (-not (Test-Path -Path $outputFolderPath)) { | |
New-Item -ItemType Directory -Path $outputFolderPath | Out-Null | |
} | |
$files = Get-ChildItem -Path $inputFolderPath -Recurse -File | Select-Object -First $maxDocs | |
$processedFiles = 0; | |
Write-Host "Converting $($files.Count) files..." | |
$junkFiles = '^(~\$|\.|.*_backup$)' | |
foreach ($file in $files) { | |
if($file.Name -match $junkFiles) { | |
Write-Host "-- Skipped $file due to junk file pattern" -ForegroundColor Yellow | |
continue | |
} | |
$relativePath = $file.DirectoryName.Substring($inputFolderPath.Length) | |
$currentOutputFolderPath = Join-Path $outputFolderPath $relativePath | |
if (-not (Test-Path -Path $currentOutputFolderPath)) { | |
New-Item -ItemType Directory -Path $currentOutputFolderPath | Out-Null | |
} | |
$outputFileName = $file.Name + ".md" | |
$outputFilePath = Join-Path $currentOutputFolderPath $outputFileName | |
$fileWritten = $false | |
if (-not (Test-Path $outputFilePath) -or ($file.LastWriteTime.ToFileTime() -gt (Get-Item $outputFilePath).LastWriteTime.ToFileTime())) { | |
switch ($file.Extension) { | |
".docx" { Convert-WordDocument -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true } | |
".doc" { Convert-WordDocument -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true } | |
".xlsx" { Convert-ExcelSheets -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true } | |
".pdf" { Convert-PDFDocument -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true } | |
} | |
if($fileWritten) { | |
Write-Host "-- Processed $file" -ForegroundColor Yellow | |
(Get-Item $outputFilePath).LastWriteTime = $file.LastWriteTime | |
$processedFiles += 1 | |
} | |
} | |
} | |
Write-Host "Conversion completed. Of $($files.Count), $processedFiles were procesed." | |
} | |
Convert-FilesToMarkdown -inputFolderPath $inputPath -outputFolderPath $outputPath |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please note this requires binaries for poppler, as well as an installed copy of pandoc (
choco install -y pandoc
) and MS Excel. This is just an initial little POC and isn't really prod grade.the following tools may do a better job:
https://tika.apache.org/
https://github.com/dbashford/textract (node)
https://github.com/deanmalmgren/textract (linux)