Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@alirobe
Created April 12, 2023 12:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alirobe/18a4a2c7cefa6478120c9e82cb230e6f to your computer and use it in GitHub Desktop.
Save alirobe/18a4a2c7cefa6478120c9e82cb230e6f to your computer and use it in GitHub Desktop.
Convert-DocumentsToMachineReadable.ps1
$popplerPath = ".\poppler\bin\pdftotext.exe"
$maxDocs = 1000
$inputPath = ".\source"
$outputPath = ".\dest"
function Convert-FilesToMarkdown {
param(
[string]$inputFolderPath,
[string]$outputFolderPath
)
function Convert-WordDocument {
param(
[string]$inputFile,
[string]$outputFile
)
pandoc $inputFile -o $outputFile
}
function Convert-PDFDocument {
param(
[string]$inputFile,
[string]$outputFile
)
if(Test-Path $outputFile) {
Remove-Item $outputFile -Force
}
& $popplerPath -enc UTF-8 $inputFile $outputFile
if($false -eq (Test-Path $outputFile)) {
Write-Host "PDF conversion failed for $outputFile" -ForegroundColor Red
Set-Content -Path $outputFile -Value "Conversion Failed"
}
}
function Convert-ExcelSheets {
param(
[string]$inputFile,
[string]$outputFile
)
# Load Excel COM object
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false
$workbook = $excel.Workbooks.Open($inputFile)
# Create an index file in Markdown format
$indexContent = "# Index`n`n"
# Loop through the sheets and save each one as a CSV
for ($i = 1; $i -le $workbook.Sheets.Count; $i++) {
$sheet = $workbook.Sheets.Item($i)
$sheetName = $sheet.Name
$csvFile = $outputFile.Replace(".md", "_$sheetName.csv")
if(Test-Path $csvFile) {
Remove-Item $csvFile -Force
}
$indexContent += "- [$sheetName]($csvFile)`n"
$sheet.SaveAs($csvFile, 6) # 6 = xlCSV format
}
# Save the index file and close the workbook
Set-Content -Path $outputFile -Value $indexContent
$workbook.Close($false)
$excel.Quit()
}
if (-not (Test-Path -Path $outputFolderPath)) {
New-Item -ItemType Directory -Path $outputFolderPath | Out-Null
}
$files = Get-ChildItem -Path $inputFolderPath -Recurse -File | Select-Object -First $maxDocs
$processedFiles = 0;
Write-Host "Converting $($files.Count) files..."
$junkFiles = '^(~\$|\.|.*_backup$)'
foreach ($file in $files) {
if($file.Name -match $junkFiles) {
Write-Host "-- Skipped $file due to junk file pattern" -ForegroundColor Yellow
continue
}
$relativePath = $file.DirectoryName.Substring($inputFolderPath.Length)
$currentOutputFolderPath = Join-Path $outputFolderPath $relativePath
if (-not (Test-Path -Path $currentOutputFolderPath)) {
New-Item -ItemType Directory -Path $currentOutputFolderPath | Out-Null
}
$outputFileName = $file.Name + ".md"
$outputFilePath = Join-Path $currentOutputFolderPath $outputFileName
$fileWritten = $false
if (-not (Test-Path $outputFilePath) -or ($file.LastWriteTime.ToFileTime() -gt (Get-Item $outputFilePath).LastWriteTime.ToFileTime())) {
switch ($file.Extension) {
".docx" { Convert-WordDocument -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true }
".doc" { Convert-WordDocument -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true }
".xlsx" { Convert-ExcelSheets -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true }
".pdf" { Convert-PDFDocument -inputFile $file.FullName -outputFile $outputFilePath; $fileWritten = $true }
}
if($fileWritten) {
Write-Host "-- Processed $file" -ForegroundColor Yellow
(Get-Item $outputFilePath).LastWriteTime = $file.LastWriteTime
$processedFiles += 1
}
}
}
Write-Host "Conversion completed. Of $($files.Count), $processedFiles were procesed."
}
Convert-FilesToMarkdown -inputFolderPath $inputPath -outputFolderPath $outputPath
@alirobe
Copy link
Author

alirobe commented Apr 12, 2023

Please note this requires binaries for poppler, as well as an installed copy of pandoc (choco install -y pandoc) and MS Excel. This is just an initial little POC and isn't really prod grade.

the following tools may do a better job:
https://tika.apache.org/
https://github.com/dbashford/textract (node)
https://github.com/deanmalmgren/textract (linux)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment