Skip to content

Instantly share code, notes, and snippets.

@mavaddat
Last active May 22, 2022 15:18
Show Gist options
  • Save mavaddat/7584836f14e8960f264f2163c631b6e3 to your computer and use it in GitHub Desktop.
Save mavaddat/7584836f14e8960f264f2163c631b6e3 to your computer and use it in GitHub Desktop.
This script checks for invalid books in the calibre library by looking for empty covers and then searches for the original book file in the downloads folder
Set-Location C:\CalibrePortable\Calibre\
$calibreNS = @{ 'dc' = 'http://purl.org/dc/elements/1.1/'
'calibre' = 'http://calibre.kovidgoyal.net/2009/metadata'
}
$formatLessBookIds = [int[]](&.\calibredb.exe search 'formats:false and (identifiers:"=isbn:")' --library-path="..\Calibre Library\" | ForEach-Object { $_ -split ',' })
$formatLessBooks = Import-Clixml -Path .\formatLessBooks.xml
if($null -eq $formatLessBooks -or $null -eq $formatLessBooks.Hash -or $formatLessBooks.Hash -ne $formatLessBookIds.GetHashCode())
{
$formatLessBooks = @{ Hash = ($formatLessBookIds.GetHashCode());Books = ($formatLessBookIds | ForEach-Object { [xml](&.\calibredb.exe show_metadata --as-opf $PSItem --library-path="..\Calibre Library\") } | Where-Object { [string]::IsNullOrWhiteSpace((&.\calibredb.exe search formats:true title:`"$(Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:title' | ForEach-Object { $_.Node.InnerXml })`" author:`"$((Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:creator' | ForEach-Object { $_.Node.InnerXml }) -join ' & ')`" --library-path="..\Calibre Library\" 2>$null | ForEach-Object { $_ -split ',' } ))})}
Export-Clixml -InputObject $formatLessBooks -Path .\formatLessBooks.xml
}
$mutex = New-Object System.Threading.Mutex -ArgumentList @($false, 'calibre-portable-update-metadata')
$formatLessBooks.Books | ForEach-Object -ThrottleLimit 16 -Parallel {
$mutex = $using:mutex
$calibreNS = $using:calibreNS
Set-Location C:\CalibrePortable\Calibre\
$title = Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:title' | ForEach-Object { $_.Node.InnerXml }
$isbn = Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:identifier' | Where-Object -FilterScript { $_.Node.scheme -eq 'ISBN' } | ForEach-Object { $_.Node.'#text' }
$id = Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:identifier' | Where-Object -FilterScript { $_.Node.scheme -eq 'Calibre' } | ForEach-Object { $_.Node.'#text' }
if (-not [String]::IsNullOrWhiteSpace( $isbn ) -and -not [String]::IsNullOrWhiteSpace( $id ))
{
$sleepTime = 3
$uri = "http://libgen.is/search.php?req=$isbn&open=0&res=25&view=simple&phrase=1&column=identifier&sort=year&sortmode=DESC"
$request = Invoke-WebRequest -Uri $uri 2>$null
while ($request.StatusCode -ne 200)
{
Start-Sleep -Seconds ($sleepTime *= 2)
$request = Invoke-WebRequest -Uri $uri 2>$null
}
Get-Package -Name 'AngleSharp' | ForEach-Object { Split-Path $_.Source } | Get-ChildItem -Filter '*.dll' -Recurse | Where-Object { $_ -Like '*standard*' } | Select-Object -Last 1 | ForEach-Object { Add-Type -Path $_ -ErrorAction SilentlyContinue -Verbose }
$parser = [AngleSharp.Html.Parser.HtmlParser]::new()
$parsedContent = $parser.ParseDocument($request.Content)
$formats = New-Object -TypeName 'System.Collections.Generic.HashSet[string]'
$parsedContent.QuerySelectorAll('body > table.c > tbody > tr') | Select-Object -Skip 1 | ForEach-Object {
$format = $_.QuerySelector('td:nth-child(9)').InnerHtml
if (-not([String]::IsNullOrWhiteSpace($format)) -and $formats.Add($format))
{
$requestDL = Invoke-WebRequest -Uri ($_.QuerySelector('td:nth-child(10) > a').GetAttribute('href'))
$parsedDLContent = $parser.ParseDocument($requestDL.Content)
$uri = [uri]::new(($parsedDLContent.QuerySelector('#download > h2 > a').href))
$md5 = $uri.segments[3] -replace '/', ''
$filename = Join-Path $env:TEMP "$id.$($uri.segments[-1] -split '\.' | Select-Object -Last 1 )"
if (-not(Test-Path $filename -PathType Leaf) -or ($md5.ToUpperInvariant() -ne (Get-FileHash -Path $filename -Algorithm MD5).Hash))
{
$request = Invoke-WebRequest -Uri $uri -OutFile $filename -Resume 2>$null
while ($request.StatusCode -ne 200)
{
Start-Sleep -Seconds ($sleepTime *= 2)
$request = Invoke-WebRequest -Uri $uri -OutFile $filename -Resume 2>$null
}
}
try {
$mutex.WaitOne() | Out-Null
$ebookFile = Get-Item $filename
&.\calibredb.exe add_format $id "$ebookFile" --library-path="..\Calibre Library\"
@{Id=$id; Ebook=$ebookFile; Title=$title; Filename=$filename} | Format-Table -AutoSize -Wrap | Write-Output
} finally {
$mutex.ReleaseMutex() | Out-Null
}
}
}
}
}
$calibreLibraryPath = "$env:USERPROFILE\Calibre Library"
#Install pre-requisite module Communary.PASM for approximate string matching if necessary
#See https://github.com/gravejester/Communary.PASM/tree/6fe3b5ea01e8f49aeccbde0db0dc777079b9e9cd
if (-not (Get-Module -ListAvailable -Name Communary.PASM)) {
$m="Installing missing approximate string matching module Communary.PASM"
Write-Host $m
Write-Log -Message $m -Level "WARN" -logfile $log
# Check to see if we are currently running "as Administrator"
if ($myWindowsPrincipal.IsInRole($adminRole))
{
# We are running "as Administrator" - so change the title and background color to indicate this
$Host.UI.RawUI.WindowTitle = $myInvocation.MyCommand.Definition + "(Elevated)"
$Host.UI.RawUI.BackgroundColor = "DarkBlue"
clear-host
Install-Module Communary.PASM
}
else
{
# We are not running "as Administrator" - so relaunch as administrator
# Create a new process object that starts PowerShell
$newProcess = new-object System.Diagnostics.ProcessStartInfo "PowerShell";
# Specify the current script path and name as a parameter
$newProcess.Arguments = $myInvocation.MyCommand.Definition;
# Indicate that the process should be elevated
$newProcess.Verb = "runas";
# Start the new process
[System.Diagnostics.Process]::Start($newProcess);
# Exit from the current, unelevated, process
# exit
}
}
#Install pre-requisite pdfinfo by XpdfReader if necessary
#See https://www.xpdfreader.com/pdfinfo-man.html
if (-not (where.exe pdfinfo.exe)){
$m="Installing missing pdfinfo program by XpdfReader"
Write-Host $m
Write-Log -Message $m -Level "WARN" -logfile $log
$pdfinfoZipDl = $($(iwr -Uri 'https://www.xpdfreader.com/download.html' -Method Get).ParsedHtml.links | foreach {$_.href | Select-String -Pattern 'xpdf-tools-win.*\.zip$' }).ToString()
(New-Object Net.WebClient).DownloadFile($pdfinfoZipDl,"$env:TEMP\xpdf-tools-win.zip");(new-object -com shell.application).namespace('"$env:TEMP').CopyHere((new-object -com shell.application).namespace("$env:TEMP\xpdf-tools-win.zip").Items(),16)
$pdfInfoLoc = "$env:TEMP\" + $(Get-ChildItem -Path $env:TEMP -Name "xpdf-tools-win*" -Directory) + "\bin64\pdfinfo.exe"
}
else {
$pdfInfoLoc = where.exe pdfinfo.exe;
}
$log = New-TemporaryFile
Function Write-Log {
[CmdletBinding()]
Param(
[Parameter(Mandatory=$True)]
[string]
$Message,
[Parameter(Mandatory=$False)]
[ValidateSet("INFO","WARN","ERROR","FATAL","DEBUG")]
[String]
$Level = "INFO",
[Parameter(Mandatory=$False)]
[string]
$logfile
)
$Stamp = (Get-Date).toString("yyyy/MM/dd HH:mm:ss")
$Line = "$Stamp`t$Level`t$Message"
If($logfile) {
Add-Content $logfile -Value $Line
}
Else {
Write-Output $Line
}
}
$confidence = 0.90
$calibreNS = @{ "dc" = "http://purl.org/dc/elements/1.1/"; "calibre" = "http://calibre.kovidgoyal.net/2009/metadata" };
$webNS = @{'xmlns'="http://www.w3.org/1999/xhtml"}
$files = Get-ChildItem -Path $env:USERPROFILE\Downloads -Recurse -File -Include *.AZW, *.AZW3, *.AZW4, *.CBZ, *.CBR, *.CBC, *.CHM, *.DJVU, *.DOCX, *.EPUB, *.FB2, *.FBZ, *.HTML, *.HTMLZ, *.LIT, *.LRF, *.MOBI, *.ODT, *.PDF, *.PRC, *.PDB, *.PML, *.RB, *.RTF, *.SNB, *.TCR, *.TXT, *.TXTZ ; #has $files.Count many elements
$ids = ([string](calibredb search cover:False)).Split(",");
$candidates = New-Object 'System.Collections.Generic.Dictionary[Float,System.IO.FileSystemInfo]'
foreach ($id in $ids) {
Write-Progress -Activity "Trying to find good copies of broken books" -Status "$([int16]$(100*$ids.IndexOf($id)/$ids.Count))% Complete:" -PercentComplete (100*$ids.IndexOf($id)/$ids.Count);
$err = New-TemporaryFile
$out = New-TemporaryFile
$calibreBookFilename = Get-ChildItem -Path $calibreLibraryPath -Include "*($id)" -Recurse
$book = [xml](calibredb.exe show_metadata --as-opf $id)
$author = Select-Xml -Xml $book -Namespace $calibreNS -XPath "//dc:creator" | ForEach-Object { $_.Node.Innerxml }
$title = Select-Xml -Xml $book -Namespace $calibreNS -XPath "//dc:title" | ForEach-Object { $_.Node.Innerxml }
$compositeName = $($(if($title.Equals("untitled")){$calibreBookFilename.Name}else{"- $author"}) + $(if($author.Equals("Unknown")){""}else{"- $author"}))
#if($title -ieq 'untitled' -and $author -ieq 'Unknown' -and $calibreBookFilename.Name -ieq 'untitled - Unknown')
#{
foreach ($file in $files) {
Write-Progress -Activity "Looking for $title by $author" -Status "$([int16]$(100*$files.IndexOf($file)/$files.Count))% Complete:" -PercentComplete (100*$files.IndexOf($file)/$files.Count);
$compar = 1 - $(Get-JaccardDistance -a $compositeName -b $file.Name.Split(".")[0] -CaseSensitive)
if ($file.Name.Split(".")[0] -gt 0) {
try{
$candidates.Add($compar,$file) | Out-Null
}
catch [ArgumentException]{
$m="'" + $file.Name.Split('.')[0] + "' had the same confidence as '" + $candidates[$compar].Name.Split('.')[0] + "'"
#Write-Host $m
Write-Log -Message $m -Level "DEBUG" -logfile $log
}
if (-not $candidates.GetEnumerator().Value -ceq $null -and $compar -ge (1-$confidence)) {
Write-Progress -Activity "Looking for $title by $author" -Status "Found $($replacement.Name) at $compar% confidence" -PercentComplete 100
Clear-Variable compar
break
}
}
}
$m="Looking for $title by $author"
Write-Host $m
Write-Log -Message $m -Level "INFO" -logfile $log
$bestGuess = $(($candidates.GetEnumerator() | sort -Property Key))[0]
$compar = $bestGuess.Key
$replacement = $bestGuess.Value;
$m="Found $($replacement.Name) at $compar% confidence"
Write-Host $m -NoNewLine -ForegroundColor DarkGreen
Write-Log -Message $m -Level "INFO" -logfile $log
if($compar -lt (1-$confidence)){
$m=" having low confidence `($compar`)"
Write-Host $m -ForegroundColor DarkYellow
Write-Log -Message $m -Level "INFO" -logfile $log
}
Start-Process -FilePath $pdfInfoLoc -ArgumentList "-meta `"$($replacement.FullName)`"" -NoNewWindow -RedirectStandardError $err -RedirectStandardOutput $out
if ( $(Select-String -InputObject $err -Pattern "(Error|Warning)") -ceq $null -and -not ($(Get-Content $out) -ceq $null)) {
$m="`nReplacing old file with new one"
Write-Host $m -ForegroundColor Green
Write-Log -Message $m -Level "INFO" -logfile $log
Write-Host $out -ForegroundColor Green
Write-Log("INFO",$out,$log)
calibredb add_format $id "$($replacement.FullName)"
} else {
$m="`nThe file $($replacement.FullName) was corrupt.`nNo valid file found for $title by $author.`nRemoving book with ID $id from Calibre"
Write-Host $m -ForegroundColor DarkRed -BackgroundColor Black
Write-Log -Message $m -Level "WARN" -logfile $log
calibredb.exe remove $id
}
#}
#else {
#$m="`nThe book with ID $id was corrupt, had no title or author. Its filename is $($calibreBookFilename.Name). It is thus impossible to replace.`nRemoving book with ID $id from Calibre"
#Write-Host $m -ForegroundColor DarkRed -BackgroundColor Black
#Write-Log -Message $m -Level "INFO" -logfile $log
#calibredb.exe remove $id
#}
Remove-Item $out, $err -Force
Clear-Variable candidates, err, out
$candidates = New-Object 'System.Collections.Generic.Dictionary[Float,System.IO.FileSystemInfo]'
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment