Skip to content

Instantly share code, notes, and snippets.

@Krazybug
Last active February 17, 2024 23:24
Show Gist options
  • Save Krazybug/b7e814d7189db9ee1d6b9c1d1a1de95c to your computer and use it in GitHub Desktop.
Save Krazybug/b7e814d7189db9ee1d6b9c1d1a1de95c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
'''
calisuck: index, filter-out smartly and download ebooks from Calibre open directories
Installation:
You need python 3.5 installed
Download the file as a zip and unzip-it and get into the dir
OR
> git clone https://gist.github.com/b7e814d7189db9ee1d6b9c1d1a1de95c.git
> mv b7e814d7189db9ee1d6b9c1d1a1de95c calisuck
> cd calisuck
>
THEN
> python3 -m venv .
> . bin/activate
> pip install requests fire humanize langid iso639 beautifultable
> python calisuck.py --help
> python calisuck.py index-ebooks --help
> python calisuck.py download-ebooks --help
> python calisuck.py download-covers --help
'''
'''
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.
'''
import sys
import os
import time
import re
import shutil
import requests
import json
import fire
from humanize import naturalsize as hsize
from langid.langid import LanguageIdentifier, model
import iso639
import time
from requests.adapters import HTTPAdapter
import urllib.parse
import urllib3
from beautifultable import BeautifulTable
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
def load_metadata(path, uuid):
filepath=path+'/'+uuid+'/metadata.json'
# print (filepath)
if os.path.isfile(filepath):
try:
with open(filepath, 'r') as fd:
return json.load(fd)
except:
print ("Error loading metadata for:", uuid, "from path:", path)
return 0
else:
# print ("Metadata not found for:", uuid, "from path:", path)
return 0
def save_metadata(path, book):
filepath=path+'/'+book['uuid']+'/metadata.json'
# print("Saving book metadata for:", book['uuid'], "to:", filepath)
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
with open(filepath+".tmp", 'w') as fd:
json.dump(book, fd, indent=4, separators=(',', ': '))
try:
shutil.move(filepath+".tmp", filepath)
# print("Saved to:", filepath)
except:
print("Unable to rename .tmp file:", filepath+".tmp")
def get_cover_path(path, uuid):
filepath=path+'/'+uuid+'/cover.jpg'
if os.path.isfile(filepath): return filepath
else: return 0
def get_file_path(path, uuid, fileformat):
files=os.listdir(path+'/'+uuid)
if files:
for f in files:
fname, ext=os.path.splitext(f)
if ext =='.'+fileformat:
return path+'/'+uuid+'/'+f
else: return 0
else: return 0
def get_cover(path, book, map):
url=book['source']['cover']
if map:
pu=urllib.parse.urlparse(url)
pu=(pu[0], map, *pu[2:])
print(pu)
url=urllib.parse.urlunparse(pu)
print("Downloading cover from:", url)
r=requests.get(url, timeout=(20, 3), verify=False)
r.raise_for_status()
filepath=path+'/'+book['uuid']+'/cover.jpg'
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
with open(filepath+".tmp", 'wb') as fd:
fd.write(r.content)
shutil.move(filepath+".tmp", filepath)
print("Saved to:", filepath)
def download_covers(dir='my_books', server='', map=""):
""" Download covers for each books"""
for root, dirs, files in os.walk(dir, topdown=True):
for d in dirs:
# print()
# print("-->", d)
book = load_metadata(root, d)
if book:
# if book['source']['status'] != "ignored":
if True:
if not get_cover_path(root, book['uuid']):
print()
print("-->", d)
print(book['uuid'])
try:
get_cover(root, book, map)
except:
print ("Unable to get cover", book['uuid'])
else:
pass
# print ("Cover already present:", book['uuid'])
else:
print ('book {} in status {}: ignored'.format(book['uuid'], book['source']['status']))
else:
print ("No ebook metadata found in:", root)
def get_file_size(url):
print("Downloading size:", url)
r = requests.head(url, verify=False)
r.raise_for_status()
size=r.headers['Content-Length']
print("Size received="+ hsize(size))
return int(size)
def get_file(path, book, format, session, map, map_lib):
uuid = book['uuid']
url=book['source']['formats'][format]['url']
if map:
pu=urllib.parse.urlparse(url)
pu=(pu[0], map, *pu[2:])
print(pu)
url=urllib.parse.urlunparse(pu)
if map_lib:
# pu=urllib.parse.urlparse(url)
# print(pu)
url_s=url.split("/")
# print(url_s)
url_s=url_s[:-1]+[map_lib]
# print('/'.join(url_s))
url='/'.join(url_s)
print()
print("Downloading ebook:", url)
print("Size expected (estimation):", hsize(book['source']['formats'][format]['size']))
r = session.get(url, timeout=(25,15), verify=False)
# headers = {"Range": "bytes=0-1023"}
# r = requests.get(url, headers=headers)
r.raise_for_status()
# print(r.headers)
if('Content-Length' in r.headers ):
print("Size received="+hsize(r.headers['Content-Length']))
else:
print("Fize received")
filename=re.findall(r'filename="(.*)"', r.headers['Content-Disposition'])
# print(filename)
if len(filename):
filepath=path+'/'+uuid+'/'+filename[0]
else:
filepath=path+'/'+uuid+'/'+uuid+"."+format
os.makedirs(os.path.dirname(filepath+".tmp"), exist_ok=True)
with open(filepath+".tmp", 'wb') as fd:
fd.write(r.content)
shutil.move(filepath+".tmp", filepath)
print("Saved to:", filepath)
def set_status(uuid, status, dir='.'):
book = load_metadata(dir, uuid)
if book:
if book['source']['status'] != status:
book['source']['status'] = status
save_metadata(dir, book)
print("Status changed to", status+":", book['uuid'], "(", book['title'], ")")
else:
print("Status unchanged changed ", status+":", book['uuid'])
else:
print ("No ebook metadata found for:", uuid)
def remove_book(uuid, path='.'):
print(os.getcwd())
bookpath=path+'/'+uuid
if os.path.isdir(bookpath):
try:
shutil.rmtree(bookpath)
print(uuid, "removed")
except:
print("problem")
else:
print(uuid, "not found")
def update_done_status(book):
source=book['source']
if source['status']!='ignored':
if set(source['formats'].keys()) == set(book['formats']) & set(source['formats'].keys()):
book['source']['status']="done"
else:
book['source']['status']="todo"
def index_ebooks(site, library="", start=0, stop=0, dir="my_books", inc=1000, force_refresh=False):
"""
Index a remote Calibre library
You will get in your <dir> all the metadata (title, authors, isbn, ...) for each book.
They're stored as simple JSON files (metadata.json) so that you can easily visualize them or process them with 'jq' program.
They are stored in subdirectories with a UUID as a name. These directories do match different books and allow you to group all
the different formats of the same book and eventually the cover file.
You can mix books from different sites without any (theoric) collisions
Params:
--site=<string> : Url of the site to index (ex: http://123.123.123.123/)
--library=<string> (default=my_books) : Id of library to index. The script index the default library by default.
The id is string following '&library_id=' in the url
--force-refresh (defaul=False) : Force a refresh of the metadata. By default all the metdata
already gathered are ignored
--start=<int> (default=0)
--stop=<int> (default=0) : Allow indexing between a range of ebooks
--inc=<int> (default=1000) : Fix the number of ebooks for each request one the server
"""
os.makedirs(dir, exist_ok=True)
offset= 0 if not start else start-1
num=min(1000,inc)
server=site.rstrip('/')
api=server+'/ajax/'
library= '/'+library if library else library
print("Server:", server)
url=api+'search'+library+'?num=0'
print()
print("Getting ebooks count:", server)
try:
r = requests.get(url,verify=False)
r.raise_for_status()
except:
print("Unable to open site:", url)
sys.exit(1)
print("Total count=",r.json()["total_num"])
total_num=int(r.json()["total_num"])
total_num= total_num if not stop else stop
print()
print("Start indexing")
range=offset+1
while offset < total_num:
remaining_num = min(num, total_num - offset)
# print()
# print("Downloading ids: offset="+str(offset), "num="+str(remaining_num))
url=api+'search'+library+'?num='+str(remaining_num)+'&offset='+str(offset)+'&sort=timestamp&sort_order=desc'
# print("->", url)
r=requests.get(url, verify=False)
# print("Ids received from:"+str(offset), "to:"+str(offset+remaining_num-1))
# print()
# print("\rDownloading metadata from", str(offset+1), "to", str(offset+remaining_num),end='')
books_s=",".join(str(i) for i in r.json()['book_ids'])
url=api+'books'+library+'?ids='+books_s
# print("->", url)
r=requests.get(url, verify=False)
# print(len(r.json()), "received")
for id, r_book in r.json().items():
uuid=r_book['uuid']
if not uuid:
print ("No uuid for ebook: ignored")
continue
if r_book['authors']:
desc= f"uuid={uuid} ({r_book['title']} / {r_book['authors'][0]})"
else:
desc= f"uuid={uuid} ({r_book['title']})"
s=f"\r--> {range}/{total_num} - {desc}"
s='{:140.140}'.format(s)
print (s, end='')
if not force_refresh:
try:
book = load_metadata(dir, uuid)
except:
print()
print("Unable to get metadata from:", uuid)
range+=1
continue
if book:
# print("Metadata already present for:", uuid)
range+=1
continue
if not r_book['formats']:
print()
print("No format found for {}".format(r_book['uuid']))
range+=1
continue
book={}
url=api+'book/'+id
book['title']=r_book['title']
book['authors']=r_book['authors']
book['series']=r_book['series']
book['series_index']=r_book['series_index']
book['edition']=0
book['uuid']=r_book['uuid']
book['identifiers']=r_book['identifiers']
book['comments']=r_book['comments']
book['pubdate']=r_book['pubdate']
book['publisher']=r_book['publisher']
languages=r_book['languages']
if not languages:
# if True:
if book['comments']:
text=book['comments']
else:
text=book['title']
s_language, prob=identifier.classify(text)
if prob >= 0.85:
language = iso639.to_iso639_2(s_language)
book['languages']=[language]
else:
book['languages']=[]
else:
book['languages']=[]
for l in languages:
book['languages'].append(iso639.to_iso639_2(l))
book['tags']=r_book['tags']
book['formats']=[]
book['metadata_version']=0.1
source={}
source['url']=url+library
source['id']=id
try:
tmpbook = load_metadata(dir, uuid)
except:
print("Unable to get metadata from:", uuid)
range+=1
continue
if tmpbook and tmpbook['source']['status']=="ignored":
source['status']="ignored"
else:
source['status']="todo"
source['cover']=server+r_book['cover']
source['timestamp']=r_book['timestamp']
format_sources={}
formats=r_book['formats']
for f in formats:
s={}
url=''
if f in r_book['main_format']:
url=r_book['main_format'][f]
else:
url=r_book['other_formats'][f]
s['url']=server+url
if 'size' in r_book['format_metadata'][f]:
s['size']=int(r_book['format_metadata'][f]['size'])
else:
print()
print("Size not found for format '{}' : {}".format(f, uuid))
print("Trying to get size online: {}".format(s['url']))
try:
s['size']=get_file_size(s['url'])
except:
print("Unable to access format '{}' : {} skipped".format(f, uuid))
continue
s['status']='todo'
format_sources[f]=s
source['formats']=format_sources
book['source']=source
if not source['formats']:
print("No format found for {}".format(r_book['uuid']))
range+=1
continue
update_done_status(book)
# print("Saving metadata for:", uuid)
try:
save_metadata(dir, book)
except:
print()
print("Unable to save book metadata", book['uuid'])
range+=1
offset=offset+num
print()
print("Done")
def has_languages(book, languages=[], ignore_empty_language=False):
# print("Accepted languages", languages)
if not ignore_empty_language:
# print("Unknown language accepted")
pass
# rustine
if not 'languages' in book:
book['languages']=[]
# print("Book languages", book['languages'])
if ignore_empty_language and not book['languages']:
# print ("'{}' ignored: language is empty".format(book['uuid']))
return False
if not ignore_empty_language and not book['languages']:
# print ("'{}' todo: language is empty".format(book['uuid']))
return True
expected_languages=list(set(book['languages']) & set(languages))
if languages and not expected_languages:
# print ("'{}' ignored: language {} not in {}".format(book['uuid'], book['languages'],languages))
return False
# print ("'{}' todo: expected languages {}".format(book['uuid'], expected_languages))
return True
def has_identifiers(book, identifiers=[], ignore_empty_identifiers=False):
# print("Accepted identifiers", identifiers)
if not ignore_empty_identifiers:
# print("Unknown identifiers accepted")
pass
# print("Book identifiers", book['identifiers'].keys())
if ignore_empty_identifiers and not book['identifiers']:
# print ("'{}' ignored: identifier is empty".format(book['uuid']))
return False
if not ignore_empty_identifiers and not book['identifiers']:
# print ("'{}' todo: identifiers is empty".format(book['uuid']))
return True
expected_identifiers=list(set(book['identifiers'].keys()) & set(identifiers))
if identifiers and not expected_identifiers:
# print ("'{}' ignored: identifiers {} not in {}".format(book['uuid'], book['identifiers'].keys(), identifiers))
return False
# print ("'{}' todo: expected identifiers {}".format(book['uuid'], expected_identifiers))
return True
def download_ebooks(dir= 'my_books', formats=[], single_format=False, ignored_formats=[], languages=[], identifiers=[], min_size=0, max_size=0, ignore_empty_language=False, ignore_empty_identifiers=False, dry_run=False, timer=0, map="", map_lib=""):
'''
Download ebooks in matching subdirs:
The different formats of the same book are groupe in the same directory
with an UUID name close to the metadata file (metadata.json).
The status of the formats for a book and its global status are initially set to 'todo'.
They move to 'done' after their download. This allows you to rerun the download and progressively collect books.
You can use different options to filter the formats for the download
by language, size, format and identifiers(isbn, ...).
A report of the download is displayed at the end of the process.
You can run this command in dry mode (--dry-run) with different settings
to only display the report and prepare your effective.
Params:
--min-size=<int> (default=0)
--max-size=<int> (default=infinity) : Delimit the size in MB for the accepted formats
--dry-run (defaul=False) : Run the command to simulate the download
--language=<string> : Restrict the download to a list of specific languages
(Ex: --languages='["eng","ita"]'
--ignore-empty-language (defaul=False) : Ignore books with unidentfied language
--formats=<string> : Restrict the download to a list of specific formats
(Ex: --formats='["epub", "mobi", "pdf"]'
--ignore-formats=<string> : Ignore the formats of a list of specific.
Compliant with --formats.
(Ex: --ignored-formats='["mp3", "rar", "zip"]'
--single-format (defaul=False) : Limit the download to 1 format per book with this preference order
'azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub',
'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar'
, 'rtf', 'txt', 'zip', 'fb2'
--identifiers=<string> : Restrict the download to a list of specific identifiers
(Ex: --identifiers='["isbn","asin"]'
--ignore-empty-identifiers (defaul=False) : Ignore books without identifiers (often OCR)
'''
# all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip']
print()
if single_format: my_formats = formats if formats else all_ordered_formats
else: my_formats=formats
# print("formats=", my_formats)
min_size=int(min_size)*1024*1024
max_size=int(max_size)*1024*1024
print ("Format expected between {} and {}".format(hsize(min_size), hsize(max_size) if max_size else "infinity"))
total_size=0
total_size_by_format={}
total_ebook_count=0
total_format_count=0
total_count_by_format={}
size_max=0
size_min=0
language_count={}
identifiers_count={}
s = requests.Session()
for root, dirs, files in os.walk(dir, topdown=True):
for counter, uuid in enumerate(dirs):
book = load_metadata(root, uuid)
if book:
status=book['source']['status']
if status=="todo":
if not has_languages(book, languages=languages, ignore_empty_language=ignore_empty_language):
continue
if not has_identifiers(book, identifiers=identifiers, ignore_empty_identifiers=ignore_empty_identifiers):
continue
source=book['source']
download_formats = get_formats_to_download(book, accepted_formats=my_formats, single_format=single_format, ignored_formats=ignored_formats, max_size=max_size, min_size=min_size)
if not len(download_formats):
# print ("'{}' ignored: no more format available in formats expected {}".format(uuid, download_formats))
# print()
pass
else:
ebook_kept=False
for f in download_formats:
url = source['formats'][f]['url']
# if map:
# pu=urllib.parse.urlparse(url)
# pu=(pu[0], map, *pu[2:])
# print(pu)
# print(urllib.parse.urlunparse(pu))
if url:
# # It shouldn't occur: Need to download again
if get_file_path(dir, uuid, f):
# print ("Format '{}' already present for {}: Retrying".format(f, uuid))
# print()
# continue
# print("Format '{}': size expected={}".format(f, hsize(source['formats'][f]['size'])))
pass
# print(f"--> format '{f}' for ({book['title']} / {book['authors'][0]} / {str(book['series'])})")
if not dry_run:
try:
get_file(dir, book, f, s, map, map_lib)
book['formats'].append(f)
book['source']['formats'][f]['status']="done"
if timer:
print(f"Waiting {timer} seconds")
time.sleep(timer)
except Exception as msg:
print("Unable to get book:", url)
print(msg)
time.sleep(5)
continue
save_metadata(dir, book)
ebook_kept=True
size=source['formats'][f]['size']
total_size += size
size_max = size if size>size_max else size_max
if not size_min:
size_min = size
else:
size_min = size if size<size_min else size_min
if not f in total_size_by_format:
total_size_by_format[f] = size
else: total_size_by_format[f] +=size
if not f in total_count_by_format:
total_count_by_format[f] = 1
else:
total_count_by_format[f]+=1
total_format_count +=1
else:
# print ("Format '{}' ignored for {} ({}): No url)".format(f, uuid, book['title']))
# print()
pass
if ebook_kept:
total_ebook_count+=1
if not book['languages']:
if not '<unknown>' in language_count:
language_count['<unknown>'] = 1
else:
language_count['<unknown>']+=1
else:
for l in book['languages']:
if not l in language_count:
language_count[l] = 1
else:
language_count[l]+=1
if not book['identifiers']:
if not '<unknown>' in identifiers_count:
identifiers_count['<unknown>'] = 1
else:
identifiers_count['<unknown>']+=1
else:
for l in book['identifiers'].keys():
if not l in identifiers_count:
identifiers_count[l] = 1
else:
identifiers_count[l]+=1
if not dry_run:
update_done_status(book)
if book['source']['status']=="done":
save_metadata(dir, book)
print("Book done:", book['uuid'])
print()
# total_ebook_count+=1
else:
# print()
# print("-->", uuid, "("+book['title']+")")
# print ('{} in status "{}": skipped'.format(book['uuid'], status))
# print(f"--> {uuid} ({book['title']}) in status {status}: skipped", end="\r")
# print(f"--> {uuid} ({book['title']})", end="\r")
print(f'--> {counter} books handled', end="\r")
print()
print("Reporting ...")
table_l = BeautifulTable()
table_l.column_headers = ["Language", "Ebooks count"]
for l, c in language_count.items():
table_l.append_row([l, c])
table_l.sort("Ebooks count", reverse=True)
table_l=table_l[0:10]
table_i = BeautifulTable()
table_i.column_headers = ["Identifier", "Ebooks count"]
for i, c in identifiers_count.items():
table_i.append_row([i, c])
table_i.sort("Ebooks count", reverse=True)
table_i=table_i[0:10]
print()
print("Top 10 ebooks by language/identifier:")
table = BeautifulTable()
table.column_headers = ["Languages", "Identifiers"]
table.append_row([table_l, table_i])
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
print(table)
print()
print("Total count of ebooks by format:")
table = BeautifulTable()
table.column_headers = ["Format", "Size", "Ebooks count"]
for f in total_count_by_format.keys():
table.append_row([f, hsize(total_size_by_format[f]),total_count_by_format[f]])
table.sort("Ebooks count", reverse=True)
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
print(table)
table_c = BeautifulTable()
table_c.column_headers = ["", "Total count"]
table_c.append_row(["Formats", total_format_count])
table_c.append_row(["Ebooks", total_ebook_count])
table_s = BeautifulTable()
table_s.column_headers = ["", "Size"]
# table.append_row(["Min", hsize(size_min)])
table_s.append_row(["Biggest File", hsize(size_max)])
table_s.append_row(["Total", hsize(total_size)])
print()
print("Summary:")
table = BeautifulTable()
table.column_headers = ["Total Count", "Total Size"]
table.append_row([table_c, table_s])
# table.set_style(BeautifulTable.STYLE_MARKDOWN)
print(table)
print()
def get_formats_to_download(book, accepted_formats=[], ignored_formats=[], single_format=False, min_size=0, max_size=0):
# print("Accepted formats", accepted_formats)
source=book['source']
# print("Formats available in source: {}".format(list(source['formats'].keys())))
my_formats=[]
for f,v in source['formats'].items():
if v['status']=='todo':
my_formats.append(f)
# print("Formats in 'todo': {}".format(my_formats))
formats=[]
if single_format:
if accepted_formats:
for f in accepted_formats:
if f in my_formats:
formats=[f]
break
else:
print("need at least 1 format for ordering")
else:
if accepted_formats:
formats=list(set(accepted_formats) & set(my_formats))
elif ignored_formats:
formats = list(set(my_formats) - set(ignored_formats))
else:
formats=my_formats
# print("Formats expected: {}".format(formats))
download_formats=formats[:]
for f in formats:
if not 'size' in source['formats'][f] and max_size:
# print ("Format '{}' ignored for {}: Size unknown".format(f, book['uuid']))
download_formats.remove(f)
else:
size = source['formats'][f]['size']
if size < min_size or (max_size and size > max_size):
download_formats.remove(f)
# print ("Format '{}' ignored for {}: size={} but expected between {} and {}".format(f, book['uuid'], hsize(size), hsize(min_size), hsize(max_size) if max_size else "infinity"))
return download_formats
def update_format_statuses(book,refresh_ignored):
formats=book['source']['formats']
for f, v in formats.items():
if v['status']=='ignored' and not refresh_ignored:
# print ("Format '{}' ignored: {} ({}))".format(f, book['uuid'], book['title']))
pass
else:
# print ("Format '{}' todo: {} ({}))".format(f, book['uuid'], book['title']))
book['source']['formats'][f]['status']='todo'
import glob
def check_ebooks(dir= 'my_books', dry_run=True):
'''
Check ebooks:
'''
print("Checking ...")
for root, dirs, files in os.walk(dir, topdown=True):
for counter, uuid in enumerate(dirs):
book = load_metadata(root, uuid)
if book:
status=book['source']['status']
if status=="todo":
print(status)
source=book['source']
update=False
for f, v in source["formats"].items():
print(uuid, f, v['status'])
if v['status']=="todo":
formats= glob.glob(root+"/"+uuid+"/*."+f)
print(formats)
if formats:
print(book['uuid'], formats[0])
book['source']['formats'][f]['status']="done"
update=True
if not dry_run and update:
update_done_status(book)
save_metadata(dir, book)
print("Book done", book['uuid'])
print()
print()
if __name__ == "__main__":
fire.Fire({
"index_ebooks": index_ebooks,
"download_ebooks": download_ebooks,
"download_covers": download_covers,
"set_status": set_status,
"check_ebooks": check_ebooks
})
@Krazybug
Copy link
Author

You're welcome

@deedledeedle
Copy link

could this be turned into a calibre plugin?that would be awesome to run it directly from calibre with a basic gui

@Krazybug
Copy link
Author

Krazybug commented Dec 28, 2019

I'm working on a new industrialized version. The idea is to allow automated book sorting, tagging ... inspired by https://github.com/na--/ebook-tools
I want it to stay flexible and Calibre doesn't scale well.

One of the first provided features thought is the ability to rebuild Calibre libraries with symlinks. This way you can maintain several Calibre libraries in parallel which share the same books

Maybe in the future a plugin could be envisaged but it's not in my priority list.

Thanks for your interest

@Krazybug
Copy link
Author

Krazybug commented Dec 28, 2019

RamiferousReader, I'm glad you enjoy it !

Not sure to clearly understand you request but I think this is not implemented as such.
Here is my personal cookbook :
Rather than using the "set-status" command like
find . -name metadata.json | xargs jq -r '.| select (.title | contains ("Python"))| select(.source.status=="todo")|.uuid' | xargs -n 1 -I {} python calisuck.py set-status --uuid={} --status="ignored"

I prefer now just to move the books I don't need in another dir. It's more efficient. I'm working to store these info in cache to improve it in a future version but it's not in place for now. So if you decide to eliminate some books here is a way:

Imagine I don't want to download files with tag "Fiction"

cd your-dir-with-only-uuids
mkdir ../trash
ls -1  | xargs -n 1 -I {} jq -r '.| select (.tags[] | test("Fiction")) | .uuid' {}/metadata.json | xargs -n 1 -I {} echo mv -v {} ../trash

Adjust it with your own regexp or rerun it with different tags. Don't forget to remove the "echo" in the last pipe when you're ready

Then you just need to run the download as usual with download-ebooks command.

And to list all the tags:
ls -1 | xargs -n 1 -I {} jq -r '.tags[]' {}/metadata.json

sort -u might be long

PS:
I'm on a Mac. You may probably to change some things with xargs arguments to run it on Linux

@RamiferousReader
Copy link

Krazybug, Yes! I am enjoying your coding very much. :)
I deleted my original comment because I later thought it was kind of crazy and possibly unappreciative by requesting such parameters outside your coding.

Thanks much for your informative response. I have read through it and look forward to trying it out and testing it.
I will look through the xargs and see if I can relate it between both OS's.

@sclint13
Copy link

Hey Krazybug, this script is awesome. I've been able to index a site, and the dry-run gives me that list of information like you said, but I can't really figure out how to get a list of the books. I downloaded jq (through hombrew, if that matters) but I'm just not sure how to invoke the code you listed. I've tried running

jq find . -name metadata.json | xargs jq -r '. | select(.source.status=="todo")|.uuid+ " | "+.title'+ " | "+.authors[0]' > ebooks.txt

but that just gives me a

pipe quote>

prompt which I don't even know what to make of. I've tried running it without the

jq

in front (so basically just copy-pasted from your comment above) with the same results. I know I'm missing something here, and I'm hoping you could help me out. I'm still pretty green when it comes to command line, so I appreciate any advice you can spare! Thanks!

@luc-pitipuis
Copy link

Hi Krazybug, do you think for a faster first run using the OPDS feed would be faster?

@RamiferousReader
Copy link

Krazybug, Thanks for the help with sorting arguments. It worked fine. I'm using Linux on a Mac, so perhaps that helped :)

I found a slow site and replaced 'trash' with 'want' then moved your code to that location and ran it that way.
Since it was slow, I thought this would be more beneficial to all parties.

@Krazybug
Copy link
Author

luc-pitipuis
I'v found the reason why the indexing phase was so slow. I'm always searching with a key in a big dict rather than keep references. I will publish a fix.
But I'm also on a brand new version of this script which allows you to store this stuff in a sqlite db. It will boost the perfs and you can make some cool SQL or full text queries to prepare you download with it. Also it is able to handle several sites as demeter does rather than to lounch a process for each one.

@Krazybug
Copy link
Author

Krazybug commented Dec 29, 2019

RamiferousReader thanks for the tip! I use the wishlist also for slow sites

Also, I use to download them incrementally by progressively use --max-size option to increase the size and optimize downloads.
I start with
calisuck.py download-ebooks --max-size=1 --dry-run
then run it in real
calisuck.py download-ebooks --max-size=1
Then increase it for a new run.
When I'm around 20 I start to filter out as often bigger files are just OCR, mags or audios.
I'm moving them in a 'ignored' dir with some jq 'test' and 'ls -1'

This is one of the reason I'm moving this script into a project allowing you to record unmatching rules against your sites.

@Krazybug
Copy link
Author

Krazybug commented Dec 29, 2019

sclint13.
I think you're trying to run "find" in the jq command. I'm not sure it does exist
Just retry without jq in the beginning:
find . -name metadata.json | xargs jq -r '. | select(.source.status=="todo")|.uuid+ " | "+.title'+ " | "+.authors[0]' > ebooks.txt

EDIT 1: Sorry didn't read the end of your comment.
I'm investigating so

EDIT2:
Found ! There was a trailing quote in the command just after .title.

Here is the good one:
find . -name metadata.json | xargs jq -r '. | select(.source.status=="todo")|.uuid+ " | "+.title'+ " | "+.authors[0]'
I update the first comment

@RamiferousReader
Copy link

Krazybug, yes limiting size initially is ideal. Your setting the 'status' in json either 'done' or 'to do' is quite helpful.

Your new ideas look exciting. Right now I create special category folder and run the download directly there.

@TLaborde
Copy link

instead of using the ajax endpoint, you can use OPDS to quickly get the list of all books. You don't get as much fields, but for a first quick run it could be enough. Please see here a simple example

import feedparser
import re
import datetime

class OpdsScrapper:

    def __init__(self, opdsUrl):
        self.catalogs = self.downloadOpdsRootCatalog(opdsUrl)
        self.downloadOpdsCatalog(self.catalogs['By Newest'])

    def downloadOpdsRootCatalog(self, opdsUrl):
        feed = feedparser.parse(opdsUrl)
        if 'bozo_exception' in feed:
            exception = feed['bozo_exception']
            message = 'Failed opening the OPDS URL ' + opdsUrl + ': '
            reason = ''
            if hasattr(exception, 'reason'):
                reason = str(exception.reason)
            return (None, {})
        catalogEntries = {}
        firstTitle = None
        for entry in feed.entries:
            title = entry.get('title', 'No title')
            if firstTitle is None:
                firstTitle = title
            links = entry.get('links', [])
            firstLink = next(iter(links), None)
            if firstLink is not None:
                catalogEntries[title] = firstLink.href
        return (catalogEntries)

    def downloadOpdsCatalog(self, opdsCatalogUrl):
        print("downloading catalog: %s" % opdsCatalogUrl)
        opdsCatalogFeed = feedparser.parse(opdsCatalogUrl)
        self.books = self.makeMetadataFromParsedOpds(opdsCatalogFeed.entries)
        nextUrl = self.findNextUrl(opdsCatalogFeed.feed)
        while nextUrl is not None:
            nextFeed = feedparser.parse(nextUrl)
            print("downloading catalog: %s" % nextUrl)
            self.books = self.books + \
                self.makeMetadataFromParsedOpds(nextFeed.entries)
            nextUrl = self.findNextUrl(nextFeed.feed)

    def makeMetadataFromParsedOpds(self, books):
        metadatalist = []
        for book in books:
            metadata = self.opdsToMetadata(book)
            metadatalist.append(metadata)
        return metadatalist

    def opdsToMetadata(self, opdsBookStructure):
        book = {}
        book['title'] = opdsBookStructure.title
        book['authors'] = opdsBookStructure.author.replace(u'& ', u'&')
        book['edition'] = 0
        book['uuid'] = opdsBookStructure.id.replace('urn:uuid:', '', 1)
        rawTimestamp = opdsBookStructure.updated
        parsableTimestamp = re.sub('((\.[0-9]+)?\+00:00|Z)$', '', rawTimestamp)
        book['timestamp'] = datetime.datetime.strptime(
            parsableTimestamp, '%Y-%m-%dT%H:%M:%S')
        tags = []
        summary = opdsBookStructure.get(u'summary', u'')
        summarylines = summary.splitlines()
        for summaryline in summarylines:
            if summaryline.startswith(u'TAGS: '):
                tagsline = summaryline.replace(u'TAGS: ', u'')
                tagsline = tagsline.replace(u'<br />', u'')
                tagsline = tagsline.replace(u', ', u',')
                tags = tagsline.split(u',')
        book['tags'] = tags
        bookDownloadUrls = []
        links = opdsBookStructure.get('links', [])
        for link in links:
            url = link.get('href', '')
            bookType = link.get('type', '')
            # Skip covers and thumbnails
            if not bookType.startswith('image/'):
                if bookType == 'application/epub+zip':
                    # EPUB books are preferred and always put at the head of the list if found
                    bookDownloadUrls.insert(0, url)
                else:
                    # Formats other than EPUB (eg. AZW), are appended as they are found
                    bookDownloadUrls.append(url)
        book['links'] = bookDownloadUrls
        return book

    def findNextUrl(self, feed):
        for link in feed.links:
            if link.rel == u'next':
                return link.href
        return None


test = OpdsScrapper('http://1.1.1.1:8080/opds')
print(test.books)

@Krazybug
Copy link
Author

Krazybug commented Dec 30, 2019

Dear all,
Here is a new version with some enhancements :
revision 8

  • The main improvement: Now the indexing runs in Turbo mode. I can index 10000 ebooks in less than one minute (against half an hour before)
  • I've polished the UI to be less verbose during indexi and download in dry or real mode
  • The report of the download is now prettier thanks to BeautifulTable (install it with pip if want to upgrade
  • I've added a decent help for commands

What is on my todolist:

  • Adding a progressbar for each download
  • polish the reporting
  • add a README and a requirement.txt
  • add 2 new options for the reporting:
    • 1 raw list to use in combination with dry-mode, allowing you to paste the list in your favorite download manager (but you loose the metadata and grouping of formats)
    • 1 in markdown to allow to paste the report in Reddit

After that I will freeze the features and focused on turning it into a real project with many new features and ... maybe ... some contributors.

EDIT: I've released this version on Reddit
Don't hesitate to ask your questions in this post

@Krazybug
Copy link
Author

Krazybug commented Dec 30, 2019

@TLaborde
Thank you for your insights. Please upgrade to the fresh new version and you will get some boost on perfs. The REST API is perfect. Issues were in my code and I will also turn it into asynchronous to scale on different servers simultaneously.
I'll probably have a look on OPDS for Calibre Web parser in the future as it does not expose a REST API.

Regarding the use of OOP I agree that my script is dirty and could be enhanced ... but quick (Thanks to Fire lib). I'll probably redesign it in my project, adding some tests, use Poetry to publish it on Pypi ...

And thanks for your snippet. I will study it later

@sclint13
Copy link

sclint13 commented Jan 4, 2020

@Krazybug
Yes! That worked. Thank you so much. I've also taken the new code for a spin and it works great! Much faster than the previous version. I feel like I should send you a bill for the new HDDs I'll be needing XD

But seriously, thank you!

@Krazybug
Copy link
Author

Krazybug commented Jan 4, 2020

@sclint13 you make my day. Enjoy!

@ImmortalDreamer
Copy link

@Krazybug
Is it possible to add authentication to the script and download books from a server whose credentials I am aware of?

@Krazybug
Copy link
Author

Krazybug commented Aug 21, 2020

@Imm0rt4lDr3am3r
You may test to enter directly the url with login and password in the following format as Calibre is using basic auth:
http://login:password@111.111.111.111:8080

It it fails, we can do it easily. Just need to add some parameters and some code.
If needed, I'll release a new version with this feature.

If you're a bit familiar with python you can do it by yourself.

  1. create 2 variables login and password
    2 change the lines with a call to requests.get with auth parameter like:
    r = requests.get('https://api.github.com/user', auth=('user', 'pass'))

@ImmortalDreamer
Copy link

ImmortalDreamer commented Aug 23, 2020

@Imm0rt4lDr3am3r
You may test to enter directly the url with login and password in the following format as Calibre is using basic auth:
http://login:password@111.111.111.111:8080

It it fails, we can do it easily. Just need to add some parameters and some code.
If needed, I'll release a new version with this feature.

If you're a bit familiar with python you can do it by yourself.

  1. create 2 variables login and password
    2 change the lines with a call to requests.get with auth parameter like:
    r = requests.get('https://api.github.com/user', auth=('user', 'pass'))

I've tried doing it with
#http://login:password@111.111.111.111:8080
format, but it fails due to a special character '!' in the password.
I am a bit familiar with Python, so I did try to create variables and add the auth parameter to the requests, but the script still failed to open the site. Although this may not be the reason, I'm not sure whether it's because of the self-signed certificate used by the site.
If possible, I hope you can provide a new version with this feature in case I did something wrong somewhere.

@ImmortalDreamer
Copy link

@Krazybug
I achieved my purpose (of backing up a remote server with known credentials) by tinkering around with the calibredb commands, but I will keep trying to make authentication work in this script. My end goal is to run the script regularly so that the backup is in sync with the remote server at all times.
Hoping to see a new version if you have the time to update the current script later on.
Thanks for the help.

@Krazybug
Copy link
Author

You're welcome. I will work on tis feature very soon so.

@Lambik
Copy link

Lambik commented Sep 20, 2020

Hi,
Thanks for the script! Is there a way to say "I'd like the epub version of this book, except if that doesn't exist, then the mobi, and if that doesn't, the "? Or does it always download all versions of the book?

@Krazybug
Copy link
Author

Krazybug commented Sep 20, 2020

Hi, @Lambik

Yes, you can use the --single-format option which has this priority order:
all_ordered_formats=['azw', 'azw3', 'cbr', 'chm', 'djvu', 'doc', 'docx', 'epub', 'kepub', 'lit', 'lrf', 'mobi', 'original_epub', 'pdf', 'ppt', 'prc', 'rar', 'rtf', 'txt', 'zip', 'fb2']
You can edit this line in your code if it doesn't fit your need.

And you can combine it with the --ignored-formats option.

@Lambik
Copy link

Lambik commented Sep 20, 2020

Cool, thanks!

@Lambik
Copy link

Lambik commented Sep 20, 2020

This might be something to look into too:

/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.column_headers' has been deprecated in 'v1.0.0' and will be removed in 'v1.
2.0'. Use 'BTColumnCollection.header' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.append_row' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'
. Use 'BTRowCollection.append' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.sort' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0'. Use
'BTRowCollection.sort' instead.
warnings.warn(message, FutureWarning)
/home/xx/.local/lib/python3.6/site-packages/beautifultable/utils.py:113: FutureWarning: 'BeautifulTable.getitem' has been deprecated in 'v1.0.0' and will be removed in 'v1.2.0
'. Use 'BeautifulTable.{columns|rows}[key]' instead.
warnings.warn(message, FutureWarning)

@Krazybug
Copy link
Author

Yes, I'm working on a new version. A true project not a gist. Probably releasing this fix. Is it blocking for this version ? I'll give a look in this case.

@Lambik
Copy link

Lambik commented Sep 20, 2020

Oh no, not blocking at all, it still prints the reports just fine, it's just a warning that in the near future, so when the next people do pip install, they might get in trouble. So not that urgent, just a fyi :-)

@Lambik
Copy link

Lambik commented Jul 15, 2021

Hi, any update on the new version status? I just happened to rediscover this project and saw the last comment was mine :D

@razorsoup
Copy link

razorsoup commented Jul 29, 2021

I'm getting two copies of the ebooks downloaded, one with the correct format extension (eg .epub) and one with a tmp extension (eg .epub.tmp). In the console, I'm seeing the following:

Downloading ebook: ***the ebook url***
Size expected (estimation): 1.0 MB
Size received=1.0 MB
Unable to get book: ***the ebook url***
[WinError 32] The process cannot access the file because it is being used by another process: '***the path to the ebook that it did actually download***.tmp'

This is on Windows 10 using Python 3.6

edit: If I change the indention on line 211&212, it seems to resolve the issue. The tmp file needs to be closed before it can be copied & deleted.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment