Krazybug/calibre-sucker.py

## calibre-sucker.py
import requests
import json
import os
import time

# - traiter les formats en stockant le hash du fichier
# - capturer les exceptions
# - mettre un cli
# - autoriser 3 modes: update metadata, update file et append filename
# - une query pou la recheche exemple http://localhost:8080/ajax/search?sort=id&sort_order=desc
# - bufferiser les fichiers
# - indice start/stop
# - stocker un index
# - id = timestamp + id process
# - cover et json avec le meme nom
# - mutualiser le code
# - json None
# - moteur de recherche
# - genartion de page html cliquable
# - mettre une taille limite
# - acces avec un mot de passe
# - mode debug


def get_file(url, path, id, format):
    print(url)
    r1=requests.get(url)
    # r1=requests.get(url, stream=True)
    try:
        f_name=path+r1.headers['Content-Disposition'].split('filename=')[1].strip('"')
        print(f_name)
    except:
        f_name=path+id+"."+format
    os.makedirs(os.path.dirname(f_name), exist_ok=True)
    with open(f_name, 'wb') as fd:
        fd.write(r1.content)
    # time.sleep(1)
    # with open(f_name, 'wb') as fd:
    #     for chunk in r.iter_content(chunk_size=4096):
    #         fd.write(chunk)

def get_cover(url, path):
    r1=requests.get(url)
    # r1=requests.get(url, stream=True)
    f_name=path+"cover.jpg"
    print(f_name)
    os.makedirs(os.path.dirname(f_name), exist_ok=True)
    with open(f_name, 'wb') as fd:
        fd.write(r1.content)
    # with open(f_name, 'wb') as fd:
    #     for chunk in r.iter_content(chunk_size=4096):
    #         fd.write(chunk)


max_size=1000*1024*1024
offset=0
# offset=800
num=25
# num=3
# server='http://localhost:8080/'
# server='http://209.44.124.40:8080'
# server='http://46.244.213.139:8080'
# url=server+'ajax/search?num=1'
server="http://188.96.212.209:8080"
#url=server+'/ajax/search/FSK18?num=0'
url=server+'/ajax/search/Zeitschrift_XXX?num=0'
print(url)
r=requests.get(url)
total_num=int(r.json()["total_num"])
# total_num=7
my_formats=['azw3', 'epub', 'pdf', 'mobi', 'doc', 'zip', 'txt', 'chm']
# my_formats=['azw3', 'epub', 'doc', 'zip']
range=offset+1
while offset < total_num:
    print("offset=", str(offset))
    url=server+'/ajax/search/Zeitschrift_XXX?num='+str(num)+'&offset='+str(offset)
    print(url)
    r=requests.get(url)

    print("from: ", str(offset), " to: ", str(offset+int(r.json()['num'])))
    book_ids=r.json()["book_ids"]
    books_s=",".join(str(i) for i in r.json()['book_ids'])
    url=server+'/ajax/books/Zeitschrift_XXX?ids='+books_s
    r=requests.get(url)
    print(url)
    print(len(r.json()))


    # mettre le rang avec la bovle te min de la requete et du stop total_num
    for id in r.json().keys():
            print ('-> range={}/{}'.format(str(range),str(total_num)))
            book={}
            book['formats']=list(set(r.json()[id]['formats']) & set(my_formats))
            book['title']=r.json()[id]['title']
            print ('--> {}: {}'.format(id, book['title']))
            formats=book['formats']
            for f in formats:
                if not 'size' in r.json()[id]['format_metadata'][f] or max_size < int(r.json()[id]['format_metadata'][f]['size']):
                    book['formats'].remove(f)
                    print ("format {} ignored for {}:'{}' too large)".format(f, id, book['title']))
            if not len(book['formats']):
                print ("'{}' ignored: no more format available in {})".format(book['title'], (r.json()[id]['formats'])))
            else:
                for f in book['formats']:
                    if f in r.json()[id]['main_format']:
                        url_path=r.json()[id]['main_format'][f]
                    else:
                        url_path=r.json()[id]['other_formats'][f]
                    print ("--->", url_path)
                    url=server+url_path
                    f_path='import/'+id+'/'
                    get_file(url, f_path, id, f)
                url_path=r.json()[id]['cover']
                url=server+url_path
                f_path='import/'+id+'/'
                print ("---->", url_path)
                get_cover(url, f_path)

                book['id']=id
                book['source']=server+'/calibre/ajax/book/'+id
                book['authors']=r.json()[id]['authors']
                book['uuid']=r.json()[id]['uuid']
                book['identifiers']=r.json()[id]['identifiers']
                book['pubdate']=r.json()[id]['pubdate']
                book['publisher']=r.json()[id]['publisher']
                book['languages']=r.json()[id]['languages']
                book['comments']=r.json()[id]['comments']
                book['series']=r.json()[id]['series']
                book['tags']=r.json()[id]['tags']
                print(book)
                filename=f_path+'metadata.json'
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                with open(filename, 'w') as fd:
                    json.dump(book, fd)
            range=range+1
    offset=offset+num
	import requests
	import json
	import os
	import time

	# - traiter les formats en stockant le hash du fichier
	# - capturer les exceptions
	# - mettre un cli
	# - autoriser 3 modes: update metadata, update file et append filename
	# - une query pou la recheche exemple http://localhost:8080/ajax/search?sort=id&sort_order=desc
	# - bufferiser les fichiers
	# - indice start/stop
	# - stocker un index
	# - id = timestamp + id process
	# - cover et json avec le meme nom
	# - mutualiser le code
	# - json None
	# - moteur de recherche
	# - genartion de page html cliquable
	# - mettre une taille limite
	# - acces avec un mot de passe
	# - mode debug


	def get_file(url, path, id, format):
	print(url)
	r1=requests.get(url)
	# r1=requests.get(url, stream=True)
	try:
	f_name=path+r1.headers['Content-Disposition'].split('filename=')[1].strip('"')
	print(f_name)
	except:
	f_name=path+id+"."+format
	os.makedirs(os.path.dirname(f_name), exist_ok=True)
	with open(f_name, 'wb') as fd:
	fd.write(r1.content)
	# time.sleep(1)
	# with open(f_name, 'wb') as fd:
	# for chunk in r.iter_content(chunk_size=4096):
	# fd.write(chunk)

	def get_cover(url, path):
	r1=requests.get(url)
	# r1=requests.get(url, stream=True)
	f_name=path+"cover.jpg"
	print(f_name)
	os.makedirs(os.path.dirname(f_name), exist_ok=True)
	with open(f_name, 'wb') as fd:
	fd.write(r1.content)
	# with open(f_name, 'wb') as fd:
	# for chunk in r.iter_content(chunk_size=4096):
	# fd.write(chunk)


	max_size=100010241024
	offset=0
	# offset=800
	num=25
	# num=3
	# server='http://localhost:8080/'
	# server='http://209.44.124.40:8080'
	# server='http://46.244.213.139:8080'
	# url=server+'ajax/search?num=1'
	server="http://188.96.212.209:8080"
	#url=server+'/ajax/search/FSK18?num=0'
	url=server+'/ajax/search/Zeitschrift_XXX?num=0'
	print(url)
	r=requests.get(url)
	total_num=int(r.json()["total_num"])
	# total_num=7
	my_formats=['azw3', 'epub', 'pdf', 'mobi', 'doc', 'zip', 'txt', 'chm']
	# my_formats=['azw3', 'epub', 'doc', 'zip']
	range=offset+1
	while offset < total_num:
	print("offset=", str(offset))
	url=server+'/ajax/search/Zeitschrift_XXX?num='+str(num)+'&offset='+str(offset)
	print(url)
	r=requests.get(url)

	print("from: ", str(offset), " to: ", str(offset+int(r.json()['num'])))
	book_ids=r.json()["book_ids"]
	books_s=",".join(str(i) for i in r.json()['book_ids'])
	url=server+'/ajax/books/Zeitschrift_XXX?ids='+books_s
	r=requests.get(url)
	print(url)
	print(len(r.json()))


	# mettre le rang avec la bovle te min de la requete et du stop total_num
	for id in r.json().keys():
	print ('-> range={}/{}'.format(str(range),str(total_num)))
	book={}
	book['formats']=list(set(r.json()[id]['formats']) & set(my_formats))
	book['title']=r.json()[id]['title']
	print ('--> {}: {}'.format(id, book['title']))
	formats=book['formats']
	for f in formats:
	if not 'size' in r.json()[id]['format_metadata'][f] or max_size < int(r.json()[id]['format_metadata'][f]['size']):
	book['formats'].remove(f)
	print ("format {} ignored for {}:'{}' too large)".format(f, id, book['title']))
	if not len(book['formats']):
	print ("'{}' ignored: no more format available in {})".format(book['title'], (r.json()[id]['formats'])))
	else:
	for f in book['formats']:
	if f in r.json()[id]['main_format']:
	url_path=r.json()[id]['main_format'][f]
	else:
	url_path=r.json()[id]['other_formats'][f]
	print ("--->", url_path)
	url=server+url_path
	f_path='import/'+id+'/'
	get_file(url, f_path, id, f)
	url_path=r.json()[id]['cover']
	url=server+url_path
	f_path='import/'+id+'/'
	print ("---->", url_path)
	get_cover(url, f_path)

	book['id']=id
	book['source']=server+'/calibre/ajax/book/'+id
	book['authors']=r.json()[id]['authors']
	book['uuid']=r.json()[id]['uuid']
	book['identifiers']=r.json()[id]['identifiers']
	book['pubdate']=r.json()[id]['pubdate']
	book['publisher']=r.json()[id]['publisher']
	book['languages']=r.json()[id]['languages']
	book['comments']=r.json()[id]['comments']
	book['series']=r.json()[id]['series']
	book['tags']=r.json()[id]['tags']
	print(book)
	filename=f_path+'metadata.json'
	os.makedirs(os.path.dirname(filename), exist_ok=True)
	with open(filename, 'w') as fd:
	json.dump(book, fd)
	range=range+1
	offset=offset+num