jul/liseuse.draft.py

## liseuse.draft.py
from pyquery import PyQuery as S
import urllib
from json import loads, dumps
from os import path, makedirs
from requests import get
WAIT=.5
"""
Admire the doc made by these fucking twats
http://www.bnf.fr/fr/professionnels/donnees_bnf_recuperables/a.donnees_rdf.html#SHDC__Attribute_BlocArticle7BnF

INPUT CHANGE YEAR RANGE L20 This magazine was discontinued
"""
ROOT_DIR=path.join( path.expanduser("~"), "Docs","Assiette_au_Beurre")
### list the review of the year
try:
    makedirs(ROOT_DIR)
except Exception as e:
    print( repr(e) )
    pass

from time import sleep
for year in range(1902, 1937):
    list_year = """http://gallica.bnf.fr/ark:/12148/cb327033728/date%s.liste.json""" % year
    ###What THE FUCKING FUCK a json in a <html><body><p> DIE!!! You fucking PIG
    grrr= lambda b_url: loads( S(S("body > P ", S(b_url).html())).html() )
    grrr= lambda b_url : loads(S(b_url).text())
    data= grrr( list_year)
    ### etract ALL the review  in a year
    #### OMGWTFBBQ English & french mixed up and relevant data at level 9 !!!!!!!!
    #### 8Mb to extract 1ko of information, YOU ARE MAD!!!!
    list_link=""
    try:
        list_link = data["PeriodicalPageFragment"]["contenu"]["SearchResultsFragment"]["contenu"]["ResultsFragment"]["contenu"][1:]
    except KeyError:
        continue
    url = lambda cont: cont["title"]["url"]
    desc= lambda cont:cont["title"]["description"]
    target= [ (desc(c), url(c).replace("?",".json?")) for c in list_link ]
    for alb_ind, (album, base_url) in enumerate(target):
    ##Getting the first review
        print( album)
        print( base_url )
        sanit=lambda _str: _str.replace("/","_").replace(".","").replace(" ","_")
        alb_dir=sanit("%03d_%s" % (alb_ind, album))
        print( "ALBDIR %s" % alb_dir)
        im_page=loads(S(base_url).text())
        #### And now a level 4 indirection?!
        im_list = im_page["ViewerFragment"]["contenu"]["PaginationViewerModel"]["url"]
        #### And avoiding a broken bloated linked list schema by using an ajax URL // serendipity
        list_image = grrr(im_list[:-len("image")] + "vertical.json")

        #### I have to fucking retranslate in a picture URL without a correct name?!!!
        def reformat(str, index):
            prefix = str.replace("services/ajax/pagination/page/SINGLE/", "")[:-len("vertical")]
            if prefix[-1] != "f":
                prefix += "f"
            return prefix +"%d.highres" % index

        for index,im_url in enumerate([el["url"]  for el in list_image["fragment"]["contenu"] ]):
            print( "getting %s" % reformat(im_url,index) )
            try:
                base_dir=path.join(ROOT_DIR,str(year), alb_dir)
                makedirs(base_dir)
            except OSError:
                pass
            dst= path.join(ROOT_DIR, base_dir, "%03d_%s.jpeg" % (index, alb_dir))
            print( dst )
            def is_jpeg(fn):
                try:
                    with open(fn, "rb") as f:
                        return list(map(int,f.read(3))) ==[255,216,255]
                except Exception as e:
                    return False
            if not path.exists(dst) or not is_jpeg(dst):
                got_jpeg=False
                RETRY=0
                while not got_jpeg and RETRY<=3:
                    content=get(reformat(im_url, index)).content
                    # check magick number
                    got_jpeg = list(map(int, content[:3])) == [ 255, 216, 255]
                    sleep(WAIT)
                    if not got_jpeg:
                        RETRY+=1
                        print("not a JPEG file BNF site probably fucked up under load")
                        WAIT+=1
                        print("WAIT ++ %d" % WAIT)
                    else:
                        WAIT=max(.5, WAIT-1 )
                        print("WAIT -- %d" % WAIT)

                if got_jpeg:
                    with open(dst,"wb") as f:
                        f.write(content)
                    if is_jpeg(dst):
                        print("%s is a JpEG, CHECKED" % dst)
        ### If you code in this fucking place, I will throw all the universal encyclopedia in your face, motherfucker
''' pour générer rapidement une liseuse HTML
cd ~/Docs/Assiette_au_Beurre
# remettre un peu gamma et corriger le contrast
for file in */*/*.jpeg; do  convert  -contrast -gamma 1.6 $file "${file%.jpg}_new.jpg"; done
# generer le HTML en mode de toute facon les brouteurs sont tres tolerants
for d in */*; do
    echo "<a href src=./$d/index.html >$d</a><br/>" >> index.html
    pushd $d
    for i in *jpg; do echo "<img src=$i ><br/>" ; done > index.html;
    popd
done
'''
	from pyquery import PyQuery as S
	import urllib
	from json import loads, dumps
	from os import path, makedirs
	from requests import get
	WAIT=.5
	"""
	Admire the doc made by these fucking twats
	http://www.bnf.fr/fr/professionnels/donnees_bnf_recuperables/a.donnees_rdf.html#SHDC__Attribute_BlocArticle7BnF

	INPUT CHANGE YEAR RANGE L20 This magazine was discontinued
	"""
	ROOT_DIR=path.join( path.expanduser("~"), "Docs","Assiette_au_Beurre")
	### list the review of the year
	try:
	makedirs(ROOT_DIR)
	except Exception as e:
	print( repr(e) )
	pass

	from time import sleep
	for year in range(1902, 1937):
	list_year = """http://gallica.bnf.fr/ark:/12148/cb327033728/date%s.liste.json""" % year
	###What THE FUCKING FUCK a json in a <html><body><p> DIE!!! You fucking PIG
	grrr= lambda b_url: loads( S(S("body > P ", S(b_url).html())).html() )
	grrr= lambda b_url : loads(S(b_url).text())
	data= grrr( list_year)
	### etract ALL the review in a year
	#### OMGWTFBBQ English & french mixed up and relevant data at level 9 !!!!!!!!
	#### 8Mb to extract 1ko of information, YOU ARE MAD!!!!
	list_link=""
	try:
	list_link = data["PeriodicalPageFragment"]["contenu"]["SearchResultsFragment"]["contenu"]["ResultsFragment"]["contenu"][1:]
	except KeyError:
	continue
	url = lambda cont: cont["title"]["url"]
	desc= lambda cont:cont["title"]["description"]
	target= [ (desc(c), url(c).replace("?",".json?")) for c in list_link ]
	for alb_ind, (album, base_url) in enumerate(target):
	##Getting the first review
	print( album)
	print( base_url )
	sanit=lambda _str: _str.replace("/","_").replace(".","").replace(" ","_")
	alb_dir=sanit("%03d_%s" % (alb_ind, album))
	print( "ALBDIR %s" % alb_dir)
	im_page=loads(S(base_url).text())
	#### And now a level 4 indirection?!
	im_list = im_page["ViewerFragment"]["contenu"]["PaginationViewerModel"]["url"]
	#### And avoiding a broken bloated linked list schema by using an ajax URL // serendipity
	list_image = grrr(im_list[:-len("image")] + "vertical.json")

	#### I have to fucking retranslate in a picture URL without a correct name?!!!
	def reformat(str, index):
	prefix = str.replace("services/ajax/pagination/page/SINGLE/", "")[:-len("vertical")]
	if prefix[-1] != "f":
	prefix += "f"
	return prefix +"%d.highres" % index

	for index,im_url in enumerate([el["url"] for el in list_image["fragment"]["contenu"] ]):
	print( "getting %s" % reformat(im_url,index) )
	try:
	base_dir=path.join(ROOT_DIR,str(year), alb_dir)
	makedirs(base_dir)
	except OSError:
	pass
	dst= path.join(ROOT_DIR, base_dir, "%03d_%s.jpeg" % (index, alb_dir))
	print( dst )
	def is_jpeg(fn):
	try:
	with open(fn, "rb") as f:
	return list(map(int,f.read(3))) ==[255,216,255]
	except Exception as e:
	return False
	if not path.exists(dst) or not is_jpeg(dst):
	got_jpeg=False
	RETRY=0
	while not got_jpeg and RETRY<=3:
	content=get(reformat(im_url, index)).content
	# check magick number
	got_jpeg = list(map(int, content[:3])) == [ 255, 216, 255]
	sleep(WAIT)
	if not got_jpeg:
	RETRY+=1
	print("not a JPEG file BNF site probably fucked up under load")
	WAIT+=1
	print("WAIT ++ %d" % WAIT)
	else:
	WAIT=max(.5, WAIT-1 )
	print("WAIT -- %d" % WAIT)

	if got_jpeg:
	with open(dst,"wb") as f:
	f.write(content)
	if is_jpeg(dst):
	print("%s is a JpEG, CHECKED" % dst)
	### If you code in this fucking place, I will throw all the universal encyclopedia in your face, motherfucker
	''' pour générer rapidement une liseuse HTML
	cd ~/Docs/Assiette_au_Beurre
	# remettre un peu gamma et corriger le contrast
	for file in //*.jpeg; do convert -contrast -gamma 1.6 $file "${file%.jpg}_new.jpg"; done
	# generer le HTML en mode de toute facon les brouteurs sont tres tolerants
	for d in /; do
	echo "<a href src=./$d/index.html >$d</a><br/>" >> index.html
	pushd $d
	for i in *jpg; do echo "<img src=$i ><br/>" ; done > index.html;
	popd
	done
	'''