FUCK LA BNF.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyquery import PyQuery as S | |
import urllib | |
from json import loads, dumps | |
from os import path, makedirs | |
from requests import get | |
WAIT=.5 | |
""" | |
Admire the doc made by these fucking twats | |
http://www.bnf.fr/fr/professionnels/donnees_bnf_recuperables/a.donnees_rdf.html#SHDC__Attribute_BlocArticle7BnF | |
INPUT CHANGE YEAR RANGE L20 This magazine was discontinued | |
""" | |
ROOT_DIR=path.join( path.expanduser("~"), "Docs","Assiette_au_Beurre") | |
### list the review of the year | |
try: | |
makedirs(ROOT_DIR) | |
except Exception as e: | |
print( repr(e) ) | |
pass | |
from time import sleep | |
for year in range(1902, 1937): | |
list_year = """http://gallica.bnf.fr/ark:/12148/cb327033728/date%s.liste.json""" % year | |
###What THE FUCKING FUCK a json in a <html><body><p> DIE!!! You fucking PIG | |
grrr= lambda b_url: loads( S(S("body > P ", S(b_url).html())).html() ) | |
grrr= lambda b_url : loads(S(b_url).text()) | |
data= grrr( list_year) | |
### etract ALL the review in a year | |
#### OMGWTFBBQ English & french mixed up and relevant data at level 9 !!!!!!!! | |
#### 8Mb to extract 1ko of information, YOU ARE MAD!!!! | |
list_link="" | |
try: | |
list_link = data["PeriodicalPageFragment"]["contenu"]["SearchResultsFragment"]["contenu"]["ResultsFragment"]["contenu"][1:] | |
except KeyError: | |
continue | |
url = lambda cont: cont["title"]["url"] | |
desc= lambda cont:cont["title"]["description"] | |
target= [ (desc(c), url(c).replace("?",".json?")) for c in list_link ] | |
for alb_ind, (album, base_url) in enumerate(target): | |
##Getting the first review | |
print( album) | |
print( base_url ) | |
sanit=lambda _str: _str.replace("/","_").replace(".","").replace(" ","_") | |
alb_dir=sanit("%03d_%s" % (alb_ind, album)) | |
print( "ALBDIR %s" % alb_dir) | |
im_page=loads(S(base_url).text()) | |
#### And now a level 4 indirection?! | |
im_list = im_page["ViewerFragment"]["contenu"]["PaginationViewerModel"]["url"] | |
#### And avoiding a broken bloated linked list schema by using an ajax URL // serendipity | |
list_image = grrr(im_list[:-len("image")] + "vertical.json") | |
#### I have to fucking retranslate in a picture URL without a correct name?!!! | |
def reformat(str, index): | |
prefix = str.replace("services/ajax/pagination/page/SINGLE/", "")[:-len("vertical")] | |
if prefix[-1] != "f": | |
prefix += "f" | |
return prefix +"%d.highres" % index | |
for index,im_url in enumerate([el["url"] for el in list_image["fragment"]["contenu"] ]): | |
print( "getting %s" % reformat(im_url,index) ) | |
try: | |
base_dir=path.join(ROOT_DIR,str(year), alb_dir) | |
makedirs(base_dir) | |
except OSError: | |
pass | |
dst= path.join(ROOT_DIR, base_dir, "%03d_%s.jpeg" % (index, alb_dir)) | |
print( dst ) | |
def is_jpeg(fn): | |
try: | |
with open(fn, "rb") as f: | |
return list(map(int,f.read(3))) ==[255,216,255] | |
except Exception as e: | |
return False | |
if not path.exists(dst) or not is_jpeg(dst): | |
got_jpeg=False | |
RETRY=0 | |
while not got_jpeg and RETRY<=3: | |
content=get(reformat(im_url, index)).content | |
# check magick number | |
got_jpeg = list(map(int, content[:3])) == [ 255, 216, 255] | |
sleep(WAIT) | |
if not got_jpeg: | |
RETRY+=1 | |
print("not a JPEG file BNF site probably fucked up under load") | |
WAIT+=1 | |
print("WAIT ++ %d" % WAIT) | |
else: | |
WAIT=max(.5, WAIT-1 ) | |
print("WAIT -- %d" % WAIT) | |
if got_jpeg: | |
with open(dst,"wb") as f: | |
f.write(content) | |
if is_jpeg(dst): | |
print("%s is a JpEG, CHECKED" % dst) | |
### If you code in this fucking place, I will throw all the universal encyclopedia in your face, motherfucker | |
''' pour générer rapidement une liseuse HTML | |
cd ~/Docs/Assiette_au_Beurre | |
# remettre un peu gamma et corriger le contrast | |
for file in */*/*.jpeg; do convert -contrast -gamma 1.6 $file "${file%.jpg}_new.jpg"; done | |
# generer le HTML en mode de toute facon les brouteurs sont tres tolerants | |
for d in */*; do | |
echo "<a href src=./$d/index.html >$d</a><br/>" >> index.html | |
pushd $d | |
for i in *jpg; do echo "<img src=$i ><br/>" ; done > index.html; | |
popd | |
done | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment