Skip to content

Instantly share code, notes, and snippets.

@jul
Last active May 1, 2022 09:01
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jul/4e04503b4c6739d7b4be692b56ad0ac5 to your computer and use it in GitHub Desktop.
Save jul/4e04503b4c6739d7b4be692b56ad0ac5 to your computer and use it in GitHub Desktop.
FUCK LA BNF.py
from pyquery import PyQuery as S
import urllib
from json import loads, dumps
from os import path, makedirs
from requests import get
WAIT=.5
"""
Admire the doc made by these fucking twats
http://www.bnf.fr/fr/professionnels/donnees_bnf_recuperables/a.donnees_rdf.html#SHDC__Attribute_BlocArticle7BnF
INPUT CHANGE YEAR RANGE L20 This magazine was discontinued
"""
ROOT_DIR=path.join( path.expanduser("~"), "Docs","Assiette_au_Beurre")
### list the review of the year
try:
makedirs(ROOT_DIR)
except Exception as e:
print( repr(e) )
pass
from time import sleep
for year in range(1902, 1937):
list_year = """http://gallica.bnf.fr/ark:/12148/cb327033728/date%s.liste.json""" % year
###What THE FUCKING FUCK a json in a <html><body><p> DIE!!! You fucking PIG
grrr= lambda b_url: loads( S(S("body > P ", S(b_url).html())).html() )
grrr= lambda b_url : loads(S(b_url).text())
data= grrr( list_year)
### etract ALL the review in a year
#### OMGWTFBBQ English & french mixed up and relevant data at level 9 !!!!!!!!
#### 8Mb to extract 1ko of information, YOU ARE MAD!!!!
list_link=""
try:
list_link = data["PeriodicalPageFragment"]["contenu"]["SearchResultsFragment"]["contenu"]["ResultsFragment"]["contenu"][1:]
except KeyError:
continue
url = lambda cont: cont["title"]["url"]
desc= lambda cont:cont["title"]["description"]
target= [ (desc(c), url(c).replace("?",".json?")) for c in list_link ]
for alb_ind, (album, base_url) in enumerate(target):
##Getting the first review
print( album)
print( base_url )
sanit=lambda _str: _str.replace("/","_").replace(".","").replace(" ","_")
alb_dir=sanit("%03d_%s" % (alb_ind, album))
print( "ALBDIR %s" % alb_dir)
im_page=loads(S(base_url).text())
#### And now a level 4 indirection?!
im_list = im_page["ViewerFragment"]["contenu"]["PaginationViewerModel"]["url"]
#### And avoiding a broken bloated linked list schema by using an ajax URL // serendipity
list_image = grrr(im_list[:-len("image")] + "vertical.json")
#### I have to fucking retranslate in a picture URL without a correct name?!!!
def reformat(str, index):
prefix = str.replace("services/ajax/pagination/page/SINGLE/", "")[:-len("vertical")]
if prefix[-1] != "f":
prefix += "f"
return prefix +"%d.highres" % index
for index,im_url in enumerate([el["url"] for el in list_image["fragment"]["contenu"] ]):
print( "getting %s" % reformat(im_url,index) )
try:
base_dir=path.join(ROOT_DIR,str(year), alb_dir)
makedirs(base_dir)
except OSError:
pass
dst= path.join(ROOT_DIR, base_dir, "%03d_%s.jpeg" % (index, alb_dir))
print( dst )
def is_jpeg(fn):
try:
with open(fn, "rb") as f:
return list(map(int,f.read(3))) ==[255,216,255]
except Exception as e:
return False
if not path.exists(dst) or not is_jpeg(dst):
got_jpeg=False
RETRY=0
while not got_jpeg and RETRY<=3:
content=get(reformat(im_url, index)).content
# check magick number
got_jpeg = list(map(int, content[:3])) == [ 255, 216, 255]
sleep(WAIT)
if not got_jpeg:
RETRY+=1
print("not a JPEG file BNF site probably fucked up under load")
WAIT+=1
print("WAIT ++ %d" % WAIT)
else:
WAIT=max(.5, WAIT-1 )
print("WAIT -- %d" % WAIT)
if got_jpeg:
with open(dst,"wb") as f:
f.write(content)
if is_jpeg(dst):
print("%s is a JpEG, CHECKED" % dst)
### If you code in this fucking place, I will throw all the universal encyclopedia in your face, motherfucker
''' pour générer rapidement une liseuse HTML
cd ~/Docs/Assiette_au_Beurre
# remettre un peu gamma et corriger le contrast
for file in */*/*.jpeg; do convert -contrast -gamma 1.6 $file "${file%.jpg}_new.jpg"; done
# generer le HTML en mode de toute facon les brouteurs sont tres tolerants
for d in */*; do
echo "<a href src=./$d/index.html >$d</a><br/>" >> index.html
pushd $d
for i in *jpg; do echo "<img src=$i ><br/>" ; done > index.html;
popd
done
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment