Last active
August 2, 2022 15:17
-
-
Save entwanne/655200d86052bc18b85ee79935b4ca7e to your computer and use it in GitHub Desktop.
ZdS clean gallery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import json | |
import re | |
import sys | |
from zipfile import ZipFile | |
img_regex = re.compile(rb'!\[[^\]]*\]\(([^)]+)\)') | |
def find_images_file(file): | |
for line in file: | |
if m := img_regex.search(line): | |
yield m.group(1).decode() | |
def find_images_archive(archive): | |
for path in archive.namelist(): | |
if not path.endswith('.md'): | |
continue | |
with archive.open(path) as file: | |
yield from find_images_file(file) | |
def find_images(*archive_names): | |
for archive_name in archive_names: | |
with ZipFile(archive_name, 'r') as archive: | |
yield from find_images_archive(archive) | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print(f'Usage: {sys.argv[0]} archive [archives]', file=sys.stderr) | |
sys.exit(1) | |
_, *archive_names = sys.argv | |
for img in find_images(*archive_names): | |
print(img) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import tempfile | |
from urllib.parse import urljoin | |
import lxml.html | |
import requests | |
from archive_list_images import find_images | |
from list_gallery import filter_images | |
def get_archive_url(url, doc=None): | |
if doc is None: | |
req = requests.get(url, cookies=cookies) | |
doc = lxml.html.fromstring(req.text) | |
archive_url, = doc.xpath('//a[starts-with(normalize-space(), "Archive")]/@href') | |
archive_url = urljoin(url, archive_url) | |
return archive_url | |
def get_archives(content_url, doc=None): | |
archives = { | |
'current.zip': get_archive_url(content_url, doc), | |
} | |
try: | |
beta_url, = doc.xpath('//a[normalize-space()="Voir la version en bêta"]/@href') | |
except ValueError: | |
pass | |
else: | |
beta_url = urljoin(content_url, beta_url) | |
archives['beta.zip'] = get_archive_url(beta_url) | |
try: | |
public_url, = doc.xpath('//a[normalize-space()="Voir la version en ligne"]/@href') | |
except ValueError: | |
pass | |
else: | |
public_url = urljoin(content_url, public_url) | |
archives['public.zip'] = get_archive_url(public_url) | |
return archives | |
def download_archives(archives, dirname): | |
for name, url in archives.items(): | |
req = requests.get(url, allow_redirects=True, cookies=cookies) | |
path = f'{dirname }/{name}' | |
with open(path, 'wb') as f: | |
f.write(req.content) | |
yield path | |
def list_all_public_images(content_url, doc=None): | |
archives = get_archives(content_url, doc) | |
with tempfile.TemporaryDirectory() as dirname: | |
return set(find_images(*download_archives(archives, dirname))) | |
def delete_image(edit_link, cookies): | |
req = requests.get(edit_link, cookies=cookies) | |
doc = lxml.html.fromstring(req.text) | |
img_name, = doc.xpath('//input[@name="title"]/@value') | |
img_src, = doc.xpath('//div[@class="gallery-col-image"]/a[img][1]/@href') | |
img_src = urljoin(edit_link, img_src) | |
form, = doc.xpath('//form[@id="form-delete-image"]') | |
action_url = urljoin(edit_link, form.get('action')) | |
image_id, = form.xpath('.//input[@name="image"]/@value') | |
csrf, = form.xpath('.//input[@name="csrfmiddlewaretoken"]/@value') | |
cookies = cookies | {'csrftoken': csrf} | |
req = requests.post( | |
action_url, | |
data={ | |
'image': image_id, | |
'csrfmiddlewaretoken': csrf, | |
'delete': '', | |
}, | |
cookies=cookies, | |
headers={'Referer': action_url}, | |
) | |
req.raise_for_status() | |
print(f'Image {img_name} ({img_src}) deleted!') | |
def clean_content_gallery(content_url, cookies): | |
req = requests.get(content_url, cookies=cookies) | |
doc = lxml.html.fromstring(req.text) | |
gallery_url, = doc.xpath('//aside//a[starts-with(@href, "/galerie/")]/@href') | |
gallery_url = urljoin(content_url, gallery_url) | |
images = list_all_public_images(content_url, doc) | |
todelete = set() | |
try: | |
for img_src, img_name, edit_link in filter_images(gallery_url, cookies, exclude=images): | |
print(f'Do you want to delete {img_name} ({img_src})? (y/n)') | |
if input('> ').strip().lower() == 'y': | |
todelete.add(edit_link) | |
except EOFError: | |
pass | |
for edit_link in todelete: | |
delete_image(edit_link, cookies) | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print(f'Usage: {sys.argv[0]} content_url session_id') | |
sys.exit(1) | |
_, content_url, session_id, *_ = sys.argv | |
if not content_url.startswith('https://zestedesavoir.com/contenus/'): | |
print(f'Malformatted content URL', file=sys.stderr) | |
sys.exit(1) | |
cookies = {'sessionid': session_id} | |
clean_content_gallery(content_url, cookies) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
from urllib.parse import urljoin | |
import lxml.html | |
import requests | |
def find_images_page(page_url, cookies): | |
req = requests.get(page_url, cookies=cookies) | |
doc = lxml.html.fromstring(req.text) | |
img_links = doc.xpath('//div[contains(@class, "gallery-item")]//a') | |
images = {} | |
for img_link in img_links: | |
img_src, = img_link.xpath('.//img/@src') | |
img_src, _, _ = img_src.rsplit('.', 2) | |
img_src = urljoin(page_url, img_src) | |
img_name, = img_link.xpath('.//span[@class="topic-title"]/text()') | |
img_edit_link = urljoin(page_url, img_link.get('href')) | |
images[img_src] = img_name, img_edit_link | |
try: | |
rel_next_url, = doc.xpath('(//a[@rel="next"]/@href)[1]') | |
next_url = urljoin(page_url, rel_next_url) | |
except ValueError: | |
next_url = None | |
return next_url, images | |
def iter_gallery_images(gallery_url, cookies): | |
next_url = gallery_url | |
while next_url is not None: | |
next_url, images = find_images_page(next_url, cookies) | |
for img_src, (img_name, edit_link) in images.items(): | |
yield img_src, img_name, edit_link | |
def filter_images(gallery_url, cookies, exclude): | |
for img_src, img_name, edit_link in iter_gallery_images(gallery_url, cookies): | |
if img_src not in exclude: | |
yield img_src, img_name, edit_link | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print(f'Usage: {sys.argv[0]} gallery_url session_id', file=sys.stderr) | |
sys.exit(1) | |
_, gallery_url, session_id, *_ = sys.argv | |
if not gallery_url.startswith('https://zestedesavoir.com/galerie/'): | |
print(f'Malformatted gallery URL', file=sys.stderr) | |
sys.exit(1) | |
cookies = {'sessionid': session_id} | |
for img in iter_gallery_images(gallery_url, cookies): | |
print(img) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment