Skip to content

Instantly share code, notes, and snippets.

@nobkd
Last active May 16, 2024 17:11
Show Gist options
  • Save nobkd/322e96fe741cafcbc9babcf2f75a0139 to your computer and use it in GitHub Desktop.
Save nobkd/322e96fe741cafcbc9babcf2f75a0139 to your computer and use it in GitHub Desktop.
Python ^3.10: Download all xkcd comics and comic data
from get_info import save_infos
from get_imgs import save_imgs
if __name__ == '__main__':
pth = 'xkcd_save'
save_infos(pth)
print()
save_imgs(pth)
import requests
from os import path, listdir
from multiprocessing import Pool
import json
import re
def load_infos(save_dir: str) -> list[str]:
files = listdir(save_dir)
json_files = []
for f in files:
if re.match('.*\.json$', f):
json_files.append(f)
return json_files
def save_img(save_dir: str, file_path: str) -> None:
pth = path.join(save_dir, file_path)
with open(pth, 'rt') as fp:
json_data = json.load(fp)
num = json_data["num"]
img = json_data["img"]
save_path = path.join(save_dir, f'{num}.{img.split(".")[-1]}')
if path.exists(save_path):
#print('Exists:', num)
return
data = requests.get(img, stream=True)
if not data.ok:
print('Not OK:', num)
return
print('Saving:', num)
with open(save_path, 'wb') as f:
for chunk in data.iter_content(chunk_size=128):
f.write(chunk)
print('Done:', num)
def save_imgs(save_dir: str) -> None:
print('Checking Images...')
imgs_to_save = load_infos(save_dir)
pool = Pool()
for img_info in imgs_to_save:
pool.apply_async(save_img, (save_dir, img_info))
pool.close()
pool.join()
if __name__ == '__main__':
pth = 'xkcd_save'
save_imgs(pth)
import requests
from os import mkdir, path
from multiprocessing import Pool
import json
BASE_URL = 'https://xkcd.com/'
JSON_URL = 'info.0.json'
def last_comic_num() -> int:
return json.loads(requests.get(BASE_URL + JSON_URL).content)['num']
def save_info(save_path: str, num: int) -> None:
pth = path.join(save_path, f'{num}.json')
if path.exists(pth):
#print('Exists:', num)
return
data = requests.get(f'{BASE_URL}{num}/{JSON_URL}', stream=True)
if not data.ok:
print('Not OK:', num)
return
print('Saving:', num)
with open(pth, 'wb') as f:
for chunk in data.iter_content(chunk_size=128):
f.write(chunk)
print('Done:', num)
def save_infos(save_path: str) -> None:
print('Checking Info...')
if not path.exists(save_path):
mkdir(save_path)
last_num = last_comic_num()
print('Latest Comic:', last_num)
pool = Pool()
for i in range(1, last_num + 1):
pool.apply_async(save_info, (save_path, i))
pool.close()
pool.join()
if __name__ == '__main__':
pth = 'xkcd_save'
save_infos(pth)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment