Skip to content

Instantly share code, notes, and snippets.

@aont
Last active July 4, 2024 03:46
Show Gist options
  • Save aont/886481cd5d705862a2bc7db1761b7a8a to your computer and use it in GitHub Desktop.
Save aont/886481cd5d705862a2bc7db1761b7a8a to your computer and use it in GitHub Desktop.
import urllib.request
import json
import os
import sys
import tqdm
import glob
genre_code = "portrait"
rankings_limit = 20
image_count_stop = 100
entries_count_stop = 100
blog_entries_limit = 20
headers = {
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"Referer": "https://ameblo.jp/",
"Referrer-Policy": "origin",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}
opener = urllib.request.build_opener()
def http_get(url: str) -> bytes:
req = urllib.request.Request(url, headers=headers)
with opener.open(req) as res:
body: bytes = res.read()
return body
def remove_prefix(input: str, prefix: str):
assert input.startswith(prefix)
return input[len(prefix):]
rankings_offset = 0
ameba_id_set = set()
while True:
# if True:
sys.stderr.write(f"debug: rankings {rankings_offset}\n")
rankings = json.loads(http_get(f"https://blogger.ameba.jp/api/blogs/rankings?genreCode={genre_code}&limit={rankings_limit}&offsetAmebaId=shashinsouko&offsetScore={rankings_offset}&includeAmebaBlog=false&includeAmebaBlogRecentEntries=false"))
rankings_offset += rankings_limit
for rankings_idx, user in enumerate(rankings["data"]):
ameba_id = user["amebaId"]
if ameba_id in ameba_id_set:
sys.stderr.write(f"debug: skip {ameba_id=}\n")
continue
# if rankings_idx>0: raise Exception(f"{ameba_id=} is already checked.")
# else: continue
sys.stderr.write(f"debug: {ameba_id=}\n")
ameba_id_set.add(ameba_id)
glob_result = glob.glob(ameba_id + os.path.sep + "*.jpg")
# if len(glob_result) >= image_count_stop: continue
sys.stderr.write(f"debug: top page\n")
body = http_get(f"https://ameblo.jp/{ameba_id}/")
def get_init_data(body: bytes) -> dict:
str_swide = b'<script>window.INIT_DATA='
str_pscs = b'};'
pos_swide = body.find(str_swide)
if pos_swide == -1: raise Exception()
pos_pscs = body.find(str_pscs, pos_swide)
if pos_pscs == -1: raise Exception()
# print(f"{pos_swide=} {pos_pscs=}")
data = body[pos_swide+len(str_swide):pos_pscs+1]
# print(data)
obj = json.loads(data)
return obj
init_data = get_init_data(body)
blog_id = tuple(init_data["bloggerState"]["blogMap"].keys())[0]
sys.stderr.write(f"debug: {blog_id=}\n")
image_count = 0
entries_count = 0
image_list = []
blog_entries_offset = 0
blog_entries_page = 1
entry_id_set = set()
while True:
sys.stderr.write(f"debug: blogEntries {blog_entries_page}\n")
body = http_get(f"https://ameblo.jp/_api/blogEntries;amebaId={ameba_id};blogId={blog_id};limit={blog_entries_limit};offset={blog_entries_offset};page={blog_entries_page}?returnMeta=false")
obj = json.loads(body)
blog_entries = obj
blogpage_map = tuple(blog_entries["data"]["entities"]["blogPageMap"].values())[0]
entry_id_list = blogpage_map["data"]
max_page = blogpage_map["paging"]["max_page"]
current_page = blogpage_map["paging"]["current_page"]
sys.stderr.write(f"debug: {current_page=} {max_page=}\n")
assert current_page == blog_entries_page
blog_entries_page += 1
blog_entries_offset += blog_entries_limit
# entry_map = blog_entries["data"]["entities"]["entryMap"]
entry_id_cur_set = set(entry_id_list)
entry_id_set_intersection = entry_id_set.intersection(entry_id_cur_set)
if len(entry_id_set_intersection)>0:
raise Exception(f"following entry ids are already checked. {entry_id_set_intersection}")
entry_id_set.update(entry_id_cur_set)
# for entry_id, info in entry_map.items():
for entry_id in tqdm.tqdm(entry_id_list, desc="entry", leave=False):
# entry_id = info["entry_id"]
# sys.stderr.write(f"debug: {entry_id=}\n")
# sys.stderr.write(f"debug: get image list\n")
try: body = http_get(f"https://blogimgapi.ameba.jp/blog/{ameba_id}/entries/{entry_id}/images")
except urllib.error.HTTPError: continue
obj = json.loads(body)
images = obj
for image_info in images["data"]:
# sys.stderr.write(f"debug: {num}\n")
image_url = "https://stat.ameba.jp" + image_info["imgUrl"] + "?caw=512"
# image_basename = remove_prefix(os.path.splitext(os.path.basename(image_info["entryUrl"]))[0], "entry-")
image_save_path = genre_code + os.path.sep + ameba_id + os.path.sep + str(entry_id) + "-" + "%03d" % image_info["nthInEntry"] + ".jpg"
image_list.append((image_url, image_save_path))
image_count += 1
if image_count >= image_count_stop: break
if image_count >= image_count_stop: break
entries_count += len(entry_id_list)
if entries_count >= entries_count_stop: break
if image_count >= image_count_stop: break
if current_page == max_page: break
# if len(entry_id_list) < blog_entries_limit: break
sys.stderr.write(f"debug: checked {entries_count} entries.\n")
image_list_prev = image_list
image_list = []
for (image_url, image_save_path) in image_list_prev:
if os.path.exists(image_save_path): continue
image_list.append((image_url, image_save_path))
sys.stderr.write(f"debug: images: {len(image_list_prev)}, new images: {len(image_list)}\n")
if len(image_list)==0: continue
os.makedirs(genre_code + os.path.sep + ameba_id, exist_ok=True)
for (image_url, image_save_path) in tqdm.tqdm(image_list, desc="image", leave=False):
try: body = http_get(image_url)
except urllib.error.HTTPError: continue
with open(image_save_path, "wb") as fp:
fp.write(body)
sys.stderr.write(f"debug: downloaded {len(image_list)} images\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment