Sg4Dylan/pixiv-file-rename.py

## pixiv-file-rename.py
#!/usr/bin/env python
#coding:utf-8
# Author:  Sg4Dylan --<sg4dylan#gmail.com>
# Created: 12/31/2018

# ==========================================
#            Pixiv 图片重命名
# ==========================================
# 使用方法： 把待处理的「文件夹」拖放到脚本上

import os
import re
import sys
import requests
from lxml.html import fromstring
from tqdm import tqdm
import multiprocessing

# 是否使用代理
I_AM_CHINESE = True
# 读取启动参数
target_root = ''
if len(sys.argv) > 1:
    target_root = sys.argv[1]
# 准备 session 并配置代理
proxies = {
    'http': 'http://127.0.0.1:2080',
    'https': 'http://127.0.0.1:2080'
}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
}
session = requests.Session()
if I_AM_CHINESE:
    session.proxies.update(proxies)


def proc_core(target_path):
    # 匹配符合命名规则的 pixiv 图片
    illust_name = os.path.basename(target_path)
    illust_dir = os.path.dirname(target_path)
    re_pattern = r'^(\d{4,})_?'
    if not re.search(re_pattern, illust_name):
        return
    # 抽出 pixiv illust id
    illust_id = re.search(re_pattern, illust_name).group(1)
    # 抓取页面
    url = f'https://www.pixiv.net/artworks/{illust_id}'
    r = session.get(url, headers=headers)
    if r.status_code != 200:
        print(f'ERROR: {r.status_code} in {illust_name}')
        return
    # 解析页面
    tree = fromstring(r.content.decode('UTF-8'))
    # 拿出页面标题部分
    illust_raw_title = tree.findtext('.//title')
    # 去掉标题结尾固定后缀
    illust_fin_name = illust_raw_title.replace('のイラスト - pixiv','')
    # 去掉以 # 开头的标签部分
    illust_fin_name = re.sub(r'(#.*? )', '', illust_fin_name)
    # 替换 Windows 文件系统不支持的字符
    illust_fin_name = illust_fin_name \
                        .replace('\\','╲').replace('/','／').replace(':','：') \
                        .replace('*','⚝').replace('?','？').replace('"','\'\'') \
                        .replace('<','‹').replace('>','›').replace('|','｜')
    # 重新构造成最终的文件名前半部分
    illust_fin_name = f"「{''.join(illust_fin_name.split('-')[:-1])}」-「{illust_fin_name.split('-')[-1]}」"
    # 从原文件名中抽取文件名后半部分
    proced_name = re.sub(re_pattern, f'{illust_fin_name}_', illust_name)
    proced_name = proced_name
    print(illust_fin_name, proced_name)
    # 检测文件名是否冲突，冲突则递增结尾数字
    confict_id = 0
    while 1:
        try:
            if confict_id == 0:
                os.rename(target_path,os.path.join(illust_dir, proced_name))
            else:
                os.rename(target_path,os.path.join(illust_dir, re.sub(r'_p\d.', f'_p{confict_id}.', proced_name)))
            return
        except FileExistsError:
            confict_id += 1


def get_file_list():
    file_path_list = []
    for root, dirs, files in os.walk(target_root):
        for name in files:
            file_path_list.append(os.path.join(root, name))
    return file_path_list


if __name__ == '__main__':
    multiprocessing.freeze_support()
    file_path_list = get_file_list()
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as exec_pool:
        with tqdm(total=len(file_path_list),ascii=True) as pbar:
            for i, _ in tqdm(enumerate(exec_pool.imap_unordered(proc_core,file_path_list))):
                pbar.update()
    os.system('pause')
	#!/usr/bin/env python
	#coding:utf-8
	# Author: Sg4Dylan --<sg4dylan#gmail.com>
	# Created: 12/31/2018

	# ==========================================
	# Pixiv 图片重命名
	# ==========================================
	# 使用方法：把待处理的「文件夹」拖放到脚本上

	import os
	import re
	import sys
	import requests
	from lxml.html import fromstring
	from tqdm import tqdm
	import multiprocessing

	# 是否使用代理
	I_AM_CHINESE = True
	# 读取启动参数
	target_root = ''
	if len(sys.argv) > 1:
	target_root = sys.argv[1]
	# 准备 session 并配置代理
	proxies = {
	'http': 'http://127.0.0.1:2080',
	'https': 'http://127.0.0.1:2080'
	}
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.193 Safari/537.36'
	}
	session = requests.Session()
	if I_AM_CHINESE:
	session.proxies.update(proxies)


	def proc_core(target_path):
	# 匹配符合命名规则的 pixiv 图片
	illust_name = os.path.basename(target_path)
	illust_dir = os.path.dirname(target_path)
	re_pattern = r'^(\d{4,})_?'
	if not re.search(re_pattern, illust_name):
	return
	# 抽出 pixiv illust id
	illust_id = re.search(re_pattern, illust_name).group(1)
	# 抓取页面
	url = f'https://www.pixiv.net/artworks/{illust_id}'
	r = session.get(url, headers=headers)
	if r.status_code != 200:
	print(f'ERROR: {r.status_code} in {illust_name}')
	return
	# 解析页面
	tree = fromstring(r.content.decode('UTF-8'))
	# 拿出页面标题部分
	illust_raw_title = tree.findtext('.//title')
	# 去掉标题结尾固定后缀
	illust_fin_name = illust_raw_title.replace('のイラスト - pixiv','')
	# 去掉以 # 开头的标签部分
	illust_fin_name = re.sub(r'(#.*? )', '', illust_fin_name)
	# 替换 Windows 文件系统不支持的字符
	illust_fin_name = illust_fin_name \
	.replace('\\','╲').replace('/','／').replace(':','：') \
	.replace('*','⚝').replace('?','？').replace('"','\'\'') \
	.replace('<','‹').replace('>','›').replace('\|','｜')
	# 重新构造成最终的文件名前半部分
	illust_fin_name = f"「{''.join(illust_fin_name.split('-')[:-1])}」-「{illust_fin_name.split('-')[-1]}」"
	# 从原文件名中抽取文件名后半部分
	proced_name = re.sub(re_pattern, f'{illust_fin_name}_', illust_name)
	proced_name = proced_name
	print(illust_fin_name, proced_name)
	# 检测文件名是否冲突，冲突则递增结尾数字
	confict_id = 0
	while 1:
	try:
	if confict_id == 0:
	os.rename(target_path,os.path.join(illust_dir, proced_name))
	else:
	os.rename(target_path,os.path.join(illust_dir, re.sub(r'_p\d.', f'_p{confict_id}.', proced_name)))
	return
	except FileExistsError:
	confict_id += 1


	def get_file_list():
	file_path_list = []
	for root, dirs, files in os.walk(target_root):
	for name in files:
	file_path_list.append(os.path.join(root, name))
	return file_path_list


	if __name__ == '__main__':
	multiprocessing.freeze_support()
	file_path_list = get_file_list()
	with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as exec_pool:
	with tqdm(total=len(file_path_list),ascii=True) as pbar:
	for i, _ in tqdm(enumerate(exec_pool.imap_unordered(proc_core,file_path_list))):
	pbar.update()
	os.system('pause')