y-ookuma/AmazonReceiptDivRename.py

## AmazonReceiptDivRename.py
#-----------------------------------------------------------
# 20230901
# Amazonの領収書の出力を電子帳簿保存法に対応しました。
#
# (1)Google Chromeの拡張機能　アマゾン注文履歴フィルタ　で領収書を一括出力します.
#  アマゾン注文履歴フィルタ　で出力したファイル名は
#  'Amazon.pdf'とすること
# (2)一括出力したAmazonの領収書のpdfのファイル名を"注文日付＿請求金額＿Amazon.pdf"とする.
#
#-----------------------------------------------------------

import PyPDF2
import glob,os,time,sys
from pdfminer.high_level import extract_text
from datetime import datetime

#  (1) 指定したファイルを "の領収書" という文字を検索し分割します。
def div_pdf(input_pdf_file):
  # 分割後のPDFファイル名のベース
  output_pdf_base = 'output'

  # PDFファイルを開く
  with open(input_pdf_file, 'rb') as pdf_file:
      pdf_reader = PyPDF2.PdfReader(pdf_file)

      for page_num, page in enumerate(pdf_reader.pages):
        # ページのテキストを取得
          page_text = page.extract_text()

        # "の領収書"の文字列がページテキストに含まれているか確認
          if "の領収書" in page_text:
              # 新しいPDFファイルを作成し、ページを追加
              pagenum=str(page_num+1).zfill(3)
              #output_pdf_file = f'{output_pdf_base}_{page_num + 1}.pdf'
              output_pdf_file = f'{output_pdf_base}_{pagenum}.pdf'
              pdf_writer = PyPDF2.PdfWriter()
              pdf_writer.add_page(page)

              # 新しいPDFファイルを保存
              with open(output_pdf_file, 'wb') as output_file:
                  pdf_writer.write(output_file)

        # "の領収書"の文字列が含まれていない場合、前のページに追加
          elif page_num > 0:
              pdf_writer.add_page(page)

              # 最後のページの場合、新しいPDFファイルを保存
              if page_num == len(pdf_reader.pages) - 1:
                  output_pdf_file = f'{output_pdf_base}_end.pdf'
                  with open(output_pdf_file, 'wb') as output_file:
                      pdf_writer.write(output_file)
          print("処理件数：",page_num +1, " " ,output_file.name)

# 文字列抽出（検索文字列、開始文字、終了文字）
def extraction_string(input_string,start_string,end_string):
  start_index = input_string.find(start_string)
  end_index = input_string.find(end_string, start_index)

  if start_index != -1 and end_index != -1:
    extracted_string = input_string[start_index + len(start_string):end_index]
    print("抽出された文字列:", extracted_string)
    return extracted_string
  else:
    print("指定された文字列が見つかりませんでした。")
    return None

# (2) ファイル名を"日付_金額_amazon.pdf"に変更する
def file_rename(input_list):
  for pdf in input_list:
    if "output" in pdf: #まだ、ファイル名の変換がされていない場合の処理
      input_string  = extract_text(pdf)

      accounting_day=extraction_string(input_string,"注文日：","\n")
      accounting_amount_money=extraction_string(input_string,"￥","\n")
      accounting_day=accounting_day.replace(" ","")
      accounting_amount_money= accounting_amount_money.replace(",","")
      accounting_amount_money= accounting_amount_money.zfill(7)  #百万まで左0詰め

      date_object = datetime.strptime(accounting_day, "%Y年%m月%d日")
      formatted_date = date_object.strftime("%Y%m%d")

      print(formatted_date+"_"+accounting_amount_money+"_Amazon.pdf")
      new_filename = formatted_date+"_"+accounting_amount_money+"_Amazon.pdf"

    # ファイル名を変更
      os.rename(pdf, new_filename)
      print("[" + str(input_list.index(pdf)+1) + "/"+ str(len(input_list))+ "] " + new_filename)
      print("")

#-----------------------------------
# ここからメイン処理
#-----------------------------------
# 分割元のPDFファイル名
input_pdf_file = 'Amazon.pdf'
if os.path.exists(input_pdf_file):
  print("ファイルを分割を開始します。")
  div_pdf(input_pdf_file)
  print("ファイルを分割しました。")
else:
  print("")
  print("Amazon.pdf が見つかりません。")
  print("Amazon.pdfを作成してください。")
  print("")
  sys.exit()


# 分割したpdfをリネームします。
print("ファイル名を変更します。")
input_list=glob.glob("*.pdf")
if input_pdf_file in input_list:
    input_list.remove(input_pdf_file)

# 分割元のPDFファイル名
file_rename(input_list)
	#-----------------------------------------------------------
	# 20230901
	# Amazonの領収書の出力を電子帳簿保存法に対応しました。
	#
	# (1)Google Chromeの拡張機能　アマゾン注文履歴フィルタ　で領収書を一括出力します.
	# アマゾン注文履歴フィルタ　で出力したファイル名は
	# 'Amazon.pdf'とすること
	# (2)一括出力したAmazonの領収書のpdfのファイル名を"注文日付＿請求金額＿Amazon.pdf"とする.
	#
	#-----------------------------------------------------------

	import PyPDF2
	import glob,os,time,sys
	from pdfminer.high_level import extract_text
	from datetime import datetime

	# (1) 指定したファイルを "の領収書" という文字を検索し分割します。
	def div_pdf(input_pdf_file):
	# 分割後のPDFファイル名のベース
	output_pdf_base = 'output'

	# PDFファイルを開く
	with open(input_pdf_file, 'rb') as pdf_file:
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	for page_num, page in enumerate(pdf_reader.pages):
	# ページのテキストを取得
	page_text = page.extract_text()

	# "の領収書"の文字列がページテキストに含まれているか確認
	if "の領収書" in page_text:
	# 新しいPDFファイルを作成し、ページを追加
	pagenum=str(page_num+1).zfill(3)
	#output_pdf_file = f'{output_pdf_base}_{page_num + 1}.pdf'
	output_pdf_file = f'{output_pdf_base}_{pagenum}.pdf'
	pdf_writer = PyPDF2.PdfWriter()
	pdf_writer.add_page(page)

	# 新しいPDFファイルを保存
	with open(output_pdf_file, 'wb') as output_file:
	pdf_writer.write(output_file)

	# "の領収書"の文字列が含まれていない場合、前のページに追加
	elif page_num > 0:
	pdf_writer.add_page(page)

	# 最後のページの場合、新しいPDFファイルを保存
	if page_num == len(pdf_reader.pages) - 1:
	output_pdf_file = f'{output_pdf_base}_end.pdf'
	with open(output_pdf_file, 'wb') as output_file:
	pdf_writer.write(output_file)
	print("処理件数：",page_num +1, " " ,output_file.name)

	# 文字列抽出（検索文字列、開始文字、終了文字）
	def extraction_string(input_string,start_string,end_string):
	start_index = input_string.find(start_string)
	end_index = input_string.find(end_string, start_index)

	if start_index != -1 and end_index != -1:
	extracted_string = input_string[start_index + len(start_string):end_index]
	print("抽出された文字列:", extracted_string)
	return extracted_string
	else:
	print("指定された文字列が見つかりませんでした。")
	return None

	# (2) ファイル名を"日付_金額_amazon.pdf"に変更する
	def file_rename(input_list):
	for pdf in input_list:
	if "output" in pdf: #まだ、ファイル名の変換がされていない場合の処理
	input_string = extract_text(pdf)

	accounting_day=extraction_string(input_string,"注文日：","\n")
	accounting_amount_money=extraction_string(input_string,"￥","\n")
	accounting_day=accounting_day.replace(" ","")
	accounting_amount_money= accounting_amount_money.replace(",","")
	accounting_amount_money= accounting_amount_money.zfill(7) #百万まで左0詰め

	date_object = datetime.strptime(accounting_day, "%Y年%m月%d日")
	formatted_date = date_object.strftime("%Y%m%d")

	print(formatted_date+"_"+accounting_amount_money+"_Amazon.pdf")
	new_filename = formatted_date+"_"+accounting_amount_money+"_Amazon.pdf"

	# ファイル名を変更
	os.rename(pdf, new_filename)
	print("[" + str(input_list.index(pdf)+1) + "/"+ str(len(input_list))+ "] " + new_filename)
	print("")

	#-----------------------------------
	# ここからメイン処理
	#-----------------------------------
	# 分割元のPDFファイル名
	input_pdf_file = 'Amazon.pdf'
	if os.path.exists(input_pdf_file):
	print("ファイルを分割を開始します。")
	div_pdf(input_pdf_file)
	print("ファイルを分割しました。")
	else:
	print("")
	print("Amazon.pdf が見つかりません。")
	print("Amazon.pdfを作成してください。")
	print("")
	sys.exit()


	# 分割したpdfをリネームします。
	print("ファイル名を変更します。")
	input_list=glob.glob("*.pdf")
	if input_pdf_file in input_list:
	input_list.remove(input_pdf_file)

	# 分割元のPDFファイル名
	file_rename(input_list)