Skip to content

Instantly share code, notes, and snippets.

@monchy-monchy
Created August 24, 2017 22:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monchy-monchy/2ea549131add9412739f5e5bc1751edd to your computer and use it in GitHub Desktop.
Save monchy-monchy/2ea549131add9412739f5e5bc1751edd to your computer and use it in GitHub Desktop.
PDF to TEXT
#! python3
# PdfToTextConverter.py
# PDFファイルの内容を読み込んで、textファイルとして出力
import os
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
pdf_folder_path = os.getcwd() # 現在のフォルダのパスを取得
text_folder_path = os.getcwd() + '/' + 'text_folder' # pathの表記がmac仕様。windowsの場合は、'/'を'\'に修正する。
os.makedirs(text_folder_path, exist_ok=True)
pdf_file_name = os.listdir(pdf_folder_path)
# name がPDFファイル(末尾が.pdf)の場合はTRUE、それ以外はFALSEを返す。
# こちらの投稿を引用・一部変更しました → http://qiita.com/korkewriya/items/72de38fc506ab37b4f2d
def pdf_checker(name):
pdf_regex = re.compile(r'.+\.pdf')
if pdf_regex.search(str(name)):
return True
else:
return False
# PDFをtextファイルに変換
def convert_pdf_to_txt(path, txtname, buf=True):
rsrcmgr = PDFResourceManager()
if buf:
outfp = StringIO()
else:
outfp = file(txtname, 'w')
codec = 'utf-8'
laparams = LAParams()
laparams.detect_vertical = True
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
device.close()
if buf:
text = outfp.getvalue()
make_new_text_file = open(text_folder_path + '/' + path + '.txt', 'w')
make_new_text_file.write(text)
make_new_text_file.close()
outfp.close()
# フォルダ内のpdfファイル名取得してリスト化
for name in pdf_file_name:
if pdf_checker(name):
convert_pdf_to_txt(name, name + '.txt') # pdf_checkerを使い、TRUE(末尾が.pdfの場合)は変換に進む)
else:
pass # PDFファイルでない場合にはpass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment