Created
August 24, 2017 22:49
-
-
Save monchy-monchy/2ea549131add9412739f5e5bc1751edd to your computer and use it in GitHub Desktop.
PDF to TEXT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3 | |
# PdfToTextConverter.py | |
# PDFファイルの内容を読み込んで、textファイルとして出力 | |
import os | |
import re | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from io import StringIO | |
pdf_folder_path = os.getcwd() # 現在のフォルダのパスを取得 | |
text_folder_path = os.getcwd() + '/' + 'text_folder' # pathの表記がmac仕様。windowsの場合は、'/'を'\'に修正する。 | |
os.makedirs(text_folder_path, exist_ok=True) | |
pdf_file_name = os.listdir(pdf_folder_path) | |
# name がPDFファイル(末尾が.pdf)の場合はTRUE、それ以外はFALSEを返す。 | |
# こちらの投稿を引用・一部変更しました → http://qiita.com/korkewriya/items/72de38fc506ab37b4f2d | |
def pdf_checker(name): | |
pdf_regex = re.compile(r'.+\.pdf') | |
if pdf_regex.search(str(name)): | |
return True | |
else: | |
return False | |
# PDFをtextファイルに変換 | |
def convert_pdf_to_txt(path, txtname, buf=True): | |
rsrcmgr = PDFResourceManager() | |
if buf: | |
outfp = StringIO() | |
else: | |
outfp = file(txtname, 'w') | |
codec = 'utf-8' | |
laparams = LAParams() | |
laparams.detect_vertical = True | |
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) | |
fp = open(path, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
for page in PDFPage.get_pages(fp): | |
interpreter.process_page(page) | |
fp.close() | |
device.close() | |
if buf: | |
text = outfp.getvalue() | |
make_new_text_file = open(text_folder_path + '/' + path + '.txt', 'w') | |
make_new_text_file.write(text) | |
make_new_text_file.close() | |
outfp.close() | |
# フォルダ内のpdfファイル名取得してリスト化 | |
for name in pdf_file_name: | |
if pdf_checker(name): | |
convert_pdf_to_txt(name, name + '.txt') # pdf_checkerを使い、TRUE(末尾が.pdfの場合)は変換に進む) | |
else: | |
pass # PDFファイルでない場合にはpass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment