salamann/ocr_pdf.py

## ocr_pdf.py
from __future__ import print_function
import httplib2
import os
import io

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload

import PyPDF2

try:
    import argparse
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
    flags = None

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'credentials.json'
APPLICATION_NAME = 'Drive API Python Quickstart'


def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    credential_path = os.path.join("./", CLIENT_SECRET_FILE)
    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else:  # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


def main(imgfile):
    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('drive', 'v3', http=http)

    txtfile = "{}.txt".format(os.path.splitext(imgfile)[0])

    mime = 'application/vnd.google-apps.document'
    res = service.files().create(
        body={
            'name': imgfile,
            'mimeType': mime
        },
        media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True)
    ).execute()

    downloader = MediaIoBaseDownload(
        io.FileIO(txtfile, 'wb'),
        service.files().export_media(fileId=res['id'], mimeType="text/plain")
    )
    done = False
    while done is False:
        status, done = downloader.next_chunk()

    service.files().delete(fileId=res['id']).execute()
    print("Done.")

def split_pdf(file_name):
    base_name = os.path.splitext(file_name)[0]
    reader1 = PyPDF2.PdfFileReader(file_name)
    num_pages = reader1.numPages

    _output = list()
    for num, init_page in enumerate(range(1, num_pages, 80)):
        print(init_page)
        merger = PyPDF2.PdfFileMerger()

        if num_pages < init_page + 79:
            last_page = num_pages
        else:
            last_page = init_page + 79
        merger.append(file_name,
                    pages=PyPDF2.pagerange.PageRange('{}:{}'.format(init_page, last_page)))
        output_file = '{}_0{}.pdf'.format(base_name, num)
        merger.write(output_file)
        merger.close()
        _output.append(output_file)
    return _output

def combine_file(files):
    _tmp = list()
    for _ in files:
        with open(_) as f:
            _tmp += f.readlines()
    file_name = "{}.txt".format(str(os.path.splitext(_)[0]).split("_")[0])
    with open(file_name, "w", encoding="utf-8") as f:
        f.writelines(_tmp)
    return file_name

def refine_text(input_file):
    with open(input_file) as f:
        data = f.readlines()

    whole = list()
    new_data = list()
    for _ in data:
        _ = _.replace(" ", "")
        if (_[-2] == "。") or (_[-2] == "」"):
            new_data.append(_[:-1])
            whole.append(''.join(new_data)+'\n')
            new_data = list()
        else:
            new_data.append(_[:-1])
    output_file = "{}_refined.txt".format(os.path.splitext(input_file)[0])
    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(whole)

if __name__ == '__main__':
    file_names = split_pdf("なぜ私だけが苦しむのかトリム.pdf")
    for _ in file_names:
        main(_)
    file_names = ["{}.txt".format(os.path.splitext(_)[0]) for _ in file_names]
    output_name = combine_file(file_names)
    refine_text(output_name)
	from __future__ import print_function
	import httplib2
	import os
	import io

	from apiclient import discovery
	from oauth2client import client
	from oauth2client import tools
	from oauth2client.file import Storage
	from apiclient.http import MediaFileUpload, MediaIoBaseDownload

	import PyPDF2

	try:
	import argparse
	flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
	except ImportError:
	flags = None

	# If modifying these scopes, delete your previously saved credentials
	# at ~/.credentials/drive-python-quickstart.json
	SCOPES = 'https://www.googleapis.com/auth/drive'
	CLIENT_SECRET_FILE = 'credentials.json'
	APPLICATION_NAME = 'Drive API Python Quickstart'


	def get_credentials():
	"""Gets valid user credentials from storage.

	If nothing has been stored, or if the stored credentials are invalid,
	the OAuth2 flow is completed to obtain the new credentials.

	Returns:
	Credentials, the obtained credential.
	"""
	credential_path = os.path.join("./", CLIENT_SECRET_FILE)
	store = Storage(credential_path)
	credentials = store.get()
	if not credentials or credentials.invalid:
	flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
	flow.user_agent = APPLICATION_NAME
	if flags:
	credentials = tools.run_flow(flow, store, flags)
	else: # Needed only for compatibility with Python 2.6
	credentials = tools.run(flow, store)
	print('Storing credentials to ' + credential_path)
	return credentials


	def main(imgfile):
	credentials = get_credentials()
	http = credentials.authorize(httplib2.Http())
	service = discovery.build('drive', 'v3', http=http)

	txtfile = "{}.txt".format(os.path.splitext(imgfile)[0])

	mime = 'application/vnd.google-apps.document'
	res = service.files().create(
	body={
	'name': imgfile,
	'mimeType': mime
	},
	media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True)
	).execute()

	downloader = MediaIoBaseDownload(
	io.FileIO(txtfile, 'wb'),
	service.files().export_media(fileId=res['id'], mimeType="text/plain")
	)
	done = False
	while done is False:
	status, done = downloader.next_chunk()

	service.files().delete(fileId=res['id']).execute()
	print("Done.")

	def split_pdf(file_name):
	base_name = os.path.splitext(file_name)[0]
	reader1 = PyPDF2.PdfFileReader(file_name)
	num_pages = reader1.numPages

	_output = list()
	for num, init_page in enumerate(range(1, num_pages, 80)):
	print(init_page)
	merger = PyPDF2.PdfFileMerger()

	if num_pages < init_page + 79:
	last_page = num_pages
	else:
	last_page = init_page + 79
	merger.append(file_name,
	pages=PyPDF2.pagerange.PageRange('{}:{}'.format(init_page, last_page)))
	output_file = '{}_0{}.pdf'.format(base_name, num)
	merger.write(output_file)
	merger.close()
	_output.append(output_file)
	return _output

	def combine_file(files):
	_tmp = list()
	for _ in files:
	with open(_) as f:
	_tmp += f.readlines()
	file_name = "{}.txt".format(str(os.path.splitext(_)[0]).split("_")[0])
	with open(file_name, "w", encoding="utf-8") as f:
	f.writelines(_tmp)
	return file_name

	def refine_text(input_file):
	with open(input_file) as f:
	data = f.readlines()

	whole = list()
	new_data = list()
	for _ in data:
	_ = _.replace(" ", "")
	if (_[-2] == "。") or (_[-2] == "」"):
	new_data.append(_[:-1])
	whole.append(''.join(new_data)+'\n')
	new_data = list()
	else:
	new_data.append(_[:-1])
	output_file = "{}_refined.txt".format(os.path.splitext(input_file)[0])
	with open(output_file, "w", encoding="utf-8") as f:
	f.writelines(whole)

	if __name__ == '__main__':
	file_names = split_pdf("なぜ私だけが苦しむのかトリム.pdf")
	for _ in file_names:
	main(_)
	file_names = ["{}.txt".format(os.path.splitext(_)[0]) for _ in file_names]
	output_name = combine_file(file_names)
	refine_text(output_name)