Skip to content

Instantly share code, notes, and snippets.

@salamann salamann/ocr_pdf.py
Created Oct 23, 2019

Embed
What would you like to do?
Convert a pdf file to refined text so that Google Play Books can read out
from __future__ import print_function
import httplib2
import os
import io
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
import PyPDF2
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'credentials.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
credential_path = os.path.join("./", CLIENT_SECRET_FILE)
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def main(imgfile):
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
txtfile = "{}.txt".format(os.path.splitext(imgfile)[0])
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': imgfile,
'mimeType': mime
},
media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True)
).execute()
downloader = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = downloader.next_chunk()
service.files().delete(fileId=res['id']).execute()
print("Done.")
def split_pdf(file_name):
base_name = os.path.splitext(file_name)[0]
reader1 = PyPDF2.PdfFileReader(file_name)
num_pages = reader1.numPages
_output = list()
for num, init_page in enumerate(range(1, num_pages, 80)):
print(init_page)
merger = PyPDF2.PdfFileMerger()
if num_pages < init_page + 79:
last_page = num_pages
else:
last_page = init_page + 79
merger.append(file_name,
pages=PyPDF2.pagerange.PageRange('{}:{}'.format(init_page, last_page)))
output_file = '{}_0{}.pdf'.format(base_name, num)
merger.write(output_file)
merger.close()
_output.append(output_file)
return _output
def combine_file(files):
_tmp = list()
for _ in files:
with open(_) as f:
_tmp += f.readlines()
file_name = "{}.txt".format(str(os.path.splitext(_)[0]).split("_")[0])
with open(file_name, "w", encoding="utf-8") as f:
f.writelines(_tmp)
return file_name
def refine_text(input_file):
with open(input_file) as f:
data = f.readlines()
whole = list()
new_data = list()
for _ in data:
_ = _.replace(" ", "")
if (_[-2] == "") or (_[-2] == ""):
new_data.append(_[:-1])
whole.append(''.join(new_data)+'\n')
new_data = list()
else:
new_data.append(_[:-1])
output_file = "{}_refined.txt".format(os.path.splitext(input_file)[0])
with open(output_file, "w", encoding="utf-8") as f:
f.writelines(whole)
if __name__ == '__main__':
file_names = split_pdf("なぜ私だけが苦しむのかトリム.pdf")
for _ in file_names:
main(_)
file_names = ["{}.txt".format(os.path.splitext(_)[0]) for _ in file_names]
output_name = combine_file(file_names)
refine_text(output_name)
@salamann

This comment has been minimized.

Copy link
Owner Author

salamann commented Oct 23, 2019

@salamann

This comment has been minimized.

Copy link
Owner Author

salamann commented Oct 23, 2019

Listening an audio book during a long drive is of interest. Cut the books, scan, then convert it to .epub.

Wrote this code so that I can convert pdf file to text file using Google Drive OCR. This code allows me to listen audio book that I cut and scanned.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.