Skip to content

Instantly share code, notes, and snippets.

@salamann
Created October 23, 2019 00:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save salamann/8ca7a47995daa600a924598bb3a168a3 to your computer and use it in GitHub Desktop.
Save salamann/8ca7a47995daa600a924598bb3a168a3 to your computer and use it in GitHub Desktop.
Convert a pdf file to refined text so that Google Play Books can read out
from __future__ import print_function
import httplib2
import os
import io
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
import PyPDF2
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'credentials.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
credential_path = os.path.join("./", CLIENT_SECRET_FILE)
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def main(imgfile):
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
txtfile = "{}.txt".format(os.path.splitext(imgfile)[0])
mime = 'application/vnd.google-apps.document'
res = service.files().create(
body={
'name': imgfile,
'mimeType': mime
},
media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True)
).execute()
downloader = MediaIoBaseDownload(
io.FileIO(txtfile, 'wb'),
service.files().export_media(fileId=res['id'], mimeType="text/plain")
)
done = False
while done is False:
status, done = downloader.next_chunk()
service.files().delete(fileId=res['id']).execute()
print("Done.")
def split_pdf(file_name):
base_name = os.path.splitext(file_name)[0]
reader1 = PyPDF2.PdfFileReader(file_name)
num_pages = reader1.numPages
_output = list()
for num, init_page in enumerate(range(1, num_pages, 80)):
print(init_page)
merger = PyPDF2.PdfFileMerger()
if num_pages < init_page + 79:
last_page = num_pages
else:
last_page = init_page + 79
merger.append(file_name,
pages=PyPDF2.pagerange.PageRange('{}:{}'.format(init_page, last_page)))
output_file = '{}_0{}.pdf'.format(base_name, num)
merger.write(output_file)
merger.close()
_output.append(output_file)
return _output
def combine_file(files):
_tmp = list()
for _ in files:
with open(_) as f:
_tmp += f.readlines()
file_name = "{}.txt".format(str(os.path.splitext(_)[0]).split("_")[0])
with open(file_name, "w", encoding="utf-8") as f:
f.writelines(_tmp)
return file_name
def refine_text(input_file):
with open(input_file) as f:
data = f.readlines()
whole = list()
new_data = list()
for _ in data:
_ = _.replace(" ", "")
if (_[-2] == "。") or (_[-2] == "」"):
new_data.append(_[:-1])
whole.append(''.join(new_data)+'\n')
new_data = list()
else:
new_data.append(_[:-1])
output_file = "{}_refined.txt".format(os.path.splitext(input_file)[0])
with open(output_file, "w", encoding="utf-8") as f:
f.writelines(whole)
if __name__ == '__main__':
file_names = split_pdf("なぜ私だけが苦しむのかトリム.pdf")
for _ in file_names:
main(_)
file_names = ["{}.txt".format(os.path.splitext(_)[0]) for _ in file_names]
output_name = combine_file(file_names)
refine_text(output_name)
@salamann
Copy link
Author

Listening an audio book during a long drive is of interest. Cut the books, scan, then convert it to .epub.

Wrote this code so that I can convert pdf file to text file using Google Drive OCR. This code allows me to listen audio book that I cut and scanned.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment