Created
October 23, 2019 00:38
-
-
Save salamann/8ca7a47995daa600a924598bb3a168a3 to your computer and use it in GitHub Desktop.
Convert a pdf file to refined text so that Google Play Books can read out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import httplib2 | |
import os | |
import io | |
from apiclient import discovery | |
from oauth2client import client | |
from oauth2client import tools | |
from oauth2client.file import Storage | |
from apiclient.http import MediaFileUpload, MediaIoBaseDownload | |
import PyPDF2 | |
try: | |
import argparse | |
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args() | |
except ImportError: | |
flags = None | |
# If modifying these scopes, delete your previously saved credentials | |
# at ~/.credentials/drive-python-quickstart.json | |
SCOPES = 'https://www.googleapis.com/auth/drive' | |
CLIENT_SECRET_FILE = 'credentials.json' | |
APPLICATION_NAME = 'Drive API Python Quickstart' | |
def get_credentials(): | |
"""Gets valid user credentials from storage. | |
If nothing has been stored, or if the stored credentials are invalid, | |
the OAuth2 flow is completed to obtain the new credentials. | |
Returns: | |
Credentials, the obtained credential. | |
""" | |
credential_path = os.path.join("./", CLIENT_SECRET_FILE) | |
store = Storage(credential_path) | |
credentials = store.get() | |
if not credentials or credentials.invalid: | |
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) | |
flow.user_agent = APPLICATION_NAME | |
if flags: | |
credentials = tools.run_flow(flow, store, flags) | |
else: # Needed only for compatibility with Python 2.6 | |
credentials = tools.run(flow, store) | |
print('Storing credentials to ' + credential_path) | |
return credentials | |
def main(imgfile): | |
credentials = get_credentials() | |
http = credentials.authorize(httplib2.Http()) | |
service = discovery.build('drive', 'v3', http=http) | |
txtfile = "{}.txt".format(os.path.splitext(imgfile)[0]) | |
mime = 'application/vnd.google-apps.document' | |
res = service.files().create( | |
body={ | |
'name': imgfile, | |
'mimeType': mime | |
}, | |
media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True) | |
).execute() | |
downloader = MediaIoBaseDownload( | |
io.FileIO(txtfile, 'wb'), | |
service.files().export_media(fileId=res['id'], mimeType="text/plain") | |
) | |
done = False | |
while done is False: | |
status, done = downloader.next_chunk() | |
service.files().delete(fileId=res['id']).execute() | |
print("Done.") | |
def split_pdf(file_name): | |
base_name = os.path.splitext(file_name)[0] | |
reader1 = PyPDF2.PdfFileReader(file_name) | |
num_pages = reader1.numPages | |
_output = list() | |
for num, init_page in enumerate(range(1, num_pages, 80)): | |
print(init_page) | |
merger = PyPDF2.PdfFileMerger() | |
if num_pages < init_page + 79: | |
last_page = num_pages | |
else: | |
last_page = init_page + 79 | |
merger.append(file_name, | |
pages=PyPDF2.pagerange.PageRange('{}:{}'.format(init_page, last_page))) | |
output_file = '{}_0{}.pdf'.format(base_name, num) | |
merger.write(output_file) | |
merger.close() | |
_output.append(output_file) | |
return _output | |
def combine_file(files): | |
_tmp = list() | |
for _ in files: | |
with open(_) as f: | |
_tmp += f.readlines() | |
file_name = "{}.txt".format(str(os.path.splitext(_)[0]).split("_")[0]) | |
with open(file_name, "w", encoding="utf-8") as f: | |
f.writelines(_tmp) | |
return file_name | |
def refine_text(input_file): | |
with open(input_file) as f: | |
data = f.readlines() | |
whole = list() | |
new_data = list() | |
for _ in data: | |
_ = _.replace(" ", "") | |
if (_[-2] == "。") or (_[-2] == "」"): | |
new_data.append(_[:-1]) | |
whole.append(''.join(new_data)+'\n') | |
new_data = list() | |
else: | |
new_data.append(_[:-1]) | |
output_file = "{}_refined.txt".format(os.path.splitext(input_file)[0]) | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.writelines(whole) | |
if __name__ == '__main__': | |
file_names = split_pdf("なぜ私だけが苦しむのかトリム.pdf") | |
for _ in file_names: | |
main(_) | |
file_names = ["{}.txt".format(os.path.splitext(_)[0]) for _ in file_names] | |
output_name = combine_file(file_names) | |
refine_text(output_name) |
Listening an audio book during a long drive is of interest. Cut the books, scan, then convert it to .epub.
Wrote this code so that I can convert pdf file to text file using Google Drive OCR. This code allows me to listen audio book that I cut and scanned.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use this code https://gist.github.com/tanaikech/8c808bf8c060455fe5401ecacad07b94