Skip to content

Instantly share code, notes, and snippets.

@tanaikech
Created June 15, 2017 05:09
Show Gist options
  • Save tanaikech/825f4b848a8cbff7018f71d33399e99b to your computer and use it in GitHub Desktop.
Save tanaikech/825f4b848a8cbff7018f71d33399e99b to your computer and use it in GitHub Desktop.
Converting PDF to TXT

Converting PDF to TXT

This is a sample script for converting a PDF file to a TXT file. 2 steps are required for this.

  1. Upload a PDF file as a Google Document
  2. Download a Google Document as a TXT file

In this sample, Python Quickstart is used. The detail information is https://developers.google.com/drive/v3/web/quickstart/python. Please read "Step 1: Turn on the Drive API" and "Step 2: Install the Google Client Library".

from __future__ import print_function
import httplib2
import os
import io

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
from apiclient.http import MediaFileUpload, MediaIoBaseDownload

try:
    import argparse
    flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
    flags = None

SCOPES = 'https://www.googleapis.com/auth/drive'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Drive API Python Quickstart'


def get_credentials():
    credential_path = os.path.join("./", 'drive-python-quickstart.json')
    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else:  # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials


def main():
    credentials = get_credentials()
    http = credentials.authorize(httplib2.Http())
    service = discovery.build('drive', 'v3', http=http)

    pdffile = 'sample.pdf'
    txtfile = 'sample.txt'

    mime = 'application/vnd.google-apps.document'
    res = service.files().create(
        body={
            'name': pdffile,
            'mimeType': mime
        },
        media_body=MediaFileUpload(pdffile, mimetype=mime, resumable=True)
    ).execute()

    dl = MediaIoBaseDownload(
        io.FileIO(txtfile, 'wb'),
        service.files().export_media(fileId=res['id'], mimeType="text/plain")
    )
    done = False
    while done is False:
        status, done = dl.next_chunk()
    print("Done.")


if __name__ == '__main__':
    main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment