Skip to content

Instantly share code, notes, and snippets.

@paultopia
Created February 12, 2019 02:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paultopia/3e38653f1a061d7d6ab5c985b35b324c to your computer and use it in GitHub Desktop.
Save paultopia/3e38653f1a061d7d6ab5c985b35b324c to your computer and use it in GitHub Desktop.
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import json
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
# this is all based on this quickstart: https://developers.google.com/docs/api/quickstart/js
# to run this, you need to follow the activation and credential download steps on that page first.
DOCUMENT_ID = '1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A'
# matches document: https://docs.google.com/document/d/1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A/edit?usp=sharing
def download_test_document():
"""Shows basic usage of the Docs API.
Prints the title of a sample document.
"""
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server()
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('docs', 'v1', credentials=creds)
# Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=DOCUMENT_ID).execute()
print('The title of the document is: {}'.format(document.get('title')))
with open("testdoc.json", 'w') as tj:
json.dump(document, tj, indent=4, sort_keys=True)
# that function is step 1, saving a local copy of the document in json format to parse through.
# this is step 2.
# helper function for making sense of the heavily nested json
def pretty_print(data):
print(json.dumps(data, indent=2, sort_keys=True))
def check_italics(textrun):
if textrun["textStyle"].get("italic"):
return textrun["content"]
return False
def check_smallcaps(textrun):
if textrun["textStyle"].get("smallCaps"):
return textrun["content"]
return False
def print_article_title_journal(footnote):
for content in footnote["content"]:
elements = content["paragraph"]["elements"]
textruns = [element["textRun"] for element in elements]
for idx, item in enumerate(textruns):
if idx >= 3: # to avoid searching below zero
if check_smallcaps(item):
candidate_title = textruns[idx - 2]
if check_italics(candidate_title): # name
print("{}, {}, in note {}".format(candidate_title["content"], item["content"], footnote["footnoteId"][-1]))
def parse_test_document():
with open("testdoc.json") as tj:
document = json.load(tj)
# just for convenience sake I'd like to have footnotes as a list of dicts rather than one big dict.
footnotes = [item for key, item in document["footnotes"].items()]
# and now we can loop over the footnotes and print article
# and title only for those footnotes with a journal article rather than a book
pretty_print(footnotes)
for footnote in footnotes:
try:
print_article_title_journal(footnote)
except:
pass # just to get rid of idiosyncratic bits of json in the footnotes block w/o content
if __name__ == '__main__':
download_test_document()
parse_test_document()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment