paultopia/extracting_reference.py

## extracting_reference.py
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import json

# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']

# this is all based on this quickstart: https://developers.google.com/docs/api/quickstart/js

# to run this, you need to follow the activation and credential download steps on that page first.

DOCUMENT_ID = '1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A'

# matches document: https://docs.google.com/document/d/1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A/edit?usp=sharing

def download_test_document():
    """Shows basic usage of the Docs API.
    Prints the title of a sample document.
    """
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server()
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=DOCUMENT_ID).execute()

    print('The title of the document is: {}'.format(document.get('title')))
    with open("testdoc.json", 'w') as tj:
        json.dump(document, tj, indent=4, sort_keys=True)


# that function is step 1, saving a local copy of the document in json format to parse through.

# this is step 2.

# helper function for making sense of the heavily nested json
def pretty_print(data):
    print(json.dumps(data, indent=2, sort_keys=True))

def check_italics(textrun):
    if textrun["textStyle"].get("italic"):
        return textrun["content"]
    return False

def check_smallcaps(textrun):
    if textrun["textStyle"].get("smallCaps"):
        return textrun["content"]
    return False

def print_article_title_journal(footnote):
    for content in footnote["content"]:
        elements = content["paragraph"]["elements"]
        textruns = [element["textRun"] for element in elements]
        for idx, item in enumerate(textruns):
            if idx >= 3:  # to avoid searching below zero
                if check_smallcaps(item):
                    candidate_title = textruns[idx - 2]
                    if check_italics(candidate_title):  # name
                        print("{}, {}, in note {}".format(candidate_title["content"], item["content"], footnote["footnoteId"][-1]))


def parse_test_document():
    with open("testdoc.json") as tj:
        document = json.load(tj)
    # just for convenience sake I'd like to have footnotes as a list of dicts rather than one big dict.
    footnotes = [item for key, item in document["footnotes"].items()]
    # and now we can loop over the footnotes and print article
    # and title only for those footnotes with a journal article rather than a book
    pretty_print(footnotes)
    for footnote in footnotes:
        try:
            print_article_title_journal(footnote)
        except:
            pass  # just to get rid of idiosyncratic bits of json in the footnotes block w/o content

if __name__ == '__main__':
    download_test_document()
    parse_test_document()
	import pickle
	import os.path
	from googleapiclient.discovery import build
	from google_auth_oauthlib.flow import InstalledAppFlow
	from google.auth.transport.requests import Request
	import json

	# If modifying these scopes, delete the file token.pickle.
	SCOPES = ['https://www.googleapis.com/auth/documents.readonly']

	# this is all based on this quickstart: https://developers.google.com/docs/api/quickstart/js

	# to run this, you need to follow the activation and credential download steps on that page first.

	DOCUMENT_ID = '1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A'

	# matches document: https://docs.google.com/document/d/1w8kimbrzfIwvbfaUFEVzL5x1XrUc_Fy8YxhR6mRn10A/edit?usp=sharing

	def download_test_document():
	"""Shows basic usage of the Docs API.
	Prints the title of a sample document.
	"""
	creds = None
	# The file token.pickle stores the user's access and refresh tokens, and is
	# created automatically when the authorization flow completes for the first
	# time.
	if os.path.exists('token.pickle'):
	with open('token.pickle', 'rb') as token:
	creds = pickle.load(token)
	# If there are no (valid) credentials available, let the user log in.
	if not creds or not creds.valid:
	if creds and creds.expired and creds.refresh_token:
	creds.refresh(Request())
	else:
	flow = InstalledAppFlow.from_client_secrets_file(
	'credentials.json', SCOPES)
	creds = flow.run_local_server()
	# Save the credentials for the next run
	with open('token.pickle', 'wb') as token:
	pickle.dump(creds, token)

	service = build('docs', 'v1', credentials=creds)

	# Retrieve the documents contents from the Docs service.
	document = service.documents().get(documentId=DOCUMENT_ID).execute()

	print('The title of the document is: {}'.format(document.get('title')))
	with open("testdoc.json", 'w') as tj:
	json.dump(document, tj, indent=4, sort_keys=True)


	# that function is step 1, saving a local copy of the document in json format to parse through.

	# this is step 2.

	# helper function for making sense of the heavily nested json
	def pretty_print(data):
	print(json.dumps(data, indent=2, sort_keys=True))

	def check_italics(textrun):
	if textrun["textStyle"].get("italic"):
	return textrun["content"]
	return False

	def check_smallcaps(textrun):
	if textrun["textStyle"].get("smallCaps"):
	return textrun["content"]
	return False

	def print_article_title_journal(footnote):
	for content in footnote["content"]:
	elements = content["paragraph"]["elements"]
	textruns = [element["textRun"] for element in elements]
	for idx, item in enumerate(textruns):
	if idx >= 3: # to avoid searching below zero
	if check_smallcaps(item):
	candidate_title = textruns[idx - 2]
	if check_italics(candidate_title): # name
	print("{}, {}, in note {}".format(candidate_title["content"], item["content"], footnote["footnoteId"][-1]))




	def parse_test_document():
	with open("testdoc.json") as tj:
	document = json.load(tj)
	# just for convenience sake I'd like to have footnotes as a list of dicts rather than one big dict.
	footnotes = [item for key, item in document["footnotes"].items()]
	# and now we can loop over the footnotes and print article
	# and title only for those footnotes with a journal article rather than a book
	pretty_print(footnotes)
	for footnote in footnotes:
	try:
	print_article_title_journal(footnote)
	except:
	pass # just to get rid of idiosyncratic bits of json in the footnotes block w/o content

	if __name__ == '__main__':
	download_test_document()
	parse_test_document()