sspaeti/onenote_export.py

## onenote_export.py
### README
# This Python scripts exports all the OneNote notebooks linked to your Microsoft account to HTML files.

## Output
# The notebooks will each become a subdirectory of the `output` folder, with further subdirectories
# for the sections within each notebook and the pages within each section. Each page is a directory
# containing the HTML file `<page-name>.html` and its images and attachments. I changed to create
# sepearte directories for `images` and `attachments` as my Markdown editor (obsidian) would show
# this directories seperately what didn't look nice in the File-Tree. Like this, images and Attachments
# are not shown in the tree but within the Markdown which is what I wanted. Any sub-pages will be
# subdirectories within this one.

## Setup
# In order to run the script, you must first do the following:
# 1. Go to https://aad.portal.azure.com/ and log in with your Microsoft account.
# 2. Select "Azure Active Directory" and then "App registrations" under "Manage".
# 3. Select "New registration". Choose any name, set "Supported account types" to "Accounts in any
#    organizational directory and personal Microsoft accounts" and under "Redirect URI", select Web
#    and enter `http://localhost:5000/getToken`. Register.
# 4. Copy "Application (client) ID" and paste it as `client_id` below in this script.
# 5. Select "Certificates & secrets" under "Manage". Press "New client secret", choose a name and
#    confirm.
# 6. Copy the client secret and paste it as `secret` below in this script.
# 7. Select "API permissions" under "Manage". Press "Add a permission", scroll down and select OneNote,
#    choose "Delegated permissions" and check "Notes.Read" and "Notes.Read.All". Press "Add
#    permissions".
# 8. Make sure you have Python 3.7 (or newer) installed and install the dependencies using the command
#    `pip install flask msal requests_oauthlib`.

## Running
# In a terminal, navigate to the directory where this script is located and run it using
# `python onenote_export.py`. This will start a local web server on port 5000.
# In your browser navigate to http://localhost:5000 and log in to your Microsoft account.
# The first time you do it, you will also have to accept that the app can read your OneNote notes.
# (This does not give any third parties access to your data, as long as you don't share the client id
# and secret you created on the Azure portal). After this, go back to the terminal to follow the progress.

## Note
# Microsoft limits how many requests you can do within a given time period. Therefore, if you have many
# notes you might eventually see messages like this in the terminal: "Too many requests, waiting 20s and
# trying again." This is not a problem, but it means the entire process can take a while. Also, the login
# session can expire after a while, which results in a TokenExpiredError. If this happens, simply reload
# `http://localhost:5000` and the script will continue (skipping the files it already downloaded).

import os
import random
import re
import shutil
import string
import time
import uuid
from html.parser import HTMLParser
from pathlib import Path
from xml.etree import ElementTree

import flask
import msal
from requests_oauthlib import OAuth2Session

notebooks_to_download = ['Personal', 'KnowHow-Box']

client_id = os.environ['AZURE_ONENOTE_CLIENT_ID']
secret = os.environ['AZURE_ONENOTE_SECRET']

output_path = Path('../output')
graph_url = 'https://graph.microsoft.com/v1.0'
authority_url = 'https://login.microsoftonline.com/common'
scopes = ['Notes.Read', 'Notes.Read.All']
redirect_uri = 'http://localhost:5000/getToken'

app = flask.Flask(__name__)
app.debug = True
app.secret_key = os.urandom(16)

application = msal.ConfidentialClientApplication(
    client_id, authority=authority_url, client_credential=secret
)


@app.route("/")
def main():
    resp = flask.Response(status=307)
    resp.headers['location'] = '/login'
    return resp


@app.route("/login")
def login():
    auth_state = str(uuid.uuid4())
    flask.session['state'] = auth_state
    authorization_url = application.get_authorization_request_url(
        scopes, state=auth_state, redirect_uri=redirect_uri
    )
    resp = flask.Response(status=307)
    resp.headers['location'] = authorization_url
    return resp


def get_json(graph_client, url, params=None):
    values = []
    next_page = url
    while next_page:
        resp = get(graph_client, next_page, params=params).json()
        if 'value' not in resp:
            raise RuntimeError(f'Invalid server response: {resp}')
        values += resp['value']
        next_page = resp.get('@odata.nextLink')
    return values


def get(graph_client, url, params=None):
    while True:
        resp = graph_client.get(url, params=params)
        if resp.status_code == 429:
            # We are being throttled due to too many requests.
            # See https://docs.microsoft.com/en-us/graph/throttling
            print('        Too many requests, waiting 20s and trying again.')
            time.sleep(20)
        elif resp.status_code == 500:
            # In my case, one specific note page consistently gave this status
            # code when trying to get the content. The error was "19999:
            # Something failed, the API cannot share any more information
            # at the time of the request."
            print('        Error 500, skipping this page.')
            return None
        else:
            resp.raise_for_status()
            return resp


def download_attachments(graph_client, content, out_dir):
    image_dir = out_dir  # / 'images'
    attachment_dir = out_dir  # / 'attachments'
    # if image_dir.exists():
    #     shutil.rmtree(image_dir)

    class MyHTMLParser(HTMLParser):
        def handle_starttag(self, tag, attrs):
            self.attrs = {k: v for k, v in attrs}

    def generate_html(tag, props):
        element = ElementTree.Element(tag, attrib=props)
        return ElementTree.tostring(element, encoding='unicode')

    def download_image(tag_match):
        # <img width="843" height="218.5" src="..." data-src-type="image/png" data-fullres-src="..." data-fullres-src-type="image/png" />
        parser = MyHTMLParser()
        parser.feed(tag_match[0])
        props = parser.attrs

        # catch error with corrupt or not regognised image
        try:
            image_url = props.get('data-fullres-src', props['src'])
        except:
            print("Unexpected error with fetching image_url" + str(props))
            file_name = 'error.png'
            props['src'] = "images/error"
            return generate_html('img', props)
        image_type = props.get('data-fullres-src-type', props['data-src-type']).split("/")[-1]
        file_name = (
            ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) + '.' + image_type
        )
        img = get(graph_client, image_url).content
        print(f'      Downloaded image of {len(img)} bytes.')
        # image_dir.mkdir(exist_ok=True)
        with open(image_dir / file_name, "wb") as f:
            f.write(img)
        props['src'] = file_name
        props = {k: v for k, v in props.items() if not 'data-fullres-src' in k}
        return generate_html('img', props)

    def download_attachment(tag_match):
        # <object data-attachment="Trig_Cheat_Sheet.pdf" type="application/pdf" data="..." style="position:absolute;left:528px;top:139px" />
        parser = MyHTMLParser()
        parser.feed(tag_match[0])
        props = parser.attrs
        data_url = props['data']
        file_name = props['data-attachment']
        if (attachment_dir / file_name).exists():
            print(f'      Attachment {file_name} already downloaded; skipping.')
        else:
            data = get(graph_client, data_url).content
            print(f'      Downloaded attachment {file_name} of {len(data)} bytes.')
            # attachment_dir.mkdir(exist_ok=True)
            with open(attachment_dir / file_name, "wb") as f:
                f.write(data)
        props['data'] = "attachments/" + file_name
        return generate_html('object', props)

    content = re.sub(r"<img .*?\/>", download_image, content, flags=re.DOTALL)
    content = re.sub(r"<object .*?\/>", download_attachment, content, flags=re.DOTALL)
    return content


def recursion_section_group(nb_name, graph_client, sectionGroup, level, output_section_path):
    section_groups = get_json(graph_client, sectionGroup['sectionGroupsUrl'])
    print(f'  Got {len(section_groups)} section groups.')
    if len(section_groups) != 0:
        for sec_gpr in section_groups:
            if output_section_path != '':
                out_path = output_section_path + '/' + sec_gpr["displayName"]
            else:
                out_path = sec_gpr["displayName"]
            print(
                f' level {level} - {output_section_path} - recursively go through section group: {sec_gpr["displayName"]}.'
            )
            recursion_section_group(nb_name, graph_client, sec_gpr, level + 1, out_path)

    # go through sections all the time, even if there is no section groups, otherwise we lose sections besides the section groups
    print(f'no more section section groups. Going through sections')
    # print(f'path: ' + output_section_path)
    # print("sectionsUrls: " + sectionGroup['sectionsUrl'])

    sections = get_json(graph_client, sectionGroup['sectionsUrl'])
    print(f'  Got {len(sections)} sections.')

    for sec in sections:
        sec_name = sec["displayName"]
        print(f'  Opening section {sec_name}')
        pages = get_json(graph_client, sec['pagesUrl'] + '?pagelevel=true')
        print(f'    Got {len(pages)} pages.')
        pages = sorted([(page['order'], page) for page in pages])
        level_dirs = [None] * 4
        for order, page in pages:
            level = page['level']
            page_title_rep = page["title"].replace("/", "&").replace(":", "-").replace(".", "-")
            page_title = f'{order}_{page_title_rep}'
            print(f'    Opening page {page_title}')
            if level == 0:
                out_dir = output_path / nb_name / output_section_path / sec_name / page_title
            else:
                out_dir = level_dirs[level - 1] / page_title
            level_dirs[level] = out_dir
            file_name = page_title_rep + ".html"
            out_html = out_dir / file_name  #'main.html'
            if out_html.exists():
                print('      HTML file already exists; skipping this page')
                continue
            out_dir.mkdir(parents=True, exist_ok=True)
            response = get(graph_client, page['contentUrl'])
            if response is not None:
                content = response.text
                print(f'      Got content of length {len(content)}')
                content = download_attachments(graph_client, content, out_dir)
                with open(out_html, "w") as f:
                    f.write(content)


@app.route("/getToken")
def main_logic():
    code = flask.request.args['code']

    token = application.acquire_token_by_authorization_code(
        code, scopes=scopes, redirect_uri=redirect_uri
    )
    graph_client = OAuth2Session(token=token)

    notebooks = get_json(graph_client, f'{graph_url}/me/onenote/notebooks')
    print(f'Got {len(notebooks)} notebooks.')

    for nb in notebooks:

        nb_name = nb["displayName"]
        if nb_name in notebooks_to_download:
            print(f'Opening notebook {nb_name}')
            # section_groups = get_json(graph_client, nb['sectionGroupsUrl'])
            print('section group url: {}'.format(nb['sectionGroupsUrl']))

            recursion_section_group(nb_name, graph_client, nb, 1, '')

    print("Done!")
    return flask.render_template_string(
        '<html><head><title>Done</title></head><body><p1><b>Done</b></p1></body></html>'
    )


if __name__ == "__main__":
    app.run()
	### README
	# This Python scripts exports all the OneNote notebooks linked to your Microsoft account to HTML files.

	## Output
	# The notebooks will each become a subdirectory of the `output` folder, with further subdirectories
	# for the sections within each notebook and the pages within each section. Each page is a directory
	# containing the HTML file `<page-name>.html` and its images and attachments. I changed to create
	# sepearte directories for `images` and `attachments` as my Markdown editor (obsidian) would show
	# this directories seperately what didn't look nice in the File-Tree. Like this, images and Attachments
	# are not shown in the tree but within the Markdown which is what I wanted. Any sub-pages will be
	# subdirectories within this one.

	## Setup
	# In order to run the script, you must first do the following:
	# 1. Go to https://aad.portal.azure.com/ and log in with your Microsoft account.
	# 2. Select "Azure Active Directory" and then "App registrations" under "Manage".
	# 3. Select "New registration". Choose any name, set "Supported account types" to "Accounts in any
	# organizational directory and personal Microsoft accounts" and under "Redirect URI", select Web
	# and enter `http://localhost:5000/getToken`. Register.
	# 4. Copy "Application (client) ID" and paste it as `client_id` below in this script.
	# 5. Select "Certificates & secrets" under "Manage". Press "New client secret", choose a name and
	# confirm.
	# 6. Copy the client secret and paste it as `secret` below in this script.
	# 7. Select "API permissions" under "Manage". Press "Add a permission", scroll down and select OneNote,
	# choose "Delegated permissions" and check "Notes.Read" and "Notes.Read.All". Press "Add
	# permissions".
	# 8. Make sure you have Python 3.7 (or newer) installed and install the dependencies using the command
	# `pip install flask msal requests_oauthlib`.

	## Running
	# In a terminal, navigate to the directory where this script is located and run it using
	# `python onenote_export.py`. This will start a local web server on port 5000.
	# In your browser navigate to http://localhost:5000 and log in to your Microsoft account.
	# The first time you do it, you will also have to accept that the app can read your OneNote notes.
	# (This does not give any third parties access to your data, as long as you don't share the client id
	# and secret you created on the Azure portal). After this, go back to the terminal to follow the progress.

	## Note
	# Microsoft limits how many requests you can do within a given time period. Therefore, if you have many
	# notes you might eventually see messages like this in the terminal: "Too many requests, waiting 20s and
	# trying again." This is not a problem, but it means the entire process can take a while. Also, the login
	# session can expire after a while, which results in a TokenExpiredError. If this happens, simply reload
	# `http://localhost:5000` and the script will continue (skipping the files it already downloaded).

	import os
	import random
	import re
	import shutil
	import string
	import time
	import uuid
	from html.parser import HTMLParser
	from pathlib import Path
	from xml.etree import ElementTree

	import flask
	import msal
	from requests_oauthlib import OAuth2Session

	notebooks_to_download = ['Personal', 'KnowHow-Box']

	client_id = os.environ['AZURE_ONENOTE_CLIENT_ID']
	secret = os.environ['AZURE_ONENOTE_SECRET']

	output_path = Path('../output')
	graph_url = 'https://graph.microsoft.com/v1.0'
	authority_url = 'https://login.microsoftonline.com/common'
	scopes = ['Notes.Read', 'Notes.Read.All']
	redirect_uri = 'http://localhost:5000/getToken'

	app = flask.Flask(__name__)
	app.debug = True
	app.secret_key = os.urandom(16)

	application = msal.ConfidentialClientApplication(
	client_id, authority=authority_url, client_credential=secret
	)


	@app.route("/")
	def main():
	resp = flask.Response(status=307)
	resp.headers['location'] = '/login'
	return resp


	@app.route("/login")
	def login():
	auth_state = str(uuid.uuid4())
	flask.session['state'] = auth_state
	authorization_url = application.get_authorization_request_url(
	scopes, state=auth_state, redirect_uri=redirect_uri
	)
	resp = flask.Response(status=307)
	resp.headers['location'] = authorization_url
	return resp


	def get_json(graph_client, url, params=None):
	values = []
	next_page = url
	while next_page:
	resp = get(graph_client, next_page, params=params).json()
	if 'value' not in resp:
	raise RuntimeError(f'Invalid server response: {resp}')
	values += resp['value']
	next_page = resp.get('@odata.nextLink')
	return values


	def get(graph_client, url, params=None):
	while True:
	resp = graph_client.get(url, params=params)
	if resp.status_code == 429:
	# We are being throttled due to too many requests.
	# See https://docs.microsoft.com/en-us/graph/throttling
	print(' Too many requests, waiting 20s and trying again.')
	time.sleep(20)
	elif resp.status_code == 500:
	# In my case, one specific note page consistently gave this status
	# code when trying to get the content. The error was "19999:
	# Something failed, the API cannot share any more information
	# at the time of the request."
	print(' Error 500, skipping this page.')
	return None
	else:
	resp.raise_for_status()
	return resp


	def download_attachments(graph_client, content, out_dir):
	image_dir = out_dir # / 'images'
	attachment_dir = out_dir # / 'attachments'
	# if image_dir.exists():
	# shutil.rmtree(image_dir)

	class MyHTMLParser(HTMLParser):
	def handle_starttag(self, tag, attrs):
	self.attrs = {k: v for k, v in attrs}

	def generate_html(tag, props):
	element = ElementTree.Element(tag, attrib=props)
	return ElementTree.tostring(element, encoding='unicode')

	def download_image(tag_match):
	# <img width="843" height="218.5" src="..." data-src-type="image/png" data-fullres-src="..." data-fullres-src-type="image/png" />
	parser = MyHTMLParser()
	parser.feed(tag_match[0])
	props = parser.attrs

	# catch error with corrupt or not regognised image
	try:
	image_url = props.get('data-fullres-src', props['src'])
	except:
	print("Unexpected error with fetching image_url" + str(props))
	file_name = 'error.png'
	props['src'] = "images/error"
	return generate_html('img', props)
	image_type = props.get('data-fullres-src-type', props['data-src-type']).split("/")[-1]
	file_name = (
	''.join(random.choice(string.ascii_lowercase) for _ in range(10)) + '.' + image_type
	)
	img = get(graph_client, image_url).content
	print(f' Downloaded image of {len(img)} bytes.')
	# image_dir.mkdir(exist_ok=True)
	with open(image_dir / file_name, "wb") as f:
	f.write(img)
	props['src'] = file_name
	props = {k: v for k, v in props.items() if not 'data-fullres-src' in k}
	return generate_html('img', props)

	def download_attachment(tag_match):
	# <object data-attachment="Trig_Cheat_Sheet.pdf" type="application/pdf" data="..." style="position:absolute;left:528px;top:139px" />
	parser = MyHTMLParser()
	parser.feed(tag_match[0])
	props = parser.attrs
	data_url = props['data']
	file_name = props['data-attachment']
	if (attachment_dir / file_name).exists():
	print(f' Attachment {file_name} already downloaded; skipping.')
	else:
	data = get(graph_client, data_url).content
	print(f' Downloaded attachment {file_name} of {len(data)} bytes.')
	# attachment_dir.mkdir(exist_ok=True)
	with open(attachment_dir / file_name, "wb") as f:
	f.write(data)
	props['data'] = "attachments/" + file_name
	return generate_html('object', props)

	content = re.sub(r"<img .*?\/>", download_image, content, flags=re.DOTALL)
	content = re.sub(r"<object .*?\/>", download_attachment, content, flags=re.DOTALL)
	return content


	def recursion_section_group(nb_name, graph_client, sectionGroup, level, output_section_path):
	section_groups = get_json(graph_client, sectionGroup['sectionGroupsUrl'])
	print(f' Got {len(section_groups)} section groups.')
	if len(section_groups) != 0:
	for sec_gpr in section_groups:
	if output_section_path != '':
	out_path = output_section_path + '/' + sec_gpr["displayName"]
	else:
	out_path = sec_gpr["displayName"]
	print(
	f' level {level} - {output_section_path} - recursively go through section group: {sec_gpr["displayName"]}.'
	)
	recursion_section_group(nb_name, graph_client, sec_gpr, level + 1, out_path)

	# go through sections all the time, even if there is no section groups, otherwise we lose sections besides the section groups
	print(f'no more section section groups. Going through sections')
	# print(f'path: ' + output_section_path)
	# print("sectionsUrls: " + sectionGroup['sectionsUrl'])

	sections = get_json(graph_client, sectionGroup['sectionsUrl'])
	print(f' Got {len(sections)} sections.')

	for sec in sections:
	sec_name = sec["displayName"]
	print(f' Opening section {sec_name}')
	pages = get_json(graph_client, sec['pagesUrl'] + '?pagelevel=true')
	print(f' Got {len(pages)} pages.')
	pages = sorted([(page['order'], page) for page in pages])
	level_dirs = [None] * 4
	for order, page in pages:
	level = page['level']
	page_title_rep = page["title"].replace("/", "&").replace(":", "-").replace(".", "-")
	page_title = f'{order}_{page_title_rep}'
	print(f' Opening page {page_title}')
	if level == 0:
	out_dir = output_path / nb_name / output_section_path / sec_name / page_title
	else:
	out_dir = level_dirs[level - 1] / page_title
	level_dirs[level] = out_dir
	file_name = page_title_rep + ".html"
	out_html = out_dir / file_name #'main.html'
	if out_html.exists():
	print(' HTML file already exists; skipping this page')
	continue
	out_dir.mkdir(parents=True, exist_ok=True)
	response = get(graph_client, page['contentUrl'])
	if response is not None:
	content = response.text
	print(f' Got content of length {len(content)}')
	content = download_attachments(graph_client, content, out_dir)
	with open(out_html, "w") as f:
	f.write(content)


	@app.route("/getToken")
	def main_logic():
	code = flask.request.args['code']

	token = application.acquire_token_by_authorization_code(
	code, scopes=scopes, redirect_uri=redirect_uri
	)
	graph_client = OAuth2Session(token=token)

	notebooks = get_json(graph_client, f'{graph_url}/me/onenote/notebooks')
	print(f'Got {len(notebooks)} notebooks.')

	for nb in notebooks:

	nb_name = nb["displayName"]
	if nb_name in notebooks_to_download:
	print(f'Opening notebook {nb_name}')
	# section_groups = get_json(graph_client, nb['sectionGroupsUrl'])
	print('section group url: {}'.format(nb['sectionGroupsUrl']))

	recursion_section_group(nb_name, graph_client, nb, 1, '')

	print("Done!")
	return flask.render_template_string(
	'<html><head><title>Done</title></head><body><p1><b>Done</b></p1></body></html>'
	)


	if __name__ == "__main__":
	app.run()