Skip to content

Instantly share code, notes, and snippets.

@justengel
Last active April 23, 2023 19:15
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save justengel/dff226eb565ee19a9552a8f2e76e5612 to your computer and use it in GitHub Desktop.
Save justengel/dff226eb565ee19a9552a8f2e76e5612 to your computer and use it in GitHub Desktop.
Convert Microsoft Office documents to html or pdf files that can be viewed in a web browser.
"""Convert office document to file formats that may be visible in a web browser. This file uses microsoft office to
convert the files, so Windows OS is assumed and required!
Requirements:
* pywin32>=228 # Not available for Python3.8 at this time
Server Requirements:
* uvicorn>=0.11.5
* fastapi>=0.58.0
* python-multipart>=0.0.5
* aiofiles>=0.5.0
Client Requirements:
* requests>=2.24.0
"""
import os
import asyncio
from typing import Callable
import pythoncom
from win32com.client.gencache import EnsureDispatch # pip install pywin32 Not available for python3.8 at this time
from win32com.client import constants
from urllib.request import pathname2url, url2pathname
import shutil
__all__ = ['register_converter', 'get_converter', 'file_to_html', 'file_to_html_async',
'word_to_html', 'excel_to_html', 'powerpoint_to_pdf', 'copy_pdf',
'get_app', 'convert_client',]
CONVERTERS = {}
def register_converter(ext: str, func: Callable[[str, str], str] = None):
if func is None:
def decorator(func: Callable[[str, str], str] = None):
return register_converter(ext, func)
return decorator
CONVERTERS[str(ext).lower()] = func
return func
def get_converter(ext: str) -> Callable[[str, str], str]:
return CONVERTERS.get(str(ext).lower(), None)
def file_to_html(filename: str, save_filename: str) -> str:
ext = os.path.splitext(filename)[-1]
func = get_converter(ext)
saved = ''
if callable(func):
pythoncom.CoInitialize()
saved = func(filename, save_filename)
pythoncom.CoUninitialize()
return saved
async def file_to_html_async(filename: str, save_filename: str, loop: asyncio.AbstractEventLoop = None) -> str:
if loop is None:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, file_to_html, filename, save_filename)
@register_converter('.docx')
@register_converter('.doc')
def word_to_html(filename: str, save_filename: str) -> str:
word = EnsureDispatch('Word.Application')
word.Visible = False
word.DisplayAlerts = False
doc = word.Documents.Open(filename)
# word.ActiveDocument.SaveAs(save_filename)
doc.SaveAs(save_filename, constants.wdFormatHTML) # wdFormatFilteredHTML
word.Quit()
return save_filename
@register_converter('.xlsx')
@register_converter('.xls')
def excel_to_html(filename: str, save_filename: str) -> str:
excel = EnsureDispatch('Excel.Application')
excel.Visible = False
excel.DisplayAlerts = False
wkbk = excel.Workbooks.Open(filename)
wkbk.SaveAs(save_filename, constants.xlHtml)
# excel.ActiveWorkbook.SaveAs(save_filename, constants.xlHtml)
excel.Quit()
return save_filename
@register_converter('.pptx')
def powerpoint_to_pdf(filename: str, save_filename: str) -> str:
# Force the save_filename to have a pdf extension. My version of office does not support HTML.
split = os.path.splitext(save_filename)
if split[-1].lower() != '.pdf':
save_filename = split[0] + '.pdf'
powerpoint = EnsureDispatch('Powerpoint.Application')
try:
powerpoint.Visible = False
except:
pass
powerpoint.DisplayAlerts = False
pres = powerpoint.Presentations.Open(filename, WithWindow=False)
pres.SaveAs(save_filename, constants.ppSaveAsPDF)
# powerpoint.ActivePresentation.SaveAs(save_filename, constants.ppSaveAsPDF)
# powerpoint.ActivePresentation.SaveCopyAs(save_filename, constants.ppSaveAsHTMLv3) # Not supported for version
powerpoint.Quit()
return save_filename
@register_converter('.pdf')
def copy_pdf(filename: str, save_filename: str) -> str:
# Force the save_filename to have a pdf extension
split = os.path.splitext(save_filename)
if split[-1].lower() != '.pdf':
save_filename = split[0] + '.pdf'
shutil.copyfile(filename, save_filename)
return save_filename
def delete_file(filename: str):
try:
os.remove(filename)
except:
pass
def get_app(save_url: str = '/converted/', save_path: str = './converted', adjust_path: Callable[[str], str] = None):
"""Create a fastapi app.
:param save_url: Static file url to access the converted files
:param save_path: Path to save the converted files
:param adjust_path: Function that takes in a path and returns a new path that may be modified.
:return: FastAPI applicaiton.
"""
from fastapi import FastAPI, Request, File, UploadFile, BackgroundTasks
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse, FileResponse, RedirectResponse
app = FastAPI()
app.mount(save_url, StaticFiles(directory=save_path))
@app.get('/')
async def list_convert(request: Request):
html = '<h1><a href="{base_url}">Office to HTML</a></h1>' \
'<p><b>File Converter:</b> <a href="{base_url}file_convert/">{base_url}file_convert/</a><p>' \
'<p><b>API Converter:</b> ' \
'{base_url}convert/?filename=server_file_path&save_filename=save_file_path</a><br>' \
'This uses a servers filepath (samba) to convert files.'\
'<p>'.format(base_url=request.base_url)
li = '<p><b>Converted Files:</b>' \
'<ul>{}</ul>' \
'</p>'.format('\n'.join(('<li><a href="{}">{}</a></li>'.format(os.path.join(save_url, name), name)
for name in os.listdir(save_path) if '.' in name)))
return HTMLResponse(html+li)
@app.get('/convert/')
async def convert(filename: str = None, save_filename: str = None):
filename = url2pathname(filename)
if save_filename is None:
save_filename = os.path.join(save_path, os.path.splitext(os.path.basename(filename))[0] + '.html')
save_filename = url2pathname(save_filename)
if adjust_path:
filename = adjust_path(filename)
save_filename = adjust_path(save_filename)
saved = await file_to_html_async(filename, save_filename)
if saved:
saved = pathname2url(saved.replace(save_path, save_url))
return {'results': saved} # Return the staticfile url
@app.route('/file_convert/', methods=['GET', 'POST'])
async def file_convert(request: Request):
tasks = BackgroundTasks()
msg: str = ''
form = await request.form()
file = form.get('file', None)
keep_converted = form.get('keep_converted', False)
if file is not None:
# Get the save filename
save_filename = os.path.join(save_path, os.path.splitext(os.path.basename(file.filename))[0] + '.html')
try:
# (Windows Issue) Cannot read from temp location? Move file AppData/local/Temp -> AppData
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(file.file.name)))
except (TypeError, ValueError, Exception):
# Data is in memory. Temporary file not created.
parent_dir = save_path
fname = os.path.join(parent_dir, file.filename)
# Possibly adjust the path
if adjust_path:
fname = adjust_path(fname)
save_filename = adjust_path(save_filename)
# Save in memory file to disk
# Also (Windows Issue) Cannot read from temp location? Move file AppData/local/Temp -> AppData
with open(fname, 'wb') as f:
shutil.copyfileobj(file.file, f)
delete_file(file.file.name)
# Save the file
try:
saved = await file_to_html_async(fname, save_filename)
except(ValueError, TypeError, Exception):
saved = ''
delete_file(fname)
if saved:
if not keep_converted:
tasks.add_task(delete_file, saved) # Delete after FileResponse
return FileResponse(saved, background=tasks)
# Failed to convert!
msg = '<p>Failed to convert the given file!</p>\n'
# GET to convert file
html = '<h1><a href="{base_url}">Office to HTML</a></h1>' \
'{msg}' \
'<form method="POST" enctype="multipart/form-data">' \
' <label for="file">Select a file (.xls to html is not supported on all browsers):</label><br>' \
' <input type="file" id="file" name="file"><br><br>' \
' <input type="checkbox" id="keep_converted" name="keep_converted">' \
' <label for="keep_converted">Keep Converted</label><br><br>' \
' <input type="submit">' \
'</form'.format(msg=msg, base_url=request.base_url)
return HTMLResponse(html)
return app
def convert_client(filename: str, save_filename: str = None, url='http://127.0.0.1:9001/convert/') -> dict:
import requests # Should be at top of file, but this makes requests optional if you are not running the client
params = {'filename': pathname2url(filename)}
if save_filename:
params['save_filename'] = pathname2url(save_filename)
r = requests.get(url, params=params)
try:
d = r.json()
except (AttributeError, ValueError, TypeError, Exception):
d = {'results': ''}
print(r.status_code, d)
return d
if __name__ == '__main__':
import uvicorn
app = get_app()
uvicorn.run(app, host='0.0.0.0', port=9001)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment