Skip to content

Instantly share code, notes, and snippets.

@mmacfadden
Last active September 3, 2022 19:46
Show Gist options
  • Save mmacfadden/49a2782ce056aef29b77d546bd067d28 to your computer and use it in GitHub Desktop.
Save mmacfadden/49a2782ce056aef29b77d546bd067d28 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
###############################################################################
# CSC-846 Lab 02
# Michael MacFadden
#
# This script unhides hidden sheets from a legacy Microsoft Office spreadsheet
# using the BIFF8 format (".xls"). The script takes a single argument which
# is the name of the file to process. The script will take the following
# actions:
# 1. Validate that the file exists, is readable, and is an OLE file.
# 2. Open the file and scan for the Workbook containing worksheets.
# 3. Inspect each worksheet it finds printing its name, type, and state.
# 4. If a worksheet is hidden, it will be made visible.
# 5. If any worksheet was unhidden, the original file will remain
# unmodified, but a new copy of the file with unhidden sheets will
# be saved in the working directory.
# 6. The script will print the name of all unhidden sheets.
#
# The script is mostly self-contained (it does not launch any external
# processes). However, it makes use of the olefile package to read
# and write OLE data.
#
# Usage:
# unhide-sheets.py myfile.xls
#
# Dependencies:
# - python => 3.10
# - olefile: => 0.46
#
# References:
# - https://blog.reversinglabs.com/blog/excel-4.0-macros
# - https://interoperability.blob.core.windows.net/files/MS-XLS/%5bMS-XLS%5d.pdf ([MS-XLS]: Excel Binary File Format)
# - https://docs.microsoft.unpackcom/en-us/openspecs/office_file_formats/ms-xls/b9ec509a-235d-424e-871d-f8e721106501
#
###############################################################################
import argparse
import os
import random
import string
import struct
import tempfile
from pathlib import Path
from shutil import copy2, move
import olefile
##
## Main entry point when run as a script.
##
def main() -> None:
"""This method is the main entry point of the script.
It will validate that a filename was passed in, ensure that the file
exists, is readable, and is an OLE file. It will then process the
file to unhide any hidden sheets.
"""
filename = process_arguments()
validate_file(filename)
process_ole_xls_file(filename)
##
## Argument Processing and Validation
##
def process_arguments() -> str:
"""Obtains the filename parameter passed in on the command line.
If the parameter was not passed in, a usage statement will be presented
and the script will terminate.
"""
parser = argparse.ArgumentParser(description='Unhide XLS hidden sheets.')
parser.add_argument('filename', type=str, nargs="+", help='The name of the file to open.')
args = parser.parse_args()
filename = args.filename[0]
return filename
def validate_file(filename: str) -> None:
"""Validates that the file exists, is readable, and an OLE file.
If not, the method will print an error message and exit.
"""
file_path = Path(filename)
if not file_path.is_file():
print_file_error_and_exit("File does not exist", filename)
if not os.access(filename, os.R_OK):
print_file_error_and_exit("File is not readable", filename)
if not olefile.isOleFile(filename):
print_file_error_and_exit("invalid OLE file", filename)
def print_file_error_and_exit(error: str, filename: str) -> None:
"""A helper method that prints an error and exits."""
print(f"\nERROR - {error}: {filename}")
exit(1)
##
## OLE File Processing
##
def process_ole_xls_file(filename) -> None:
"""Unhides hidden sheets in an OLE XLS file.
If hidden sheets are found in the input file a new file, prepended with
'unhidden" will be written to the current directory. This file will have
all sheets unhidden. If no hidden sheets were found in the input file
then no action will be taken.
The method will print out a listing of all sheets in the workbook, along
with their type and state. The method will also print a summary of which
sheets were unhidden, if any.
"""
print(f"Unhiding sheets in MS-XLS file: {filename}")
# Create a tmp file that is a copy of the file we are processing. It is
# easier to operate on a copy, so we can edit as we go. If we don't
# find any hidden sheets the tmp file is deleted.
random_file_name = ''.join(random.choices(string.ascii_letters + string.digits, k=20))
tmp_filename = os.path.join(tempfile.gettempdir(), random_file_name)
copy2(filename, tmp_filename)
print(f"\nWorkbook Sheets:")
modified_sheets = list()
# Open file tmp file we created and find the workbook stream.
with olefile.OleFileIO(tmp_filename, write_mode=True) as ole_file:
streams = ole_file.listdir()
for path in streams:
if 'Book' in path or 'Workbook' in path:
# We have found the path corresponding to the workbook
# that should contain the worksheets. Processes this
# path and obtain the list of sheets (if any) that
# were unhidden.
modified_sheets = process_ole_workbook_stream(ole_file, path)
if len(modified_sheets) > 0:
print("\nUnhidden Sheets:")
for sheet_name in modified_sheets:
print(f" - {sheet_name}")
source_path = Path(filename).resolve()
unhidden_name = f"unhidden_{source_path.name}"
dest_path = source_path.parent / unhidden_name
print(f"\nWritting modifications to: {unhidden_name}")
move(tmp_filename, dest_path.absolute())
else:
# There were no unhidden sheets, so remove the tmp file.
print("\nNo hidden sheets were found in the file.")
os.remove(tmp_filename)
def process_ole_workbook_stream(ole_file: olefile.OleFileIO, path: list[str]) -> list[str]:
"""Processes the workbook stream and unhides any hidden sheets.
The method returns the names of any unhidden sheets.
"""
modified_sheets = list()
stream_path = '/'.join(path)
stream = ole_file.openstream(stream_path).read()
# Tracks the current offset into the stream.
offset = 0
while offset < len(stream):
# Extract the record header that contains the type of entry as well as
# the length of the record.
#
# From: [MS-XLS] - v20220816 Excel Binary File Format
#
# |----------------------------------|
# |0 | 1 |
# |0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 |
# |----------------------------------|
# | type (1 byte) | length (1-byte) |
# |----------------------------------|
reord_header_format = 'HH'
record_header_size = struct.calcsize(reord_header_format)
if len(stream[offset:offset + record_header_size]) < record_header_size:
break
header = stream[offset:offset + record_header_size]
record_type, record_length = struct.unpack(reord_header_format, header)
# Grab a slice of the stream the represents the data for the
# current record.
record_data_start = offset + record_header_size
record_data = stream[record_data_start:record_data_start + record_length]
# Check to see if this record is a BoundSheet record (type of 85)
if record_type == bound_sheet_record_type:
# Extract the sheet type, state, and name.
#
# From: [MS-XLS] - v20220816 Excel Binary File Format
#
# |----------------------------------------------------------------------|
# |0 |1 |2 |3 |
# |0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7 |
# |----------------------------------------------------------------------|
# | BOF Start Position (4 bytes) |
# |----------------------------------------------------------------------|
# | st | unused | type (1 byte) | name (variable 1-31 bytes) |
# |----------------------------------------------------------------------|
sheet_meta_format = 'BB'
bof_ptr_len = 4
sheet_meta_size = struct.calcsize(sheet_meta_format)
sheet_meta = record_data[bof_ptr_len:bof_ptr_len + sheet_meta_size]
sheet_state, sheet_type = struct.unpack(sheet_meta_format, sheet_meta)
# The sheet name is the rest of the data in this record.
sheet_name_start = bof_ptr_len + sheet_meta_size
sheet_name = decode_sheetname(record_data[sheet_name_start:])
print(f" - Name: {sheet_name}\n Type: {sheet_types[sheet_type]}\n State: {sheet_states[sheet_state]}\n")
# If the sheet is not visible, update the byte that represents the
# visibility state to make it visible and add the sheet to the list of
# unhidden sheets.
if (sheet_state != 0):
new_state = bytes([sheet_state & ~0x03])
stream = stream[:record_data_start + bof_ptr_len] + \
new_state + stream[record_data_start + bof_ptr_len + 1:]
modified_sheets.append(sheet_name)
# Increment the offset to just after the record we just processed.
offset = offset + record_header_size + record_length
if len(modified_sheets) > 0:
# If any sheets were unhidden, we need to write the updated stream
# back to the file.
ole_file.write_stream(stream_path, stream)
return modified_sheets
##
## Helper Methods
##
def decode_sheetname(data):
"""A helper utility to decode the BoundSheet name"""
cch = int_or_unicode_value(data[0])
highbyte = int_or_unicode_value(data[1])
if highbyte == 0:
return data[2:2 + cch].decode("utf-8")
else:
return repr(data[2:2 + cch * 2])
def int_or_unicode_value(value: int | str) -> int:
"""A helper to coalesce a one char string or int to an int"""
return value if type(value) == int else ord(value)
##
## Constants
##
bound_sheet_record_type = 0x85
sheet_states = {
0x00: "visible",
0x01: "hidden",
0x02: "very hidden"
}
sheet_types = {
0x0: "Worksheet or Dialog Sheet",
0x1: "Macro Sheet",
0x2: "Chart Sheet",
0x6: "VBA Module"
}
##
## Script Entry Point
##
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment