Skip to content

Instantly share code, notes, and snippets.

@blakeNaccarato
Last active January 12, 2024 04:35
Show Gist options
  • Save blakeNaccarato/078c45c4cf6dd6cff4e8dd3819dc9830 to your computer and use it in GitHub Desktop.
Save blakeNaccarato/078c45c4cf6dd6cff4e8dd3819dc9830 to your computer and use it in GitHub Desktop.
Generate data for different lab sections.
"""
From all Excel workbooks in the working directory, generate multiple distinct Excel
workbooks in subfolders with different data and computed results removed. Retain an
instructor's key for each distinct dataset.
- Remove cell names.
- Remove sheets matching "pivot".
- Multiply bold cells by a factor from a Gaussian distribution and unbold them.
- Clear italic cells.
- Clear table column formulas.
"""
import os
import random
from decimal import Decimal
from glob import glob
from typing import List, Tuple
import openpyxl
import pyxltab
from openpyxl.cell.cell import TYPE_NUMERIC, Cell
from openpyxl.styles import Font
from openpyxl.workbook.workbook import Workbook
# * ------------------------------------------------------------------------------ # *
# * MAIN * #
def main():
"""
From all Excel workbooks in the working directory, generate multiple distinct Excel
workbooks in subfolders with different data and computed results removed. Retain an
instructor's key for each distinct dataset.
- Remove cell names.
- Remove sheets matching "pivot".
- Multiply bold cells by a factor from a Gaussian distribution and unbold them.
- Clear italic cells.
- Clear table column formulas.
"""
# Remove sheets matching "pivot".
match = "pivot"
# Of the cells modified, 95% will fall within 96% and 104% of the original value.
# Over 99% will fall between 94% and 106%.
factor_mean = 1
factor_std = 0.02
# Generate this many files with the suffix starting at a certain number and
# incrementing by one with each file. Useful for appending course section numbers.
num_files = 4
suffix_begin = 1001
in_files = glob("[!~$]*.xlsx") # Don't include temp XLSX files marked by "~$"
for in_file in in_files:
out_files_key, out_files_student = generate_out_files(
in_file, num_files, suffix_begin
)
for out_key, out_student in zip(out_files_key, out_files_student):
modify_book(in_file, out_key, out_student, match, factor_mean, factor_std)
# * ------------------------------------------------------------------------------ # *
# * PRIMARY FUNCTIONS * #
def generate_out_files(
in_filename: str, num_files: int, suffix_begin: int
) -> Tuple[List[str], List[str]]:
"""
Generates output filenames given an input filename. Make directories as needed.
"""
(in_file, ext) = os.path.splitext(in_filename)
out_root = os.path.abspath(in_file)
repeated_args = (out_root, in_file, num_files, suffix_begin, ext)
out_files_key = generate_out_files_helper(*repeated_args, additional_suffix="key")
out_files_student = generate_out_files_helper(*repeated_args)
return (out_files_key, out_files_student)
def modify_book(
book_path_in: str,
book_path_key: str,
book_path_student: str,
match: str,
factor_mean: float,
factor_std: float,
):
"""
From one master workbook, generate a workbook with different data and computed
results removed. "Bold" cells will be modified, "italic" cells will be cleared.
Sheets containing the word "pivot" will be deleted.
"""
# First just apply Gaussian scaling and save the instructor's key.
openpyxl_book = openpyxl.load_workbook(book_path_in)
gaussian_scale_bold(openpyxl_book, factor_mean, factor_std)
openpyxl_book.save(book_path_key)
# Then, clear all computed values and remove formatting.
remove_cell_names(openpyxl_book)
remove_matching_sheets(openpyxl_book, match)
format_and_clear_formulas(openpyxl_book)
openpyxl_book.save(book_path_student)
# * ------------------------------------------------------------------------------ # *
# * SECONDARY FUNCTIONS * #
def generate_out_files_helper(
out_root: str,
in_file: str,
num_files: int,
suffix_begin: int,
ext: str,
additional_suffix: str = "",
) -> List[str]:
"""
Generate output files based on the input file.
"""
if additional_suffix != "":
additional_suffix = "_" + additional_suffix
out_files = [
os.path.join(
out_root, in_file + "_" + str(suffix_begin + num) + additional_suffix + ext
)
for num in range(num_files)
]
if not os.path.exists(out_root):
os.makedirs(out_root)
return out_files
def remove_cell_names(book: Workbook):
"""
Remove all defined names from the workbook. Does not remove table references.
"""
defined_names = book.defined_names
names = [dn.name for dn in defined_names.definedName]
for name in names:
del defined_names[name]
def remove_matching_sheets(book: Workbook, match: str):
"""
Remove all sheets in the workbook that match the supplied string.
"""
match = match.lower()
for name in book.sheetnames:
if match in name.lower():
del book[name]
def gaussian_scale_bold(book: Workbook, factor_mean: float, factor_std: float):
"""
Get the instructor key workbok by modifying bold, numeric cell values by a random
factor from a Gaussian distribution.
"""
tables = pyxltab.get_tables(book)
tables_cells = [table.get_cells() for table in tables.values()]
for table_cells in tables_cells:
for col_cells in table_cells:
for cell in col_cells:
if cell.data_type == TYPE_NUMERIC and cell.font.bold:
modify_cell_gauss(cell, factor_mean, factor_std)
def format_and_clear_formulas(book: Workbook):
"""
Clear italic cells. Unbold an unitalicize. If a table column has a formula, clear
all cells in that column, whether or not they were italicized. Clear column
formulas.
"""
not_bold = Font(bold=False)
not_italic = Font(italic=False)
tables = pyxltab.get_tables(book)
tables_cells = [table.get_cells() for table in tables.values()]
for (table, table_cells) in zip(tables.values(), tables_cells):
for (col, col_cells) in zip(table.columns.values(), table_cells):
col_formula = col.openpyxl_column.calculatedColumnFormula
for cell in col_cells:
# Unbold
if cell.data_type == TYPE_NUMERIC and cell.font.bold:
cell.font = not_bold
# Clear italic cells. Unitalicize.
elif cell.font.italic:
cell.value = None
cell.font = not_italic
# If a table column has a formula, clear all cells in that column,
# whether or not they were italicized.
if cell.data_type == TYPE_NUMERIC and col_formula:
cell.value = None
# Clear column formulas.
if col_formula:
col.openpyxl_column.calculatedColumnFormula = None
# * ------------------------------------------------------------------------------ # *
# * TERTIARY FUNCTIONS * #
def modify_cell_gauss(cell: Cell, factor_mean: float, factor_std: float):
"""
Change cell value by a random factor from a Gaussian distribution.
"""
num_places = get_num_places(cell)
value = Decimal(cell.value)
factor = Decimal(random.gauss(factor_mean, factor_std))
cell.value = (value * factor).quantize(num_places)
def get_num_places(cell: Cell) -> Decimal:
"""
For the given cell, get the number of places beyond the decimal point. Return a
pattern to be used with `Decimal.quantize(result)` to adjust other numbers to the
same precision.
"""
if "." in str(cell.value):
digits_after_decimal = len(str(cell.value).split(".")[-1])
else:
digits_after_decimal = 0
num_places = Decimal(10) ** -digits_after_decimal
return num_places
# * ------------------------------------------------------------------------------ # *
# * RUN MAIN * #
# Run the `main()` function defined above if this script is directly invoked.
if __name__ == "__main__":
main()
openpyxl
pyxltab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment