Skip to content

Instantly share code, notes, and snippets.

@xyb
Created April 4, 2023 08:39
Show Gist options
  • Save xyb/0a3023196a77cfd700f2c461d42b91b5 to your computer and use it in GitHub Desktop.
Save xyb/0a3023196a77cfd700f2c461d42b91b5 to your computer and use it in GitHub Desktop.
some functions to help read or generate excel files
import csv
import io
import logging
import openpyxl
logger = logging.getLogger("excel")
def excel_to_csv(path):
"""
Converts an Excel file to a list of CSV strings,
one for each sheet in the Excel file.
Args:
- path (str): The path to the Excel file to be converted.
Returns:
- result (list): A list of dictionaries, where each dictionary
contains the following keys:
- sheet_no (int): The number of the sheet in the Excel file.
- sheet_name (str): The name of the sheet in the Excel file.
- csv_content (str): The CSV content of the sheet.
Raises:
- FileNotFoundError: If the Excel file does not exist.
- openpyxl.utils.exceptions.InvalidFileException: If the Excel
file is not a valid Excel file.
"""
wb = openpyxl.load_workbook(path)
result = []
for id, sheet_name in enumerate(wb.sheetnames):
sheet = wb[sheet_name]
csv_content = io.StringIO()
csv_writer = csv.writer(csv_content)
for row in sheet.iter_rows(values_only=True):
csv_writer.writerow(row)
result.append(
dict(
sheet_no=id + 1,
sheet_name=sheet_name,
csv_content=csv_content.getvalue(),
),
)
return result
def sheets_to_excel(path, sheets):
"""
Write all sheets into a excel file.
It takes in two parameters: path and sheets. The path parameter is the path
to the excel file. The sheets parameter is a list of tuples, where each tuple
contains the sheet name and the csv content. The csv content is in csv format,
where the first line is the field names and the other lines are the data itself.
Example Usage:
content = 'a,b,c\\n1,2,3\\n4,5,6\\n'
excel_path = '/tmp/test.xlsx'
sheets_to_excel(excel_path, [('test sheet', content)])
Parameters:
- path (str): The path to the excel file.
- sheets (list): A list of tuples, where each tuple contains the sheet name
and the csv content.
Returns:
- None
>>> content = 'a,b,c\\n1,2,3\\n'
>>> excel_path = '/tmp/test.xlsx'
>>> sheets_to_excel(excel_path, [('test', content)])
>>> excel_to_csv(excel_path)
[{'sheet_no': 1, 'sheet_name': 'test', 'csv_content': 'a,b,c\\r\\n1,2,3\\r\\n'}]
>>> sheets_to_excel(excel_path, [])
>>> excel_to_csv(excel_path)
[{'sheet_no': 1, 'sheet_name': 'Sheet', 'csv_content': ''}]
"""
workbook = openpyxl.Workbook()
default_sheet = workbook["Sheet"]
if sheets:
workbook.remove(default_sheet)
for sheet in sheets:
sheet_name, csv_content = sheet
worksheet = workbook.create_sheet(title=sheet_name)
reader = csv.reader(csv_content.split("\n"))
for row_index, row in enumerate(reader):
for col_index, cell_value in enumerate(row):
worksheet.cell(
row=row_index + 1,
column=col_index + 1,
value=cell_value,
)
workbook.save(path)
def merge_same_cell(xlsx_filename):
"""
>>> content = 'a,b,c\\na,b,e\\na,d,e'
>>> excel_path = '/tmp/test.xlsx'
>>> sheets_to_excel(excel_path, [('test', content)])
>>> excel_to_csv(excel_path)
[... 'csv_content': 'a,b,c\\r\\na,b,e\\r\\na,d,e\\r\\n'}]
>>> merge_same_cell(excel_path)
>>> excel_to_csv(excel_path)
[... 'csv_content': 'a,b,c\\r\\n,,e\\r\\n,d,\\r\\n'}]
"""
wb = openpyxl.load_workbook(xlsx_filename)
for sheet in wb:
merge_list = []
for col in sheet.columns:
start_row = 1
for i in range(1, len(col)):
if col[i].value != col[i - 1].value:
if start_row != i:
cell_range = (
sheet.cell(row=start_row, column=col[0].column).coordinate
+ ":"
+ sheet.cell(row=i, column=col[0].column).coordinate
)
merge_list.append(cell_range)
start_row = i + 1
if start_row != len(col) + 1:
cell_range = (
sheet.cell(row=start_row, column=col[0].column).coordinate
+ ":"
+ sheet.cell(row=len(col), column=col[0].column).coordinate
)
merge_list.append(cell_range)
for cell_range in merge_list:
if cell_range in sheet.merged_cells:
sheet.unmerge_cells(cell_range)
sheet.merge_cells(cell_range)
wb.save(xlsx_filename)
def adjust_col_width_to_fit_content(xlsx_filename):
"""
Adjusts the column width of an Excel file to fit its content.
Args:
- xlsx_filename (str): The path to the Excel file.
Returns:
- None
"""
wb = openpyxl.load_workbook(xlsx_filename)
for sheet in wb:
for col in sheet.columns:
max_length = 0
column = col[0].column_letter # Get the column name
for cell in col:
try: # Necessary to avoid error on empty cells
if len(str(cell.value)) > max_length:
max_length = len(cell.value)
except Exception:
pass
adjusted_width = (max_length + 2) * 1.2 # Adjust the width
sheet.column_dimensions[column].width = adjusted_width
wb.save(xlsx_filename)
def auto_remove_unused_cols_and_rows(xlsx_filename):
wb = openpyxl.load_workbook(xlsx_filename)
for sheet in wb:
# Get the range of cells that contain data
used_range = sheet.calculate_dimension()
# Get the maximum row and column indices in the used range
max_row, min_col, max_col, min_row = openpyxl.utils.cell.range_boundaries(
used_range,
)
# Identify the header row
header_row = None
for row in range(min_row, max_row + 1):
if all(cell.value is None for cell in sheet[row]):
header_row = row
else:
break
# Delete any unused columns
for col in range(max_col, sheet.max_column + 1):
if all(
sheet[row][col].value is None for row in range(min_row, max_row + 1)
):
sheet.delete_cols(col)
# Delete any unused rows
if header_row is not None:
for row in range(max_row, header_row - 1, -1):
if all(
sheet[row][col].value is None for col in range(min_col, max_col + 1)
):
sheet.delete_rows(row)
else:
for row in range(max_row, 0, -1):
if all(
sheet[row][col].value is None for col in range(min_col, max_col + 1)
):
sheet.delete_rows(row)
# Unmerge any merged cells before deleting them
for merged_cell_range in sheet.merged_cells.ranges:
merged_cell_range.unmerge()
wb.save(xlsx_filename)
def auto_height_row(xlsx_filename):
wb = openpyxl.load_workbook(xlsx_filename)
for sheet in wb:
for row in sheet.rows:
max_height = 0
for cell in row:
try:
if cell.value:
cell_height = len(str(cell.value)) * 1.2
if cell_height > max_height:
max_height = cell_height
except Exception:
pass
if max_height > 0:
sheet.row_dimensions[row[0].row].height = max_height
wb.save(xlsx_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment