Skip to content

Instantly share code, notes, and snippets.

@zafercavdar
Created August 7, 2023 11:10
Show Gist options
  • Save zafercavdar/d7294ce815aef81bb9e61cda5dcd7a67 to your computer and use it in GitHub Desktop.
Save zafercavdar/d7294ce815aef81bb9e61cda5dcd7a67 to your computer and use it in GitHub Desktop.
Excel content reformatting
import argparse
import re
import pandas as pd
parser = argparse.ArgumentParser(description="Reformatter app")
parser.add_argument("path", type=str, help="Path to an Excel file")
def uppercase_first_letter(text: str) -> str:
return text[0].upper() + text[1:] if len(text) > 0 else text
def normalize_not_available_info(text: str) -> str:
phrases = [
r"no available information",
r"no information available",
r"no information provided",
r"no.*mention of",
r"is not disclosed",
r"no.*involved",
r"does not provide",
r"doesn't provide",
r"not mentioned",
r"no.*signs",
]
search_text = str(text).lower()
for phrase in phrases:
if re.search(phrase, search_text) is not None:
return "N/A"
return uppercase_first_letter(str(text).strip())
def export_as_excel(df, filename, sheet_name="Sheet 1"):
writer = pd.ExcelWriter(filename, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheet_name, index=True) # send df to writer
worksheet = writer.sheets[sheet_name] # pull worksheet object
worksheet.set_column(0, 0, 60)
for column in df: # loop through all columns
series = df[column]
col_idx = df.columns.get_loc(column)
max_len = max((
series.astype(str).map(len).max(), # len of largest item
len(str(series.name)), # len of column name/header
))
worksheet.set_column(col_idx + 1, col_idx + 1,
max_len // 1.1) # set column width
writer.save()
def reformat(filename):
df = pd.read_excel(filename, parse_dates=False)
column_order = sorted(df["Dimensions"].unique())
df["Results_fixed"] = df["Results"].apply(normalize_not_available_info)
formatted_df = df.pivot_table(values='Results_fixed',
index=df["Company name"],
columns='Dimensions',
aggfunc='first')[column_order].fillna("N/A")
new_filename = filename.rsplit(".", maxsplit=1)[0] + "-reformatted.xlsx"
export_as_excel(formatted_df, new_filename)
print(f"The new file is saved as {new_filename}")
if __name__ == "__main__":
args = parser.parse_args()
reformat(args.path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment