Skip to content

Instantly share code, notes, and snippets.

@mehalter
Created September 2, 2021 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mehalter/2898d701539be2a67ba43dd6c7192eb9 to your computer and use it in GitHub Desktop.
Save mehalter/2898d701539be2a67ba43dd6c7192eb9 to your computer and use it in GitHub Desktop.
xlsx's to csv
openpyxl==3.0.7
pandas==1.3.2
import os
import numpy as np
import pandas as pd
# Directory to read in xlsx files from
data_dir = r"data"
output_file = r"output.csv"
# Indexes of interested columns
columns = {
"TP1": 1,
"TP2": 2,
"TP3": 3,
}
# Indexes of interested rows
rows = {
"TEST": 5,
"BP": 6,
"AMBIOLIC": 7,
"SOMETHING": 14,
"WELP": 15,
}
# Define entry key
key = {
"": 0,
"x": 1,
"xx": 2,
"xxx": 3,
"o": -1,
}
final_df = None
# Iterate over files in data directory, recursively
for subdir, dirs, files in os.walk(data_dir):
for filename in files:
# Only look at the xlsx files
if filename.endswith(".xlsx"):
# Get the full file path from the file system walk
filepath = os.path.join(subdir, filename)
# Open the file into a pandas data frame
data = pd.read_excel(filepath).replace(np.nan, "", regex=True)
# Get the id from the filename
id = os.path.splitext(filename)[0]
# Make a temporary data frame to hold the data
entry = pd.DataFrame(columns=[id])
# Iterate over the interested rows and columns
for row in rows:
for col in columns:
# Generate the unique variable name
variable = "{}_{}".format(row, col)
# Set the variable in the temp data frame using the lookup key
entry.loc[variable] = key[data.iloc[rows[row], columns[col]]]
# Transpose entry to be a single row with several columns
entry = entry.T
# If a final dataframe hasn't been started, start it
if final_df is None:
final_df = entry
# If there is a final data frame, append the new entry to it
else:
final_df = final_df.append(entry)
# Print the final dataframe
print(final_df)
# Output generated data frame as a CSV file
if final_df is not None:
final_df.to_csv(output_file, index_label="id")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment