Created
September 2, 2021 14:26
-
-
Save mehalter/2898d701539be2a67ba43dd6c7192eb9 to your computer and use it in GitHub Desktop.
xlsx's to csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
openpyxl==3.0.7 | |
pandas==1.3.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
import pandas as pd | |
# Directory to read in xlsx files from | |
data_dir = r"data" | |
output_file = r"output.csv" | |
# Indexes of interested columns | |
columns = { | |
"TP1": 1, | |
"TP2": 2, | |
"TP3": 3, | |
} | |
# Indexes of interested rows | |
rows = { | |
"TEST": 5, | |
"BP": 6, | |
"AMBIOLIC": 7, | |
"SOMETHING": 14, | |
"WELP": 15, | |
} | |
# Define entry key | |
key = { | |
"": 0, | |
"x": 1, | |
"xx": 2, | |
"xxx": 3, | |
"o": -1, | |
} | |
final_df = None | |
# Iterate over files in data directory, recursively | |
for subdir, dirs, files in os.walk(data_dir): | |
for filename in files: | |
# Only look at the xlsx files | |
if filename.endswith(".xlsx"): | |
# Get the full file path from the file system walk | |
filepath = os.path.join(subdir, filename) | |
# Open the file into a pandas data frame | |
data = pd.read_excel(filepath).replace(np.nan, "", regex=True) | |
# Get the id from the filename | |
id = os.path.splitext(filename)[0] | |
# Make a temporary data frame to hold the data | |
entry = pd.DataFrame(columns=[id]) | |
# Iterate over the interested rows and columns | |
for row in rows: | |
for col in columns: | |
# Generate the unique variable name | |
variable = "{}_{}".format(row, col) | |
# Set the variable in the temp data frame using the lookup key | |
entry.loc[variable] = key[data.iloc[rows[row], columns[col]]] | |
# Transpose entry to be a single row with several columns | |
entry = entry.T | |
# If a final dataframe hasn't been started, start it | |
if final_df is None: | |
final_df = entry | |
# If there is a final data frame, append the new entry to it | |
else: | |
final_df = final_df.append(entry) | |
# Print the final dataframe | |
print(final_df) | |
# Output generated data frame as a CSV file | |
if final_df is not None: | |
final_df.to_csv(output_file, index_label="id") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment