Last active
May 11, 2023 13:08
-
-
Save frainfreeze/ba1612249293efc5bfb1e4c0cd549627 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
__author__ = "Tomislav Kucar" | |
__email__ = "kucar.tomislav@gmail.com" | |
__status__ = "Prototype" | |
"""Excel sheet to database table extractor | |
This script allows the user to extract data from excel sheet, | |
transform it and load it into a database table. | |
This tool accepts excel (.xls, .xlsx) files for input and any | |
sqlalchemy supported database for output. | |
This script requires that `pandas` and `sqlalchemy` be installed | |
within the Python environment you are running this script in. | |
This file can also be imported as a module. | |
""" | |
import pandas as pd | |
from sqlalchemy import create_engine | |
def excel_extract(filepath, sheet, rm_rows, rm_columns, rm_tail, table_name, db): | |
''' | |
Extracts data from excel sheet, transforms it and loads it into database table. | |
Parameters: | |
filepath (str): path to excel file | |
sheet (int): excel sheet number | |
rm_rows (int): number of rows to trim from top | |
rm_columns (list): list of columns to remove | |
rm_tail (int): tail length to trim | |
table_name (str): database table name | |
db (sqlalchemy.engine.Engine): sqlalchemy engine instance | |
''' | |
df = pd.read_excel(filepath, sheet_name = sheet) | |
df.drop(df.head(rm_rows).index, inplace=True) | |
df.drop(df.columns[rm_columns], axis=1, inplace=True) | |
df.drop(df.tail(rm_tail).index, inplace=True) | |
# bug workaround, see https://github.com/pandas-dev/pandas/issues/34211 | |
df.replace({'%': 'pct'}, regex=True, inplace=True) | |
df = df.rename(columns=df.iloc[0]).drop(df.index[0]) # set column names | |
df.to_sql(table_name, db, index=False) | |
if __name__ == "__main__": | |
filepath = "SpaceNK_20181222 2.0.xlsx" | |
table_name = "last_week_report" | |
engine = create_engine('postgresql://bluset:1234@localhost:5432/foreo_task', echo=False) | |
excel_extract(filepath, sheet=0, rm_rows=4, rm_columns=[0,1,4], rm_tail=1, table_name=table_name, db=engine) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment