Created
May 14, 2020 20:26
-
-
Save mlisovyi/480642b63381876d79a14a4d0af702cd to your computer and use it in GitHub Desktop.
Compare an excel with many tabs vs files in a set of folders
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import shutil | |
import pandas as pd | |
# inputs | |
dirs = [Path.cwd().joinpath("dir2"), Path.cwd().joinpath("dir1")] | |
for _dir in dirs: | |
if not _dir.is_dir(): | |
raise IOError(f"Can not find {_dir}.") | |
# excel table | |
file_excel_fin = "CUM" | |
file_excel_path = Path.cwd().joinpath(f"{file_excel_fin}.xlsx") | |
xlsx_in = pd.ExcelFile(file_excel_path) | |
# folder contents to compare | |
files_input_netsed = [[f for f in _dir.rglob("*") if f.is_file() and f.parent.name != "FOUND"] for _dir in dirs] | |
files_input = [f for sublist in files_input_netsed for f in sublist] | |
files_input_stems = [f.stem for f in files_input] | |
print(files_input) | |
out_excel_path = Path.cwd().joinpath(f"{file_excel_fin}_processed.xlsx") | |
with pd.ExcelWriter(out_excel_path) as xlsx_out: # pylint: disable=abstract-class-instantiated | |
for sheet in xlsx_in.sheet_names: | |
df = pd.read_excel(xlsx_in, sheet) | |
if 'Date' not in df: | |
continue | |
df['jc_year'] = (df['Date'].dt.month >= 10) + df['Date'].dt.year | |
df['file_name'] = '7101' + df['Number'].astype(str) + df['jc_year'].astype(str) | |
status = [] | |
for file_name_base in df['file_name'].values: | |
if file_name_base in files_input_stems: | |
status.append("FOUND") | |
f_idx = files_input_stems.index(file_name_base) | |
file_path = files_input[f_idx] | |
out_dir = file_path.parent.joinpath("FOUND") | |
out_dir.mkdir(parents=True, exist_ok=True) | |
shutil.move(str(file_path), str(out_dir)) | |
else: | |
status.append("NOT FOUND") | |
df['Status'] = status | |
df.drop(['jc_year', "file_name"], axis=1, inplace=True) | |
df.to_excel(xlsx_out, sheet, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment