Skip to content

Instantly share code, notes, and snippets.

@qnkhuat
Last active November 2, 2021 09:29
Show Gist options
  • Save qnkhuat/691435797e59bfbe26595b6fe0f1da65 to your computer and use it in GitHub Desktop.
Save qnkhuat/691435797e59bfbe26595b6fe0f1da65 to your computer and use it in GitHub Desktop.
# install
# pip3 install pandas
# pip3 install xlrd
# pip3 install openpyxl
import pandas as pd
from glob import glob
import argparse
import os
from multiprocessing import Pool, cpu_count
def work(path, n=True, out_folder ="./", override=False):
print("---------------")
print("Start process file: ", path)
df = None
for i in range(n):
path_splits = path.split('.')
out_file = f"{''.join(path_splits[:-1])}_{i+1}.csv".strip(os.sep)
out_path = os.path.join(out_folder, out_file)
# skip if file existed and not override
if os.path.exists(out_path) and not override:
print(f"SKIP due to existed: {out_path}")
continue
if type(df) != pd.DataFrame:
df = pd.read_excel(path, engine='openpyxl', dtype={"Số tờ khai": str})
rows_per_file = len(df) // n
split_start_index = 0 if i == 0 else i * rows_per_file
split_stop_index = (i + 1) * rows_per_file
df[split_start_index:split_stop_index].to_csv(out_path, index=False, encoding="utf-8")
print(f"Created file: {out_path}")
def main():
parser = argparse.ArgumentParser(description='Convert xlsx to multiple csvs')
parser.add_argument('-n', type=int,
default=2,
help='Convert 1 xlsx to n csv. Default is 2')
parser.add_argument('--out',
default=f"./",
help=f'Path to folder to save files')
parser.add_argument('--search',
default=f".{os.sep}*.xlsx",
help=f'Regex to search xlsx files. Default is: ".{os.sep}*.xlsx"')
parser.add_argument("--override", action="store_true",
default=False,
help="Override existed file")
parser.add_argument("--fast", action="store_true",
default=False,
help="Brrrrr")
args = parser.parse_args()
# create output folder
os.makedirs(args.out, exist_ok=True)
files = glob(args.search)
print(f"Found {len(files)} files")
print("Enter to continue, Ctlr-C to cancle")
input("")
if args.fast:
with Pool(cpu_count()) as p:
p.starmap(work, [[file, args.n, args.out, args.override] for file in files])
else:
for i, file in enumerate(files):
print(f"{i}/{len(files)}")
work(file, args.n, args.out, args.override)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment