Last active
November 2, 2021 09:29
-
-
Save qnkhuat/691435797e59bfbe26595b6fe0f1da65 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install | |
# pip3 install pandas | |
# pip3 install xlrd | |
# pip3 install openpyxl | |
import pandas as pd | |
from glob import glob | |
import argparse | |
import os | |
from multiprocessing import Pool, cpu_count | |
def work(path, n=True, out_folder ="./", override=False): | |
print("---------------") | |
print("Start process file: ", path) | |
df = None | |
for i in range(n): | |
path_splits = path.split('.') | |
out_file = f"{''.join(path_splits[:-1])}_{i+1}.csv".strip(os.sep) | |
out_path = os.path.join(out_folder, out_file) | |
# skip if file existed and not override | |
if os.path.exists(out_path) and not override: | |
print(f"SKIP due to existed: {out_path}") | |
continue | |
if type(df) != pd.DataFrame: | |
df = pd.read_excel(path, engine='openpyxl', dtype={"Số tờ khai": str}) | |
rows_per_file = len(df) // n | |
split_start_index = 0 if i == 0 else i * rows_per_file | |
split_stop_index = (i + 1) * rows_per_file | |
df[split_start_index:split_stop_index].to_csv(out_path, index=False, encoding="utf-8") | |
print(f"Created file: {out_path}") | |
def main(): | |
parser = argparse.ArgumentParser(description='Convert xlsx to multiple csvs') | |
parser.add_argument('-n', type=int, | |
default=2, | |
help='Convert 1 xlsx to n csv. Default is 2') | |
parser.add_argument('--out', | |
default=f"./", | |
help=f'Path to folder to save files') | |
parser.add_argument('--search', | |
default=f".{os.sep}*.xlsx", | |
help=f'Regex to search xlsx files. Default is: ".{os.sep}*.xlsx"') | |
parser.add_argument("--override", action="store_true", | |
default=False, | |
help="Override existed file") | |
parser.add_argument("--fast", action="store_true", | |
default=False, | |
help="Brrrrr") | |
args = parser.parse_args() | |
# create output folder | |
os.makedirs(args.out, exist_ok=True) | |
files = glob(args.search) | |
print(f"Found {len(files)} files") | |
print("Enter to continue, Ctlr-C to cancle") | |
input("") | |
if args.fast: | |
with Pool(cpu_count()) as p: | |
p.starmap(work, [[file, args.n, args.out, args.override] for file in files]) | |
else: | |
for i, file in enumerate(files): | |
print(f"{i}/{len(files)}") | |
work(file, args.n, args.out, args.override) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment