Skip to content

Instantly share code, notes, and snippets.

@idiomer
Last active July 17, 2020 03:10
Show Gist options
  • Save idiomer/b0b7eaa8f2882edc2f269f816d89702f to your computer and use it in GitHub Desktop.
Save idiomer/b0b7eaa8f2882edc2f269f816d89702f to your computer and use it in GitHub Desktop.
多进程读取多个数据文件
from glob import glob
import multiprocessing
from tqdm import tqdm
import pandas as pd
def json_reader(fname):
df = pd.read_json(fname, lines=True)
return df
def parquet_reader(fname):
part_df = pd.read_parquet(fname) # .query('has_exposure==1')
return part_df
# filenames = glob('data/dt=2020-04-01/part*')
pool = multiprocessing.Pool(10)
callbacks = []
pbar = tqdm(total=len(filenames))
for fname in filenames:
callbacks.append(pool.apply_async(parquet_reader, args=(fname, ), callback=lambda _: pbar.update(1)))
pool.close()
pool.join()
df = pd.concat([cb.get() for cb in callbacks], ignore_index=True, sort=True, copy=False)
pool.terminate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment