Skip to content

Instantly share code, notes, and snippets.

@mappingvermont
Created April 3, 2017 21:36
Show Gist options
  • Save mappingvermont/c8d36fe49c7bfd5163e102f01f87fa2a to your computer and use it in GitHub Desktop.
Save mappingvermont/c8d36fe49c7bfd5163e102f01f87fa2a to your computer and use it in GitHub Desktop.
join pandas DFs and write to CSV using multiprocessing
import feather
import os
import pandas as pd
import multiprocessing
folder_path = r'/home/ubuntu/raster-vector-to-tsv/output/925ff7f3-c08b-4de5-a385-469e0b26975a/final_tsv/'
csv_list = [os.path.join(folder_path, x) for x in os.listdir(folder_path) if os.path.splitext(x)[1] == '.tsv']
def mp_worker(csv_path):
print 'starting merge'
df1 = feather.read_dataframe(csv_path)
df2 = feather.read_dataframe(csv_path)
df = pd.merge(df1, df2, how='left', on=['0', '1'])
print 'writing'
out_csv = os.path.splitext(csv_path)[0] + '_out.csv'
df.to_csv(out_csv, index=None, header=None)
def mp_handler():
p = multiprocessing.Pool()
p.map(mp_worker, csv_list)
if __name__ == '__main__':
mp_handler()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment