Skip to content

Instantly share code, notes, and snippets.

@johnb30
Created December 7, 2012 23:06
Show Gist options
  • Save johnb30/4237347 to your computer and use it in GitHub Desktop.
Save johnb30/4237347 to your computer and use it in GitHub Desktop.
Parallel implementation of data subsetting
import numpy as np
from joblib import Parallel, delayed
def subset(file):
dataOut = []
data = open(file, 'r')
data.readline()
for line in data:
splitLine = line.split('\t')
if splitLine[3] == '57':
dataOut.append(splitLine)
return dataOut
def stack(list_of_data, hold_data):
for i in xrange(len(list_of_data)):
current = np.array(data[i])
hold = np.vstack((hold_data, current))
return hold
if __name__ == "__main__":
filepath = ['testData1.txt', 'testData2.txt']
hold = []
temp = open(filepath[0], 'r')
hold.append(temp.readline().split('\t'))
data = Parallel(n_jobs=-1)(delayed(subset)(x) for x in filepath)
finalData = stack(data, hold)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment