Skip to content

Instantly share code, notes, and snippets.

@dorneanu
Created May 6, 2015 18:44
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save dorneanu/2393d8d03d0ed443baeb to your computer and use it in GitHub Desktop.
Save dorneanu/2393d8d03d0ed443baeb to your computer and use it in GitHub Desktop.
Simple parallelism with Python
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
""" Simple demo of using multiprocessing when parsing files """
import sys
import os
import codecs
from multiprocessing import Pool, Process, Queue, cpu_count
def get_file_list(folder):
""" Returns a list of files inside the <folder> directory """
file_list = []
dirs_list = []
# "Walk" through folder
for root, dirs, files in os.walk(folder):
for dir in dirs:
dirpath = os.path.join(root, dir)
print(dirpath)
for filename in files:
filepath = os.path.join(root, filename)
# Check if it's file
if os.path.isfile(filepath):
file_list.append(filepath)
return file_list
def parse_file(file_path, nr_lines):
""" Read first n lines from file """
lines = None
with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()[nr_lines]
f.close()
return lines
def process_files(file_list, q):
""" Proprocess every file in <file_list> and call parse_file() """
results = []
try:
for f in file_list:
results.append(parse_file(f, 3))
except:
q.put([])
raise
# Put results into queue
q.put(results)
def parallelize_parsing(file_list):
cpu_cores = cpu_count()
q = Queue()
procs = []
# Split the input file list into sublists according to the number
# of the available CPU cores
for i in range(0, cpu_cores):
sub_list = [file_list[j] for j in range(0, len(file_list)) if j % cpu_cores == i]
if len(sub_list) > 0:
p = Process(target=process_files, args=([sub_list, q]))
p.start()
procs.append(p)
# Collect the results
all_results = []
for i in range(0, len(procs)):
all_results.append(q.get())
print(all_results)
if __name__ == "__main__":
if (len(sys.argv) != 2):
print("Usage: %s location" % sys.argv[0])
exit(1)
else:
# Get list of file paths
file_list = get_file_list(sys.argv[1])
# print(file_list)
parallelize_parsing(file_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment