Skip to content

Instantly share code, notes, and snippets.

@robert-lucente
Last active November 20, 2022 17:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robert-lucente/2c324ae6fc9b780b51cb3267f8f93676 to your computer and use it in GitHub Desktop.
Save robert-lucente/2c324ae6fc9b780b51cb3267f8f93676 to your computer and use it in GitHub Desktop.
Parallelizing Nested For Loops in Python
Often nested for-loops cause performance issues. Remember that the complexity
rises as the power of the number of nested for loops. So, if you
have 3 nested for-loops, the complexity is N**3. The ideal case would be to
refactor the algorithm and associated code not to use nested for loops,
although this is not always possible.
A potential alternative might be to just throw hardware at the problem. The
use case that we are going to explore will permit this.
The use case that we are going to explore is a hierarchy of categories that
is three level deep. The first level of the hierarchy has an associated priority
and consequently, they all must be processed before the level 2 categories
can be processed. However, since level 2 categories have no associated
priority, they can all be executed in parallel. Similarly, once a category 2
has been processed, all its associated level 3 categories can be processed
in parallel.
For a tabular format of the data used for the categories, go to the links below
Excel: https://docs.google.com/spreadsheets/d/1w9NJ9y_Pcb9xXITmG8PU66NovBZtajqt/
PDF: https://drive.google.com/file/d/1HcYyObk4LMAczN5FyJu_H6l2rv75E29k/view?usp=sharing
For the associated code/data structures for the categories, refer to
common_processing.py.
The program is initially written using nested for loops. For the details, refer
to nested_for_loops.py.
The program is then re-written such that it is parallelized. Python's
multiprocessing pool is used. For the details, refer to
parallelize_nested_for_loops.py.
To demonstrate that the fundamental code for nested for loops vs parallelized
nested for loops is not that different, the common code was placed in
common_processing.py.
"""
Common processing for "nested for loops" and "parallelized nested
for loops."
"""
def process_categories_lvl_1():
"""
Since level 1 categories are prioritized,
they require special processing. Furthermore,
all level 1 category processing muust be
completed before any level 2 categories can
be processed. Consequently, they have a separate
function.
"""
print("Process level 1 categories", flush=True)
def process_categories_non_lvl_1(
category_lvl, category_previos_lvl, category_current_lvl
):
"""
All non-level 1 categories can be processed
using the same function.
The inputs are needed because in the actual application they
are used to dynamically build a query.
"""
if category_lvl == 2:
leading_blank_spaces = " "
else:
leading_blank_spaces = " "
print(flush=True)
print(
f" {leading_blank_spaces}**** Category level being processed: {category_lvl}",
flush=True,
)
print(
f" {leading_blank_spaces}**** Category from previous level: {category_previos_lvl}",
flush=True,
)
print(
f" {leading_blank_spaces}**** Category from current level: {category_current_lvl}",
flush=True,
)
print(flush=True)
def summary_counts():
"""
Summary counts can be computed only after all
the categories and their associated hierarchies have been processed.
"""
print("Compute summary counts", flush=True)
"""
Hierarchical categories of 3-level deep to be processed.
For a tabular format of the data used for the categories,
go to the links below
Excel: https://docs.google.com/spreadsheets/d/1w9NJ9y_Pcb9xXITmG8PU66NovBZtajqt/
PDF: https://drive.google.com/file/d/1HcYyObk4LMAczN5FyJu_H6l2rv75E29k/view?usp=sharing
Stand in for the category hiearchy data that is most likely stored in a database.
"""
from typing import Dict, List, Set
def load_categories_lvl_1() -> List[str]:
"""
Since level 1 categories are prioritized,
a list data structure is used.
"""
categories_lvl_1 = [
"category_lvl_1_a",
"category_lvl_1_b",
"category_lvl_1_c",
"category_lvl_1_d",
"category_lvl_1_e",
"category_lvl_1_f",
"category_lvl_1_g",
]
return categories_lvl_1
def load_categories_lvl_1_2() -> Dict[str, Set[str]]:
""" "
Since level 2 categories are not prioritized,
the data structure is a dictionary keyed by
the level 1 category followed by a set of
level 2 categories.
Each level 2 category can be associated with only
one level 1 category.
"""
categories_lvl_1_2 = {
"category_lvl_1_b": {"category_lvl_2_a"},
"category_lvl_1_c": {"category_lvl_2_b"},
"category_lvl_1_d": {"category_lvl_2_c", "category_lvl_2_d"},
"category_lvl_1_e": {
"category_lvl_2_e",
"category_lvl_2_f",
"category_lvl_2_g",
},
"category_lvl_1_f": {"category_lvl_2_h"},
"category_lvl_1_g": {"category_lvl_2_i"},
}
return categories_lvl_1_2
def load_categories_lvl_2_3() -> Dict[str, Set[str]]:
"""
Since level 3 categories are not prioritized,
the data structure is a dictionary keyed by
the level 2 category followed by a set of
level 3 categories.
"""
categories_lvl_2_3 = {
"category_lvl_2_b": {"category_lvl_3_a"},
"category_lvl_2_h": {"category_lvl_3_b", "category_lvl_3_c"},
"category_lvl_2_i": {
"category_lvl_3_d",
"category_lvl_3_e",
"category_lvl_3_f",
},
}
return categories_lvl_2_3
"""
Hierarchical categories of 3-level deep are processed.
Since the depth of the hierarchy is shallow and fixed, no recursion
is necessary. Consequently, will use simple nested for loops.
"""
import common_processing
import input_data
def main():
"""
1) Load category hierarchy
2) Process level 1 category hierarchy
3) Loop through the category hierarchy
4) Compute summary counts
"""
# 1) Load category hierarchy
categories_lvl_1 = input_data.load_categories_lvl_1()
categories_lvl_1_2 = input_data.load_categories_lvl_1_2()
categories_lvl_2_3 = input_data.load_categories_lvl_2_3()
# 2) Process level 1 category hierarchy
common_processing.process_categories_lvl_1()
# 3) Loop through the category hierarchy
for category_lvl_1 in categories_lvl_1:
print()
print(f"Category Level 1: {category_lvl_1}")
try:
categories_lvl_2 = categories_lvl_1_2[category_lvl_1]
for category_lvl_2 in categories_lvl_2:
print()
print(f" Category Level 2: {category_lvl_2}")
common_processing.process_categories_non_lvl_1(
2, category_lvl_1, category_lvl_2
)
try:
categories_lvl_3 = categories_lvl_2_3[category_lvl_2]
for category_lvl_3 in categories_lvl_3:
print(f" Category Level 3: {category_lvl_3}")
common_processing.process_categories_non_lvl_1(
3, category_lvl_2, category_lvl_3
)
except KeyError:
print(" There are no associated level 3 categories")
print()
except KeyError:
print()
print(" There are no associated level 2 categories")
# 4) Compute summary counts
common_processing.summary_counts()
if __name__ == "__main__":
main()
Process level 1 categories
Category Level 1: category_lvl_1_a
There are no associated level 2 categories
Category Level 1: category_lvl_1_b
Category Level 2: category_lvl_2_a
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_b
**** Category from current level: category_lvl_2_a
There are no associated level 3 categories
Category Level 1: category_lvl_1_c
Category Level 2: category_lvl_2_b
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_c
**** Category from current level: category_lvl_2_b
Category Level 3: category_lvl_3_a
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_b
**** Category from current level: category_lvl_3_a
Category Level 1: category_lvl_1_d
Category Level 2: category_lvl_2_d
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_d
**** Category from current level: category_lvl_2_d
There are no associated level 3 categories
Category Level 2: category_lvl_2_c
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_d
**** Category from current level: category_lvl_2_c
There are no associated level 3 categories
Category Level 1: category_lvl_1_e
Category Level 2: category_lvl_2_g
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_e
**** Category from current level: category_lvl_2_g
There are no associated level 3 categories
Category Level 2: category_lvl_2_e
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_e
**** Category from current level: category_lvl_2_e
There are no associated level 3 categories
Category Level 2: category_lvl_2_f
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_e
**** Category from current level: category_lvl_2_f
There are no associated level 3 categories
Category Level 1: category_lvl_1_f
Category Level 2: category_lvl_2_h
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_f
**** Category from current level: category_lvl_2_h
Category Level 3: category_lvl_3_b
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_h
**** Category from current level: category_lvl_3_b
Category Level 3: category_lvl_3_c
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_h
**** Category from current level: category_lvl_3_c
Category Level 1: category_lvl_1_g
Category Level 2: category_lvl_2_i
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_g
**** Category from current level: category_lvl_2_i
Category Level 3: category_lvl_3_d
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_i
**** Category from current level: category_lvl_3_d
Category Level 3: category_lvl_3_e
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_i
**** Category from current level: category_lvl_3_e
Category Level 3: category_lvl_3_f
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_i
**** Category from current level: category_lvl_3_f
Compute summary counts
"""
Parallelize level 2 category processing
"""
from multiprocessing.pool import Pool
import input_data
import common_processing
def get_level_2_categories():
"""
The way that the set of level 2 categories is obtained
wil vary between applications. In my particular application,
the data will be obtained from a database.
"""
# Load category level 1, 2 combinations
categories_lvl_1_2 = input_data.load_categories_lvl_1_2()
# Create an empty set to store level 2 categories
categories_lvl_2 = set()
# Loop through to get the sets containing level 2 categories
for category_lvl_2_set in categories_lvl_1_2.values():
# Loop a level 2 category set to obtain
# an individual level 2 category
for category_lvl_2 in category_lvl_2_set:
categories_lvl_2.add(category_lvl_2)
return categories_lvl_2
def get_category_lvl_1_given_lvl_2(category_lvl_2):
"""
In a production system, this function will probably
obtain the data from a database. If the execution
of this function is time critical, some sort of hash
lookup will have to be used.
Pylint will complain "Either all return statements in a
function should return an expression, or none of them should.
(inconsistent-return-statements)". This isn't true because
the for loop will always find a key since a level 2 category
can't exist without a leve 1 category.
"""
categories_lvl_1_2 = input_data.load_categories_lvl_1_2()
for key, value_set in categories_lvl_1_2.items():
if category_lvl_2 in value_set:
return key
def process_category_lvl_2(category_lvl_2):
"""
1) Obtain the category level 1 associated with the
input level 2 category
2) Process the level 1, 2 category combination
3) Process the level 2, 3 combinations for the
input level 2 category
"""
# 1) Obtain the category level 1 associated with the
# input level 2 category
category_lvl_1 = get_category_lvl_1_given_lvl_2(category_lvl_2)
# 2) Process the level 1, 2 combination
common_processing.process_categories_non_lvl_1(2, category_lvl_1, category_lvl_2)
# 3) Process the level 2, 3 combinations for the
# input level 2 category
categories_lvl_2_3 = input_data.load_categories_lvl_2_3()
try:
categories_lvl_3 = categories_lvl_2_3[category_lvl_2]
for category_lvl_3 in categories_lvl_3:
print(f" Category Level 3: {category_lvl_3}")
common_processing.process_categories_non_lvl_1(
3, category_lvl_2, category_lvl_3
)
except KeyError:
print(" There are no associated level 3 categories")
print()
def main():
"""
1) Process level 1 category hierarchy
2) Execute in parallel all level 2 categories
Each level 2 category in turn will process its
associated level 3 categories
3) Compute summary counts
"""
# 1) Process level 1 category hierarchy
common_processing.process_categories_lvl_1()
print(flush=True)
# 2) Execute in parallel all level 2 categories
with Pool() as pool:
pool.imap_unordered(process_category_lvl_2, get_level_2_categories())
pool.close()
pool.join()
# 3) Compute summary counts
common_processing.summary_counts()
if __name__ == "__main__":
main()
Process level 1 categories
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_e
**** Category from current level: category_lvl_2_f
There are no associated level 3 categories
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_f
**** Category from current level: category_lvl_2_h
Category Level 3: category_lvl_3_b
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_h
**** Category from current level: category_lvl_3_b
Category Level 3: category_lvl_3_c
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_h
**** Category from current level: category_lvl_3_c
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_e
**** Category from current level: category_lvl_2_g
There are no associated level 3 categories
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_c
**** Category from current level: category_lvl_2_b
Category Level 3: category_lvl_3_a
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_b
**** Category from current level: category_lvl_3_a
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_d
**** Category from current level: category_lvl_2_c
There are no associated level 3 categories
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_b
**** Category from current level: category_lvl_2_a
There are no associated level 3 categories
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_d
**** Category from current level: category_lvl_2_d
There are no associated level 3 categories
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_g
**** Category from current level: category_lvl_2_i
Category Level 3: category_lvl_3_d
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_i
**** Category from current level: category_lvl_3_d
Category Level 3: category_lvl_3_f
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_i
**** Category from current level: category_lvl_3_f
Category Level 3: category_lvl_3_e
**** Category level being processed: 3
**** Category from previous level: category_lvl_2_i
**** Category from current level: category_lvl_3_e
**** Category level being processed: 2
**** Category from previous level: category_lvl_1_e
**** Category from current level: category_lvl_2_e
There are no associated level 3 categories
Compute summary counts
"""
Initial exploration at parallelizing nested for-loops. It was
abonded when it was realized that the approach used in the code
will not work. This file produces incorrect results and should be
deleted. It will be deleted on 2/1/2023. Keeping it around unti
then because some people are still referring to it.
Hierarchical categories 3 levels deep need to be processed.
Since the depth of the hierarchy is shallow and fixed, no recursion
is necessary. Consequently, will use simple nested for loops.
The first level of the hierarchy has an associated priority. However,
the other levels do not. Consequently, once the level 1 categories have
been processed, all the level 2 categories can be processed in parallel.
Similarly, once a particular level 2 category has been processed,
all its associated category 3 levels can be processed in parallel.
TODO (Robert): Add processing that once a particular level 2 category has been
processed, all its associated category 3 levels can be processed in parallel.
"""
from multiprocessing.pool import Pool
import common_processing
import input_data
def process_categories_non_lvl_1(categories_lvl_x_y):
"""
The processing for "nested for loops" and "parallelized nested
for loops" is identical except that in the parallel processing
the iterable needs to be unpacked.
"""
common_processing.process_categories_non_lvl_1(
categories_lvl_x_y[0], categories_lvl_x_y[1], categories_lvl_x_y[2]
)
def main():
"""
1) Load category hierarchy
2) Process level 1 category hierarchy
3) Create an empty set to hold a tuple containing
the current level, the category from the previous
level and the category from the current level
4) Loop through the category hierarchy
5) Process categories in level 2 in parallel
6) Compute summary counts
"""
# 1) Load category hierarchy
categories_lvl_1 = input_data.load_categories_lvl_1()
categories_lvl_1_2 = input_data.load_categories_lvl_1_2()
# 2) Process level 1 category hierarchy
common_processing.process_categories_lvl_1()
# 3) Create an empty set to hold a tuple containing
# the current level, the category from the previous
# level and the category from the current level
cat_lvl_1_2_combos = set()
# 4) Loop through the category hierarchy
for category_lvl_1 in categories_lvl_1:
try:
categories_lvl_2 = categories_lvl_1_2[category_lvl_1]
for category_lvl_2 in categories_lvl_2:
cat_lvl_1_2_combos.add((2, category_lvl_1, category_lvl_2))
except KeyError:
# Some level 1 categories do not have an associated
# level 2 category. This will result in a KeyError
# during the dictionary lookup. In this case the
# KeyError can be ignored.
pass
# 5) Process categories in level 2 in parallel
with Pool() as pool:
pool.imap_unordered(process_categories_non_lvl_1, cat_lvl_1_2_combos)
pool.close()
pool.join()
# 6) Compute summary counts
common_processing.summary_counts()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment