Skip to content

Instantly share code, notes, and snippets.

@jwpeterson
Last active October 3, 2018 22:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwpeterson/5b4f0fd0d92939c618ea3a976b3808cc to your computer and use it in GitHub Desktop.
Save jwpeterson/5b4f0fd0d92939c618ea3a976b3808cc to your computer and use it in GitHub Desktop.
#include "libmesh/mesh_generation.h"
#include "libmesh/mesh.h"
#include "libmesh/elem.h"
#include "libmesh/getpot.h"
#include "libmesh/string_to_enum.h"
#include "libmesh/elem_range.h"
// C++ includes
#include <chrono>
using namespace libMesh;
// This program looks at the performance of the Elem::key() function
// both before and after the std::array optimizations.
//
// Timings before the std::array optimization (Mac Pro 2.7 GHz):
// 1a.) HEX27, the most expensive element to hash (elapsed time per loop).
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4
// np.mean([0.238691,0.239391,0.238902,0.238897,0.244535]) = 0.2400832
//
// 2a.) TRI3, one of the cheapest elements to hash (total elapsed time).
// ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100
// np.mean([0.911238,0.89935,0.90599,0.87904,0.902818,0.907416,0.884893,0.908562,0.927676,0.880153]) = 0.9007136
//
// Timings _after_ the std::array optimization (Mac Pro 2.7 GHz):
// 1b.) HEX27, the most expensive element to hash (elapsed time per loop).
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4
// np.mean([0.173798,0.170249,0.171532,0.168717,0.170337]) = 0.1709266
//
// 2b.) TRI3, one of the cheapest elements to hash (total elapsed time).
// ./key_performance_test-opt -d 2 -n 500 --verbose -e TRI3 --num-loops 100
// np.mean([0.875458,0.904809,0.893347,0.90164,0.909899,0.894806,0.878814,0.902609,0.907773,0.911029]) = 0.8980184
//
// (single-threaded) Speedup
// Case 1: 0.2400832 / 0.1709266 = 1.4
// Case 2: 0.9007136 / 0.8980184 = 1.00
//
// --------------------------------------------------------------------------------
//
// Multithreaded timings _before_ the std::array optimization (total elapsed time).
// a. One thread
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1
// np.mean([0.954329,0.974633,0.953297,0.983907,0.949742,]) = 0.9631816
//
// b.) Two threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2
// np.mean([0.541939,0.528525,0.505729,0.532151,0.51816,]) = 0.5253008
//
// c.) Three threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3
// np.mean([0.394029,0.353447,0.39675,0.397024,0.404526,]) = 0.3891552
//
// d.) Four threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4
// np.mean([0.36172,0.339432,0.338645,0.355434,0.345501,]) = 0.3481464
//
// e.) Five threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5
// np.mean([0.223091,0.27234,0.237944,0.282024,0.222382]) = 0.2475562
//
// f.) Six threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6
// np.mean([0.241564,0.205035,0.240761,0.235877,0.239974]) = 0.2326422
//
// g.) Seven threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7
// np.mean([0.174728,0.206139,0.215146,0.213032,0.211148]) = 0.2040386
//
// h.) Eight threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8
// np.mean([0.18606,0.192954,0.184264,0.16539,0.193858]) = 0.1845052
//
// --------------------------------------------------------------------------------
//
// Multithreaded timings _after_ the std::array optimization (total elapsed time).
// a. One thread
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=1
// np.mean([0.674726,0.688726,0.675761,0.680911,0.676321,0.685274,0.681034,0.683756]) = 0.680813625
//
// b.) Two threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=2
// np.mean([0.369587,0.361204,0.387461,0.363354,0.365055,0.362668,0.365292,0.364438]) = 0.367382375
//
// c.) Three threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=3
// np.mean([0.253700,0.254675,0.300733,0.257468,0.301073,0.279767,0.27771,0.256436]) = 0.27269525
//
// d.) Four threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=4
// np.mean([0.242026,0.251493,0.237071,0.211548,0.240836,0.209865,0.25133,0.249079]) = 0.236656
//
// e.) Five threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=5
// np.mean([0.183068,0.196724,0.201339,0.206538,0.192243]) = 0.1959824
//
// f.) Six threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=6
// np.mean([0.183316,0.180756,0.144275,0.180142,0.178662]) = 0.1734302
//
// g.) Seven threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=7
// np.mean([0.157155,0.142657,0.157594,0.13912,0.157656]) = 0.1508364
//
// h.) Eight threads
// ./key_performance_test-opt -d 3 -n 100 --verbose -e HEX27 --num-loops 4 --n-threads=8
// np.mean([0.141037,0.146088,0.132373,0.145368,0.138546]) = 0.1406824
/**
* For use when debugging: compare single-threaded algorithm to
* multi-threaded algorithm running on one thread, make sure they get
* the same result.
*/
// #define SINGLE_THREADED 1
/**
* For multi-threaded version: a functor that can be called in a parallel_for loop.
*/
class Body
{
public:
Body () : key_sum() {}
Body (Body &, Threads::split) : key_sum(0) {}
~Body() = default;
std::size_t key_sum;
void operator() (const ConstElemRange & range)
{
for (const auto & elem : range)
key_sum += elem->key();
}
void join(const Body & other)
{
key_sum += other.key_sum;
}
};
/**
* Main program.
*/
int main (int argc, char ** argv)
{
LibMeshInit init (argc, argv);
GetPot command_line (argc, argv);
// Parse command line arguments
unsigned dim = 2;
if (command_line.search(2, "-d", "--dim"))
dim = command_line.next(dim);
std::string elem_type = "QUAD4";
if (command_line.search(2, "-e", "--elem_type"))
elem_type = command_line.next(elem_type);
ElemType et = Utility::string_to_enum<ElemType>(elem_type);
// Since building the mesh takes a lot longer than computing keys,
// we can get longer duration timing by computing the keys multiple
// times.
int num_loops = 10;
if (command_line.search(2, "-L", "--num-loops"))
num_loops = command_line.next(num_loops);
// Print thread status message
libMesh::out << "Running on " << libMesh::n_threads() << " thread(s)." << std::endl;
Mesh mesh(init.comm(), dim);
// If true, print more output.
bool verbose = false;
if (command_line.search(2, "-v", "--verbose"))
verbose = true;
unsigned n_elem = 10;
if (command_line.search(2, "-n", "--n_elem"))
n_elem = command_line.next(n_elem);
// Note: you _must_ have an equal sign, i.e. --n-threads=2, you _cannot_ have a space
// between the argument and the value. The following code can be used to ensure that
// it is working...
// std::vector<std::string> n_threads(2);
// n_threads[0] = "--n_threads";
// n_threads[1] = "--n-threads";
// int nt = libMesh::command_line_value (n_threads, 1);
// libMesh::out << "Number of threads read from command line " << nt << std::endl;
if (verbose)
libMesh::out << "Building " << n_elem << "^" << dim << " mesh." << std::endl;
if (dim==3)
{
// Note: An grid of N^3 HEX8's will have 24*N^3 TET4's.
// The hashing function for a Tetrahedral face only involves
// 3 nodes, and hence is different from the hashing function for
// a Hex, which involves 4.
MeshTools::Generation::build_cube (mesh,
n_elem, n_elem, n_elem,
-1., 1.,
-1., 1.,
-1., 1.,
et);
}
else
{
// This will test 2-node hashes
MeshTools::Generation::build_square (mesh,
n_elem, n_elem,
-1., 1.,
-1., 1.,
et);
}
if (verbose)
libMesh::out << "Constructing range... " << std::endl;
// Single-threaded version:
// To minimize the impact of iteration itself in the timing, we'll
// make a flat vector of Elem* first.
#ifdef SINGLE_THREADED
std::vector<Elem *> elem_vec;
elem_vec.reserve(mesh.n_nodes());
for (const auto & elem : mesh.element_ptr_range())
elem_vec.push_back(elem);
#else
// Multi-threaded version:
ConstElemRange elem_range (mesh.elements_begin(), mesh.elements_end());
Body body;
#endif
if (verbose)
libMesh::out << "Computing keys... " << std::endl;
// Keep the compiler honest by maintaining a running sum of the key
// values. This might overflow (?) but who cares.
#ifdef SINGLE_THREADED
std::size_t key_sum = 0;
#endif
typedef std::chrono::system_clock clock_type;
std::chrono::time_point<clock_type> start = clock_type::now();
#ifdef SINGLE_THREADED
// Single-threaded version
for (int loop=0; loop<num_loops; ++loop)
for (const auto & elem : elem_vec)
key_sum += elem->key();
#else
// Multi-threaded (parallel_reduce) version.
for (int loop=0; loop<num_loops; ++loop)
Threads::parallel_reduce (elem_range, body);
#endif
std::chrono::time_point<clock_type> stop = clock_type::now();
clock_type::duration dur = stop - start;
Real elapsed = std::chrono::duration_cast<std::chrono::microseconds>(dur).count() * 1.e-6;
// Report results
#ifdef SINGLE_THREADED
libMesh::out << "key_sum = " << key_sum << std::endl;
#else
libMesh::out << "key_sum = " << body.key_sum << std::endl;
#endif
libMesh::out << "Total elapsed time: " << elapsed << " s\n";
libMesh::out << "Elapsed time/loop: " << elapsed / num_loops << " s\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment