kumagi/LICENSE.txt

## commandline
$ clang++ spin_lock.cc -Ofast -march=native -std=c++20 -DNDEBUG

## LICENSE.txt
Copyright 2023 Hiroki Kumazaki

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## spin_lock.cc
#include <assert.h>
#include <atomic>
#include <chrono>
#include <functional>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <random>
#include <string>
#include <string_view>
#include <thread>
#include <vector>
#define unlikely(x) __builtin_expect(!!(x), 0)

class Mutex {
public:
  void Lock() { mutex.lock(); }
  void Unlock() { mutex.unlock(); }
  std::mutex mutex;
};

class ExchangeLock {
public:
  void Lock() {
    while (flag_.exchange(1) == 1) {
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  alignas(64) std::atomic<int> flag_{0};
  char padding_[64 - sizeof(std::atomic<int>)];
};

class ExchangeLockWithYield {
public:
  void Lock() {
    while (flag_.exchange(1) == 1) {
      std::this_thread::yield();
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  std::atomic<int> flag_{0};
  char padding_[64 - sizeof(std::atomic<int>)];
};

class ExchangeLockWithBackoff {
public:
  ExchangeLockWithBackoff() : flag_(0) {}
  size_t kSleepLimit = 2048;
  void Lock() {
    size_t sleep_max = 1;
    thread_local std::mt19937 rand(0);
    while (flag_.exchange(1) == 1) {
      size_t sleep = rand() % sleep_max;
      std::this_thread::sleep_for(std::chrono::nanoseconds(sleep));
      if (sleep_max <= kSleepLimit) {
        sleep_max *= 2;
      }
      continue;
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  std::atomic<int> flag_;
  char padding_[64 - sizeof(std::atomic<int>)];
};

class TASLock {
public:
  TASLock() : flag_(0) {}
  void Lock() {
    while (flag_.test_and_set() == 1) {
    }
  }
  void Unlock() {
    // flag_.clear();
    __atomic_clear(&flag_, (int)std::memory_order_release);
  }
  std::atomic_flag flag_;
  char padding_[64 - sizeof(std::atomic_flag)];
};

class TTASLock {
public:
  TTASLock() : flag_(0) {}
  void Lock() {
    for (;;) {
      if (flag_.test(std::memory_order_relaxed)) {
        continue;
      }
      if (flag_.test_and_set() == 0) {
        break;
      }
    }
  }
  void Unlock() {
    // flag_.clear();
    __atomic_clear(&flag_, (int)std::memory_order_release);
  }
  std::atomic_flag flag_;
  char padding_[64 - sizeof(std::atomic_flag)];
};

class TTASLockWithYield {
public:
  TTASLockWithYield() : flag_(0) {}
  void Lock() {
    for (;;) {
      if (flag_.test(std::memory_order_relaxed)) {
        std::this_thread::yield();
        continue;
      }
      if (flag_.test_and_set() == 0) {
        break;
      }
    }
  }
  void Unlock() {
    // flag_.clear();
    __atomic_clear(&flag_, (int)std::memory_order_release);
  }
  std::atomic_flag flag_;
  char padding_[64 - sizeof(std::atomic_flag)];
};

class TTASLockWithBackoff {
public:
  TTASLockWithBackoff() : flag_(0) {}
  size_t kSleepLimit = 2048;
  void Lock() {
    size_t sleep_max = 1;
    thread_local std::mt19937 rand(
        std::hash<std::thread::id>()(std::this_thread::get_id()));
    for (;;) {
      if (flag_.test(std::memory_order_relaxed)) {
        size_t sleep = rand() % sleep_max;
        std::this_thread::sleep_for(std::chrono::nanoseconds(sleep));
        if (sleep_max <= kSleepLimit) {
          sleep_max *= 2;
        }
        continue;
      }
      if (flag_.test_and_set() == 0) {
        break;
      }
    }
  }
  void Unlock() {
    // flag_.clear();
    __atomic_clear(&flag_, (int)std::memory_order_release);
  }
  std::atomic_flag flag_;
  char padding_[64 - sizeof(std::atomic_flag)];
};

class CASLock {
public:
  void Lock() {
    int x = 0;
    while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
                                        std::memory_order_relaxed)) {
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  alignas(64) std::atomic<int> flag_{0};
  char padding_[64 - sizeof(std::atomic<int>)];
};

class TCASLock {
public:
  void Lock() {
    int x = 0;
    while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
                                        std::memory_order_relaxed)) {
      while (flag_.load(std::memory_order_relaxed) == 1) {
      }
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  alignas(64) std::atomic<int> flag_{0};
  char padding_[64 - sizeof(std::atomic<int>)];
};

class TCASLockWithYield {
public:
  void Lock() {
    int x = 0;
    while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
                                        std::memory_order_relaxed)) {
      while (flag_.load(std::memory_order_relaxed) == 1) {
        std::this_thread::yield();
      }
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  alignas(64) std::atomic<int> flag_{0};
  char padding_[64 - sizeof(std::atomic<int>)];
};

class TCASLockWithBackoff {
public:
  size_t kSleepLimit = 2048;
  void Lock() {
    size_t sleep_max = 1;
    thread_local std::mt19937 rand(
        std::hash<std::thread::id>()(std::this_thread::get_id()));
    int x = 0;
    while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
                                        std::memory_order_relaxed)) {
      while (flag_.load(std::memory_order_relaxed) == 1) {
        size_t sleep = rand() % sleep_max;
        std::this_thread::sleep_for(std::chrono::nanoseconds(sleep));
        if (sleep_max <= kSleepLimit) {
          sleep_max *= 2;
        }
      }
    }
  }
  void Unlock() { flag_.store(0, std::memory_order_release); }
  alignas(64) std::atomic<int> flag_{0};
  char padding_[64 - sizeof(std::atomic<int>)];
};

class MCSLock {
  static constexpr int kMaxThreads = 192;
  struct Node {
    alignas(64) std::atomic<bool> waiting{true};
    std::atomic<Node *> next{nullptr};
  };

  Node *GetMyNode(bool reset = false) {
    static std::atomic<int> used_id_{0};
    if (reset) {
      used_id_.store(0);
      return nullptr;
    }
    thread_local int my_id = std::atomic_fetch_add(&used_id_, 1);
    return &nodes_[my_id];
  }

public:
  ~MCSLock() { GetMyNode(true); }
  void Lock() {
    Node *mine = GetMyNode();
    mine->waiting.store(false, std::memory_order_relaxed);
    mine->next.store(nullptr, std::memory_order_relaxed);

    Node *old_tail = tail_.exchange(mine);
    if (old_tail != nullptr) {
      mine->waiting.store(true, std::memory_order_relaxed);
      old_tail->next.store(mine, std::memory_order_relaxed);
      while (mine->waiting.load(std::memory_order_acquire)) {
      }
    }
  }

  void Unlock() {
    Node *mine = GetMyNode();
    if (mine->next.load(std::memory_order_relaxed) == nullptr) {
      if (tail_.compare_exchange_weak(mine, nullptr, std::memory_order_release,
                                      std::memory_order_relaxed)) {
        return;
      }
      mine = GetMyNode();
      while (mine->next.load(std::memory_order_relaxed) == nullptr) {
      }
    }
    mine->next.load(std::memory_order_relaxed)
        ->waiting.store(false, std::memory_order_release);
    mine->next.store(nullptr, std::memory_order_relaxed);
  }
  alignas(64) std::atomic<Node *> tail_{};
  alignas(64) std::array<Node, kMaxThreads> nodes_;
};

class ByteLock {
  ByteLock() {}
  void Lock() {
    thread_local size_t my_slot_;
    if (unlikely(my_slot_ == 0)) {
      my_slot_ = std::atomic_fetch_add(&workers_, 1);
    }
  }
  std::atomic<size_t> workers_{1};
  alignas(64) std::array<char, 64> slots_;
};

uint64_t TimeEvalUs(std::function<void(void)> f) {
  auto start = std::chrono::high_resolution_clock::now();
  f();
  auto end = std::chrono::high_resolution_clock::now();
  return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
      .count();
}

template <typename L> double BenchmarkSingle() {
  constexpr static uint64_t kCount = 50000000;
  L target;
  double duration = TimeEvalUs([&] {
    for (uint64_t i = 0; i < kCount; ++i) {
      target.Lock();
      target.Unlock();
    }
  });
  return kCount * 1000.0 / duration; // returns in milliseconds
}

struct PaddedCount {
  __attribute__((always_inline)) void increment() { count++; }
  alignas(64) uint64_t count{0};
};

template <typename L> double BenchmarkMulti(int parallel) {
  alignas(64) L target;
  alignas(64) uint64_t sum = 0;
  std::vector<PaddedCount> counters;
  counters.resize(parallel);
  double duration = TimeEvalUs([&] {
    std::atomic<bool> stop_flag{false};
    std::vector<std::thread> workers;
    workers.reserve(parallel);
    for (int i = 0; i < parallel; ++i) {
      workers.push_back(std::thread([&, i]() {
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);
        CPU_SET(i, &cpuset);
        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
                                   &cpuset) == -1) {
          perror("pthread_setaffinity_no");
          exit(1);
        }
        while (!stop_flag.load(std::memory_order_relaxed)) {
          target.Lock();
          counters[i].increment();
          target.Unlock();
        }
      }));
    }
    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
    stop_flag.store(true, std::memory_order_release);
    for (auto &w : workers) {
      w.join();
    }
  });

  // int i = 0;
  for (const auto &c : counters) {
    // std::cout << i++ << " :" << c.count << "\n";
    sum += c.count;
  }
  return sum * 1000 / duration;
}

double BenchmarkByName(std::string_view name, int parallel) {
  if (name == "Mutex") {
    return BenchmarkMulti<Mutex>(parallel);
  } else if (name == "Exchange") {
    return BenchmarkMulti<ExchangeLock>(parallel);
  } else if (name == "ExchangeWithYield") {
    return BenchmarkMulti<ExchangeLockWithYield>(parallel);
  } else if (name == "ExchangeWithBackoff") {
    return BenchmarkMulti<ExchangeLockWithBackoff>(parallel);
  } else if (name == "TAS") {
    return BenchmarkMulti<TASLock>(parallel);
  } else if (name == "TTAS") {
    return BenchmarkMulti<TTASLock>(parallel);
  } else if (name == "TTASWithYield") {
    return BenchmarkMulti<TTASLockWithYield>(parallel);
  } else if (name == "TTASWithBackoff") {
    return BenchmarkMulti<TTASLockWithBackoff>(parallel);
  } else if (name == "CAS") {
    return BenchmarkMulti<CASLock>(parallel);
  } else if (name == "TCAS") {
    return BenchmarkMulti<TCASLock>(parallel);
  } else if (name == "TCASWithYield") {
    return BenchmarkMulti<TCASLockWithYield>(parallel);
  } else if (name == "TCASWithBackoff") {
    return BenchmarkMulti<TCASLockWithBackoff>(parallel);
  } else if (name == "MCS") {
    return BenchmarkMulti<MCSLock>(parallel);
  }
  return 0;
}

int main() {
  setvbuf(stdout, NULL, _IONBF, 0);
  constexpr int kMaxThreads = 12;
  constexpr int kDatapoints = 8;
  constexpr int kRepeats = 10;
  std::cout << std::setprecision(10);
  const std::vector<std::string> targets = {
      "Mutex",
      "Exchange", "ExchangeWithYield", "ExchangeWithBackoff",
      "TAS", "TTAS",     "TTASWithYield",     "TTASWithBackoff",
      "CAS", "TCAS",     "TCASWithYield",     "TCASWithBackoff",
      "MCS"
  };
  std::vector<int> parallels;
  parallels.push_back(1);
  for (int i = 1; i < kDatapoints - 1; ++i) {
    parallels.push_back((kMaxThreads * i + kDatapoints - 1) / kDatapoints);
  }
  parallels.push_back(kMaxThreads);
  std::cout << " ";
  for (const auto &target : targets) {
    std::cout << ", " << target;
  }
  std::cout << "\n";

  for (const int parallel : parallels) {
    for (int i = 0; i < kRepeats; ++i) {
      std::cout << parallel;
      for (const auto &target : targets) {
        std::cout << ", " << BenchmarkByName(target, parallel);
      }
      std::cout << "\n";
    }
  }
}
	Copyright 2023 Hiroki Kumazaki

	Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	#include <assert.h>
	#include <atomic>
	#include <chrono>
	#include <functional>
	#include <iomanip>
	#include <iostream>
	#include <mutex>
	#include <random>
	#include <string>
	#include <string_view>
	#include <thread>
	#include <vector>
	#define unlikely(x) __builtin_expect(!!(x), 0)

	class Mutex {
	public:
	void Lock() { mutex.lock(); }
	void Unlock() { mutex.unlock(); }
	std::mutex mutex;
	};

	class ExchangeLock {
	public:
	void Lock() {
	while (flag_.exchange(1) == 1) {
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	alignas(64) std::atomic<int> flag_{0};
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class ExchangeLockWithYield {
	public:
	void Lock() {
	while (flag_.exchange(1) == 1) {
	std::this_thread::yield();
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	std::atomic<int> flag_{0};
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class ExchangeLockWithBackoff {
	public:
	ExchangeLockWithBackoff() : flag_(0) {}
	size_t kSleepLimit = 2048;
	void Lock() {
	size_t sleep_max = 1;
	thread_local std::mt19937 rand(0);
	while (flag_.exchange(1) == 1) {
	size_t sleep = rand() % sleep_max;
	std::this_thread::sleep_for(std::chrono::nanoseconds(sleep));
	if (sleep_max <= kSleepLimit) {
	sleep_max *= 2;
	}
	continue;
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	std::atomic<int> flag_;
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class TASLock {
	public:
	TASLock() : flag_(0) {}
	void Lock() {
	while (flag_.test_and_set() == 1) {
	}
	}
	void Unlock() {
	// flag_.clear();
	__atomic_clear(&flag_, (int)std::memory_order_release);
	}
	std::atomic_flag flag_;
	char padding_[64 - sizeof(std::atomic_flag)];
	};

	class TTASLock {
	public:
	TTASLock() : flag_(0) {}
	void Lock() {
	for (;;) {
	if (flag_.test(std::memory_order_relaxed)) {
	continue;
	}
	if (flag_.test_and_set() == 0) {
	break;
	}
	}
	}
	void Unlock() {
	// flag_.clear();
	__atomic_clear(&flag_, (int)std::memory_order_release);
	}
	std::atomic_flag flag_;
	char padding_[64 - sizeof(std::atomic_flag)];
	};

	class TTASLockWithYield {
	public:
	TTASLockWithYield() : flag_(0) {}
	void Lock() {
	for (;;) {
	if (flag_.test(std::memory_order_relaxed)) {
	std::this_thread::yield();
	continue;
	}
	if (flag_.test_and_set() == 0) {
	break;
	}
	}
	}
	void Unlock() {
	// flag_.clear();
	__atomic_clear(&flag_, (int)std::memory_order_release);
	}
	std::atomic_flag flag_;
	char padding_[64 - sizeof(std::atomic_flag)];
	};

	class TTASLockWithBackoff {
	public:
	TTASLockWithBackoff() : flag_(0) {}
	size_t kSleepLimit = 2048;
	void Lock() {
	size_t sleep_max = 1;
	thread_local std::mt19937 rand(
	std::hash<std::thread::id>()(std::this_thread::get_id()));
	for (;;) {
	if (flag_.test(std::memory_order_relaxed)) {
	size_t sleep = rand() % sleep_max;
	std::this_thread::sleep_for(std::chrono::nanoseconds(sleep));
	if (sleep_max <= kSleepLimit) {
	sleep_max *= 2;
	}
	continue;
	}
	if (flag_.test_and_set() == 0) {
	break;
	}
	}
	}
	void Unlock() {
	// flag_.clear();
	__atomic_clear(&flag_, (int)std::memory_order_release);
	}
	std::atomic_flag flag_;
	char padding_[64 - sizeof(std::atomic_flag)];
	};

	class CASLock {
	public:
	void Lock() {
	int x = 0;
	while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
	std::memory_order_relaxed)) {
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	alignas(64) std::atomic<int> flag_{0};
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class TCASLock {
	public:
	void Lock() {
	int x = 0;
	while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
	std::memory_order_relaxed)) {
	while (flag_.load(std::memory_order_relaxed) == 1) {
	}
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	alignas(64) std::atomic<int> flag_{0};
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class TCASLockWithYield {
	public:
	void Lock() {
	int x = 0;
	while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
	std::memory_order_relaxed)) {
	while (flag_.load(std::memory_order_relaxed) == 1) {
	std::this_thread::yield();
	}
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	alignas(64) std::atomic<int> flag_{0};
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class TCASLockWithBackoff {
	public:
	size_t kSleepLimit = 2048;
	void Lock() {
	size_t sleep_max = 1;
	thread_local std::mt19937 rand(
	std::hash<std::thread::id>()(std::this_thread::get_id()));
	int x = 0;
	while (!flag_.compare_exchange_weak(x, 1, std::memory_order_acquire,
	std::memory_order_relaxed)) {
	while (flag_.load(std::memory_order_relaxed) == 1) {
	size_t sleep = rand() % sleep_max;
	std::this_thread::sleep_for(std::chrono::nanoseconds(sleep));
	if (sleep_max <= kSleepLimit) {
	sleep_max *= 2;
	}
	}
	}
	}
	void Unlock() { flag_.store(0, std::memory_order_release); }
	alignas(64) std::atomic<int> flag_{0};
	char padding_[64 - sizeof(std::atomic<int>)];
	};

	class MCSLock {
	static constexpr int kMaxThreads = 192;
	struct Node {
	alignas(64) std::atomic<bool> waiting{true};
	std::atomic<Node *> next{nullptr};
	};

	Node *GetMyNode(bool reset = false) {
	static std::atomic<int> used_id_{0};
	if (reset) {
	used_id_.store(0);
	return nullptr;
	}
	thread_local int my_id = std::atomic_fetch_add(&used_id_, 1);
	return &nodes_[my_id];
	}

	public:
	~MCSLock() { GetMyNode(true); }
	void Lock() {
	Node *mine = GetMyNode();
	mine->waiting.store(false, std::memory_order_relaxed);
	mine->next.store(nullptr, std::memory_order_relaxed);

	Node *old_tail = tail_.exchange(mine);
	if (old_tail != nullptr) {
	mine->waiting.store(true, std::memory_order_relaxed);
	old_tail->next.store(mine, std::memory_order_relaxed);
	while (mine->waiting.load(std::memory_order_acquire)) {
	}
	}
	}

	void Unlock() {
	Node *mine = GetMyNode();
	if (mine->next.load(std::memory_order_relaxed) == nullptr) {
	if (tail_.compare_exchange_weak(mine, nullptr, std::memory_order_release,
	std::memory_order_relaxed)) {
	return;
	}
	mine = GetMyNode();
	while (mine->next.load(std::memory_order_relaxed) == nullptr) {
	}
	}
	mine->next.load(std::memory_order_relaxed)
	->waiting.store(false, std::memory_order_release);
	mine->next.store(nullptr, std::memory_order_relaxed);
	}
	alignas(64) std::atomic<Node *> tail_{};
	alignas(64) std::array<Node, kMaxThreads> nodes_;
	};

	class ByteLock {
	ByteLock() {}
	void Lock() {
	thread_local size_t my_slot_;
	if (unlikely(my_slot_ == 0)) {
	my_slot_ = std::atomic_fetch_add(&workers_, 1);
	}
	}
	std::atomic<size_t> workers_{1};
	alignas(64) std::array<char, 64> slots_;
	};

	uint64_t TimeEvalUs(std::function<void(void)> f) {
	auto start = std::chrono::high_resolution_clock::now();
	f();
	auto end = std::chrono::high_resolution_clock::now();
	return std::chrono::duration_cast<std::chrono::microseconds>(end - start)
	.count();
	}

	template <typename L> double BenchmarkSingle() {
	constexpr static uint64_t kCount = 50000000;
	L target;
	double duration = TimeEvalUs([&] {
	for (uint64_t i = 0; i < kCount; ++i) {
	target.Lock();
	target.Unlock();
	}
	});
	return kCount * 1000.0 / duration; // returns in milliseconds
	}

	struct PaddedCount {
	__attribute__((always_inline)) void increment() { count++; }
	alignas(64) uint64_t count{0};
	};

	template <typename L> double BenchmarkMulti(int parallel) {
	alignas(64) L target;
	alignas(64) uint64_t sum = 0;
	std::vector<PaddedCount> counters;
	counters.resize(parallel);
	double duration = TimeEvalUs([&] {
	std::atomic<bool> stop_flag{false};
	std::vector<std::thread> workers;
	workers.reserve(parallel);
	for (int i = 0; i < parallel; ++i) {
	workers.push_back(std::thread([&, i]() {
	cpu_set_t cpuset;
	CPU_ZERO(&cpuset);
	CPU_SET(i, &cpuset);
	if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t),
	&cpuset) == -1) {
	perror("pthread_setaffinity_no");
	exit(1);
	}
	while (!stop_flag.load(std::memory_order_relaxed)) {
	target.Lock();
	counters[i].increment();
	target.Unlock();
	}
	}));
	}
	std::this_thread::sleep_for(std::chrono::milliseconds(1000));
	stop_flag.store(true, std::memory_order_release);
	for (auto &w : workers) {
	w.join();
	}
	});

	// int i = 0;
	for (const auto &c : counters) {
	// std::cout << i++ << " :" << c.count << "\n";
	sum += c.count;
	}
	return sum * 1000 / duration;
	}

	double BenchmarkByName(std::string_view name, int parallel) {
	if (name == "Mutex") {
	return BenchmarkMulti<Mutex>(parallel);
	} else if (name == "Exchange") {
	return BenchmarkMulti<ExchangeLock>(parallel);
	} else if (name == "ExchangeWithYield") {
	return BenchmarkMulti<ExchangeLockWithYield>(parallel);
	} else if (name == "ExchangeWithBackoff") {
	return BenchmarkMulti<ExchangeLockWithBackoff>(parallel);
	} else if (name == "TAS") {
	return BenchmarkMulti<TASLock>(parallel);
	} else if (name == "TTAS") {
	return BenchmarkMulti<TTASLock>(parallel);
	} else if (name == "TTASWithYield") {
	return BenchmarkMulti<TTASLockWithYield>(parallel);
	} else if (name == "TTASWithBackoff") {
	return BenchmarkMulti<TTASLockWithBackoff>(parallel);
	} else if (name == "CAS") {
	return BenchmarkMulti<CASLock>(parallel);
	} else if (name == "TCAS") {
	return BenchmarkMulti<TCASLock>(parallel);
	} else if (name == "TCASWithYield") {
	return BenchmarkMulti<TCASLockWithYield>(parallel);
	} else if (name == "TCASWithBackoff") {
	return BenchmarkMulti<TCASLockWithBackoff>(parallel);
	} else if (name == "MCS") {
	return BenchmarkMulti<MCSLock>(parallel);
	}
	return 0;
	}

	int main() {
	setvbuf(stdout, NULL, _IONBF, 0);
	constexpr int kMaxThreads = 12;
	constexpr int kDatapoints = 8;
	constexpr int kRepeats = 10;
	std::cout << std::setprecision(10);
	const std::vector<std::string> targets = {
	"Mutex",
	"Exchange", "ExchangeWithYield", "ExchangeWithBackoff",
	"TAS", "TTAS", "TTASWithYield", "TTASWithBackoff",
	"CAS", "TCAS", "TCASWithYield", "TCASWithBackoff",
	"MCS"
	};
	std::vector<int> parallels;
	parallels.push_back(1);
	for (int i = 1; i < kDatapoints - 1; ++i) {
	parallels.push_back((kMaxThreads * i + kDatapoints - 1) / kDatapoints);
	}
	parallels.push_back(kMaxThreads);
	std::cout << " ";
	for (const auto &target : targets) {
	std::cout << ", " << target;
	}
	std::cout << "\n";

	for (const int parallel : parallels) {
	for (int i = 0; i < kRepeats; ++i) {
	std::cout << parallel;
	for (const auto &target : targets) {
	std::cout << ", " << BenchmarkByName(target, parallel);
	}
	std::cout << "\n";
	}
	}
	}