Last active
April 26, 2021 21:28
-
-
Save alkis/7fd9678e64ae885fd9a4135ee7411360 to your computer and use it in GitHub Desktop.
c++ library multiversioning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2016 Google Inc. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// compile time map of processor -> feature | |
template <int CPU, int FEATURE> | |
struct FeatureMap; | |
// all intrinsics | |
template <int CPU, bool has_bmi2 = FeatureMap<CPU, BMI2>> | |
struct PDep { | |
uint64_t pdep64(uint64_t x, uint64_t m) { /* implementation without pdep */ } | |
}; | |
template <int CPU> | |
struct PDep<CPU, true> | |
uint64_t pdep64(uint64_t x, uint64_t m) __attribute__((target("bmi2"))) { | |
_pdep_u64(x, m); | |
} | |
}; | |
// a bunch of higher level primitives | |
template <int CPU, | |
bool has_bmi = FeatureMap<CPU, BMI>, | |
bool has_bmi2 = FeatureMap<CPU, BMI2>> | |
struct Select { | |
int select64(uint64_t x, int i) { /* implementation without bmi + bmi2 */ } | |
}; | |
template <int CPU> | |
struct Select<CPU, true, true> { | |
int select64(uint64_t x, int i) __attribute__((target("bmi,bmi2"))) { | |
// This will be properly inlined and will result in 2 instructions. | |
// This is great because we have intrinsics/primitives that are composable | |
// and inlinable. | |
return TZCnt<CPU>::tzcnt64(PDep<CPU>::pdep64(uint64_t{1} << n, x)); | |
} | |
}; | |
// Bundle all ops (intrinsics + primitives) in a single struct. | |
template <int CPU> | |
struct Ops : Pdep<CPU>, Select<CPU>, etc... {}; | |
// The CPU dispatcher. This is where we need the requested intrinsic. | |
template <class Fn, class... Args> | |
auto SelectCPU(Fn&& fn, Args&&... args) { | |
switch (GetCPU()) { | |
case SKYLAKE: | |
// While our ops are composable and inlinable we cannot inline them into fn. | |
// This is because the compiler does not know that at this point we are running | |
// on SKYLAKE so our fn when instantiated with Ops<SKYLAKE> implicitly has | |
// __attributes__((target("arch=skylake")). As such it will not inline any of | |
// the functions in Ops<SKYLAKE>. Also it will not use target specific info | |
// for this instantiation of fn (instruction latencies, etc). | |
return fn(Ops<SKYLAKE>{}, std::forward<Args>(args)...); | |
case IVYBRIDGE: | |
// Same here. | |
return fn(Ops<IVYBRIDGE>{}, std::forward<Args>(args)...); | |
// etc | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment