Skip to content

Instantly share code, notes, and snippets.

@alkis
Last active April 26, 2021 21:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alkis/7fd9678e64ae885fd9a4135ee7411360 to your computer and use it in GitHub Desktop.
Save alkis/7fd9678e64ae885fd9a4135ee7411360 to your computer and use it in GitHub Desktop.
c++ library multiversioning
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// compile time map of processor -> feature
template <int CPU, int FEATURE>
struct FeatureMap;
// all intrinsics
template <int CPU, bool has_bmi2 = FeatureMap<CPU, BMI2>>
struct PDep {
uint64_t pdep64(uint64_t x, uint64_t m) { /* implementation without pdep */ }
};
template <int CPU>
struct PDep<CPU, true>
uint64_t pdep64(uint64_t x, uint64_t m) __attribute__((target("bmi2"))) {
_pdep_u64(x, m);
}
};
// a bunch of higher level primitives
template <int CPU,
bool has_bmi = FeatureMap<CPU, BMI>,
bool has_bmi2 = FeatureMap<CPU, BMI2>>
struct Select {
int select64(uint64_t x, int i) { /* implementation without bmi + bmi2 */ }
};
template <int CPU>
struct Select<CPU, true, true> {
int select64(uint64_t x, int i) __attribute__((target("bmi,bmi2"))) {
// This will be properly inlined and will result in 2 instructions.
// This is great because we have intrinsics/primitives that are composable
// and inlinable.
return TZCnt<CPU>::tzcnt64(PDep<CPU>::pdep64(uint64_t{1} << n, x));
}
};
// Bundle all ops (intrinsics + primitives) in a single struct.
template <int CPU>
struct Ops : Pdep<CPU>, Select<CPU>, etc... {};
// The CPU dispatcher. This is where we need the requested intrinsic.
template <class Fn, class... Args>
auto SelectCPU(Fn&& fn, Args&&... args) {
switch (GetCPU()) {
case SKYLAKE:
// While our ops are composable and inlinable we cannot inline them into fn.
// This is because the compiler does not know that at this point we are running
// on SKYLAKE so our fn when instantiated with Ops<SKYLAKE> implicitly has
// __attributes__((target("arch=skylake")). As such it will not inline any of
// the functions in Ops<SKYLAKE>. Also it will not use target specific info
// for this instantiation of fn (instruction latencies, etc).
return fn(Ops<SKYLAKE>{}, std::forward<Args>(args)...);
case IVYBRIDGE:
// Same here.
return fn(Ops<IVYBRIDGE>{}, std::forward<Args>(args)...);
// etc
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment