Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <xmmintrin.h>
typedef int32_t v4 __attribute__ ((vector_size(sizeof(int32_t) * 4)));
inline bool v4equal(v4 a, v4 b)
{
__m128i v1 = _mm_load_si128((__m128i*)&a);
__m128i v2 = _mm_load_si128((__m128i*)&b);
__m128i vcmp = _mm_cmpeq_epi32(v1, v2);
uint16_t mask = _mm_movemask_epi8(vcmp);
return mask == 0xffff;
}
inline bool v4anyGreater(v4 a, v4 b)
{
__m128i v1 = _mm_load_si128((__m128i*)&a);
__m128i v2 = _mm_load_si128((__m128i*)&b);
__m128i vcmp = _mm_cmpgt_epi32(v1, v2);
uint16_t mask = _mm_movemask_epi8(vcmp);
return mask != 0;
}
class Sujiko
{
private:
uint16_t _usedSet;
uint16_t _usedCells;
uint8_t _cells[9];
v4 _sums;
uint8_t _diag1, _diag2;
v4 _clues;
inline bool filled();
inline bool used(uint8_t v);
inline void unset(size_t cell, uint8_t value);
inline bool cantBeValid();
inline bool valid();
public:
Sujiko(uint8_t clue1, uint8_t clue2, uint8_t clue3, uint8_t clue4)
: _usedSet(0),
_usedCells(0),
_cells {0},
_sums {0,0,0,0},
_diag1(clue1 + clue3),
_diag2(clue2 + clue4),
_clues { clue4, clue3, clue2, clue1 }
{}
inline void set(size_t cell, uint8_t value);
bool solve();
void print();
};
static const v4 multipliers[] = {
{ 0, 0, 0, 1 },
{ 0, 0, 1, 1 },
{ 0, 0, 1, 0 },
{ 0, 1, 0, 1 },
{ 1, 1, 1, 1 },
{ 1, 0, 1, 0 },
{ 0, 1, 0, 0 },
{ 1, 1, 0, 0 },
{ 1, 0, 0, 0 },
};
void Sujiko::set(size_t cell, uint8_t value)
{
_sums += multipliers[cell] * value;
_cells[cell] = value;
_usedSet |= 1 << value;
_usedCells |= 1 << cell;
}
void Sujiko::unset(size_t cell, uint8_t value)
{
_usedSet ^= 1 << value;
_usedCells ^= 1 << cell;
_cells[cell] = 0;
_sums -= multipliers[cell] * value;
}
bool Sujiko::filled()
{
return _usedSet == 0b1111111110;
}
bool Sujiko::valid()
{
return filled() && v4equal(_sums, _clues);
}
bool Sujiko::cantBeValid()
{
return
// These identities are from Wikipedia and allow us to bail earlier
(_cells[0] + _cells[8]) + _diag2 > 45 + _cells[4] ||
(_cells[2] + _cells[6]) + _diag1 > 45 + _cells[4] ||
// General properties
v4anyGreater(_sums, _clues);
}
bool Sujiko::used(uint8_t v)
{
return (_usedSet & (1 << v)) != 0;
}
// Precondition: not filled
bool Sujiko::solve()
{
int firstUnusedCell = 4;
// This loop gets unrolled, experiments with running from a higher known
// index didn't make it faster. This static order works from most
// constrained to least constrained.
static const size_t constrainedOrder[] = { 4, 1, 5, 7, 3, 0, 2, 6, 8 };
for (int i = 0; i < 9; i++) {
// The compiler inlines the test values from above
if ((_usedCells & (1 << constrainedOrder[i])) == 0) {
firstUnusedCell = constrainedOrder[i];
break;
}
}
v4 diff = _clues - _sums;
if (firstUnusedCell == 0 || firstUnusedCell == 2 || firstUnusedCell == 6 || firstUnusedCell == 8) {
uint16_t toUse = (((1 << diff[0]) | (1 << diff[1])) | ((1 << diff[2]) | (1 << diff[3]))) & 0b1111111110;
if ((_usedSet ^ toUse) != 0b1111111110) {
// Means we would need to duplicate a value
return false;
} else {
if ((_usedCells & (1 << 0)) == 0)
set(0, diff[3]);
if ((_usedCells & (1 << 2)) == 0)
set(2, diff[2]);
if ((_usedCells & (1 << 6)) == 0)
set(6, diff[1]);
if ((_usedCells & (1 << 8)) == 0)
set(8, diff[0]);
return true;
}
}
uint8_t highestUnused = 63 - __builtin_clzl(~_usedSet & 0b1111111111);
const uint16_t originalSet = _usedSet;
for (uint8_t testValue = highestUnused; testValue != 0; --testValue) {
// Testing against the original set is a bit faster because there is no
// longer a data dependency
if ((originalSet & (1 << testValue)) == 0) {
set(firstUnusedCell, testValue);
if (!cantBeValid()) {
// This is the same as not filled
if (firstUnusedCell != 8) {
if (solve()) {
return true;
}
}
if (valid()) {
return true;
}
}
unset(firstUnusedCell, testValue);
}
}
return false;
}
void Sujiko::print()
{
printf("%d,%d,%d,%d\n%d%d%d\n%d%d%d\n%d%d%d\n",
_clues[3], _clues[2], _clues[1], _clues[0],
_cells[0], _cells[1], _cells[2],
_cells[3], _cells[4], _cells[5],
_cells[6], _cells[7], _cells[8]);
}
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
static const int ATTEMPTS = 300000;
int main(int argc, char* argv[])
{
Sujiko board(22, 21, 24, 24);
board.set(1, 4);
board.set(2, 3);
board.solve();
unsigned long long start = rdtsc();
for (int i = 0; i < ATTEMPTS; i++) {
Sujiko board(22, 21, 24, 24);
board.set(1, 4);
board.set(2, 3);
board.solve();
}
unsigned long long end = rdtsc();
unsigned long long cycles = end - start;
board.print();
printf("Average time is %f cycles, %f us\n", (double)cycles / ATTEMPTS, (double)cycles / 2.8e9 * 1e6 / (double)ATTEMPTS);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.