#include <algorithm> | |
#include <cstdint> | |
#include <cstdio> | |
#include <xmmintrin.h> | |
typedef int32_t v4 __attribute__ ((vector_size(sizeof(int32_t) * 4))); | |
inline bool v4equal(v4 a, v4 b) | |
{ | |
__m128i v1 = _mm_load_si128((__m128i*)&a); | |
__m128i v2 = _mm_load_si128((__m128i*)&b); | |
__m128i vcmp = _mm_cmpeq_epi32(v1, v2); | |
uint16_t mask = _mm_movemask_epi8(vcmp); | |
return mask == 0xffff; | |
} | |
inline bool v4anyGreater(v4 a, v4 b) | |
{ | |
__m128i v1 = _mm_load_si128((__m128i*)&a); | |
__m128i v2 = _mm_load_si128((__m128i*)&b); | |
__m128i vcmp = _mm_cmpgt_epi32(v1, v2); | |
uint16_t mask = _mm_movemask_epi8(vcmp); | |
return mask != 0; | |
} | |
class Sujiko | |
{ | |
private: | |
uint16_t _usedSet; | |
uint16_t _usedCells; | |
uint8_t _cells[9]; | |
v4 _sums; | |
uint8_t _diag1, _diag2; | |
v4 _clues; | |
inline bool filled(); | |
inline bool used(uint8_t v); | |
inline void unset(size_t cell, uint8_t value); | |
inline bool cantBeValid(); | |
inline bool valid(); | |
public: | |
Sujiko(uint8_t clue1, uint8_t clue2, uint8_t clue3, uint8_t clue4) | |
: _usedSet(0), | |
_usedCells(0), | |
_cells {0}, | |
_sums {0,0,0,0}, | |
_diag1(clue1 + clue3), | |
_diag2(clue2 + clue4), | |
_clues { clue4, clue3, clue2, clue1 } | |
{} | |
inline void set(size_t cell, uint8_t value); | |
bool solve(); | |
void print(); | |
}; | |
static const v4 multipliers[] = { | |
{ 0, 0, 0, 1 }, | |
{ 0, 0, 1, 1 }, | |
{ 0, 0, 1, 0 }, | |
{ 0, 1, 0, 1 }, | |
{ 1, 1, 1, 1 }, | |
{ 1, 0, 1, 0 }, | |
{ 0, 1, 0, 0 }, | |
{ 1, 1, 0, 0 }, | |
{ 1, 0, 0, 0 }, | |
}; | |
void Sujiko::set(size_t cell, uint8_t value) | |
{ | |
_sums += multipliers[cell] * value; | |
_cells[cell] = value; | |
_usedSet |= 1 << value; | |
_usedCells |= 1 << cell; | |
} | |
void Sujiko::unset(size_t cell, uint8_t value) | |
{ | |
_usedSet ^= 1 << value; | |
_usedCells ^= 1 << cell; | |
_cells[cell] = 0; | |
_sums -= multipliers[cell] * value; | |
} | |
bool Sujiko::filled() | |
{ | |
return _usedSet == 0b1111111110; | |
} | |
bool Sujiko::valid() | |
{ | |
return filled() && v4equal(_sums, _clues); | |
} | |
bool Sujiko::cantBeValid() | |
{ | |
return | |
// These identities are from Wikipedia and allow us to bail earlier | |
(_cells[0] + _cells[8]) + _diag2 > 45 + _cells[4] || | |
(_cells[2] + _cells[6]) + _diag1 > 45 + _cells[4] || | |
// General properties | |
v4anyGreater(_sums, _clues); | |
} | |
bool Sujiko::used(uint8_t v) | |
{ | |
return (_usedSet & (1 << v)) != 0; | |
} | |
// Precondition: not filled | |
bool Sujiko::solve() | |
{ | |
int firstUnusedCell = 4; | |
// This loop gets unrolled, experiments with running from a higher known | |
// index didn't make it faster. This static order works from most | |
// constrained to least constrained. | |
static const size_t constrainedOrder[] = { 4, 1, 5, 7, 3, 0, 2, 6, 8 }; | |
for (int i = 0; i < 9; i++) { | |
// The compiler inlines the test values from above | |
if ((_usedCells & (1 << constrainedOrder[i])) == 0) { | |
firstUnusedCell = constrainedOrder[i]; | |
break; | |
} | |
} | |
v4 diff = _clues - _sums; | |
if (firstUnusedCell == 0 || firstUnusedCell == 2 || firstUnusedCell == 6 || firstUnusedCell == 8) { | |
uint16_t toUse = (((1 << diff[0]) | (1 << diff[1])) | ((1 << diff[2]) | (1 << diff[3]))) & 0b1111111110; | |
if ((_usedSet ^ toUse) != 0b1111111110) { | |
// Means we would need to duplicate a value | |
return false; | |
} else { | |
if ((_usedCells & (1 << 0)) == 0) | |
set(0, diff[3]); | |
if ((_usedCells & (1 << 2)) == 0) | |
set(2, diff[2]); | |
if ((_usedCells & (1 << 6)) == 0) | |
set(6, diff[1]); | |
if ((_usedCells & (1 << 8)) == 0) | |
set(8, diff[0]); | |
return true; | |
} | |
} | |
uint8_t highestUnused = 63 - __builtin_clzl(~_usedSet & 0b1111111111); | |
const uint16_t originalSet = _usedSet; | |
for (uint8_t testValue = highestUnused; testValue != 0; --testValue) { | |
// Testing against the original set is a bit faster because there is no | |
// longer a data dependency | |
if ((originalSet & (1 << testValue)) == 0) { | |
set(firstUnusedCell, testValue); | |
if (!cantBeValid()) { | |
// This is the same as not filled | |
if (firstUnusedCell != 8) { | |
if (solve()) { | |
return true; | |
} | |
} | |
if (valid()) { | |
return true; | |
} | |
} | |
unset(firstUnusedCell, testValue); | |
} | |
} | |
return false; | |
} | |
void Sujiko::print() | |
{ | |
printf("%d,%d,%d,%d\n%d%d%d\n%d%d%d\n%d%d%d\n", | |
_clues[3], _clues[2], _clues[1], _clues[0], | |
_cells[0], _cells[1], _cells[2], | |
_cells[3], _cells[4], _cells[5], | |
_cells[6], _cells[7], _cells[8]); | |
} | |
static __inline__ unsigned long long rdtsc(void) | |
{ | |
unsigned hi, lo; | |
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); | |
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); | |
} | |
static const int ATTEMPTS = 300000; | |
int main(int argc, char* argv[]) | |
{ | |
Sujiko board(22, 21, 24, 24); | |
board.set(1, 4); | |
board.set(2, 3); | |
board.solve(); | |
unsigned long long start = rdtsc(); | |
for (int i = 0; i < ATTEMPTS; i++) { | |
Sujiko board(22, 21, 24, 24); | |
board.set(1, 4); | |
board.set(2, 3); | |
board.solve(); | |
} | |
unsigned long long end = rdtsc(); | |
unsigned long long cycles = end - start; | |
board.print(); | |
printf("Average time is %f cycles, %f us\n", (double)cycles / ATTEMPTS, (double)cycles / 2.8e9 * 1e6 / (double)ATTEMPTS); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment