marchete/CSB_avx_sim.cpp Secret

## CSB_avx_sim.cpp
#pragma GCC optimize("Ofast","unroll-loops","omit-frame-pointer","inline")
#pragma GCC option("arch=native","tune=native","no-zeroupper")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2")
#include <immintrin.h> //SSE Extensions
//{Headers
#include <bits/stdc++.h> //All main STD libraries
using namespace std;

#define STI static inline
#define FOR0(i,n) for(int i=0;i<(n);++i)
#define FOR(i,a,b) for(int i=a;i<(b);++i)
#define SQ(x) ((x)*(x))
#define OP operator
typedef long long ll;
typedef unsigned long ulong;
typedef unsigned int uint;

//} Headers

//SIMCOUNT:2686800 MAX:2732400 Min:1803600 Media:2484715
#define MAX_ITERATIONS_TURN 6
#define ALIGN __attribute__((aligned(32)))
#define ALIGN16 __attribute__((aligned(16)))
//divisiones y= _mm_castps_si128(_mm_div_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(q));

//>>Simulation Tests, you need to play against yourself!!!!!
#define SIMLOG
int SIM_TURNS_FAILED = 0; //Turns with simulation fails
int SIM_OVERLOAD = 0; //Turns with excessive simulation steps
int SIM_Vector_Not_Noised = 0; //AVX index without noise, will be randomized
//<<Simulation Tests


//Unit Tests GA. Seems broken....
//#define TESTGENOMA

int SIMCOUNT = 0;

typedef __m256 v8;//AVX

template <int i0,int i1>
static inline  __m256 C8f(){
    static const union {float f[8]; __m256  ymm;} u = {{(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1
                                                      ,(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1}};
    return u.ymm;
};


constexpr int DEPTH = 6;
constexpr int POPULATION = 50;
constexpr int ELITISM = 4;
constexpr int MUTATION = 4;
constexpr int MUTATION_RATE = 1;
constexpr int MixElite = ELITISM /2;
constexpr int SHIELD_PROB = 20*127/100; //20% in base 127

#define USEMALUSSCORE
constexpr int MalusPercent = 70;

//#define TESTSIM
//#define VERBOSELOG

bool FirstPlayer = false;


//{Angles
const int16_t ANGLES_LENGTH = 360;
const float INV_PI = 1.0f/M_PI;
const float TO_RAD = M_PI / 180.0f;
const float TO_DEG = 180.0f/M_PI;
const ALIGN v8 v8TO_DEG = {TO_DEG,TO_DEG,TO_DEG,TO_DEG,TO_DEG,TO_DEG,TO_DEG,TO_DEG};
const ALIGN v8 v8_360 = {360.0f,360.0f,360.0f,360.0f,360.0f,360.0f,360.0f,360.0f};

static float fast_cos[ANGLES_LENGTH];
static float fast_sin[ANGLES_LENGTH];
static float fast_angle[ANGLES_LENGTH];
static inline void InitAngles() {
	constexpr float angle =  (float)ANGLES_LENGTH;
	float f;
	for (int i = 0; i < ANGLES_LENGTH; ++i) {
	    f=i*360.0f/angle;
  	    fast_cos[i] = cosf(f * TO_RAD);
		fast_sin[i] = sinf(f * TO_RAD);
		fast_angle[i] = f;
	}
}
//} Angles

//{Stopwatch
#define TIMEOUT0_0 300000
#define TIMEOUT0_1 950000
#define TIMEOUTN_0  30000  //Calc enemy
#define TIMEOUTN_1 144500  //Calc me
#define Now() chrono::high_resolution_clock::now()
static struct Stopwatch {
	chrono::high_resolution_clock::time_point c_time,c_timeout;
	void Start() {c_time=Now();}
	void setTimeout(int us){c_timeout=c_time+chrono::microseconds(us);}
	inline bool Timeout(){return Now()>c_timeout;}
	ll EllapsedMicroseconds(){return chrono::duration_cast<chrono::microseconds>(Now()-c_time).count();}
	ll EllapsedMilliseconds(){return chrono::duration_cast<chrono::milliseconds>(Now()-c_time).count();}
} stopwatch;
//} Stopwatch


//{SSE Short 16-bit integers signed 8x16
#define v8izero _mm_setzero_si128()
#define Pv8iab v8i const &a,v8i const &b
#define Pv8ias v8i const &a,int16_t b
class v8i{
public:
  __m128i ALIGN16 v;
 v8i(){v= v8izero;}
 v8i(int f){v=_mm_set1_epi16((int16_t)f);}
 v8i(int16_t i0,int16_t i1,int16_t i2,int16_t i3,int16_t i4,int16_t i5,int16_t i6,int16_t i7){v=_mm_setr_epi16(i0, i1, i2, i3,i4,i5,i6,i7);}
 v8i(__m128i const& x){v=x;}
// OP __m128i() const {return v;}
  void load(int16_t const * p){v=_mm_loadu_si128((__m128i const*)p);}
  void loada(int16_t const * p){v=_mm_load_si128((__m128i const*)p);}
  void store(int16_t * p) {_mm_storeu_si128((__m128i*)p,v);}
  void storea(int16_t * p){_mm_store_si128((__m128i*)p, v);}
  void insert(uint32_t index, int16_t value) {
        switch(index) {
        case 0:v = _mm_insert_epi16(v,value,0);  break;
        case 1:v = _mm_insert_epi16(v,value,1);  break;
        case 2:v = _mm_insert_epi16(v,value,2);  break;
        case 3:v = _mm_insert_epi16(v,value,3);  break;
        case 4:v = _mm_insert_epi16(v,value,4);  break;
        case 5:v = _mm_insert_epi16(v,value,5);  break;
        case 6:v = _mm_insert_epi16(v,value,6);  break;
        case 7:v = _mm_insert_epi16(v,value,7);  break;  }
    }
  int16_t OP [] (uint32_t index) const {
    switch(index) {
		case 0:return (int16_t)_mm_extract_epi16(v,0);
		case 1:return (int16_t)_mm_extract_epi16(v,1);
		case 2:return (int16_t)_mm_extract_epi16(v,2);
		case 3:return (int16_t)_mm_extract_epi16(v,3);
		case 4:return (int16_t)_mm_extract_epi16(v,4);
		case 5:return (int16_t)_mm_extract_epi16(v,5);
		case 6:return (int16_t)_mm_extract_epi16(v,6);
	    case 7:return (int16_t)_mm_extract_epi16(v,7); }
  }
};
STI v8i OP +(Pv8iab){return _mm_add_epi16(a.v,b.v);}
STI v8i OP +(Pv8ias){return _mm_add_epi16(a.v,_mm_set1_epi16((int16_t)b));}
STI v8i OP -(Pv8iab){return _mm_sub_epi16(a.v,b.v);}
STI v8i OP -(Pv8ias){return _mm_sub_epi16(a.v,_mm_set1_epi16((int16_t)b));}
STI v8i OP -(v8i const &a){return _mm_sub_epi16(_mm_setzero_si128(),a.v);}

STI v8i &OP ++(v8i &a){a=a+1;return a;}
STI v8i OP ++(v8i &a,int){v8i ALIGN16 a0(a);a=a+1;return a0;}
STI v8i &OP +=(v8i &a,v8i const &b){a=a+b;return a;}
STI v8i &OP --(v8i &a){a=a-1;return a;}
STI v8i OP --(v8i &a,int){v8i ALIGN16 a0=a;a=a-1;return a0;}
STI v8i &OP -=(v8i &a,v8i const &b){a=a-b;return a;}

STI v8i OP *(Pv8iab){return _mm_mullo_epi16(a.v,b.v);}
STI v8i OP *(Pv8ias){return _mm_mullo_epi16(a.v,_mm_set1_epi16((int16_t)b));}
STI v8i &OP *=(v8i &a,v8i const &b){a=a*b;return a;}
STI v8i OP !(v8i const &a){return _mm_cmpeq_epi16(a.v,_mm_setzero_si128());}  //  return _mm_xor_si128(a, _mm_set1_epi16(-1));}
STI v8i OP ~(v8i const &a){return _mm_xor_si128(a.v, _mm_set1_epi32(-1));}
STI v8i OP &(Pv8iab){return _mm_and_si128(a.v,b.v);}
STI v8i OP &&(Pv8iab){return _mm_and_si128(a.v,b.v);}
STI v8i &OP &=(v8i &a,v8i const &b){a=a&b;return a;}
STI v8i OP |(Pv8iab){return _mm_or_si128(a.v,b.v);}
STI v8i OP ||(Pv8iab){return _mm_or_si128(a.v,b.v);}
STI v8i &OP |=(v8i &a,v8i const &b){a=a|b;return a;}
STI v8i OP ^(Pv8iab){return _mm_xor_si128(a.v,b.v);}
STI v8i &OP ^=(v8i &a,v8i const &b){a=a^b;return a;}
STI v8i OP ==(Pv8iab){return _mm_cmpeq_epi16(a.v, b.v);}
STI v8i OP !=(Pv8iab){return  ~(_mm_cmpeq_epi16(a.v, b.v));}
STI v8i OP >(Pv8iab){return  _mm_cmpgt_epi16(a.v,b.v); }
STI v8i OP <(Pv8iab){return  _mm_cmpgt_epi16(b.v,a.v); }
STI v8i OP >=(Pv8iab){return  ~(_mm_cmpgt_epi16(b.v,a.v)); }
STI v8i OP <=(Pv8iab){return  ~(_mm_cmpgt_epi16(a.v,b.v)); }
STI v8i andnot(Pv8iab){return _mm_andnot_si128(a.v,b.v);}  //a&!b
STI v8i OP <<(Pv8ias){return _mm_sll_epi16(a.v,_mm_cvtsi32_si128(b));}  //STI v8i shl8(Pv8iab){return _mm_sll_epi16(a,b);}
STI v8i OP >>(Pv8ias){return _mm_sra_epi16(a.v,_mm_cvtsi32_si128(b));}  //STI v8i shr8(Pv8iab){return _mm_sra_epi16(a,b);}
STI v8i &OP <<=(v8i &a,int const &b){a = a<<b;return a;}
STI v8i &OP >>=(v8i &a,int const &b){a = a>>b;return a;}
STI v8i OP /(Pv8ias){
switch(b) {    //Baaaaaaaad way :S
 case 1:return a;	 case 2:return a>>1;  case 4:return a>>2;
 case 8:return a>>3; case 16:return a>>4; case 32:return a>>5;
 case 64:return a>>6;case 128:return a>>7;case 256:return a>>8;
 default: break;}
    int16_t d[8];_mm_storeu_si128((__m128i*)d,a.v);
    FOR0(i,8) d[i] /= b; return _mm_loadu_si128((__m128i const*)d);
}
STI v8i OP /(Pv8iab){ //Baaaaaaaad way :S
    int16_t d[8]; _mm_storeu_si128((__m128i*)d,a.v);
	FOR0(i,8) d[i] /= b[i];return _mm_loadu_si128((__m128i const*)d);
}
STI v8i &OP /=(v8i &a,v8i const &b){a=a/b;return a;}
STI ostream &OP<<(ostream& output, const v8i& p){output<<"v8i: [";FOR0(i,8) output<<p[i]<<",";output << "]";return output;}
// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
STI v8i if_select(v8i const &s,Pv8iab){return _mm_blendv_epi8(b.v,a.v,s.v);}
STI v8i if_add(v8i const &f,Pv8iab){return a + (f&b);}
STI v8i if_sub(v8i const &f,Pv8iab){return a - (f&b);}
STI v8i if_mul(v8i const &f,Pv8iab){return a*if_select(f,b,_mm_set1_epi16(1));}
STI v8i if_div(v8i const &f,Pv8iab){return a/if_select(f,b,_mm_set1_epi16(1));}
STI void when_select(v8i& res,v8i const &s,const v8i&a){res=_mm_blendv_epi8(res.v,a.v,s.v);}
STI void when_add(v8i& res,const v8i &f,const v8i& b){res+=(f&b);}
STI void when_sub(v8i& res,const v8i &f,const v8i& b){res-=(f&b);}
STI void when_mul(v8i& res,const v8i &f,const v8i& b){res*=if_select(f,b,_mm_set1_epi16(1));}
STI void when_div(v8i& res,const v8i &f,const v8i& b){res/=if_select(f,b,_mm_set1_epi16(1));}

STI v8i max(Pv8iab){return _mm_max_epi16(a.v,b.v);}
STI v8i min(Pv8iab){return _mm_min_epi16(a.v,b.v);}
STI v8i abs(v8i const &a){ return _mm_sign_epi16(a.v,a.v);}
STI bool horizontal_and(v8i const &a){return _mm_movemask_epi8(a.v) == 0xFFFF;}
STI bool horizontal_or(v8i const &a){return ! _mm_testz_si128(a.v,a.v);}
STI int16_t horizontal_add(v8i const & a) {
    __m128i sum1  = _mm_hadd_epi16(a.v,a.v);                   // horizontally add 8 elements in 3 steps
    __m128i sum2  = _mm_hadd_epi16(sum1,sum1);
    __m128i sum3  = _mm_hadd_epi16(sum2,sum2);
    int16_t sum4  = (int16_t)_mm_cvtsi128_si32(sum3);      // 16 bit sum
    return  sum4;                                          // sign extend to 32 bits
}


STI int get(const v8i&a,const int& b){return (int)a[b];}
STI int get(const v8i&a){return (int)a[0];}

STI v8i rotate_left(v8i const & a, int16_t b) {
    __m128i left  = _mm_sll_epi16(a.v,_mm_cvtsi32_si128(b & 0x0F));      // a << b
    __m128i right = _mm_srl_epi16(a.v,_mm_cvtsi32_si128((16-b) & 0x0F)); // a >> (16 - b)
    return _mm_or_si128(left,right);                          // or
}
STI v8i rotate_right(v8i const & a, int16_t b) {return rotate_left(a,-b);}

template <int16_t i0,int16_t i1,int16_t i2,int16_t i3,int16_t i4,int16_t i5,int16_t i6,int16_t i7>
STI v8i C8i(){static const union {int16_t f[8];v8i ymm;} u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; return u.ymm;}
template <int16_t i0>
STI v8i C8i(){static const union {int16_t f[8];v8i ymm;} u = {{i0,i0,i0,i0,i0,i0,i0,i0}}; return u.ymm;}

//}SSE short 16-bit integers signed 8x16

//{AVX Float operators 8x32
#define P8fa v8 const &a
#define P8fab v8 const &a,v8 const &b
STI v8 C8fMax(){
	float f = static_cast<float>(numeric_limits<uint>::max());
    static const union {float f[8];v8  ymm;} u = {{f,f,f,f,f,f,f,f}};
    return u.ymm;
}
template <int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7>
STI v8 constant8f(){
    static const union {int i[8];v8  ymm;} u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
    return u.ymm;
}
ostream &operator<<(ostream& output, const v8& p){
	    output <<  "[";//"v8 : [";
	    FOR0(i,8) output << p[i]<<",";output << "]";
		return output;
	}
static const float minusINF = -850000.0f;

//static const ALIGN v8 v8zero = {0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f};
#define v8zero _mm256_setzero_ps()
static const ALIGN v8 v8one = {1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f,1.0f};
static const ALIGN v8 v8minusone = {-1.0f,-1.0f,-1.0f,-1.0f,-1.0f,-1.0f,-1.0f,-1.0f};
static const ALIGN v8 v8two = {2.0f,2.0f,2.0f,2.0f,2.0f,2.0f,2.0f,2.0f};
static const ALIGN v8 v8minustwo = {-2.0f,-2.0f,-2.0f,-2.0f,-2.0f,-2.0f,-2.0f,-2.0f};
static const ALIGN v8 v8three = {3.0f,3.0f,3.0f,3.0f,3.0f,3.0f,3.0f,3.0f};
static const ALIGN v8 v8four = {4.0f,4.0f,4.0f,4.0f,4.0f,4.0f,4.0f,4.0f};
static const ALIGN v8 v8five = {5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f,5.0f};
static const ALIGN v8 v8minusINF = {-850000.0f,-850000.0f,-850000.0f,-850000.0f,-850000.0f,-850000.0f,-850000.0f,-850000.0f};
static const ALIGN v8 v8INV_PI = {INV_PI,INV_PI,INV_PI,INV_PI,INV_PI,INV_PI,INV_PI,INV_PI};
static const ALIGN v8 v8PI = {(float)M_PI,(float)M_PI,(float)M_PI,(float)M_PI,(float)M_PI,(float)M_PI,(float)M_PI,(float)M_PI};
static const ALIGN v8 v8HALFPI = {(float)M_PI/2.0,(float)M_PI/2.0,(float)M_PI/2.0,(float)M_PI/2.0,(float)M_PI/2.0,(float)M_PI/2.0,(float)M_PI/2.0,(float)M_PI/2.0};

STI v8 loadv8(float f){return _mm256_set1_ps(f);}


STI v8 loadv8(float f0,float f1,float f2,float f3,float f4,float f5,float f6,float f7){return _mm256_setr_ps(f0,f1,f2,f3,f4,f5,f6,f7);}
STI v8 loadu_v8(float const * p){return _mm256_loadu_ps(p);}
STI v8 loada_v8(float const * p){return _mm256_load_ps(p); }
STI void storeu_v8(v8 const &ymm,float * p)   {_mm256_storeu_ps(p,ymm);  }
STI void storea_v8(v8 const &ymm,float * p)   {_mm256_store_ps(p,ymm);   }

STI v8 add(P8fab){return _mm256_add_ps(a,b);}
STI v8 add(P8fa,float b){return _mm256_add_ps(a,loadv8(b));}
STI v8 sub(P8fab){return _mm256_sub_ps(a,b);}
STI v8 sub(P8fa,float b){return _mm256_sub_ps(a,loadv8(b));}
STI v8 mul(P8fab){return _mm256_mul_ps(a,b);}
STI v8 mul(P8fa,float b){return _mm256_mul_ps(a,loadv8(b));}
STI v8 div(P8fab){return _mm256_div_ps(a,b);}
STI v8 div(P8fa,float b){return _mm256_div_ps(a,loadv8(b));}

//Booleans
STI v8 eq(P8fab){return _mm256_cmp_ps(a,b,0);}
STI v8 neq(P8fab){return _mm256_cmp_ps(a,b,4);}
STI v8 lt(P8fab){return _mm256_cmp_ps(a,b,1);}
STI v8 leq(P8fab){return _mm256_cmp_ps(a,b,2);}
STI v8 gt(P8fab){return  _mm256_cmp_ps(b,a,1); }
STI v8 geq(P8fab){return _mm256_cmp_ps(b,a,2);}
STI v8 _and(P8fab){return _mm256_and_ps(a,b);}
STI v8 _or(P8fab){return _mm256_or_ps(a,b);}
STI v8 _xor(P8fab){return _mm256_xor_ps(a,b);}
STI v8 _not(P8fa){return _mm256_cmp_ps(a,v8zero,0);}

// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
STI v8 if_select(v8 const &s,P8fab){return _mm256_blendv_ps (b,a,s);}
STI v8 if_add(v8 const &f,P8fab){return add(a, _and(f, b));}//f[i] ? (a[i] + b[i]) : a[i]
STI v8 if_sub(v8 const &f,P8fab){return sub(a,_and(f, b));}//f[i] ? (a[i] - b[i]) : a[i]
STI v8 if_mul(v8 const &f,P8fab){return mul(a, if_select(f,b,v8one));}//f[i] ? (a[i] * b[i]) : a[i]
STI v8 if_div(v8 const &f,P8fab){return div(a, if_select(f,b,v8one));}//f[i] ? (a[i] / b[i]) : a[i]

STI float horizontal_add (P8fa){
    __m256 t1 = _mm256_hadd_ps(a,a);
    __m256 t2 = _mm256_hadd_ps(t1,t1);
    __m128 t3 = _mm256_extractf128_ps(t2,1);
    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
    return _mm_cvtss_f32(t4);
}
STI v8 max(P8fab){return _mm256_max_ps(a,b);}
STI v8 min(P8fab){return _mm256_min_ps(a,b);}
STI v8 abs(P8fa){
    __m256 mask = constant8f<0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF> ();
    return _mm256_and_ps(a,mask);
}
STI v8 sqrt(P8fa){return _mm256_sqrt_ps(a);}

STI v8 square(P8fa){return a * a;}


STI v8 andnot(P8fab){return _mm256_andnot_ps(b,a);}
//STI bool horizontal_and (P8fa){return neq(_mm256_testc_ps(a,constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()), v8zero);}
STI bool horizontal_or(P8fa){return !(_mm256_testz_ps(a,a));}


// function round: round to nearest integer (even). (result as float vector)
STI v8 round(P8fa){return _mm256_round_ps(a,0+8);}
// function truncate: round towards zero. (result as float vector)
STI v8 truncate(P8fa){return _mm256_round_ps(a,3+8);}
// function floor: round towards minus infinity. (result as float vector)
STI v8 floor(P8fa){return _mm256_round_ps(a,1+8);}
// function ceil: round towards plus infinity. (result as float vector)
STI v8 ceil(P8fa){return _mm256_round_ps(a,2+8);}

STI v8 approx_recipr(P8fa){return _mm256_rcp_ps(a);}
STI v8 approx_rsqrt(P8fa) {return _mm256_rsqrt_ps(a);}
STI v8 infinite8f() {return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>();}
STI v8 nan8f(int n = 0x10) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FC00000 + n));}
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
STI v8 change_sign(P8fa) {
    if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
    __m256 mask = constant8f<i0 ? (int)0x80000000 : 0, i1 ? (int)0x80000000 : 0, i2 ? (int)0x80000000 : 0, i3 ? (int)0x80000000 : 0,
        i4 ? (int)0x80000000 : 0, i5 ? (int)0x80000000 : 0, i6 ? (int)0x80000000 : 0, i7 ? (int)0x80000000 : 0> ();
    return _mm256_xor_ps(a, mask);
}

//Medium precission atan2. 3.5e-5 0.000035  2x accuracy than atan2c
STI v8 atan2(P8fab) { //8*83M opers/sec 58x faster than real atan2, 468x faster due to vectorization. 2x precission than atan2c
  v8 ALIGN t0, t1, t2, t3, t4;
  t3 = abs(b);
  t1 = abs(a);
  t0 = max(t3, t1);
  t1 = min(t3, t1);
  t3 = approx_recipr(t0); //approximate
  //t3 = div(v8one,t0);
  t3 = mul(t1,t3);
  t4 = mul(t3, t3);
  t0 = loadv8(-0.013480470f);
  t0 = add(mul(t0 , t4),0.057477314f);
  t0 = sub(mul(t0 , t4),0.121239071f);
  t0 = add(mul(t0 , t4),0.195635925f);
  t0 = sub(mul(t0 , t4),0.332994597f);
  t0 = add(mul(t0 , t4),0.999995630f);
  t3 = mul(t0 ,t3);
  t3= if_select( gt(abs(a),abs(b)), sub(v8HALFPI,t3),t3);
  t3= if_select( lt(b,v8zero), sub(v8PI,t3),t3);
  t3= if_select( lt(a,v8zero), sub(v8zero,t3),t3);
  return t3;
}

//Higher precission atan2. 1.2e-6 = 0.0000012 60x accuracy than atan2c, 30x accuracy atan2
STI v8 atan2E(P8fab) { //8*62M opers/sec 43x faster than real atan2, 350x faster due to vectorization
  v8 ALIGN t0, t1, t2, t3, t4;
  t3 = abs(b);
  t1 = abs(a);
  t0 = max(t3, t1);
  t1 = min(t3, t1);
  //t3 = approx_recipr(t0);
  //t3 = div(v8one,t0);
  //t3 = mul(t1,t3);
  t3 = div(t1,t0);
  t4 = mul(t3, t3);
  t0 = loadv8(-0.013480470f);
  t0 = add(mul(t0 , t4),0.057477314f);
  t0 = sub(mul(t0 , t4),0.121239071f);
  t0 = add(mul(t0 , t4),0.195635925f);
  t0 = sub(mul(t0 , t4),0.332994597f);
  t0 = add(mul(t0 , t4),0.999995630f);
  t3 = mul(t0 ,t3);
  t3= if_select( gt(abs(a),abs(b)), sub(v8HALFPI,t3),t3);
  t3= if_select( lt(b,v8zero), sub(v8PI,t3),t3);
  t3= if_select( lt(a,v8zero), sub(v8zero,t3),t3);
  return t3;
}

//Low precission atan2. 7.35e-5
STI v8 atan2c(P8fab) {  //8*86M opers/sec   60x faster than real atan2, 485x faster due to vectorization
	v8 ALIGN n = div(min( abs(a),abs(b)),max( abs(a),abs(b)));
	v8 ALIGN s = mul(n,n);
	v8 ALIGN r = add(mul(sub(mul(add(mul(s,-0.0464964749), 0.15931422) , s), 0.327622764) , mul(s , n)) , n);
	r= if_select( gt(abs(a),abs(b)), sub(v8HALFPI,r),r);
	r= if_select( lt(abs(b),v8zero), sub(v8PI,r),r);
	r= if_select( lt(abs(a),v8zero), sub(v8zero,r),r);
	return r;
}

STI v8 atan2_ref(P8fab) {  //8*177k opers/sec. Lentisimo!!!
	return loadv8(atan2f(a[0], b[0]),
				  atan2f(a[1], b[1]),
				  atan2f(a[2], b[2]),
				  atan2f(a[3], b[3]),
				  atan2f(a[4], b[4]),
				  atan2f(a[5], b[5]),
				  atan2f(a[6], b[6]),
				  atan2f(a[7], b[7]));
}

template <int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7>
STI v8 C_F(){static const union {int f[8];v8 ymm;} u = {{(float)i0,(float)i1,(float)i2,(float)i3,(float)i4,(float)i5,(float)i6,(float)i7}}; return u.ymm;}
template <int i0,int i1>
STI v8 C_F(){
    static const union {float f[8];v8  ymm;} u = {{(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1,(float)i0/i1}};
    return u.ymm;
	}

   #define F_0 _mm256_setzero_ps()
   #define F_1 C_F<1,1>()
   #define F_2 C_F<2,1>()
   #define F_3 C_F<3,1>()
   #define F_minus1 C_F<-1,1>()


STI v8 dist(v8 const &x1,v8 const &y1,v8 const &x2,v8 const &y2)
{
	v8 ALIGN dx =  sub(x1,x2);
	v8 ALIGN dy =  sub(y1,y2);
	v8 ALIGN dx2 =  mul(dx,dx);
	v8 ALIGN dy2 =  mul(dy,dy);
	v8 ALIGN dist2 =  add(dx2,dy2);
   return sqrt(dist2);
}
STI v8 dist2(v8 const &x1,v8 const &y1,v8 const &x2,v8 const &y2)
{
	v8 ALIGN dx =  sub(x1,x2);
	v8 ALIGN dy =  sub(y1,y2);
	v8 ALIGN dx2 =  mul(dx,dx);
	v8 ALIGN dy2 =  mul(dy,dy);
	return add(dx2,dy2);
}

STI v8 Seno(v8 const &x)
{  //http://lolengine.net/wiki/doc/maths/remez
  v8 ALIGN x2 = mul(x,x);
  return  sub(x, mul(x2, mul(x , add(mul(x2, 0.00585375),0.1587164))));
}

STI int compare(P8fab)
{
 int dif = 0;
 FOR0(i,8){if (abs(a[i] - b[i])>0.000001) ++dif;}
 return dif;
}

STI v8 convert(const v8i& vector_int)  //8xint16_t to 8xfloat
{
 __m128 ALIGN16 lo = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(vector_int.v));
 __m128 ALIGN16 hi = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(vector_int.v, 8)));
 return _mm256_insertf128_ps(_mm256_castps128_ps256(lo),(hi),1);
}

STI __m128i convert(const v8& vector_float) //8xfloat to 8xint16_t
{
 __m128i ALIGN16 lo =  _mm_cvtps_epi32(_mm256_castps256_ps128(vector_float));
 __m128i ALIGN16 hi =  _mm_cvtps_epi32(_mm256_extractf128_ps(vector_float,1));
  return _mm_packs_epi32(lo,hi);
}

STI v8 loadv8(const v8i& x){ return convert(x);}
//} AVX Operators


//{FAST RANDOM
constexpr int RND_SEED=8000;
static unsigned int g_seed=RND_SEED;
inline int fastrand() {	g_seed = (214013 * g_seed + 2531011);return (g_seed >> 16) & 0x7FFF;}
inline int fastRandInt(int maxSize) {return fastrand() % maxSize;}
inline int fastRandAngle(){	g_seed = (214013 * g_seed + 2531011);	return (g_seed >> 16) % ANGLES_LENGTH;	}
inline int fastRandInt(int a, int b) {return(a + fastRandInt(b - a));}
inline float fastRandDouble(){return static_cast<float>(fastrand())/0x7FFF;}
inline float fastRandDouble(float a,float b){return a+(static_cast<float>(fastrand())/0x7FFF)*(b-a);}
//} FAST RANDOM

//{FAST RANDOM VECTORIZED
#pragma GCC diagnostic ignored "-Wnarrowing"
__m128i SEED_RANDOM={UINT64_C(2000),UINT64_C(8000)};
static const __m128i v2i_k1 = {UINT64_C(0x9E3779B97F4A7C15),UINT64_C(0x9E3779B97F4A7C15)};
static const __m128i v2i_k2 = {UINT64_C(0xBF58476D1CE4E5B9),UINT64_C(0xBF58476D1CE4E5B9)};
static const __m128i v2i_k3 = {UINT64_C(0x94D049BB133111EB),UINT64_C(0x94D049BB133111EB)};
static const __m128i v2i_k4 = {2685821657736338717LL,2685821657736338717LL};
static const __m128i v2i_9  = {UINT64_C(9),UINT64_C(9)};
static const __m128i v2i_27 = {UINT64_C(27),UINT64_C(27)};
static const __m128i v2i_28 = {UINT64_C(28),UINT64_C(28)};
static const __m128i v2i_30 = {UINT64_C(30),UINT64_C(30)};
static const __m128i v2i_31 = {UINT64_C(31),UINT64_C(31)};
static const __m128i v2i_55 = {UINT64_C(55),UINT64_C(55)};
static const __m128i v2i_36 = {UINT64_C(36),UINT64_C(36)};
static const __m128i v2i_12 = {UINT64_C(12),UINT64_C(12)};
static const __m128i v2i_14 = {UINT64_C(14),UINT64_C(14)};
static const __m128i v2i_25 = {UINT64_C(25),UINT64_C(25)};
static const __m128i v8i_FF = {UINT64_C(0xFFFFFFFFFFFFFFFF),UINT64_C(0xFFFFFFFFFFFFFFFF)};
static const __m128i v8i_UINT={UINT64_C(0x7FFF7FFF7FFF7FFF),UINT64_C(0x7FFF7FFF7FFF7FFF)};
static const __m128i v8i_UB = {UINT64_C(0x7F7F7F7F7F7F7F7F),UINT64_C(0x7F7F7F7F7F7F7F7F)};

inline __m128i mul64(__m128i const & a,__m128i const & b){
    __m128i bswap   = _mm_shuffle_epi32(b,0xB1);           // b0H,b0L,b1H,b1L (swap H<->L)
    __m128i prodlh  = _mm_mullo_epi32(a,bswap);            // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products
    __m128i zero    = _mm_setzero_si128();                 // 0
    __m128i prodlh2 = _mm_hadd_epi32(prodlh,zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
    __m128i prodlh3 = _mm_shuffle_epi32(prodlh2,0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
    __m128i prodll  = _mm_mul_epu32(a,b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
    __m128i prod    = _mm_add_epi64(prodll,prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
    return  prod;
}

__m128i xnext() {
    SEED_RANDOM = _mm_add_epi64(SEED_RANDOM,v2i_k1);
    __m128i ALIGN16 z = SEED_RANDOM;
	z=mul64(_mm_xor_si128(z,_mm_srl_epi64(z,v2i_30)),v2i_k2);
	z=mul64(_mm_xor_si128(z,_mm_srl_epi64(z,v2i_27)),v2i_k3);
	return _mm_xor_si128(z,_mm_srl_epi64(z,v2i_31));
}
__m128i ALIGN16 seed[2]={ xnext(),xnext() };

inline __m128i Xrandom() {
	__m128i ALIGN16 s0=seed[0];
	__m128i ALIGN16 s1=seed[1];
	__m128i ALIGN16 result=_mm_add_epi64(s0,s1);
	s1= _mm_xor_si128(s1,s0);
	seed[0]=_mm_xor_si128(_mm_or_si128(_mm_sll_epi64(s0,v2i_55),_mm_srl_epi64(s1,v2i_9)),_mm_xor_si128(s1,_mm_sll_epi64(s1,v2i_14)));
	seed[1]=_mm_or_si128(_mm_sll_epi64(s1,v2i_36),_mm_srl_epi64(s1,v2i_28));
	return result;
}

 inline __m128i Irandom(){return _mm_and_si128(Xrandom(),v8i_UINT);}
 template <int i0>
 inline __m128i Irandom(){
switch(i0) {
 case 2:case 4:case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:

  static const union {int16_t f[8];__m128i v;} u = {{(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1)
                                                    ,(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1)}};
    return _mm_and_si128(Xrandom(),u.v);
 default: break;
}

//Slower calculation
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)i0);
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)i0));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return convert(_mm256_sub_ps(a, base));
 }
 template <int init,int end>
 inline __m128i Irandom(){
switch(end-init) {
 case 2:case 4:case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:

  static const union {int16_t f[8];__m128i v;} u = {{(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1)
                                                    ,(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1)}};
    return _mm_add_epi16(_mm_and_si128(Xrandom(),u.v),_mm_set1_epi16(init));
 default: break;
}
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)(end-init));
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)(end-init)));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return convert(_mm256_add_ps(_mm256_sub_ps(a, base),_mm256_set1_ps((float)init)));
 }
 inline __m128i Irandom(const int& i0){
switch(i0) {
 case 2:case 4:case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:

  static const union {int16_t f[8];__m128i v;} u = {{(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1)
                                                    ,(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1)}};
    return _mm_and_si128(Xrandom(),u.v);
 default: break;
}

//Slower calculation
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)i0);
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)i0));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return convert(_mm256_sub_ps(a, base));
 }
 inline __m128i Irandom(const int& init,const int& end){
switch(end-init) {
 case 2:case 4:case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:

    static const union {int16_t f[8];__m128i v;} u = {{(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1)
                                                    ,(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1)}};
    return _mm_add_epi16(_mm_and_si128(Xrandom(),u.v),_mm_set1_epi16(init));
 default: break;
}
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)(end-init));
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)(end-init)));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return convert(_mm256_add_ps(_mm256_sub_ps(a, base),_mm256_set1_ps((float)init)));
 }
 inline __m128i IrandomBool(){return  _mm_cmpeq_epi16(_mm_and_si128(Xrandom(),C8i<1>().v), v8izero);}


 inline __m256 Frandom(){
	 return convert(_mm_and_si128(Xrandom(),v8i_UINT));
 }

 template <int i0>
 inline __m256 Frandom(){
switch(i0) {
 case 2:case 4:case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:

  static const union {int16_t f[8];__m128i v;} u = {{(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1)
                                                    ,(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1),(int16_t)(i0-1)}};
    return convert(_mm_and_si128(Xrandom(),u.v));
 default: break;
}
//Slower calculation
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)i0);
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)i0));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return _mm256_sub_ps(a, base);
 }

 template <int init,int end>
 inline __m256 Frandom(){
switch(end-init) {
 case 2:case 4:case 8:case 16:case 32:case 64:case 128:case 256:case 512:case 1024:

  static const union {int16_t f[8];__m128i v;} u = {{(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1)
                                                    ,(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1),(int16_t)(end-init-1)}};
    return _mm256_add_ps(convert(_mm_and_si128(Xrandom(),u.v)),_mm256_set1_ps((float)init));
 default: break;
}
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)(end-init));
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)(end-init)));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return _mm256_add_ps(_mm256_sub_ps(a, base),_mm256_set1_ps((float)init));
 }

 inline __m256 Frandom(const int& i0){
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)i0);
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)i0));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return _mm256_sub_ps(a, base);
 }

 inline __m256 Frandom(const int& init,const int& end){
    __m256 ALIGN a = convert(_mm_and_si128(Xrandom(),v8i_UINT));
    __m256 ALIGN den = _mm256_set1_ps((float)(end-init));
    __m256 ALIGN c = _mm256_mul_ps(a,_mm256_set1_ps(1.0f/(float)(end-init)));
    __m256 ALIGN base = _mm256_mul_ps(_mm256_round_ps(c,3+8),den);
    return _mm256_add_ps(_mm256_sub_ps(a, base),_mm256_set1_ps((float)init));
 }

 inline __m256 FrandomBool(){
     static const union {int16_t f[8];__m128i v;} V_1 = {{1,1,1,1,1,1,1,1}};
     return _mm256_cmp_ps(convert(_mm_and_si128(Xrandom(),V_1.v)),F_0,0);
     }
//}FAST RANDOM VECTORIZED


//{GAME CONSTANTS
#define GAME_IS_CODERS_STRIKE_BACK

constexpr int MAX_THRUST = 200;
static const v8i ALIGN16 v8i_MAX_THRUST = C8i<MAX_THRUST>();

constexpr int16_t MIN_ROTATION = -18;
constexpr int16_t MAX_ROTATION =  18;
static const v8i ALIGN16 v8i_MIN_ROTATION = C8i<MIN_ROTATION>();
static const v8i ALIGN16 v8i_MAX_ROTATION = C8i<MAX_ROTATION>();

static const v8i ALIGN16 v8i_SHIELD_PROB = C8i<SHIELD_PROB>();

int CG_playerId=0;

constexpr int MAX_PLAYERS = 2;

constexpr float MAX_TIMEOUT = 100.0f;
static const ALIGN v8 v8MAX_TIMEOUT = {MAX_TIMEOUT,MAX_TIMEOUT,MAX_TIMEOUT,MAX_TIMEOUT,MAX_TIMEOUT,MAX_TIMEOUT,MAX_TIMEOUT,MAX_TIMEOUT};
constexpr int BOOST = 650;
static const ALIGN v8 v8BOOST = {BOOST,BOOST,BOOST,BOOST,BOOST,BOOST,BOOST,BOOST};
constexpr float MIN_IMPULSE = 120.0;
constexpr float MIN_IMPULSE2 = MIN_IMPULSE*MIN_IMPULSE;


constexpr float CP_RADIUS = 200.0;

static const ALIGN v8 v8_Dist_Fitness = {195.0f,195.0f,195.0f,195.0f,195.0f,195.0f,195.0f,195.0f};
constexpr float POD_RADIUS = 400.0;
constexpr float FRICTION = 0.85;

constexpr float POD_RADIUS2_POD_POD = (POD_RADIUS+POD_RADIUS)*(POD_RADIUS+POD_RADIUS);
constexpr float POD_RADIUS2_POD_CP = (POD_RADIUS+CP_RADIUS)*(POD_RADIUS+CP_RADIUS);

static const ALIGN v8 v8RADIUS2_POD_POD = {POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD,POD_RADIUS2_POD_POD};
static const ALIGN v8 v8RADIUS2_POD_CP = {POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP,POD_RADIUS2_POD_CP};

constexpr int COLL_POD_POD = 0;
constexpr int COLL_POD_CP = 1;

constexpr float MASS_NORMAL = 1.0;
constexpr float MASS_SHIELD = 10.0;
constexpr float INVMASS_NORMAL = 1.0;
constexpr float INVMASS_SHIELD = 0.1;
static const ALIGN v8 v8Mass = {MASS_NORMAL,MASS_NORMAL,MASS_NORMAL,MASS_NORMAL,MASS_NORMAL,MASS_NORMAL,MASS_NORMAL,MASS_NORMAL};
static const ALIGN v8 v8MassShield = {MASS_SHIELD,MASS_SHIELD,MASS_SHIELD,MASS_SHIELD,MASS_SHIELD,MASS_SHIELD,MASS_SHIELD,MASS_SHIELD};
static const ALIGN v8 v8INVMASS_NORMAL = {INVMASS_NORMAL,INVMASS_NORMAL,INVMASS_NORMAL,INVMASS_NORMAL,INVMASS_NORMAL,INVMASS_NORMAL,INVMASS_NORMAL,INVMASS_NORMAL};
static const ALIGN v8 v8INVMASS_SHIELD = {INVMASS_SHIELD,INVMASS_SHIELD,INVMASS_SHIELD,INVMASS_SHIELD,INVMASS_SHIELD,INVMASS_SHIELD,INVMASS_SHIELD,INVMASS_SHIELD};

constexpr float SHIELD_BLOCK = 4.0;
static const ALIGN v8 v8SHIELD_BLOCK = {SHIELD_BLOCK,SHIELD_BLOCK,SHIELD_BLOCK,SHIELD_BLOCK,SHIELD_BLOCK,SHIELD_BLOCK,SHIELD_BLOCK,SHIELD_BLOCK};

constexpr int TOTAL_PODS = 4;
constexpr int PLAYER_PODS = 2;
constexpr int MAX_COLLISIONS = TOTAL_PODS*(TOTAL_PODS-1)/2;
constexpr int MAX_CHECKPOINTS = 8;

constexpr float COEF[] = { (MASS_NORMAL * MASS_NORMAL) / (MASS_NORMAL+MASS_NORMAL),
                       (MASS_NORMAL * MASS_SHIELD) / (MASS_NORMAL+MASS_SHIELD),
                       (MASS_SHIELD * MASS_SHIELD) / (MASS_SHIELD+MASS_SHIELD) };


static const ALIGN v8 v8COEF[] = {
	{COEF[0],COEF[0],COEF[0],COEF[0],COEF[0],COEF[0],COEF[0],COEF[0]},
	{COEF[1],COEF[1],COEF[1],COEF[1],COEF[1],COEF[1],COEF[1],COEF[1]},
	{COEF[2],COEF[2],COEF[2],COEF[2],COEF[2],COEF[2],COEF[2],COEF[2]}
}					   ;

constexpr float MIN_ENERGY = 0.00006f;//1/10000;
static const ALIGN v8 v8MIN_IMPULSE = {MIN_IMPULSE,MIN_IMPULSE,MIN_IMPULSE,MIN_IMPULSE,MIN_IMPULSE,MIN_IMPULSE,MIN_IMPULSE,MIN_IMPULSE};
static const ALIGN v8 v8MIN_IMPULSE2 = {MIN_IMPULSE2,MIN_IMPULSE2,MIN_IMPULSE2,MIN_IMPULSE2,MIN_IMPULSE2,MIN_IMPULSE2,MIN_IMPULSE2,MIN_IMPULSE2};
static const ALIGN v8 v8ErrorColl = {1.5f,1.5f,1.5f,1.5f,1.5f,1.5f,1.5f,1.5f};
static const ALIGN v8 v8Friction = {FRICTION,FRICTION,FRICTION,FRICTION,FRICTION,FRICTION,FRICTION,FRICTION};
static const ALIGN v8 v8minEnergy = {MIN_ENERGY,MIN_ENERGY,MIN_ENERGY,MIN_ENERGY,MIN_ENERGY,MIN_ENERGY,MIN_ENERGY,MIN_ENERGY};
//}GAME CONSTANTS

//{GAME ENTITIES
struct ALIGN Entity{ v8 x, y, vx, vy, shield, countCP,nextCP,used_booster; v8i angle;  };
ostream &operator<<(ostream& output, const Entity& p){
	    output << "Pod:XY:"<<p.x[0]<<","<<p.y[0]<<" V:"<<p.vx[0]<<","<<p.vy[0]
	           << " S:"<<p.shield[0]<<" A:"<< p.angle[0]<<" Next:"<<p.nextCP[0];
		return output;
	}

struct ALIGN CheckPoint{ v8 x, y; };
 struct ALIGN Collision{
	v8 Time;
	int A;//[8]; //First Entity
	int B;
	int Type;//[8]; //0 HorizWalls,1 VertWalls, 2 Entities
 };
//}GAME ENTITIES

//{GAME STATE
class ALIGN GameState{
public:
  Entity ALIGN pods[TOTAL_PODS];
  CheckPoint ALIGN next_CP[TOTAL_PODS];
  CheckPoint ALIGN checkpoints[MAX_CHECKPOINTS];

  v8 timeout[2];
  v8 victory[2];

  v8 totalCP;
  v8 totalLaps;
  v8 victoryCP;
  v8 v8turn;
  v8 v8simturn;
  int turn;

  int runner[2];
  int blocker[2];


GameState(){
	CG_playerId = 0;
	turn = -1;
	timeout[0] = v8MAX_TIMEOUT;
	timeout[1] = v8MAX_TIMEOUT;
	victory[0] = gt(timeout[1],timeout[1]); //Set False
	victory[1] = victory[0];
	FOR0(i,TOTAL_PODS)
	{
	 pods[i].nextCP=v8one;
	 pods[i].x=v8zero; pods[i].y=v8zero; pods[i].vx=v8zero; pods[i].vy=v8zero;
     pods[i].angle=v8izero;
	 pods[i].shield=v8zero; pods[i].countCP=v8zero;pods[i].used_booster=v8zero;
	}
 };

void ReadConfig(istream& input){
  float lapcount;
  float CPcount;
  input >> lapcount; input.ignore();
  input >> CPcount; input.ignore();
  cerr << "Total LAPS:"<<lapcount<<" TotalCP:"<<CPcount<<endl;
  totalLaps = loadv8(lapcount);
  totalCP   = loadv8(CPcount);
  victoryCP = loadv8(lapcount*CPcount);

  FOR0(i,CPcount) {
	float x,y;
    input >> x>>y; input.ignore();
	checkpoints[i].x = loadv8(x);
	checkpoints[i].y = loadv8(y);
  }
}

void ReadTurn(istream& input){
  float pod_x;
  float pod_y;
  float pod_vx;
  float pod_vy;
  int pod_angle;
  float nCP;
  //New turn
  ++turn;
  v8turn = loadv8((float)turn);
  v8simturn = v8turn;
  bool resetTimeout[4] = {false,false,false,false};
  //Read input
  FOR0(i,4) {
	 int team = i >>1;
     input >> pod_x>> pod_y>> pod_vx>> pod_vy>> pod_angle>> nCP; input.ignore();
     //cerr<< "Read Pod: " << i<<" " << pod_x<<" "<< pod_y<<" "<< pod_vx<<" "<< pod_vy<<" "<< pod_angle<<" "<< nCP<<endl;
     if (pod_angle>=360) pod_angle-=360;
	 if (i == 0) stopwatch.Start();
	 if (turn == 0 && i > 1 && pod_angle > -1) CG_playerId = 1;

	 //Change CP
     if (nCP <0) {
		 cerr << "El Checkpoint del enemigo es -1. Convirtiendo a cero"<<endl;
         nCP = 0;
     }
	 else
	 {
		if (nCP != pods[i].nextCP[0])
		{ //Advance CheckPoint
          pods[i].countCP = add(pods[i].countCP,v8one);
          cerr << pods[i].countCP[0] << " >= "<<victoryCP[0]<< (pods[i].countCP[0] >= victoryCP[0]?"ES UNA VICTORIA!!!!!!":"")<<endl;
	      victory[team] = _or(victory[team],geq(pods[i].countCP,victoryCP));
	      timeout[team] = v8MAX_TIMEOUT;
		  resetTimeout[i] = true;
		}
	 }
     next_CP[i].x = checkpoints[(int)nCP].x;
     next_CP[i].y = checkpoints[(int)nCP].y;

	 pods[i].nextCP = loadv8(nCP);
     pods[i].nextCP = if_select(victory[team], v8zero,pods[i].nextCP);
	 //Load AVX
	 pods[i].x = loadv8(pod_x);
	 pods[i].y = loadv8(pod_y);
	 pods[i].vx = loadv8(pod_vx);
	 pods[i].vy = loadv8(pod_vy);
	 #ifdef SIMLOG
	  FOR0(j,8)
	  if ( j != SIM_Vector_Not_Noised)
	  {
  	   pods[i].vx[j] = pods[i].vx[j] + (float)fastRandInt(-30,30);
	   pods[i].vy[j] = pods[i].vy[j] + (float)fastRandInt(-30,30);
  	   pods[i].x[j] = pods[i].x[j] + (float)fastRandInt(-8,8);
	   pods[i].y[j] = pods[i].y[j] + (float)fastRandInt(-8,8);

	  }
	  if (CG_playerId== 0)
	  {
	  if (i==0)  cerr << "Non randomized Vectors:"<<SIM_Vector_Not_Noised<<endl;
	  cerr << "Noised Pod"<<i<<" Pos:"<<pods[i].x
	  //<<"|"<<pods[i].y<<": V:"<<pods[i].vx<<"|"<<pods[i].vy
	  <<endl;
	  }
     #endif

	 if (turn >0 || pod_angle >= 0)
	 {
//	     cerr << "Pod "<<i<<" Angle:"<<pod_angle<<endl;
    pods[i].angle = v8i(pod_angle);

	 } else
	 {
		 pods[i].angle = getAngle(i,checkpoints[1].x,checkpoints[1].y);
	 }
     pods[i].shield = if_sub( (pods[i].shield>v8zero),pods[i].shield,v8one );
  }

//	 cerr << get_int16_t(pods[0].angle,0)<<" "<<get_int16_t(pods[1].angle,0)<<" "<< get_int16_t(pods[2].angle,0)<<" "<<get_int16_t(pods[3].angle,0)<<" "<<endl;


   if (!resetTimeout[0] && !resetTimeout[1])
	   timeout[0] -= v8one;
   if (!resetTimeout[1] && !resetTimeout[2])
	   timeout[1] -= v8one;
}

bool PodCompare(GameState& b,const int& POD)
{
  string dif ="";
  if (abs(pods[POD].x[SIM_Vector_Not_Noised] - b.pods[POD].x[SIM_Vector_Not_Noised])>1) dif +="X,";
  if (abs(pods[POD].y[SIM_Vector_Not_Noised] - b.pods[POD].y[SIM_Vector_Not_Noised])>1) dif +="Y,";
  if (abs(pods[POD].vx[SIM_Vector_Not_Noised] - b.pods[POD].vx[SIM_Vector_Not_Noised])>1) dif +="VX,";
  if (abs(pods[POD].vy[SIM_Vector_Not_Noised] - b.pods[POD].vy[SIM_Vector_Not_Noised])>1) dif +="VY,";
  if (abs(pods[POD].nextCP[SIM_Vector_Not_Noised] - b.pods[POD].nextCP[SIM_Vector_Not_Noised])>0) dif +="NextCP,";

  if (pods[POD].angle[SIM_Vector_Not_Noised] !=
      b.pods[POD].angle[SIM_Vector_Not_Noised]) dif +="A,";
  cerr<<"Work"<<POD<<":"<<b.pods[POD]<<endl;
  cerr<<"Curr"<<POD<<":"<<pods[POD]<<endl;
  if (dif != "")
  {
      cerr << "POD ERROR "<<POD<<":"<<dif<<endl;
//      abort();
  }

  return dif == "";
}

bool Compare(GameState& b){
  bool n = true;
  FOR0(j,4)
    n = n && PodCompare(b,j);
 if (!n) ++SIM_TURNS_FAILED;
  return n;
}

inline void searchCollisions(const int& i, const int& j,Collision collisions[MAX_COLLISIONS],int& CollCount,const v8& Turn_Time,const int& iterations){
    v8 ALIGN dx = (pods[i].x-pods[j].x);
	v8 ALIGN dy = (pods[i].y-pods[j].y);
	v8 ALIGN dvx = (pods[i].vx-pods[j].vx);
	v8 ALIGN dvy = (pods[i].vy-pods[j].vy);
	v8 ALIGN a = ((dvx*dvx)+(dvy*dvy));
	v8 ALIGN b = (v8minustwo*((dx*dvx)+(dy*dvy)));
	v8 ALIGN delta = ((b*b) - (v8four*(a*(((dx*dx)+( (dy*dy)-v8RADIUS2_POD_POD))))));
	v8 ALIGN t =((b-sqrt(delta))/(a*v8two));
    t = if_select((a <= v8minEnergy), v8ErrorColl,t ); //Without enough E, don't collide
    t = if_select((delta < v8zero), v8ErrorColl,t ); //Si delta<0, don't use collision
//    t = if_select(lt(t , v8zero),v8ErrorColl,t ); //Si t<0, don't use collision
    t = if_select((t<v8minEnergy),v8ErrorColl,t); //Ignore negatives
    t = if_select(t,v8ErrorColl,t); //Remove -nan
/*//TODO>>Revisar si esto se necesita
    a = add(mul(dx,dx),mul(dy,dy));
    t = if_select( lt(a,v8RADIUS2_POD_POD),v8zero,t); //But check if dist2 < r2*r2, then t = 0
	*/
	t += Turn_Time;
    if (horizontal_or(t<v8one))
	{
#ifdef SIMLOG
//if (FirstPlayer)
cerr <<" ##COLIS ENTs: "<<i<<" y "<<j<<"T:"<<t<<" entre entidades "<<endl;
#endif
		//Add Collision
		collisions[CollCount].A = i;
		collisions[CollCount].B = j;
		collisions[CollCount].Time = t;
		collisions[CollCount].Type = COLL_POD_POD;
		++CollCount;
	}
}

inline void searchCPCollisions(const int& i,Collision CPcollisions[MAX_COLLISIONS],int& CPCollCount,const v8& Turn_Time,const int& iterations){
    v8 ALIGN dx = (pods[i].x-next_CP[i].x);
	v8 ALIGN dy = (pods[i].y-next_CP[i].y);
	v8 ALIGN dvx = pods[i].vx;
	v8 ALIGN dvy = pods[i].vy;
	v8 ALIGN a = ((dvx*dvx)+(dvy*dvy));
	v8 ALIGN b = (v8minustwo*((dx*dvx)+(dy*dvy)));
	v8 ALIGN delta = ((b*b) - (v8four*(a*(((dx*dx)+((dy*dy)-v8RADIUS2_POD_CP))))));
	v8 ALIGN t =((b-sqrt(delta))/(a*v8two));
    t = if_select((a <= v8minEnergy), v8ErrorColl,t ); //Without enough E, don't collide
    t = if_select((delta < v8zero), v8ErrorColl,t ); //Si delta<0, don't use collision
    t = if_select((t < v8zero),v8ErrorColl,t ); //Si t<0, don't use collision
//    t = if_select( lt(t,v8minEnergy),v8ErrorColl,t); //Ignore negatives
    t = if_select( t,v8ErrorColl,t); //Remove -nan

	t += Turn_Time;
    if (horizontal_or(t<v8one))
	{
		//Add Collision
		CPcollisions[CPCollCount].A = i;
		CPcollisions[CPCollCount].B = i;
		CPcollisions[CPCollCount].Time = t;
		CPcollisions[CPCollCount].Type = COLL_POD_CP;
		++CPCollCount;
	}
}


inline void defineRunners(){
//TODO:
 float runScore[4]={0};
 FOR0(i,4)
 {
     float dx = next_CP[i].x[0]- pods[i].x[0];
     float dy = next_CP[i].y[0]- pods[i].y[0];
     float d = sqrt(dx*dx+dy*dy);
     runScore[i] = pods[i].countCP[0]*50000.0f-d;
 }
 runner[0] = (runScore[0] > runScore[1]?0:1);
 blocker[0] = (runScore[0] > runScore[1]?1:0);
 runner[1] = (runScore[2] > runScore[3]?2:3);
 blocker[1] = (runScore[2] > runScore[3]?3:2);
}

inline void moveUnits(const v8& step_time){
  FOR0(i,TOTAL_PODS)
  {
    pods[i].x +=  pods[i].vx*step_time;
    pods[i].y +=  pods[i].vy*step_time;
  }
}


//Removing ugly functions
inline void Collide(const Collision& c,const v8& collCheck) {
	    v8 ALIGN shieldA = (pods[c.A].shield == v8SHIELD_BLOCK);
		v8 ALIGN shieldB = (pods[c.B].shield == v8SHIELD_BLOCK);
		v8 ALIGN coef = if_select(_and(shieldA,shieldB),v8COEF[2],if_select(_or(shieldA,shieldB),v8COEF[1],v8COEF[0]));
        v8 ALIGN dx = (pods[c.A].x-pods[c.B].x);
        v8 ALIGN dy = (pods[c.A].y-pods[c.B].y);
		v8 ALIGN product = ((dx*(pods[c.A].vx-pods[c.B].vx))+(dy*(pods[c.A].vy-pods[c.B].vy)));
		coef = ((coef*product)/((dx*dx)+(dy*dy)));
		dx *= coef;
		dy *= coef;
        v8 ALIGN impulse = if_select(collCheck,((dx*dx)+(dy*dy)),v8MIN_IMPULSE2);
		coef = (impulse<v8MIN_IMPULSE2);
		if (horizontal_or(coef)) //Only if some vector value is below minimpulse, to save sqrt and div
               //impulse = (v8one+if_select(coef, (v8MIN_IMPULSE/sqrt(impulse)),v8one));
               impulse = (v8one+if_select(coef, (v8MIN_IMPULSE*approx_rsqrt(impulse)),v8one)); //TODO: Check if I can approx sqrt or not, too many errors?
		  else impulse = v8two;

		dx *= impulse;
		dy *= impulse;
		shieldA = if_select(shieldA,v8INVMASS_SHIELD,v8INVMASS_NORMAL);
		shieldB = if_select(shieldB,v8INVMASS_SHIELD,v8INVMASS_NORMAL);
   	    pods[c.A].vx -= if_select(collCheck,shieldA*dx,v8zero);
   	    pods[c.A].vy -= if_select(collCheck,shieldA*dy,v8zero);
 		pods[c.B].vx += if_select(collCheck,shieldB*dx,v8zero);
 		pods[c.B].vy += if_select(collCheck,shieldB*dy,v8zero);
  }


inline void playTurn(){
    ++SIMCOUNT;
 v8 ALIGN Turn_Time = v8zero;		//Now the hard part, we'll do 8 simulations in parallel
 v8 ALIGN FirstCollisionTime = v8two;
 v8 ALIGN minCollTime = v8minEnergy;
 Collision ALIGN collisions[MAX_COLLISIONS];
 Collision ALIGN CPcollisions[TOTAL_PODS];

 int iterations = 0;
 int endA = TOTAL_PODS-1;

 bool Sim_nextCP[TOTAL_PODS] = {false,false,false,false};

 //**** LOOPING COLLISIONS *****
 while(1) //Loop is common for 8 simulations, we'll stop when all sims are finished
 {
 int CollCount = 0;
 int CPCollCount = 0;

//Search Entity-Entity Collisions
 FOR0(i,endA)
   FOR(j,i+1,TOTAL_PODS)
   {
     searchCollisions(i,j,collisions,CollCount,Turn_Time,iterations);
   }
//Search Entity-Checkpoint Collisions
FOR0(i,TOTAL_PODS)
 if (!Sim_nextCP[i])
  {
	searchCPCollisions(i,CPcollisions,CPCollCount,Turn_Time,iterations);
  }

if (CollCount > 0)
{
	 FOR0(col,CollCount)
	 {
	    FirstCollisionTime = min(FirstCollisionTime,collisions[col].Time);

 #ifdef SIMLOG
	 if ( FirstPlayer)
	 {
	     cerr <<"Collision Time es    :"<<collisions[col].Time<<" CollCount"<<CollCount<<":"<<col<<endl;
		 cerr <<"FirstCollisionTime es:"<<FirstCollisionTime<<" CollCount"<<CollCount<<":"<<col<<endl;
	 }
#endif

	 }
}
 FirstCollisionTime = min(FirstCollisionTime,v8one);
 FirstCollisionTime = max(FirstCollisionTime,v8zero); //TODO: Needed?

//cerr <<"First Collision Time:"<<FirstCollisionTime<<endl;

//Checkpoint collisions
if (CPCollCount > 0)
 	 FOR0(col,CPCollCount)
	 {
		v8 ALIGN CPpassed = (CPcollisions[col].Time <= FirstCollisionTime);
#ifdef SIMLOG
 if (CG_playerId== 0)
		cerr << "CP Collision "<<col<<" "<<CPpassed[0]<<" Tiempo "<<CPcollisions[col].Time[0]<<"<="<<FirstCollisionTime[0]<<endl;
#endif
		if (horizontal_or(CPpassed))
		{
#ifdef SIMLOG
 if (CG_playerId== 0)
		    cerr <<"Avanced CP"<<col<<" Pod:"<<CPcollisions[col].A<<endl;
#endif
			advanceCP(CPcollisions[col].A,CPpassed);
		}
		//all passed?
        if (!horizontal_or(CPcollisions[col].Time > FirstCollisionTime))
		{
#ifdef SIMLOG
 if (CG_playerId== 0)
		    cerr <<"All passed CP"<<col<<endl;
#endif
			Sim_nextCP[CPcollisions[col].A] = true;
		}
	 }

 if (/*FirstCollisionTime[0] >= 1.0f &&*/  (CollCount == 0 || iterations >= MAX_ITERATIONS_TURN))  //If none of the pods have a collision, just move all and continue
 {
    #ifdef TESTSIM
	if (iterations >= MAX_ITERATIONS_TURN)
	{
	     ++SIM_OVERLOAD;
         //cerr << "Loop collision ERROR!"<<endl;
		 abort();
	}
	#endif
	moveUnits(v8one-Turn_Time);
	break;
 }
  ++iterations;

//Advance time to the first collision
  moveUnits(FirstCollisionTime-Turn_Time);
  minCollTime = min(FirstCollisionTime+v8minEnergy,v8one);
 #ifdef SIMLOG
	 if ( FirstPlayer)
		 cerr <<"FirstCollisionTime es:"<<FirstCollisionTime<<" CollCount"<<CollCount<<endl;
#endif
  //Ahora las colisiones. Vamos colisionando todo con t < minEnergy, y si colisionan con TError ponemos todos los ID's como para recolisionar.
  FOR0(col,CollCount)
  {
	  v8 ALIGN collCheck = (collisions[col].Time <= minCollTime);
	  if (horizontal_or(collCheck)) //if CollisionTime <= minCollTime then Collide
	  {
		 Collide(collisions[col],collCheck);
	  }
  }
  //And Advance turn Time
  Turn_Time = FirstCollisionTime; //Move
  FirstCollisionTime = v8two;
#ifdef TESTSIM
	 if (iterations >0 && FirstPlayer)
		 cerr << "---------> ITERATION:"<< iterations<<" CollCount:"<<CollCount<<endl;
#endif

 }
  v8simturn += v8one;
  endTurn();
}

inline void advanceCP(const int& i,const v8& Mask)
{
int team = i >>1;

//cerr <<">>Avanzo Pod "<<i<<":"<<pods[i].nextCP[0]<<" X:"<<next_CP[i].x[0]<<" V:"<<victory[team][0]<<" T:"<<timeout[team][0]<<endl;
	pods[i].nextCP = if_add(Mask,pods[i].nextCP,v8one);
	pods[i].nextCP = if_select(pods[i].nextCP>=totalCP,v8zero,pods[i].nextCP);
    pods[i].countCP = if_add(Mask,pods[i].countCP,v8one);

	victory[team] = _or(victory[team],pods[i].countCP>=victoryCP);
	pods[i].nextCP = if_select(victory[team], v8zero,pods[i].nextCP);
	timeout[team] = if_select(Mask,v8MAX_TIMEOUT+v8one,timeout[team]);

    FOR0(j,8) //TODO: Vectorize!
	{
	  int nCP = (int)pods[i].nextCP[j];
      next_CP[i].x[j] = checkpoints[nCP].x[j];
	  next_CP[i].y[j] = checkpoints[nCP].y[j];
	}
//cerr <<"<<Avanzo Pod "<<i<<":"<<pods[i].nextCP[0]<<" X:"<<next_CP[i].x[0]<<" V:"<<victory[team][0]<<" T:"<<timeout[team][0]<<endl;
}


inline void endTurn(){
  FOR0(i,TOTAL_PODS)
  {
	 pods[i].x = round(pods[i].x);
	 pods[i].y = round(pods[i].y);
	 pods[i].vx = truncate(pods[i].vx*v8Friction);
     pods[i].vy = truncate(pods[i].vy*v8Friction);
	 pods[i].shield = if_sub( pods[i].shield>v8zero,pods[i].shield,v8one );
  }
  timeout[0] -= v8one;
  timeout[1] -= v8one;
}

inline v8i clampAngle(const v8i& a)
{
 return if_sub( a>=C8i<360>(), if_add( (a<v8izero), a, C8i<360>()), C8i<360>());
}

inline void ApplyGenoma(const int& Pod,const v8i& angle,const v8i& thrust){
	//Rotate
	pods[Pod].angle = clampAngle(pods[Pod].angle+angle );
    v8 ALIGN f_cos,f_sin;
  	FOR0(i,8){ //Vectorize?!
	 int16_t ang = pods[Pod].angle[i];
	 if (ang < 0 || ang >=360)
	 {
	     cerr <<"FALLO "<<ang<<" "<<angle<<" "<<thrust<<endl;
	     abort();
	 }
	 f_cos[i] = fast_cos[ang];
	 f_sin[i] = fast_sin[ang];
	}
	v8 ALIGN power = loadv8(thrust);
    power = if_select( _and((power==v8BOOST), (pods[Pod].used_booster == v8zero)),v8BOOST,power);//Turbo
    pods[Pod].used_booster = if_select((power==v8BOOST),v8one,pods[Pod].used_booster); //Turbo

    pods[Pod].shield = if_select( (power < v8zero) ,v8SHIELD_BLOCK,pods[Pod].shield); //Shield
	pods[Pod].vx = if_add((pods[Pod].shield ==v8zero) ,pods[Pod].vx, (power *f_cos));
	pods[Pod].vy = if_add((pods[Pod].shield ==v8zero) ,pods[Pod].vy, (power *f_sin));
}

inline v8i getAngle(const int& Pod,const v8& x,const v8& y)
{
    v8 ALIGN d= (atan2((y-pods[Pod].y),(x-pods[Pod].x))*v8TO_DEG); //Todo: Atan2 more approximate, recheck if it's valid.
    //v8 ALIGN d= (atan2E((y-pods[Pod].y),(x-pods[Pod].x))*v8TO_DEG);  //More accurate
	return convert(if_add(d<v8zero,d,v8_360));
}

inline float get_angle(int x0,int y0,int x1,int y1) {
    float d = sqrtf((x0-x1)*(x0-x1)+(y0-y1)*(y0-y1));
    float dx = (x1 - x0) / d;
    float dy = (y1 - y0) / d;
    float a = acos(dx) * 180 / M_PI;
    if (dy < 0) {
        a = 360 - a;
    }
    return a;
}

    inline float diff_angle(int x0,int y0,int x1,int y1,int angle) {
        float a = get_angle(x0,y0,x1,y1);
        float right = angle <= a ? a - angle : a - angle + 360. ;
        float left = angle >= a ? angle - a : angle- a + 360. ;
//cerr << "a:"<<a<<" angle:"<<angle<<" right:"<<right<<" left:"<<left<<endl;
        if (right < left) {
            return right;
        }

        return -left;
    }

inline v8i diffAngle(const int& PodA,const v8& x,const v8& y)
	{
		//TODO: Review diffAngle, can be easier, maybe  abs(a-b),  then if >180, -180
		v8i ALIGN16 a = getAngle(PodA,x,y);
		v8i ALIGN16 right = if_add((pods[PodA].angle> a), (a-pods[PodA].angle), C8i<360>());
		v8i ALIGN16 left  = if_add((pods[PodA].angle< a), (pods[PodA].angle-a), C8i<360>() );
//cerr << "a:"<<get_int16_t(a,0)<<" angle:"<<get_int16_t(pods[PodA].angle,0)<<" right:"<<get_int16_t(right,0)<<" left:"<<get_int16_t(left,0)<<endl;
		return if_select(right<left, right, v8izero-left);
	}

inline v8 calcDistances(const v8& x1,const v8& y1,const v8& x2,const v8& y2){
	//TODO:
 v8 ALIGN dx = (x1-x2);
 v8 ALIGN dy = (y1-y2);
 return sqrt(dx*dx+dy*dy);
}

inline v8 Fitness(const int& playerID,const int& playerIndex,const int& enemyID,const int& enemyIndex	){
	v8 ALIGN calcScore = C8f<1000,1>();
	//TODO:
	FOR0(i,2)
	{
 	 v8 ALIGN dist =calcDistances(pods[playerIndex+i].x,pods[playerIndex+i].y,next_CP[playerIndex+i].x,next_CP[playerIndex+i].y);
	 dist = max(dist,v8_Dist_Fitness);
     //v8 ALIGN dist =calcDistances(pods[i].x,pods[i].y,next_CP[i].x,next_CP[i].y);
	 //v8 ALIGN endist =calcDistances(pods[2+i].x,pods[2+i].y,next_CP[2+i].x,next_CP[2+i].y);

	v8 ALIGN enemydist =calcDistances(pods[enemyIndex+i].x,pods[enemyIndex+i].y,next_CP[enemyIndex+i].x,next_CP[enemyIndex+i].y);
	enemydist = max(enemydist,v8_Dist_Fitness);

	  calcScore += (pods[playerIndex+i].countCP*50000.0f)-dist;
    calcScore -= (pods[playerIndex+i].shield*15.0f);
	calcScore -= ((pods[enemyIndex+i].countCP*50000.0f)-enemydist);
	}
	return calcScore;
 }

};
GameState ALIGN gamestate;
GameState ALIGN working;
//} GameState

//{GENETIC ALGORITHM

v8 ALIGN v8_FITNESSMALUS[DEPTH];
void InitFitnessMalus()
{
    v8 ALIGN v8Malus = C8f<MalusPercent,100>();
/*
v8 ALIGN v8Malus = C8f<MalusPercent,100>();
    v8_FITNESSMALUS[0] = v8one;
    FOR(i,1,DEPTH)
        v8_FITNESSMALUS[i] = mul(v8_FITNESSMALUS[i-1],v8Malus);*/
/*   v8 ALIGN v8Malus = C8f<MalusPercent,100>();

   FOR0(i,DEPTH-1)
        v8_FITNESSMALUS[i] = v8Malus;
   v8_FITNESSMALUS[DEPTH-1] = v8one;
  */
};


class Genoma {
public:

  v8 ALIGN Score;
  v8i ALIGN16 thrust[PLAYER_PODS][DEPTH];
  v8i ALIGN16 rotation[PLAYER_PODS][DEPTH];


#ifdef TESTGENOMA
 void SanityCheck(string s)
 {

   FOR0(j,PLAYER_PODS)
    FOR0(i,DEPTH){
     FOR0(k,8)
     {
	  if (abs(rotation[j][i][k]) > 18)
	   {
	       cerr << "************"<<s<<"*************"<<endl;
	       cerr << "Error en rotation:"<<j<<" "<<i<<" "<<k<<" Valor:"<<rotation[j][i][k]<<" > 18"<<endl;
	       abort();
	   }
	   int16_t t = thrust[j][i][k];
       if (t > 200 || t < -1)
      {
	       cerr << "************"<<s<<"*************"<<endl;
	       cerr << "Error en thrust:"<<j<<" "<<i<<" "<<k<<" Valor:"<<t<<" debe ser entre -1 y 200"<<endl;
	       abort();
	   }

     }
	}

 }
 #endif

  inline void clear()
  {
	  FOR0(j,PLAYER_PODS) {
        for (int i = 0; i < DEPTH; ++i) {
			rotation[j][i] = v8izero;
			thrust[j][i] = v8izero;
        }
      }
  }

  inline v8i randomRotation(const int& depth,const v8i& prev)
  {
      /*int16_t ALIGN16 ra[8];
      FOR0(i,8)
      {
          ra[i]=fastRandInt(-40,40);
          if (ra[i]<-18) ra[i]=-18;
          if (ra[i]>18) ra[i]=18;
      }
      return loadu_v8i(ra);*/
      //TODO: FIX THAT
	if (depth == 0)
	{
		return max(v8i_MIN_ROTATION,min( Irandom<-25,25>() , v8i_MAX_ROTATION));
		//return Irandom<-18,18>();// , v8i_MAX_ROTATION));
	}
	else
	{
	    return max(v8i_MIN_ROTATION,min(prev+ Irandom<-10,10>() , v8i_MAX_ROTATION));
	}
  }
  inline v8i randomThrust(const int& depth)
  {
	 return if_select((Irandom<128>() < v8i_SHIELD_PROB),C8i<-1>(), min((Irandom<4>() *C8i<70>()), v8i_MAX_THRUST));

  }

  inline void aleatorio(){
   FOR0(j,PLAYER_PODS)
    FOR0(i,DEPTH){
	  rotation[j][i] = randomRotation(i, rotation[j][i==0?0:i-1]);
	  thrust[j][i] = randomThrust(i);
	}
 #ifdef TESTGENOMA
   SanityCheck("Aleatorio");
 #endif

  }

  //Random 50% Crossover p()<50%?a:b
  inline void CrossoverUniform(const Genoma& a,const Genoma& b){
	  FOR0(j,PLAYER_PODS) {
        for (int i = 0; i < DEPTH; ++i) {
			v8i ALIGN16 r = IrandomBool();
			rotation[j][i] = if_select( r ,a.rotation[j][i],b.rotation[j][i]);
			thrust[j][i] = if_select( r ,a.thrust[j][i],b.thrust[j][i]);
        }
      }
 #ifdef TESTGENOMA
   SanityCheck("CX Uniform");
 #endif

 }

//Select best vectors from both Genomas
  inline void PickBest(const Genoma& a,const Genoma& b){
	  Score = max(a.Score,b.Score);
	  int16_t ALIGN16 values[8];
	  FOR0(k,8) if (a.Score[k] >= b.Score[k]) {values[k] = 1;} else {values[k]=0;};
	  v8i ALIGN16 isBest;
	  isBest.loada(values);
	  isBest = (isBest>v8izero);

	  FOR0(j,PLAYER_PODS) {
        for (int i = 0; i < DEPTH; ++i) {
            rotation[j][i] = if_select(isBest,a.rotation[j][i],b.rotation[j][i]);
			thrust[j][i] = if_select(isBest,a.thrust[j][i],b.thrust[j][i]);
        }
	  }
 #ifdef TESTGENOMA
   SanityCheck("PickBest");
 #endif

  }


//Random entity from A, then from B
  inline void CrossoverEntity(const Genoma& a,const Genoma& b){
      bool PickA = fastRandInt(100)<50;
	  FOR0(j,PLAYER_PODS)
	  {
	      if (PickA)
               for (int i = 0; i < DEPTH; ++i) {rotation[j][i] = a.rotation[j][i];thrust[j][i] = a.thrust[j][i];}
	      else for (int i = 0; i < DEPTH; ++i) {rotation[j][i] = b.rotation[j][i];thrust[j][i] = b.thrust[j][i];}
          PickA = !PickA;
	  }
 #ifdef TESTGENOMA
   SanityCheck("CX Entity");
 #endif

  }

  //50% arithmetic Crossover (a+b)/2
  /*inline void CrossoverArithmetic(const Genoma& a,const Genoma& b){
	  FOR0(j,PLAYER_PODS)
        for (int i = 0; i < DEPTH; ++i) {
			thrust[j][i] = if_select8(IrandomBool(),a.thrust[j][i],b.thrust[j][i]);
			rotation[j][i] = shr8(add8(a.rotation[j][i],b.rotation[j][i]),C8i<1>());
        }
  } */

  // AAAAAABBBBB
  inline void CrossoverPoint(const Genoma& a,const Genoma& b, int point){
	  FOR0(j,PLAYER_PODS) {
        for (int i = 0; i < point; ++i) {
			rotation[j][i] = a.rotation[j][i];
			thrust[j][i] = a.thrust[j][i];
        }
        for (int i = point; i < DEPTH; ++i) {
			rotation[j][i] = b.rotation[j][i];
			thrust[j][i] = b.thrust[j][i];
        }

      }
 #ifdef TESTGENOMA
   SanityCheck("CX Point");
 #endif
  }


  // AAAAAAAAAAAA[Random]
  inline void copyFrom(const Genoma& a,int offset = 0){
   FOR0(j,PLAYER_PODS)
   {

    FOR(i,offset,DEPTH){
	  rotation[j][i-offset] = a.rotation[j][i];
	  thrust[j][i-offset] = a.thrust[j][i];
	}

	if (offset>0)
	{
	  int offs = DEPTH-offset;
	  for (int i = offs; i < DEPTH; ++i)
	  {
		rotation[j][i] = randomRotation(i, rotation[j][i==0?0:i-1]);
		thrust[j][i] = randomThrust(i);
	  }
	}
   }
 #ifdef TESTGENOMA
   SanityCheck("mutate");
 #endif
  }


  inline void mutateWholeTurn(int NoMutations=1) {
    FOR0(i,NoMutations)
    {
	  int randDEPTH= fastRandInt(DEPTH);
	  FOR0(j,PLAYER_PODS)
	  {
  		rotation[j][randDEPTH] = randomRotation(randDEPTH, rotation[j][randDEPTH==0?0:randDEPTH-1]);
		thrust[j][randDEPTH] = randomThrust(randDEPTH);
	  }
    }

 #ifdef TESTGENOMA
   SanityCheck("mutateWholeTurn");
 #endif
  }


  inline void mutate(int NoMutations=MUTATION_RATE) {
    if (PLAYER_PODS > 0)
    FOR0(i,NoMutations)
    {
	  int randPod = fastRandInt(PLAYER_PODS);
	  int randDEPTH = fastRandInt(DEPTH);
	  rotation[randPod][randDEPTH] = randomRotation(randDEPTH, rotation[randPod][randDEPTH==0?0:randDEPTH-1]);
	  thrust[randPod][randDEPTH] = randomThrust(randDEPTH);
     }

 #ifdef TESTGENOMA
   SanityCheck("mutate");
 #endif
  }


  void printmove(int count)
  {
	  int index = -1;
	  float bestScore = -9999999.0f;

	  FOR0(i,8)
	  {
	    if (index == -1 || Score[i] > bestScore)
	      {
	          bestScore = Score[i];
			  index = i;
	      }
	  }
      cerr << "Score:"<<Score<<" best: "<<index<<" ("<<bestScore<<")"<<endl;

	  FOR0(j,PLAYER_PODS)
	  {
		 int16_t m_rotation = (rotation[j][0][index]);
		 int16_t m_thrust = (thrust[j][0][index]);

		 int angle = (gamestate.pods[j].angle[index])+ m_rotation;
		 if (angle >= 360) angle -= 360;
		 if (angle < 0) angle += 360;
		 cerr << "angulo es "<< angle<< " rotation: "<<m_rotation<<endl;
         int px =(int)round(gamestate.pods[j].x[index] +fast_cos[angle]*16000.0f);
		 int py =(int)round(gamestate.pods[j].y[index] +fast_sin[angle]*16000.0f);
         if (m_thrust == -1) {
			 cout << px<<" "<<py<<" SHIELD";
             gamestate.pods[j].shield = v8SHIELD_BLOCK;
         } else if (m_thrust == 650) {
			 cout << px<<" "<<py<<" BOOST";
             gamestate.pods[j].used_booster = v8one;
         } else cout << px<<" "<<py<<" "<<m_thrust;
	  	 if (j == 0) cout <<" G:"<<count<<"->S:"<<(count*POPULATION*DEPTH*8);
          else cout << " Sims/turn:"<<SIMCOUNT*8;
		 cout <<endl;
	  }
  }


};

ostream &operator<<(ostream& output, const Genoma& g){
    FOR0(ROW,8)
    {
	    output << "Genome ROW:"<<ROW<<" ("<<g.Score[ROW]<<"):";
		FOR0(j,PLAYER_PODS){
		  FOR0(i,DEPTH) {
			output << setw(4)<<(g.rotation[j][i][ROW])<<",";
		   }
		 output << "|";
		}
    }
    return output;
}

class Generacion{ //POPULATION
public:
  Genoma ALIGN g[POPULATION];
  int Elite[ELITISM];
  v8 BestScore;
  int BestIndex;
  int playerID,enemyID;
  int playerIndex,enemyIndex;
  Genoma* enemyMove;

  Generacion()
  {
	  ResetElitism();
	  enemyMove == NULL;
  }

  void ResetElitism()
  {
 	  FOR0(i,ELITISM)
	  {
		  Elite[i] = -1;
	  }
	  BestScore =  loadv8(-9999999.0f);
	  BestIndex = -1;
  }

  inline void checkElitism(int index)
  {
      bool newElite = false;
#ifdef TESTGENOMA
// cerr << "Validando Elitismo. Max Elite:"<<ELITISM<<" New:"<<newElite<<endl;
#endif
      //Step 1 populate all with Elite == -1

      FOR0(i,ELITISM)
      {
          if (Elite[i] == -1)
          {

              newElite = true;
              Elite[i] = index;
#ifdef TESTGENOMA
 cerr << " **Elite"<<i<<" está vacía, usando para index:"<<index<<"<->"<<Elite[i]<<" New:"<<newElite<<endl;
#endif

              break;
          }
      }


      if (!newElite)
      {
 	   FOR0(i,ELITISM)
	   {
	    if (horizontal_or(g[index].Score > g[Elite[i]].Score))
	    {

	       newElite = true;
#ifdef TESTGENOMA
 cerr << "  Reemplazo Elite"<<i<<":"<<Elite[i]<<" por "<<index<<". Anterior Score"<<g[Elite[i]].Score<<"<->"<<g[index].Score<<" New:"<<newElite<<endl;
#endif

           Elite[i] = index;
           break;
	    }
	   }
      }

      if (newElite)
      {
#ifdef TESTGENOMA
 cerr << "   Como hay nueva élite buscamos un nuevo best"<<endl;
#endif
	  FOR0(i,ELITISM)
	  {
	   if (Elite[i] != -1 && horizontal_or((g[Elite[i]].Score > BestScore)))
	   {
	       #ifdef TESTGENOMA
 cerr << "   El Elite["<<i<<"] parece mejor"<<g[Elite[i]].Score<<" "<<BestScore<<endl;
#endif

		  if (BestIndex >=0)
  		       g[Elite[i]].PickBest(g[Elite[i]],g[BestIndex]);
		  BestScore = g[Elite[i]].Score;
		  BestIndex = Elite[i];
	       #ifdef TESTGENOMA
 cerr << "   BestIndex:"<<BestIndex<<" "<<BestScore<<endl;
#endif
	   }
	  }
	  }
  }

   inline void Dummy_runner(const int& meID,const int& meIndex, const int& otID, const int& otIndex)
   {
       int j = working.runner[meID];
       v8 ALIGN next_x = sub(working.next_CP[j].x , mul(working.pods[j].vx,v8three));
       v8 ALIGN next_y = sub(working.next_CP[j].y , mul(working.pods[j].vy,v8three));

       v8i ALIGN16 r_angle = gamestate.diffAngle(j,next_x,next_y);
       v8i ALIGN16 m_thrust = if_select( (abs(r_angle)>=C8i<90>()), C8i<200>(),v8izero);
       r_angle = min(C8i<18>(),max(C8i<-18>(),r_angle));
       working.ApplyGenoma(j,r_angle ,m_thrust);
   }

   inline void Dummy_blocker(const int& meID,const int& meIndex, const int& otID, const int& otIndex)
   {
       //TODO: SHIELD
       int tgt = working.runner[otID];
       int j = working.blocker[meID];
       v8 ALIGN next_x = sub( add(working.pods[tgt].x,working.pods[tgt].vx) , mul(working.pods[j].vx,v8three));
       v8 ALIGN next_y = sub( add(working.pods[tgt].y,working.pods[tgt].vy) , mul(working.pods[j].vy,v8three));

       v8i ALIGN16 r_angle = gamestate.diffAngle(j,next_x,next_y);
       v8i ALIGN16 m_thrust = if_select( (abs(r_angle)>=C8i<90>()), C8i<200>(),v8izero);
       r_angle = min(C8i<18>(),max(C8i<-18>(),r_angle));
       working.ApplyGenoma(j,r_angle ,m_thrust);

   }


  inline void simulate(int index)
  {
	//g[index] is a Solution on the Population Array
	//enemyMove is the Best Solution from the Enemy GA, can be NULL
	g[index].Score = v8zero; //Reset Score for Solution
	//GA-GA check
  if (enemyMove != NULL)
  {
	working = gamestate; //Copy current GameState to a working, temporary one
	FOR0(turn,DEPTH)//for (int turn=0;turn<DEPTH;++turn)
	{

     FOR0(j,PLAYER_PODS)
	    working.ApplyGenoma(playerIndex+j,g[index].rotation[j][turn]  ,g[index].thrust[j][turn]);

  if (turn > 2)	   //Dummies after 2nd3turn of GA
  {
	  Dummy_runner(enemyID,enemyIndex,playerID,playerIndex);
	  Dummy_blocker(enemyID,enemyIndex,playerID,playerIndex);
  }
  else {
     FOR0(j,PLAYER_PODS)
      working.ApplyGenoma(enemyIndex   ,enemyMove->rotation[j][turn],enemyMove->thrust[j][turn]);
  }
	  working.playTurn();
      if (turn==0)
	   g[index].Score = mul(working.Fitness(playerID,playerIndex,enemyID,enemyIndex),0.1f);
	}
	g[index].Score = add(g[index].Score,working.Fitness(playerID,playerIndex,enemyID,enemyIndex));  //Fitness is the Scoring eval()
  }

	working = gamestate; //Copy current GameState to a working, temporary one
	v8 ALIGN DummyScore;
	FOR0(turn,DEPTH)//for (int turn=0;turn<DEPTH;++turn)
	{
      FOR0(j,PLAYER_PODS)
	    working.ApplyGenoma(playerIndex+j,g[index].rotation[j][turn]  ,g[index].thrust[j][turn]);
	  Dummy_runner(enemyID,enemyIndex,playerID,playerIndex);
	  Dummy_blocker(enemyID,enemyIndex,playerID,playerIndex);
	  working.playTurn();
	  if (turn==0)
	    DummyScore = mul(working.Fitness(playerID,playerIndex,enemyID,enemyIndex),0.1f);
	}
	DummyScore = add(DummyScore,working.Fitness(playerID,playerIndex,enemyID,enemyIndex));  //Fitness is the Scoring eval()
    if (enemyMove != NULL)
	      g[index].Score = min(g[index].Score,DummyScore);
     else g[index].Score = DummyScore;
  }
};

class GeneticAlgorithm{
public:
  Generacion ALIGN generacion[2];
  int CurrGen,NextGen;
  int playerID,enemyID;
  int playerIndex,enemyIndex;

  GeneticAlgorithm()
  {
	  CurrGen = -1;
	  NextGen = 0;
	  generacion[0].ResetElitism();
	  generacion[1].ResetElitism();
  }

  void setPlayerID(int _id)
  {
	  playerID = _id;
	  enemyID = (playerID+1)%2;
	  generacion[0].playerID = playerID;
	  generacion[1].playerID = playerID;
	  generacion[0].enemyID = enemyID;
	  generacion[1].enemyID = enemyID;

	  playerIndex = 2*playerID;
	  enemyIndex =  2*enemyID;
	  generacion[0].playerIndex = playerIndex;
	  generacion[1].playerIndex = playerIndex;
	  generacion[0].enemyIndex = enemyIndex;
	  generacion[1].enemyIndex = enemyIndex;
  }

  inline void ChangeGen()
  {
	CurrGen = (CurrGen+1)&1;
	NextGen = (CurrGen+1)&1;
  }

  void IniciarPoblacion()
  {
	int startPoolIndex = 0;
	if (CurrGen >= 0)
	{
 	 //generacion[CurrGen].ResetElitism();
	 FOR0(i,ELITISM){
	   generacion[NextGen].g[startPoolIndex++].copyFrom(generacion[CurrGen].g[generacion[CurrGen].Elite[i]],1);
       generacion[NextGen].g[startPoolIndex++].copyFrom(generacion[CurrGen].g[generacion[CurrGen].Elite[i]],1);
	 }
	}
	generacion[0].ResetElitism();
	generacion[1].ResetElitism();
    ChangeGen();
	//Solutions from simple AI
	constexpr int HALF_POP = POPULATION*3/4;
    FOR(i,startPoolIndex,HALF_POP)
	{
		generacion[CurrGen].g[i].aleatorio(); //TODO: Simple AI

    }
	//Random solutions
	FOR(i,HALF_POP,POPULATION)
	{
		generacion[CurrGen].g[i].aleatorio();
	}
//  cerr << " Antes de simular "<<stopwatch.EllapsedMilliseconds()<<endl;
    FOR0(i,POPULATION)
	{
	  generacion[CurrGen].simulate(i);
	}
  //cerr << " Simulado "<<stopwatch.EllapsedMilliseconds()<<endl;
    FOR0(i,POPULATION)
	{
	  generacion[CurrGen].checkElitism(i);
	}

  }

 int genCount;

  void evolve(Genoma* enemyMove)
  {
//      cerr << " Copiamos los enemy moves "<<endl;
	  generacion[0].enemyMove = enemyMove;
	  generacion[1].enemyMove = enemyMove;
	  genCount = 0;
	  int bestGeneracion = 1;
//	  cerr << " Iniciar Población "<<stopwatch.EllapsedMilliseconds()<<endl;
	  IniciarPoblacion();
//	  cerr << " Evolucionar "<<stopwatch.EllapsedMilliseconds()<<endl;
	  while (!stopwatch.Timeout())
	  {
		++genCount;
		generacion[NextGen].ResetElitism();
		int nextIndex = 0;
//Copiar ELITISM al nuevo pool, y mutar
		FOR0(i,ELITISM)
		 if (generacion[CurrGen].Elite[i] >= 0)
		 {
			generacion[NextGen].g[nextIndex].copyFrom(generacion[CurrGen].g[generacion[CurrGen].Elite[i]],0);
			generacion[NextGen].g[nextIndex].mutate();
			++nextIndex;
		 }
//Elite+Elite
#if ELITISM > 1

        FOR0(i,MixElite)
		{
		 int Elite1 = fastRandInt(ELITISM);
         int Elite2;
		 do {
			 Elite2 = fastRandInt(ELITISM);
                } while (Elite1 == Elite2);
         Elite1 = generacion[CurrGen].Elite[Elite1];
		 Elite2 = generacion[CurrGen].Elite[Elite2];
		 //if (fastRandInt(10) >= 3)
			 generacion[NextGen].g[nextIndex].CrossoverUniform(generacion[CurrGen].g[Elite1],generacion[CurrGen].g[Elite2]);
           //else generacion[NextGen].g[nextIndex].CrossoverEntity(generacion[CurrGen].g[Elite1],generacion[CurrGen].g[Elite2]);
		 if (!fastRandInt(MUTATION)) { generacion[NextGen].g[nextIndex].mutate(); }
	 	 ++nextIndex;
		}
#endif
//Elite+Random
        constexpr int startLoop = ELITISM+MixElite;
		constexpr int endLoop = ELITISM+POPULATION/2;
        FOR(i,startLoop,endLoop)
		{
		 int Elite1 = generacion[CurrGen].Elite[fastRandInt(ELITISM)];
		 int aIndex,bIndex;
		 do {
			 aIndex = fastRandInt(POPULATION);
                } while (Elite1 == aIndex);
		 do {
			 bIndex = fastRandInt(POPULATION);
                } while (Elite1 == bIndex || aIndex == bIndex);
		 Genoma bestGenes;
		 bestGenes.PickBest(generacion[CurrGen].g[aIndex],generacion[CurrGen].g[bIndex]);
		//if (fastRandInt(10) >= 3)
		     generacion[NextGen].g[nextIndex].CrossoverUniform(generacion[CurrGen].g[Elite1],bestGenes);
		 //else generacion[NextGen].g[nextIndex].CrossoverEntity(generacion[CurrGen].g[Elite1],bestGenes);
		 if (fastRandInt(MUTATION)==0) { generacion[NextGen].g[nextIndex].mutate(); }
		++nextIndex;
		}
//Random+Random
        FOR(i,endLoop,POPULATION)
		{
		 Genoma ALIGN16 bestGenes1;
		 Genoma ALIGN16 bestGenes2;
		 int aIndex1,bIndex1;
		 aIndex1 = fastRandInt(POPULATION);
		 do {
			 bIndex1 = fastRandInt(POPULATION);
                } while (aIndex1 == bIndex1);
         bestGenes1.PickBest(generacion[CurrGen].g[aIndex1],generacion[CurrGen].g[bIndex1]);
		 int aIndex2,bIndex2;
		 do {
			 aIndex2 = fastRandInt(POPULATION);
                } while (bIndex1 == aIndex2 || aIndex1 == aIndex2);
		 do {
			 bIndex2 = fastRandInt(POPULATION);
                } while (aIndex2 == bIndex2);
        bestGenes2.PickBest(generacion[CurrGen].g[aIndex2],generacion[CurrGen].g[bIndex2]);
        //if (fastRandInt(10) >= 3)
		    generacion[NextGen].g[nextIndex].CrossoverUniform(bestGenes1,bestGenes2);
         //else generacion[NextGen].g[nextIndex].CrossoverEntity(bestGenes1,bestGenes2);
	    if (fastRandInt(MUTATION)==0) { generacion[NextGen].g[nextIndex].mutate();  }
		++nextIndex;
		}
//Evaluate fitness
		FOR0(i,POPULATION)
		{
		  generacion[NextGen].simulate(i);
		  generacion[NextGen].checkElitism(i);
		}

//Ahora nos quedamos con la mejor generación
        if (horizontal_or(gt(generacion[NextGen].BestScore , generacion[CurrGen].BestScore)))
		{
			//La siguiente generación es mejor, evolucionamos
			ChangeGen();
			bestGeneracion = genCount;
		}
	  }
cerr << "FIN Evolucion: CUENTA:" << genCount<<" Bestgen:"<<bestGeneracion << " Sims:"<<(genCount*POPULATION*DEPTH*8)<<" T:"<<stopwatch.EllapsedMilliseconds()<<"ms"<<endl;
  }

  inline Genoma* getBest()
  {
	 return (&generacion[CurrGen].g[generacion[CurrGen].BestIndex]);
  }

};

GeneticAlgorithm enemyGA;
GeneticAlgorithm myGA;
//}GENETIC ALGORITHM

//{MAIN FUNCTIONS

void Tests()
{
    cerr << "get_angle: "<<gamestate.get_angle(gamestate.pods[0].x[0],gamestate.pods[0].y[0],800,500)<<endl;
    cerr << "diff_angle: "<<gamestate.diff_angle(gamestate.pods[0].x[0],gamestate.pods[0].y[0],800,500,(gamestate.pods[0].angle[0]))<<endl;
    cerr << "getAngle: "<<gamestate.getAngle(0,loadv8(800),loadv8(500))<<endl;
    cerr << "diffAngle: "<<gamestate.diffAngle(0,loadv8(800),loadv8(500))<<endl;


    v8i ALIGN16 entero = v8i(-10,2,-30,4,-50,6,-70,8);
    cerr <<entero<<endl;
    cerr <<loadv8(entero)<<endl;
    cerr <<convert(add(loadv8(entero),v8one))<<endl;
}

int TESTSIM(){
    cerr << "Simulation Test"<<endl;
 srand (time(NULL));
  SIM_Vector_Not_Noised = rand() % 8;

    Genoma g[TOTAL_PODS];
    int next_x[TOTAL_PODS];
    int next_y[TOTAL_PODS];

	while(1)
	{
	 SIMCOUNT = 0;
	 gamestate.ReadTurn(cin);
	 if (gamestate.turn > 0 && (CG_playerId==0) )
	 {
	     //Compare working with gamestate
	    gamestate.Compare(working);
	    cerr <<"Fails/Over/Turn:"<<SIM_TURNS_FAILED<<"/"<<SIM_OVERLOAD<<"/"<<gamestate.turn<<endl;
	    if (SIM_TURNS_FAILED >2) {
	        cerr <<"********************************************************************"<<endl;
	        cerr <<"** TOO MANY SIM ERRORS. RECHECK THAT PLAYER 2 IS THIS AI TOO !!!! **"<<endl;
	        cerr <<"********************************************************************"<<endl;
	        abort();
	    }

	 }
	 else {
//	     FOR0(j,TOTAL_PODS)
//	       gamestate.pods[j].angle=loadv8i(0);
	 }
	working = gamestate; //Copy current GameState to a working, temporary one
 	//Move pods
 	FOR0(j,TOTAL_PODS)
 	{
 	   bool elbueno = (j == 0 && CG_playerId == 0) ;

       int ncp = (int)gamestate.pods[j].nextCP[SIM_Vector_Not_Noised];
       int next_x = gamestate.checkpoints[ncp].x[SIM_Vector_Not_Noised] - 3*gamestate.pods[j].vx[SIM_Vector_Not_Noised];
       int next_y = gamestate.checkpoints[ncp].y[SIM_Vector_Not_Noised] - 3*gamestate.pods[j].vy[SIM_Vector_Not_Noised];

       int r_angle = (int)(gamestate.diffAngle(j,loadv8(next_x),loadv8(next_y))[SIM_Vector_Not_Noised]);
       r_angle = min(18,max(-18,r_angle));
//       if (r_angle < 0) r_angle +=360;

 	   int16_t m_thrust = ((gamestate.turn*4)%201 ); //Test Different SPEEDS
 	   if ( (gamestate.turn + 3*(j%2)) % 13 == 0) m_thrust = -1; //Test SHIELD
       working.ApplyGenoma(j,v8i(r_angle) ,v8i(m_thrust));

       if (j <2)
       {
         int index = 0;
		 int angle = (int)(gamestate.pods[j].angle[index])+ r_angle;
		 if (angle >= 360) angle -= 360;
		 if (angle < 0) angle += 360;
         int px =(int)round(gamestate.pods[j].x[index] +fast_cos[angle]*16000.0f);
		 int py =(int)round(gamestate.pods[j].y[index] +fast_sin[angle]*16000.0f);

         if (m_thrust == -1) {
			 cout << px<<" "<<py<<" SHIELD";

         } else if (m_thrust == 650) {
			 cout << px<<" "<<py<<" BOOST";
         } else cout << px<<" "<<py<<" "<<m_thrust;
		 cout <<endl;
       }
       if (m_thrust == -1) {gamestate.pods[j].shield = v8SHIELD_BLOCK;}
       if (m_thrust == 650) {gamestate.pods[j].used_booster = v8one;}
    //   else cout << next_x<<" "<<next_y<<" 0"<<endl;
 	}
     working.playTurn();
//     SIM_Vector_Not_Noised = (SIM_Vector_Not_Noised+1)%8; //Test another record AVX
   }
}

void testGENOMA(){
    myGA.IniciarPoblacion();
    Generacion* G = &myGA.generacion[myGA.CurrGen];
/*
  Genoma ALIGN g[POPULATION];
  int Elite[ELITISM];
  v8 BestScore;
  int BestIndex;
*/
   cerr << "BestIndex:"<<G->BestIndex<<" ELITES:"<<endl;
   FOR0(i,ELITISM)
     cerr << G->Elite[i]<<":"<<G->g[G->Elite[i]].Score<<" S:"<<G->g[G->Elite[i]].thrust[0][0]<<" R:"<<G->g[G->Elite[i]].rotation[0][0]<<endl;
   cerr<< " BestIndex:"<<G->BestIndex << "BestScore:"<<G->BestScore<<endl;
   cerr << "Best Genoma:"<<G->g[G->BestIndex].thrust[0][0]<<" "<<G->g[G->BestIndex].rotation[0][0]<<endl;
   //cerr << G->g[0]<<" CX

   abort();
}

int main(){

	gamestate.ReadConfig(cin);
	InitAngles();
	myGA.setPlayerID(0);
	enemyGA.setPlayerID(1);
#ifdef SIMLOG
TESTSIM();
#endif


//TESTMOVE();

int minS = 99999999;
int maxS = 0;
ll total =0;
	while(1)
	{
	 SIMCOUNT = 0;
	 gamestate.ReadTurn(cin);
#ifdef TESTGENOMA
testGENOMA();
#endif

//	 Tests();
//return 0;
	 gamestate.defineRunners();
     stopwatch.setTimeout((gamestate.turn>0?TIMEOUTN_0:TIMEOUT0_0));
     enemyGA.evolve(NULL);
     stopwatch.setTimeout((gamestate.turn>0?TIMEOUTN_1:TIMEOUT0_1));
     myGA.evolve(enemyGA.getBest());
     myGA.getBest()->printmove((enemyGA.genCount+myGA.genCount));
if (gamestate.turn > 0)
{
     SIMCOUNT =SIMCOUNT*8;
     total += SIMCOUNT;
     minS = min(minS,SIMCOUNT);
     maxS = max(maxS,SIMCOUNT);
     cerr <<" SIMCOUNT:"<< SIMCOUNT<<" MAX:"<<maxS<<" Min:"<<minS<<" Media:"<<(total/(gamestate.turn))<<endl;
}


	}
}
//}MAIN FUNCTIONS