syoyo/gist:24d0bf30dd2a9b5b2b69

## gistfile1.txt
// Writing (&x)[i] in operator[] is safe/correct C++ code or not?
// The following is the reduced code fragment(Thus not work by just copy&paste) which Intel C++ compier(ver 13 and 15) miscompiles(Release build only) the code for the access to real3 object through operator[] inside OpenMP loop.
// clang and gcc are OK to compile&run

typedef float real;

struct real3 {
  real3() {}
  real3(real xx, real yy, real zz) {
    x = xx;
    y = yy;
    z = zz;
  }
  explicit real3(real *p) {
    x = p[0];
    y = p[1];
    z = p[2];
  }

  real operator[](int i) const { return (&x)[i]; }
  real &operator[](int i) { return (&x)[i]; }

    real x, y, z;
  // real pad;  // for alignment
};

// -----------------------------

void MortonCodesTetraFloat30(
    uint32_t *codes, const float *points, const uint32_t *faces,
    const real3 &bmin, const real3 &bmax, int64_t startIdx, int64_t endIdx) {

  int kDIV = (1 << 10);
  real invx = kDIV / (bmax[0] - bmin[0]);
  real invy = kDIV / (bmax[1] - bmin[1]);
  real invz = kDIV / (bmax[2] - bmin[2]);

  int64_t n = endIdx - startIdx;

  float one_fourth = 1.0f / 4.0f;

#ifdef _OPENMP
#pragma omp parallel for if (n > 4096)
#endif
  for (int64_t i = startIdx; i < endIdx; i++) {
    uint32_t f0 = faces[3 * i + 0];
    uint32_t f1 = faces[3 * i + 1];
    uint32_t f2 = faces[3 * i + 2];
    real3 p0(points[3 * f0 + 0], points[3 * f0 + 1], points[3 * f0 + 2]);
    real3 p1(points[3 * f1 + 0], points[3 * f1 + 1], points[3 * f1 + 2]);
    real3 p2(points[3 * f2 + 0], points[3 * f2 + 1], points[3 * f2 + 2]);
    real3 p_i;

    // *** Intel Compiler miscompiles(?) real3::operator[] and give wrong result inside OpenMP loop ***
    p_i[0] = one_third * (p0[0] + p1[0] + p2[0]);
    p_i[1] = one_third * (p0[1] + p1[1] + p2[1]);
    p_i[2] = one_third * (p0[2] + p1[2] + p2[2]);
    codes[i] = MortionCode30(p_i, bmin, invx, invy, invz);
  }
}
	// Writing (&x)[i] in operator[] is safe/correct C++ code or not?
	// The following is the reduced code fragment(Thus not work by just copy&paste) which Intel C++ compier(ver 13 and 15) miscompiles(Release build only) the code for the access to real3 object through operator[] inside OpenMP loop.
	// clang and gcc are OK to compile&run

	typedef float real;

	struct real3 {
	real3() {}
	real3(real xx, real yy, real zz) {
	x = xx;
	y = yy;
	z = zz;
	}
	explicit real3(real *p) {
	x = p[0];
	y = p[1];
	z = p[2];
	}

	real operator[](int i) const { return (&x)[i]; }
	real &operator[](int i) { return (&x)[i]; }

	real x, y, z;
	// real pad; // for alignment
	};

	// -----------------------------

	void MortonCodesTetraFloat30(
	uint32_t codes, const float points, const uint32_t *faces,
	const real3 &bmin, const real3 &bmax, int64_t startIdx, int64_t endIdx) {

	int kDIV = (1 << 10);
	real invx = kDIV / (bmax[0] - bmin[0]);
	real invy = kDIV / (bmax[1] - bmin[1]);
	real invz = kDIV / (bmax[2] - bmin[2]);

	int64_t n = endIdx - startIdx;

	float one_fourth = 1.0f / 4.0f;

	#ifdef _OPENMP
	#pragma omp parallel for if (n > 4096)
	#endif
	for (int64_t i = startIdx; i < endIdx; i++) {
	uint32_t f0 = faces[3 * i + 0];
	uint32_t f1 = faces[3 * i + 1];
	uint32_t f2 = faces[3 * i + 2];
	real3 p0(points[3 * f0 + 0], points[3 * f0 + 1], points[3 * f0 + 2]);
	real3 p1(points[3 * f1 + 0], points[3 * f1 + 1], points[3 * f1 + 2]);
	real3 p2(points[3 * f2 + 0], points[3 * f2 + 1], points[3 * f2 + 2]);
	real3 p_i;

	// * Intel Compiler miscompiles(?) real3::operator[] and give wrong result inside OpenMP loop *
	p_i[0] = one_third * (p0[0] + p1[0] + p2[0]);
	p_i[1] = one_third * (p0[1] + p1[1] + p2[1]);
	p_i[2] = one_third * (p0[2] + p1[2] + p2[2]);
	codes[i] = MortionCode30(p_i, bmin, invx, invy, invz);
	}
	}