vurtun/obj_filter.c

## obj_filter.c
#ifdef _MSC_VER
#define alignto(x) __declspec(align(x))
#define bit_cnt(u) __popcnt(u)
#define bit_cnt64(u) __popcnt64(u)
static int bit_ffs32(unsigned int u) {_BitScanForward(&u, u); return casti(u);}
static int bit_ffs64(unsigned long long u) {_BitScanForward64(&u, u); return casti(u);}
#else /* GCC, CLANG */
#define alignto(x) __attribute__((aligned(x)))
#define bit_cnt(u) __builtin_popcount(u)
#define bit_cnt64(u) __builtin_popcountll(u)
#define bit_ffs32(u) __builtin_ctz(u)
#define bit_ffs64(u) __builtin_ctzll(u)
#endif

static int
fltr_obj(struct guid* res, const char *obj_desc_buf, const char *obj_desc_buf_end,
         const struct guid* obj_ids, const char *fltr_str, const char  *fltr_str_end){
/*  This algorithm uses as a predicate equality of the first and the last characters from the substring. These two characters are populated in two registers
 *  F and L respectively. Then in each iteration two chunks of strings are loaded. The first chunk (A) is read from offset i (where i is the current offset)
 *  and the second chunk (B) is read from offset i + k - 1, where k is sub string's length. Then we compute a vector expression F == A and B == L.
 *  This step yields a byte vector (or a bit mask), where "true" values denote position of potential substring occurrences.
 *  Finally, just at these positions an exact comparisons of sub strings are performed.
 *
 *  Example: Let's assume 8-byte registers. We're searching for word "cat", thus:
 *
 *    F = [ c | c | c | c | c | c | c | c ]
 *    L = [ t | t | t | t | t | t | t | t ]
 *
 *  We're searching in the string "a_cat_tries". In the first iteration the register A gets data from offset 0, B from offset 2:
 *
 *    A = [ a | _ | c | a | t | _ | t | r ]
 *    B = [ c | a | t | _ | t | r | i | e ]
 *
 *  Now we compare:
 *
 *    AF = ( A == F ) = [ 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 ]
 *    BL = ( B == L ) = [ 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 ]
 *
 *  After merging comparison results, i.e.AF & BL, we get following mask :
 *
 *    mask = [ 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 ]
 *
 *  Since the mask is non-zero, it means there are possible substring occurrences.
 *  As we see, there is only one non-zero element at index 2, thus only one sub string comparison must be performed.
 *
 *  For the actual search of all objects we use a single buffer containing all descriptions of all objects separated by '\0'.
 *  In addition a separate array of object ids is kept to map from description based on number of `\0` encountered to array index
 *  and therefore object id.
 */
  if(fltr_str == fltr_str_end){
    return 0;
  }
  int val_idx = 0;
  int guid_idx = 0;
  int needle_cnt = fltr_str_end - fltr_str;
  const __m128i zero = _mm_set1_epi8(0u);
  const __m128i first = _mm_set1_epi8(fltr_str[0] );
  const __m128i last = _mm_set1_epi8(fltr_str_end[-1] );
  for (const char *it = obj_desc_buf; it < obj_desc_buf_end; it += 16) {
    __m128i blk_first = _mm_loadu_si128((const __m128i*)it );
    __m128i blk_last = _mm_loadu_si128((const __m128i*)it + needle_cnt - 1u);
    __m128i equal_first = _mm_cmpeq_epi8(first, blk_first);
    __m128i equal_last = _mm_cmpeq_epi8(last, blk_last);
    __m128i equal_zero = _mm_cmpeq_epi8(blk_first, zero);
    __m128i equal = _mm_and_si128(equal_first,equal_last);
    unsigned zero_mask = _mm_movemask_epi8(equal_zero);
    unsigned msk = _mm_movemask_epi8(equal);
    while(msk != 0u) {
      unsigned bit_pos = bit_ffs32( msk );
      if(memcmp(it + bit_pos + 1, fltr_str + 1, needle_cnt-2 ) == 0) {
        unsigned cur_idx = guid_idx + bit_cnt(zero_mask & ((1u << bit_pos) - 1u));
        res[val_idx] = obj_ids[cur_idx];
        val_idx++;
      }
      msk = msk & ( msk - 1 );
    }
     guid_idx += bit_cnt(zero_mask);
  }
  return val_idx;
}
	#ifdef _MSC_VER
	#define alignto(x) __declspec(align(x))
	#define bit_cnt(u) __popcnt(u)
	#define bit_cnt64(u) __popcnt64(u)
	static int bit_ffs32(unsigned int u) {_BitScanForward(&u, u); return casti(u);}
	static int bit_ffs64(unsigned long long u) {_BitScanForward64(&u, u); return casti(u);}
	#else /* GCC, CLANG */
	#define alignto(x) __attribute__((aligned(x)))
	#define bit_cnt(u) __builtin_popcount(u)
	#define bit_cnt64(u) __builtin_popcountll(u)
	#define bit_ffs32(u) __builtin_ctz(u)
	#define bit_ffs64(u) __builtin_ctzll(u)
	#endif

	static int
	fltr_obj(struct guid* res, const char obj_desc_buf, const char obj_desc_buf_end,
	const struct guid* obj_ids, const char fltr_str, const char fltr_str_end){
	/* This algorithm uses as a predicate equality of the first and the last characters from the substring. These two characters are populated in two registers
	* F and L respectively. Then in each iteration two chunks of strings are loaded. The first chunk (A) is read from offset i (where i is the current offset)
	* and the second chunk (B) is read from offset i + k - 1, where k is sub string's length. Then we compute a vector expression F == A and B == L.
	* This step yields a byte vector (or a bit mask), where "true" values denote position of potential substring occurrences.
	* Finally, just at these positions an exact comparisons of sub strings are performed.
	*
	* Example: Let's assume 8-byte registers. We're searching for word "cat", thus:
	*
	* F = [ c \| c \| c \| c \| c \| c \| c \| c ]
	* L = [ t \| t \| t \| t \| t \| t \| t \| t ]
	*
	* We're searching in the string "a_cat_tries". In the first iteration the register A gets data from offset 0, B from offset 2:
	*
	* A = [ a \| _ \| c \| a \| t \| _ \| t \| r ]
	* B = [ c \| a \| t \| _ \| t \| r \| i \| e ]
	*
	* Now we compare:
	*
	* AF = ( A == F ) = [ 0 \| 0 \| 1 \| 0 \| 0 \| 0 \| 0 \| 0 ]
	* BL = ( B == L ) = [ 0 \| 0 \| 1 \| 0 \| 1 \| 0 \| 0 \| 0 ]
	*
	* After merging comparison results, i.e.AF & BL, we get following mask :
	*
	* mask = [ 0 \| 0 \| 1 \| 0 \| 0 \| 0 \| 0 \| 0 ]
	*
	* Since the mask is non-zero, it means there are possible substring occurrences.
	* As we see, there is only one non-zero element at index 2, thus only one sub string comparison must be performed.
	*
	* For the actual search of all objects we use a single buffer containing all descriptions of all objects separated by '\0'.
	* In addition a separate array of object ids is kept to map from description based on number of `\0` encountered to array index
	* and therefore object id.
	*/
	if(fltr_str == fltr_str_end){
	return 0;
	}
	int val_idx = 0;
	int guid_idx = 0;
	int needle_cnt = fltr_str_end - fltr_str;
	const __m128i zero = _mm_set1_epi8(0u);
	const __m128i first = _mm_set1_epi8(fltr_str[0] );
	const __m128i last = _mm_set1_epi8(fltr_str_end[-1] );
	for (const char *it = obj_desc_buf; it < obj_desc_buf_end; it += 16) {
	__m128i blk_first = _mm_loadu_si128((const __m128i*)it );
	__m128i blk_last = _mm_loadu_si128((const __m128i*)it + needle_cnt - 1u);
	__m128i equal_first = _mm_cmpeq_epi8(first, blk_first);
	__m128i equal_last = _mm_cmpeq_epi8(last, blk_last);
	__m128i equal_zero = _mm_cmpeq_epi8(blk_first, zero);
	__m128i equal = _mm_and_si128(equal_first,equal_last);
	unsigned zero_mask = _mm_movemask_epi8(equal_zero);
	unsigned msk = _mm_movemask_epi8(equal);
	while(msk != 0u) {
	unsigned bit_pos = bit_ffs32( msk );
	if(memcmp(it + bit_pos + 1, fltr_str + 1, needle_cnt-2 ) == 0) {
	unsigned cur_idx = guid_idx + bit_cnt(zero_mask & ((1u << bit_pos) - 1u));
	res[val_idx] = obj_ids[cur_idx];
	val_idx++;
	}
	msk = msk & ( msk - 1 );
	}
	guid_idx += bit_cnt(zero_mask);
	}
	return val_idx;
	}