Created
September 22, 2022 12:09
-
-
Save HookedBehemoth/ed0725ed571e95ca85eb1dbda8823853 to your computer and use it in GitHub Desktop.
scanmem avx2 accelerated initial search patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/ptrace.c b/ptrace.c | |
index 5e71d78..7fd3545 100644 | |
--- a/ptrace.c | |
+++ b/ptrace.c | |
@@ -46,6 +46,8 @@ | |
#include <stdbool.h> | |
#include <limits.h> | |
#include <fcntl.h> | |
+#include <immintrin.h> | |
+#include <avx2intrin.h> | |
// dirty hack for FreeBSD | |
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) | |
@@ -487,9 +489,15 @@ bool sm_checkmatches(globals_t *vars, | |
} | |
+#include <time.h> | |
+#include "endianness.h" | |
+ | |
/* sm_searchregions() performs an initial search of the process for values matching `uservalue` */ | |
bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const uservalue_t *uservalue) | |
{ | |
+ clock_t t; | |
+ t = clock(); | |
+ | |
matches_and_old_values_swath *writing_swath_index; | |
int required_extra_bytes_to_record = 0; | |
unsigned long total_size = 0; | |
@@ -498,6 +506,69 @@ bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const userv | |
region_t *r; | |
unsigned long total_scan_bytes = 0; | |
unsigned char *data = NULL; | |
+ unsigned stride = 0; | |
+ __m256i mask = {}; | |
+ __m256 maskf = {}; | |
+ __m256d maskd = {}; | |
+ unsigned count = 0; | |
+ | |
+ switch (vars->options.scan_data_type) { | |
+ case INTEGER8: | |
+ if (uservalue->flags & flag_s8b) { | |
+ mask = _mm256_set1_epi8(uservalue->int8_value); | |
+ } else { | |
+ mask = _mm256_set1_epi8(uservalue->uint8_value); | |
+ } | |
+ stride = 1; | |
+ count = 32; | |
+ break; | |
+ case INTEGER16: | |
+ if (uservalue->flags & flag_s16b) { | |
+ mask = _mm256_set1_epi16( | |
+ vars->options.reverse_endianness ? swap_bytes16(uservalue->int16_value) : uservalue->int16_value); | |
+ } else { | |
+ mask = _mm256_set1_epi16( | |
+ vars->options.reverse_endianness ? swap_bytes16(uservalue->uint16_value) : uservalue->uint16_value); | |
+ } | |
+ stride = 2; | |
+ count = 16; | |
+ break; | |
+ case INTEGER32: | |
+ if (uservalue->flags & flag_s32b) { | |
+ mask = _mm256_set1_epi32( | |
+ vars->options.reverse_endianness ? swap_bytes32(uservalue->int32_value) : uservalue->int32_value); | |
+ } else { | |
+ mask = _mm256_set1_epi32( | |
+ vars->options.reverse_endianness ? swap_bytes32(uservalue->uint32_value) : uservalue->uint32_value); | |
+ } | |
+ stride = 4; | |
+ count = 8; | |
+ break; | |
+ case INTEGER64: | |
+ if (uservalue->flags & flag_s64b) { | |
+ mask = _mm256_set1_epi64x( | |
+ vars->options.reverse_endianness ? swap_bytes64(uservalue->int64_value) : uservalue->int64_value); | |
+ } else { | |
+ mask = _mm256_set1_epi64x( | |
+ vars->options.reverse_endianness ? swap_bytes64(uservalue->uint64_value) : uservalue->uint64_value); | |
+ } | |
+ stride = 8; | |
+ count = 4; | |
+ break; | |
+ case FLOAT32: | |
+ maskf = _mm256_set1_ps(uservalue->float32_value); | |
+ stride = 4; | |
+ count = 8; | |
+ break; | |
+ case FLOAT64: | |
+ maskd = _mm256_set1_pd(uservalue->float64_value); | |
+ stride = 8; | |
+ count = 4; | |
+ break; | |
+ default: | |
+ printf("invalid scan_data_type: %hhu\n", vars->options.scan_data_type); | |
+ abort(); | |
+ } | |
if (sm_choose_scanroutine(vars->options.scan_data_type, match_type, uservalue, vars->options.reverse_endianness) == false) | |
{ | |
@@ -505,6 +576,10 @@ bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const userv | |
return false; | |
} | |
+ | |
+ printf("%d\n", match_type); | |
+ assert(match_type == MATCHEQUALTO); | |
+ | |
assert(sm_scan_routine); | |
/* stop and attach to the target */ | |
@@ -587,7 +662,7 @@ bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const userv | |
size_t buffer_size = 0; | |
void *reg_pos = r->start; | |
const uint8_t *buf_pos = NULL; | |
- for ( ; ; memlength--, buffer_size--, reg_pos++, buf_pos++) { | |
+ for ( ; ; ) { | |
/* check if the buffer is finished (or we just started) */ | |
if (UNLIKELY(buffer_size == 0)) { | |
@@ -626,6 +701,86 @@ bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const userv | |
buf_pos = data; | |
} | |
+ if (LIKELY(buffer_size >= 32 && ((uintptr_t)buf_pos & 0x1f) == 0)) { | |
+ int result = 0; | |
+ switch (vars->options.scan_data_type) { | |
+ case INTEGER8: | |
+ result = _mm256_movemask_epi8( | |
+ _mm256_cmpeq_epi8(_mm256_load_si256((const __m256i_u*)buf_pos), mask)); | |
+ break; | |
+ case INTEGER16: | |
+ result = _mm256_movemask_epi8( | |
+ _mm256_cmpeq_epi16(_mm256_load_si256((const __m256i_u*)buf_pos), mask)); | |
+ break; | |
+ case INTEGER32: | |
+ result = _mm256_movemask_epi8( | |
+ _mm256_cmpeq_epi32(_mm256_load_si256((const __m256i_u*)buf_pos), mask)); | |
+ break; | |
+ case INTEGER64: | |
+ result = _mm256_movemask_epi8( | |
+ _mm256_cmpeq_epi64(_mm256_load_si256((const __m256i_u*)buf_pos), mask)); | |
+ break; | |
+ case FLOAT32: | |
+ if (vars->options.reverse_endianness) { | |
+ const __m256i toLittleEndian = _mm256_set_epi8( | |
+ 3, 2, 1, 0, 7, 6, 5, 4, | |
+ 11,10, 9, 8,15,14,13,12, | |
+ 19,18,17,16,23,22,21,20, | |
+ 27,26,25,24,31,30,29,28); | |
+ __m256i value = _mm256_shuffle_epi8(_mm256_load_si256((const __m256i_u*)buf_pos), toLittleEndian); | |
+ result = _mm256_movemask_pd( | |
+ _mm256_cmp_pd(value, maskd, _CMP_EQ_OQ) | |
+ ); | |
+ } else { | |
+ result = _mm256_movemask_ps( | |
+ _mm256_cmp_ps(_mm256_load_ps((const float*)buf_pos), maskf, _CMP_EQ_OQ)); | |
+ } | |
+ break; | |
+ case FLOAT64: { | |
+ if (vars->options.reverse_endianness) { | |
+ const __m256i toLittleEndian = _mm256_set_epi8( | |
+ 7, 6, 5, 4, 3, 2, 1, 0, | |
+ 15,14,13,12,11,10, 9, 8, | |
+ 23,22,21,20,19,18,17,16, | |
+ 31,30,29,28,27,26,25,24); | |
+ __m256i value = _mm256_shuffle_epi8(_mm256_load_si256((const __m256i_u*)buf_pos), toLittleEndian); | |
+ result = _mm256_movemask_pd( | |
+ _mm256_cmp_pd(value, maskd, _CMP_EQ_OQ) | |
+ ); | |
+ } else { | |
+ result = _mm256_movemask_pd( | |
+ _mm256_cmp_pd(_mm256_load_pd((const double*)buf_pos), maskd, _CMP_EQ_OQ)); | |
+ } | |
+ } | |
+ break; | |
+ } | |
+ if (UNLIKELY(result)) { | |
+ for (int i = 0; i < count; ++i) { | |
+ const advance = i * stride; | |
+ const mem64_t* memory_ptr = (mem64_t*)(buf_pos + advance); | |
+ const void* reg_ptr = reg_pos + advance; | |
+ | |
+ if ((result >> advance) & 1) { | |
+ // printf("value: %f\n", memory_ptr->float32_value); | |
+ writing_swath_index = add_element(&(vars->matches), writing_swath_index, reg_ptr, | |
+ get_u8b(memory_ptr), uservalue->flags); | |
+ | |
+ for (int j = 1; j < stride; ++j) { | |
+ const mem64_t* memory_ptr = (mem64_t*)(buf_pos + advance + j); | |
+ writing_swath_index = add_element(&(vars->matches), writing_swath_index, reg_ptr + j, | |
+ get_u8b(memory_ptr), flags_empty); | |
+ } | |
+ | |
+ ++vars->num_matches; | |
+ } | |
+ } | |
+ } | |
+ | |
+ memlength -= 32; | |
+ buffer_size -= 32; | |
+ reg_pos += 32; | |
+ buf_pos += 32; | |
+ } else { | |
const mem64_t* memory_ptr = (mem64_t*)buf_pos; | |
unsigned int match_length; | |
match_flags checkflags; | |
@@ -652,6 +807,10 @@ bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const userv | |
--required_extra_bytes_to_record; | |
} | |
+ memlength--; buffer_size--; reg_pos++; buf_pos++; | |
+ | |
+ } | |
+ | |
} | |
free(data); | |
@@ -678,6 +837,11 @@ bool sm_searchregions(globals_t *vars, scan_match_type_t match_type, const userv | |
show_info("we currently have %ld matches.\n", vars->num_matches); | |
+ t = clock() - t; | |
+ double time_taken = ((double)t)/CLOCKS_PER_SEC; // in seconds | |
+ | |
+ show_info("scan took %f seconds to execute \n", time_taken); | |
+ | |
/* okay, detach */ | |
return sm_detach(vars->target); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment