Skip to content

Instantly share code, notes, and snippets.

@musamaanjum
Created August 8, 2023 08:36
Show Gist options
  • Save musamaanjum/f8c1119ea227a6fe63a7d95e7b464aee to your computer and use it in GitHub Desktop.
Save musamaanjum/f8c1119ea227a6fe63a7d95e7b464aee to your computer and use it in GitHub Desktop.
Test to measure the running time of pagemap_ioctl
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/mman.h>
#define _GNU_SOURCE
#define _OPEN_THREADS
#include <pthread.h>
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
#include <assert.h>
#define LEN(region) ((region.end - region.start)/4096)
#define LOG(format, ...) {printf("%x:%s: " format, getpid(), __func__ __VA_OPT__(,)__VA_ARGS__);}
int pagemap_fd;
#define MAX_THREAD_COUNT 64
#define PAGE_SIZE 0x1000
#define TEST_TIME 3000.0
static bool finish;
static int nthreads;
static volatile long long raw_writes_count, writes_time;
static char *mem;
static bool random_access, read_reset;
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#endif
struct thread_info
{
volatile void *mem;
size_t npages;
};
void *thread_proc(void *data)
{
struct thread_info *info = data;
volatile unsigned char *m = info->mem;
size_t page;
unsigned long long t1, t2;
page = 0;
while (!finish)
{
if (random_access)
page = rand() % info->npages;
else
page = (page + 1) % info->npages;
t1 = rdtsc();
++*(volatile unsigned int *)(m + page * PAGE_SIZE);
t2 = rdtsc();
__atomic_add_fetch (&raw_writes_count, 1, __ATOMIC_RELAXED);
__atomic_add_fetch (&writes_time, t2 - t1, __ATOMIC_RELAXED);
}
return 0;
}
double curr_time_ms(void)
{
struct timespec current_time;
clock_gettime(CLOCK_MONOTONIC, &current_time);
return current_time.tv_sec * 1000.0 +
current_time.tv_nsec / 1000000.0;
}
static double rdtsc_c;
////////////////////////////////////////////////////////////////////////////////////////////////
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
#define UFFD_FEATURE_WP_ASYNC (1<<15)
#ifndef PAGEMAP_SCAN
/* Pagemap ioctl */
#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)
/* Bits are set in flags of the page_region and masks in pm_scan_args */
#define PAGE_IS_WPASYNC (1 << 0)
#define PAGE_IS_WRITTEN (1 << 1)
#define PAGE_IS_FILE (1 << 2)
#define PAGE_IS_PRESENT (1 << 3)
#define PAGE_IS_SWAPPED (1 << 4)
#define PAGE_IS_PFNZERO (1 << 5)
/*
* struct page_region - Page region with flags
* @start: Start of the region
* @end: End of the region (exclusive)
* @categories: PAGE_IS_* category bitmask for the region
*/
struct page_region {
__u64 start;
__u64 end;
__u64 categories;
};
/* Flags for PAGEMAP_SCAN ioctl */
#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */
#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */
/*
* struct pm_scan_arg - Pagemap ioctl argument
* @size: Size of the structure
* @flags: Flags for the IOCTL
* @start: Starting address of the region
* (Ending address of the walk is also returned in it)
* @end: Ending address of the region
* @vec: Address of page_region struct array for output
* @vec_len: Length of the page_region struct array
* @max_pages: Optional limit for number of returned pages (0 = disabled)
* @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
* @category_mask: Skip pages for which any category doesn't match
* @category_anyof_mask: Skip pages for which no category matches
* @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned
*/
struct pm_scan_arg {
__u64 size;
__u64 flags;
__u64 start;
__u64 end;
__u64 walk_end;
__u64 vec;
__u64 vec_len;
__u64 max_pages;
__u64 category_inverted;
__u64 category_mask;
__u64 category_anyof_mask;
__u64 return_mask;
};
#endif
#define __NR_userfaultfd 323
#define PAGEMAP "/proc/self/pagemap"
int pagemap_fd;
int uffd;
static long pagemap_ioctl(void *start, size_t len, void *vec, int vec_len, int flag,
int max_pages, long required_mask, long anyof_mask, long excluded_mask,
long return_mask)
{
struct pm_scan_arg arg;
arg.start = (uintptr_t)start;
arg.end = (uintptr_t)start + len;
arg.vec = (uintptr_t)vec;
arg.vec_len = vec_len;
arg.flags = flag;
arg.size = sizeof(struct pm_scan_arg);
arg.max_pages = max_pages;
arg.category_mask = required_mask;
arg.category_anyof_mask = anyof_mask;
arg.category_inverted = excluded_mask;
arg.return_mask = return_mask;
return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
}
int init_uffd(void)
{
struct uffdio_api uffdio_api;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
exit(1);
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC |
UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
if (ioctl(uffd, UFFDIO_API, &uffdio_api))
exit(1);
if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) ||
!(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) ||
!(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) ||
!(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM))
exit(1);
return 0;
}
int wp_init(void *start, size_t size)
{
struct uffdio_register uffdio_register;
struct uffdio_writeprotect wp;
madvise( start, size, MADV_NOHUGEPAGE );
uffdio_register.range.start = (uintptr_t)start;
uffdio_register.range.len = size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
exit(1);
if (!(uffdio_register.ioctls & UFFDIO_WRITEPROTECT))
exit(1);
wp.range.start = (uintptr_t)start;
wp.range.len = size;
wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp))
exit(1);
return 0;
}
int wp_free(void *start, size_t size)
{
struct uffdio_register uffdio_register;
uffdio_register.range.start = (uintptr_t)start;
uffdio_register.range.len = size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
exit(1);
return 0;
}
int wp_addr_range(void *lpBaseAddress, int dwRegionSize)
{
struct uffdio_writeprotect wp;
wp.range.start = (unsigned long)lpBaseAddress;
wp.range.len = dwRegionSize;
wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp))
exit(1);
return 0;
}
int wp_addr_range_ioctl(void *start, size_t size)
{
int ret;
ret = pagemap_ioctl(start, size, NULL, 0, PM_SCAN_WP_MATCHING,
0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN);
if (ret < 0) {
LOG("%p, %p.\n", start, (void *)size);
perror("pagemap_ioctl");
exit(1);
}
return 0;
}
struct page_region *buf;
int read_reset_dirty(int reset, char *start, size_t len, void **vec, unsigned int *ww_count,
unsigned int *granularity)
{
struct pm_scan_arg arg;
int i, ret;
uint64_t addr;
arg.start = (uintptr_t)start;
arg.end = (uintptr_t)start + len;
arg.vec = (uintptr_t)buf;
arg.vec_len = *ww_count;
arg.flags = 0;
if (reset)
arg.flags |= PM_SCAN_WP_MATCHING;
arg.size = sizeof(struct pm_scan_arg);
arg.max_pages = *ww_count;
arg.category_mask = PAGE_IS_WRITTEN;
arg.category_anyof_mask = 0;
arg.category_inverted = 0;
arg.return_mask = PAGE_IS_WRITTEN;
if (granularity)
*granularity = 4096;
ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
assert(ret >= 0);
*ww_count = 0;
for (i = 0; i < ret; i++) {
for (addr = buf[i].start; addr != buf[i].end; addr += 0x1000)
*vec++ = (void *)(uintptr_t)addr;
*ww_count += LEN(buf[i]);
}
ret = 0;
return ret;
}
int reset_dirty(void *start, size_t size)
{
return wp_addr_range_ioctl(start, size);
}
int main(int argc, char *argv[])
{
unsigned int ww_count;
long long ww_total, cycle_count;
struct thread_info info[MAX_THREAD_COUNT];
unsigned long long t1, t2, rdtsc_start, rdtsc_end;
double start, curr, cycle_start;
static void **ww_addr;
long long writes_count;
unsigned long long wwtot_time, wwreset_time;
double rw_delay_ms;
unsigned int granularity;
unsigned int i;
int get_count;
size_t npages;
pthread_t th;
pagemap_fd = open(PAGEMAP, O_RDONLY);
if (pagemap_fd < 0) {
perror("pagemapfd");
return -EINVAL;
}
if (init_uffd())
return -1;
if (argc < 6) {
puts("Usage: win.exe <nthreads> <npages> <watch_delay_ms> <random_access> <read_reset>\n");
return -1;
}
nthreads = atoi(argv[1]);
if (nthreads > MAX_THREAD_COUNT) {
LOG("Maximum of %u threads supported.\n", MAX_THREAD_COUNT);
return -1;
}
npages = atoi(argv[2]);
if (npages < nthreads || npages % nthreads) {
LOG("npages should be > nthreads and evenly divisible by nthreads.\n");
return -1;
}
rw_delay_ms = atof(argv[3]);
random_access = atoi(argv[4]);
read_reset = atoi(argv[5]);
ww_addr = malloc(sizeof(*ww_addr) * npages);
buf = malloc(100000 * sizeof(struct page_region));
mem = mmap(NULL, npages * 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (!mem) {
perror("Error commit");
return -1;
}
wp_init(mem, npages * 0x1000);
wp_addr_range(mem, npages * 0x1000);
for (i = 0; i < nthreads; ++i) {
info[i].mem = mem + 0x1000 * i * npages / nthreads;
info[i].npages = npages / nthreads;
pthread_create(&th, NULL, thread_proc, &info[i]);
}
get_count = npages;
wwreset_time = wwtot_time = 0;
curr = start = curr_time_ms();
ww_total = 0;
cycle_count = 0;
rdtsc_start = rdtsc();
while (curr - start < TEST_TIME)
{
cycle_start = curr;
ww_count = get_count;
t1 = rdtsc();
if (read_reset_dirty(read_reset, mem, npages * PAGE_SIZE,
ww_addr, &ww_count, &granularity)) {
LOG("GetWriteWatch() failed, GetLastError() %d.\n", errno);
return -1;
}
assert((char *)ww_addr[0] >= mem);
ww_total += ww_count;
if (!read_reset)
{
unsigned long long t1, t2;
t1 = rdtsc();
reset_dirty(mem, npages * PAGE_SIZE);
t2 = rdtsc();
wwreset_time += t2 - t1;
}
t2 = rdtsc();
wwtot_time += t2 - t1;
curr = curr_time_ms();
while (curr - start < TEST_TIME && curr - cycle_start < rw_delay_ms) {
sched_yield();
curr = curr_time_ms();
}
++cycle_count;
}
rdtsc_end = rdtsc();
writes_count = raw_writes_count;
finish = true;
rdtsc_c = 1000.0 * (curr - start) / (rdtsc_end - rdtsc_start);
LOG("Elapsed %.1lfms, cycle_count %llu, writes_count %lld, writes watched %llu.\n",
curr - start, cycle_count, writes_count, ww_total);
LOG("writes per thread * msec %.3lf, avg. write time %.3lfns, GetWriteWatch() avg %.1lfmcs (reset %.1lf)\n",
writes_count / (TEST_TIME * nthreads), 1000.0 * writes_time * rdtsc_c / writes_count, wwtot_time * rdtsc_c / cycle_count, wwreset_time * rdtsc_c / cycle_count);
free(buf);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment