-
-
Save huyang531/28a31fc2d9f348fafb5b9d8e6c9493d5 to your computer and use it in GitHub Desktop.
cache-scratch: A Multi-threaded Memory Allocation Stress Test Ported to Unikraft
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
/* Import user configuration: */ | |
#ifdef __Unikraft__ | |
#include <uk/config.h> | |
#endif /* __Unikraft__ */ | |
///////////////////////////////////////////////////////////////////// | |
// | |
// Hoard: A Fast, Scalable, and Memory-Efficient Allocator | |
// for Shared-Memory Multiprocessors | |
// Contact author: Emery Berger, http://www.cs.umass.edu/~emery | |
// | |
// Copyright (c) 1998-2003, The University of Texas at Austin. | |
// | |
// This library is free software; you can redistribute it and/or modify | |
// it under the terms of the GNU Library General Public License as | |
// published by the Free Software Foundation, http://www.fsf.org. | |
// | |
// This library is distributed in the hope that it will be useful, but | |
// WITHOUT ANY WARRANTY; without even the implied warranty of | |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
// Library General Public License for more details. | |
// | |
////////////////////////////////////////////////////////////////////////////// | |
/** | |
* @file cache-scratch.c | |
* | |
* cache-scratch is a benchmark that exercises a heap's cache-locality. | |
* An allocator that allows multiple threads to re-use the same small | |
* object (possibly all in one cache-line) will scale poorly, while | |
* an allocator like Hoard will exhibit near-linear scaling. | |
* | |
* Try the following (on a P-processor machine): | |
* | |
* cache-scratch 1 1000 1 1000000 | |
* cache-scratch P 1000 1 1000000 | |
* | |
* cache-scratch-hoard 1 1000 1 1000000 | |
* cache-scratch-hoard P 1000 1 1000000 | |
* | |
* The ideal is a P-fold speedup. | |
*/ | |
#include <uk/assert.h> | |
#include <uk/print.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stddef.h> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <string.h> | |
#include <sys/syscall.h> | |
#include <sys/sysinfo.h> | |
#include <stdio.h> | |
#include <unistd.h> | |
#include <time.h> | |
#include <stdint.h> | |
#define _GNU_SOURCE | |
#include <sched.h> | |
#include <pthread.h> | |
/////////// Helper variables /////////// | |
#define DSEG_MAX 256*1024*1024 /* 256 Mb */ | |
char *dseg_lo = NULL, *dseg_hi = NULL; | |
long dseg_size; /* Maximum size of data segment */ | |
static int page_size; | |
/* Align pointer to closest page boundary downwards */ | |
#define PAGE_ALIGN(p) ((void *)(((unsigned long)(p) / page_size) * page_size)) | |
/* Align pointer to closest page boundary upwards */ | |
#define PAGE_ALIGN_UP(p) ((void *)((((unsigned long)(p) + page_size - 1) / page_size) * page_size)) | |
///////// Helper function definitions ////////// | |
double timespec_diff(struct timespec *start, struct timespec *end); | |
void initialize_pthread_attr(int detachstate, int schedpolicy, int priority, | |
int inheritsched, int scope, pthread_attr_t *attr); | |
int getNumProcessors(void); | |
void setCPU (int n); | |
/** | |
* TODO: Use these custom memory management functions to collect more stats. | |
*/ | |
int mem_init (void); | |
void *mem_sbrk (ptrdiff_t increment); | |
int mem_pagesize (void); | |
ptrdiff_t mem_usage (void); | |
// This struct just holds arguments to each thread. | |
struct workerArg { | |
char * _object; | |
int _objSize; | |
int _iterations; | |
int _repetitions; | |
int _cpu; | |
}; | |
extern void * worker (void * arg) | |
{ | |
// free the object we were given. | |
// Then, repeatedly do the following: | |
// malloc a given-sized object, | |
// repeatedly write on it, | |
// then free it. | |
int i, j, k; /* Loop control variables */ | |
struct workerArg * w = (struct workerArg *) arg; | |
setCPU(w->_cpu); | |
free(w->_object); | |
uk_pr_debug("Worker thread %d starting to iterate %d times\n", w->_cpu, w->_iterations); | |
for (i = 0; i < w->_iterations; i++) { | |
// Allocate the object. | |
char * obj = (char *)malloc(w->_objSize); | |
// Write into it a bunch of times. | |
for (j = 0; j < w->_repetitions; j++) { | |
for (k = 0; k < w->_objSize; k++) { | |
obj[k] = (char) k; | |
volatile char ch = obj[k]; | |
ch++; | |
} | |
} | |
// Free the object. | |
free(obj); | |
} | |
uk_pr_debug("Worker thread %d finished iterations\n", w->_cpu); | |
free(w); | |
return NULL; | |
} | |
int main (int argc, char * argv[]) { | |
int nthreads; | |
int iterations; | |
int objSize; | |
int repetitions; | |
pthread_attr_t attr; | |
int numCPU; | |
int i; | |
char **objs; | |
struct timespec start_time; | |
struct timespec end_time; | |
if (argc > 4) { | |
nthreads = atoi(argv[1]); | |
iterations = atoi(argv[2]); | |
objSize = atoi(argv[3]); | |
repetitions = atoi(argv[4]); | |
} else { | |
fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]); | |
return 1; | |
} | |
/* Declare threads[] array here after nthreads is set, so we | |
* can use stack-allocated space for the array. | |
*/ | |
pthread_t threads[nthreads]; | |
numCPU = getNumProcessors(); | |
// Allocate nthreads objects and distribute them among the threads. | |
objs = (char **)malloc(nthreads * sizeof(char *)); | |
for (i = 0; i < nthreads; i++) { | |
objs[i] = (char *)malloc(objSize); | |
} | |
initialize_pthread_attr(PTHREAD_CREATE_JOINABLE, SCHED_RR, -10, PTHREAD_EXPLICIT_SCHED, | |
PTHREAD_SCOPE_SYSTEM, &attr); | |
/* Get the starting time */ | |
clock_gettime(CLOCK_MONOTONIC_RAW, &start_time); | |
for (i = 0; i < nthreads; i++) { | |
struct workerArg * w = (struct workerArg *)malloc(sizeof(struct workerArg)); | |
w->_object = objs[i]; | |
w->_objSize = objSize; | |
w->_repetitions = repetitions / nthreads; | |
w->_iterations = iterations; | |
w->_cpu = (i+1)%numCPU; | |
pthread_create(&threads[i], &attr, &worker, (void *)w); | |
uk_pr_debug("Created worker thread %d\n", i); | |
} | |
for (i = 0; i < nthreads; i++) { | |
uk_pr_debug("Waiting for worker thread %d\n", i); | |
pthread_join(threads[i], NULL); | |
uk_pr_debug("Worker thread %d finished\n", i); | |
} | |
/* Get the finish time */ | |
clock_gettime(CLOCK_MONOTONIC_RAW, &end_time); | |
double t = timespec_diff(&start_time, &end_time); | |
free(objs); | |
printf ("Time elapsed = %f seconds\n", t); | |
printf ("Memory used = %ld bytes\n",mem_usage()); | |
return 0; | |
} | |
/////////////// Following are helper functions //////////////////// | |
double timespec_diff(struct timespec *start, struct timespec *end) { | |
struct timespec diff; | |
diff.tv_nsec = end->tv_nsec - start->tv_nsec; | |
diff.tv_sec = end->tv_sec - start->tv_sec; | |
if (diff.tv_nsec < 0) { | |
if (diff.tv_sec == 0) { | |
return 0.0; | |
} | |
/* Move 1 second from seconds to nanoseconds */ | |
diff.tv_sec -= 1; | |
diff.tv_nsec += 1000000000L; | |
} | |
return (double)(diff.tv_sec + (double)diff.tv_nsec/1000000000.0); | |
} | |
/* Set thread attributes */ | |
void initialize_pthread_attr(int detachstate, int schedpolicy, int priority, | |
int inheritsched, int scope, pthread_attr_t *attr) | |
{ | |
pthread_attr_init(attr); | |
pthread_attr_setdetachstate(attr, detachstate); | |
if (inheritsched == PTHREAD_EXPLICIT_SCHED) { | |
pthread_attr_setschedpolicy(attr, schedpolicy); | |
struct sched_param p; | |
p.sched_priority = priority; | |
pthread_attr_setschedparam(attr, &p); | |
} | |
pthread_attr_setscope(attr, scope); | |
} | |
/* | |
* This function should be more complicated to try and avoid a call to the | |
* C library malloc() routine embedded in the Linux sysconf() call. | |
* However, here we can allow a call to malloc() before the | |
* main test starts. | |
*/ | |
int getNumProcessors (void) | |
{ | |
static int np = 0; | |
if (!np) { | |
np = sysconf(_SC_NPROCESSORS_ONLN); | |
} | |
return np; | |
} | |
void setCPU (int n) { | |
/* Set CPU affinity to CPU n only. */ | |
pid_t tid = gettid(); | |
cpu_set_t mask; | |
CPU_ZERO(&mask); | |
CPU_SET(n, &mask); | |
if (sched_setaffinity(tid, sizeof(cpu_set_t), &mask) != 0) { | |
perror("sched_setaffinity failed"); | |
} | |
} | |
int mem_init (void) | |
{ | |
/* Get system page size */ | |
page_size = (int) getpagesize(); | |
/* Allocate heap */ | |
dseg_lo = (char *) malloc(DSEG_MAX + 2*page_size); | |
if (!dseg_lo) | |
return -1; | |
/* align heap to the next page boundary */ | |
dseg_lo = (char *) PAGE_ALIGN_UP(dseg_lo); | |
dseg_hi = dseg_lo-1; | |
dseg_size = DSEG_MAX; | |
return 0; | |
} | |
void *mem_sbrk (ptrdiff_t increment) | |
{ | |
char *new_hi = dseg_hi + increment; | |
char *old_hi = dseg_hi; | |
long dseg_cursize = dseg_hi - dseg_lo + 1; | |
UK_ASSERT(increment > 0); | |
/* Resize data segment, if the memory is available */ | |
if (new_hi > dseg_lo + dseg_size) | |
return NULL; | |
dseg_hi = new_hi; | |
dseg_cursize = dseg_hi - dseg_lo + 1; | |
return (void *)(old_hi + 1); | |
} | |
int mem_pagesize (void) | |
{ | |
return page_size; | |
} | |
ptrdiff_t mem_usage (void) | |
{ | |
/* hack for libc */ | |
if (dseg_lo != NULL && dseg_hi == NULL) { | |
dseg_hi = sbrk(0); | |
} | |
return dseg_hi - dseg_lo; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment