timo/gist:b8f32c09684ef4cda728d530b1844e85 Secret

## gistfile1.diff
diff --git a/src/core/frame.c b/src/core/frame.c
index 260c25e..6a19e70 100644
--- a/src/core/frame.c
+++ b/src/core/frame.c
@@ -85,15 +85,80 @@ static void instrumentation_level_barrier(MVMThreadContext *tc, MVMStaticFrame *
         MVM_profile_ensure_uninstrumented(tc, static_frame);
 }

+static MVMint8 return_to_fsa_cache(MVMThreadContext *tc, size_t size, MVMRegister *pointer) {
+    MVMint8 use_cache = -1;
+    if (!tc->fsa_cache[0]) {
+        /* If the first slot is free, the second one is, too. Use this one. */
+        tc->fsa_cache[0] = pointer;
+        tc->fsa_cache_sizes[0] = size;
+        memset(pointer, 0, size * sizeof(MVMRegister));
+        return 1;
+    } else {
+        if (tc->fsa_cache[1]) {
+            /* The second slot is already in use. */
+            if (tc->fsa_cache_sizes[1] > size) {
+                /* If the second slot is already bigger than our to-free data,
+                 * we'll not put it in the cache. */
+                return 0;
+            } else if (tc->fsa_cache_sizes[0] > size) {
+                /* Both slots are filled, but only the second one is smaller
+                 * than our piece of data. We'll install it in the second slot. */
+                MVM_fixed_size_free(tc, tc->instance->fsa, tc->fsa_cache_sizes[1], tc->fsa_cache[1]);
+                tc->fsa_cache[1] = pointer;
+                tc->fsa_cache_sizes[1] = size;
+                memset(pointer, 0, size * sizeof(MVMRegister));
+                return 1;
+            } else {
+                /* Both slots are filled, and we have the biggest thing.
+                 * First, free the second slot so we don't leak. Then
+                 * move the first slot over to the second slot and install
+                 * our thing in the first slot. */
+                MVM_fixed_size_free(tc, tc->instance->fsa, tc->fsa_cache_sizes[1], tc->fsa_cache[1]);
+                tc->fsa_cache[1] = tc->fsa_cache[0];
+                tc->fsa_cache_sizes[1] = tc->fsa_cache_sizes[0];
+                tc->fsa_cache[0] = pointer;
+                tc->fsa_cache_sizes[0] = size;
+                memset(pointer, 0, size * sizeof(MVMRegister));
+                return 1;
+            }
+        } else {
+            /* The second slot is free for us to use. */
+            if (size > tc->fsa_cache_sizes[0]) {
+                /* Move the first slot over into the second slot, then install
+                 * our thing in the first slot */
+                tc->fsa_cache[1] = tc->fsa_cache[0];
+                tc->fsa_cache_sizes[1] = tc->fsa_cache_sizes[0];
+                tc->fsa_cache[0] = pointer;
+                tc->fsa_cache_sizes[0] = size;
+                memset(pointer, 0, size * sizeof(MVMRegister));
+                return 1;
+            } else {
+                /* Our thing is smaller than the cached thing, so we'll
+                 * put it into the second slot. */
+                tc->fsa_cache[1] = pointer;
+                tc->fsa_cache_sizes[1] = size;
+                memset(pointer, 0, size * sizeof(MVMRegister));
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
 /* Destroys a frame. */
 void MVM_frame_destroy(MVMThreadContext *tc, MVMFrame *frame) {
     if (frame->work) {
         MVM_args_proc_cleanup(tc, &frame->params);
-        MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_work,
-            frame->work);
+        if (!return_to_fsa_cache(tc, frame->allocd_work, frame->work)) {
+            MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_work,
+                frame->work);
+        }
+    }
+    if (frame->env) {
+        if (!return_to_fsa_cache(tc, frame->allocd_env, frame->env)) {
+            MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_env, frame->env);
+        }
     }
-    if (frame->env)
-        MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_env, frame->env);
     if (frame->continuation_tags)
         MVM_continuation_free_tags(tc, frame);
 }
@@ -199,6 +264,24 @@ static MVMFrame * autoclose(MVMThreadContext *tc, MVMStaticFrame *needed) {
     return result;
 }

+static MVMint8 obtain_from_fsa_cache(MVMThreadContext *tc, MVMint32 *size, MVMRegister **pointer) {
+    MVMint8 use_cache = -1;
+    return 0; /* XXX disabled the cache here */
+    if ((tc->fsa_cache[0] && tc->fsa_cache_sizes[0] == *size && ((use_cache = 0), 1))
+        || (tc->fsa_cache[1] && tc->fsa_cache_sizes[1] == *size && (use_cache = 1))) {
+        *pointer = tc->fsa_cache[use_cache];
+        *size = tc->fsa_cache_sizes[use_cache];
+        if (use_cache == 0) {
+            tc->fsa_cache[0] = tc->fsa_cache[1];
+            tc->fsa_cache_sizes[0] = tc->fsa_cache_sizes[1];
+        }
+        tc->fsa_cache[1] = NULL;
+        tc->fsa_cache_sizes[1] = 0;
+        return 1;
+    }
+    return 0;
+}
+
 /* Obtains memory for a frame on the thread-local call stack. */
 static MVMFrame * allocate_frame(MVMThreadContext *tc, MVMStaticFrame *static_frame,
                                  MVMSpeshCandidate *spesh_cand) {
@@ -218,7 +301,9 @@ static MVMFrame * allocate_frame(MVMThreadContext *tc, MVMStaticFrame *static_fr
     static_frame_body = &(static_frame->body);
     env_size = spesh_cand ? spesh_cand->env_size : static_frame_body->env_size;
     if (env_size) {
-        frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
+        if (!obtain_from_fsa_cache(tc, &env_size, &frame->env)) {
+            frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
+        }
         frame->allocd_env = env_size;
     }
     work_size = spesh_cand ? spesh_cand->work_size : static_frame_body->work_size;
@@ -226,13 +311,17 @@ static MVMFrame * allocate_frame(MVMThreadContext *tc, MVMStaticFrame *static_fr
         if (spesh_cand) {
             /* Allocate zeroed memory. Spesh makes sure we have VMNull setup in
              * the places we need it. */
-            frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
+            if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
+                frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
+            }
         }
         else {
             /* Copy frame template with VMNulls in to place. */
-            frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
-            memcpy(frame->work, static_frame_body->work_initial,
-                sizeof(MVMRegister) * static_frame_body->num_locals);
+            if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
+                frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
+                memcpy(frame->work, static_frame_body->work_initial,
+                    sizeof(MVMRegister) * static_frame_body->num_locals);
+            }
         }
         frame->allocd_work = work_size;

@@ -264,7 +353,9 @@ static MVMFrame * allocate_heap_frame(MVMThreadContext *tc, MVMStaticFrame *stat
     static_frame_body = &(static_frame->body);
     env_size = spesh_cand ? spesh_cand->env_size : static_frame_body->env_size;
     if (env_size) {
-        frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
+        if (!obtain_from_fsa_cache(tc, &env_size, &frame->env)) {
+            frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
+        }
         frame->allocd_env = env_size;
     }
     work_size = spesh_cand ? spesh_cand->work_size : static_frame_body->work_size;
@@ -274,13 +365,17 @@ static MVMFrame * allocate_heap_frame(MVMThreadContext *tc, MVMStaticFrame *stat
             MVMuint32 num_locals = spesh_cand->num_locals;
             MVMuint16 *local_types = spesh_cand->local_types;
             MVMuint32 i;
-            frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
+            if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
+                frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
+            }
             for (i = 0; i < num_locals; i++)
                 if (local_types[i] == MVM_reg_obj)
                     frame->work[i].o = tc->instance->VMNull;
         }
         else {
-            frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
+            if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
+                frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
+            }
             memcpy(frame->work, static_frame_body->work_initial,
                 sizeof(MVMRegister) * static_frame_body->num_locals);
         }
diff --git a/src/core/threadcontext.c b/src/core/threadcontext.c
index 9a4a7a4..32a7d6c 100644
--- a/src/core/threadcontext.c
+++ b/src/core/threadcontext.c
@@ -32,6 +32,11 @@ MVMThreadContext * MVM_tc_create(MVMInstance *instance) {
     /* Allocate an initial call stack region for the thread. */
     MVM_callstack_region_init(tc);

+    tc->fsa_cache[0] = NULL;
+    tc->fsa_cache[1] = NULL;
+    tc->fsa_cache_sizes[0] = 0;
+    tc->fsa_cache_sizes[1] = 0;
+
     /* Use default loop for main thread; create a new one for others. */
     tc->loop = instance->main_thread ? uv_loop_new() : uv_default_loop();

diff --git a/src/core/threadcontext.h b/src/core/threadcontext.h
index 3c422c4..5cd8d7c 100644
--- a/src/core/threadcontext.h
+++ b/src/core/threadcontext.h
@@ -117,6 +117,13 @@ struct MVMThreadContext {
     /* The second GC generation allocator. */
     MVMGen2Allocator *gen2;

+    /* Hitting the fixed size alloc over and over for a frame's lexical env
+     * and locals storage space can be costly. We can try to either re-use
+     * the last two blobs (and avoid the alloc altogether) or at least free
+     * two and alloc two in one go without releasing locks in-between. */
+    MVMRegister *fsa_cache[2];
+    MVMuint32    fsa_cache_sizes[2];
+
     /* Number of bytes promoted to gen2 in current GC run. */
     MVMuint32 gc_promoted_bytes;
	diff --git a/src/core/frame.c b/src/core/frame.c
	index 260c25e..6a19e70 100644
	--- a/src/core/frame.c
	+++ b/src/core/frame.c
	@@ -85,15 +85,80 @@ static void instrumentation_level_barrier(MVMThreadContext tc, MVMStaticFrame
	MVM_profile_ensure_uninstrumented(tc, static_frame);
	}

	+static MVMint8 return_to_fsa_cache(MVMThreadContext tc, size_t size, MVMRegister pointer) {
	+ MVMint8 use_cache = -1;
	+ if (!tc->fsa_cache[0]) {
	+ /* If the first slot is free, the second one is, too. Use this one. */
	+ tc->fsa_cache[0] = pointer;
	+ tc->fsa_cache_sizes[0] = size;
	+ memset(pointer, 0, size * sizeof(MVMRegister));
	+ return 1;
	+ } else {
	+ if (tc->fsa_cache[1]) {
	+ /* The second slot is already in use. */
	+ if (tc->fsa_cache_sizes[1] > size) {
	+ /* If the second slot is already bigger than our to-free data,
	+ * we'll not put it in the cache. */
	+ return 0;
	+ } else if (tc->fsa_cache_sizes[0] > size) {
	+ /* Both slots are filled, but only the second one is smaller
	+ * than our piece of data. We'll install it in the second slot. */
	+ MVM_fixed_size_free(tc, tc->instance->fsa, tc->fsa_cache_sizes[1], tc->fsa_cache[1]);
	+ tc->fsa_cache[1] = pointer;
	+ tc->fsa_cache_sizes[1] = size;
	+ memset(pointer, 0, size * sizeof(MVMRegister));
	+ return 1;
	+ } else {
	+ /* Both slots are filled, and we have the biggest thing.
	+ * First, free the second slot so we don't leak. Then
	+ * move the first slot over to the second slot and install
	+ * our thing in the first slot. */
	+ MVM_fixed_size_free(tc, tc->instance->fsa, tc->fsa_cache_sizes[1], tc->fsa_cache[1]);
	+ tc->fsa_cache[1] = tc->fsa_cache[0];
	+ tc->fsa_cache_sizes[1] = tc->fsa_cache_sizes[0];
	+ tc->fsa_cache[0] = pointer;
	+ tc->fsa_cache_sizes[0] = size;
	+ memset(pointer, 0, size * sizeof(MVMRegister));
	+ return 1;
	+ }
	+ } else {
	+ /* The second slot is free for us to use. */
	+ if (size > tc->fsa_cache_sizes[0]) {
	+ /* Move the first slot over into the second slot, then install
	+ * our thing in the first slot */
	+ tc->fsa_cache[1] = tc->fsa_cache[0];
	+ tc->fsa_cache_sizes[1] = tc->fsa_cache_sizes[0];
	+ tc->fsa_cache[0] = pointer;
	+ tc->fsa_cache_sizes[0] = size;
	+ memset(pointer, 0, size * sizeof(MVMRegister));
	+ return 1;
	+ } else {
	+ /* Our thing is smaller than the cached thing, so we'll
	+ * put it into the second slot. */
	+ tc->fsa_cache[1] = pointer;
	+ tc->fsa_cache_sizes[1] = size;
	+ memset(pointer, 0, size * sizeof(MVMRegister));
	+ return 1;
	+ }
	+ }
	+ }
	+ return 0;
	+}
	+
	/* Destroys a frame. */
	void MVM_frame_destroy(MVMThreadContext tc, MVMFrame frame) {
	if (frame->work) {
	MVM_args_proc_cleanup(tc, &frame->params);
	- MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_work,
	- frame->work);
	+ if (!return_to_fsa_cache(tc, frame->allocd_work, frame->work)) {
	+ MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_work,
	+ frame->work);
	+ }
	+ }
	+ if (frame->env) {
	+ if (!return_to_fsa_cache(tc, frame->allocd_env, frame->env)) {
	+ MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_env, frame->env);
	+ }
	}
	- if (frame->env)
	- MVM_fixed_size_free(tc, tc->instance->fsa, frame->allocd_env, frame->env);
	if (frame->continuation_tags)
	MVM_continuation_free_tags(tc, frame);
	}
	@@ -199,6 +264,24 @@ static MVMFrame * autoclose(MVMThreadContext tc, MVMStaticFrame needed) {
	return result;
	}

	+static MVMint8 obtain_from_fsa_cache(MVMThreadContext tc, MVMint32 size, MVMRegister **pointer) {
	+ MVMint8 use_cache = -1;
	+ return 0; /* XXX disabled the cache here */
	+ if ((tc->fsa_cache[0] && tc->fsa_cache_sizes[0] == *size && ((use_cache = 0), 1))
	+ \|\| (tc->fsa_cache[1] && tc->fsa_cache_sizes[1] == *size && (use_cache = 1))) {
	+ *pointer = tc->fsa_cache[use_cache];
	+ *size = tc->fsa_cache_sizes[use_cache];
	+ if (use_cache == 0) {
	+ tc->fsa_cache[0] = tc->fsa_cache[1];
	+ tc->fsa_cache_sizes[0] = tc->fsa_cache_sizes[1];
	+ }
	+ tc->fsa_cache[1] = NULL;
	+ tc->fsa_cache_sizes[1] = 0;
	+ return 1;
	+ }
	+ return 0;
	+}
	+
	/* Obtains memory for a frame on the thread-local call stack. */
	static MVMFrame * allocate_frame(MVMThreadContext tc, MVMStaticFrame static_frame,
	MVMSpeshCandidate *spesh_cand) {
	@@ -218,7 +301,9 @@ static MVMFrame * allocate_frame(MVMThreadContext tc, MVMStaticFrame static_fr
	static_frame_body = &(static_frame->body);
	env_size = spesh_cand ? spesh_cand->env_size : static_frame_body->env_size;
	if (env_size) {
	- frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
	+ if (!obtain_from_fsa_cache(tc, &env_size, &frame->env)) {
	+ frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
	+ }
	frame->allocd_env = env_size;
	}
	work_size = spesh_cand ? spesh_cand->work_size : static_frame_body->work_size;
	@@ -226,13 +311,17 @@ static MVMFrame * allocate_frame(MVMThreadContext tc, MVMStaticFrame static_fr
	if (spesh_cand) {
	/* Allocate zeroed memory. Spesh makes sure we have VMNull setup in
	* the places we need it. */
	- frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
	+ if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
	+ frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
	+ }
	}
	else {
	/* Copy frame template with VMNulls in to place. */
	- frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
	- memcpy(frame->work, static_frame_body->work_initial,
	- sizeof(MVMRegister) * static_frame_body->num_locals);
	+ if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
	+ frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
	+ memcpy(frame->work, static_frame_body->work_initial,
	+ sizeof(MVMRegister) * static_frame_body->num_locals);
	+ }
	}
	frame->allocd_work = work_size;

	@@ -264,7 +353,9 @@ static MVMFrame * allocate_heap_frame(MVMThreadContext tc, MVMStaticFrame stat
	static_frame_body = &(static_frame->body);
	env_size = spesh_cand ? spesh_cand->env_size : static_frame_body->env_size;
	if (env_size) {
	- frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
	+ if (!obtain_from_fsa_cache(tc, &env_size, &frame->env)) {
	+ frame->env = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, env_size);
	+ }
	frame->allocd_env = env_size;
	}
	work_size = spesh_cand ? spesh_cand->work_size : static_frame_body->work_size;
	@@ -274,13 +365,17 @@ static MVMFrame * allocate_heap_frame(MVMThreadContext tc, MVMStaticFrame stat
	MVMuint32 num_locals = spesh_cand->num_locals;
	MVMuint16 *local_types = spesh_cand->local_types;
	MVMuint32 i;
	- frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
	+ if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
	+ frame->work = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, work_size);
	+ }
	for (i = 0; i < num_locals; i++)
	if (local_types[i] == MVM_reg_obj)
	frame->work[i].o = tc->instance->VMNull;
	}
	else {
	- frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
	+ if (!obtain_from_fsa_cache(tc, &work_size, &frame->work)) {
	+ frame->work = MVM_fixed_size_alloc(tc, tc->instance->fsa, work_size);
	+ }
	memcpy(frame->work, static_frame_body->work_initial,
	sizeof(MVMRegister) * static_frame_body->num_locals);
	}
	diff --git a/src/core/threadcontext.c b/src/core/threadcontext.c
	index 9a4a7a4..32a7d6c 100644
	--- a/src/core/threadcontext.c
	+++ b/src/core/threadcontext.c
	@@ -32,6 +32,11 @@ MVMThreadContext * MVM_tc_create(MVMInstance *instance) {
	/* Allocate an initial call stack region for the thread. */
	MVM_callstack_region_init(tc);

	+ tc->fsa_cache[0] = NULL;
	+ tc->fsa_cache[1] = NULL;
	+ tc->fsa_cache_sizes[0] = 0;
	+ tc->fsa_cache_sizes[1] = 0;
	+
	/* Use default loop for main thread; create a new one for others. */
	tc->loop = instance->main_thread ? uv_loop_new() : uv_default_loop();

	diff --git a/src/core/threadcontext.h b/src/core/threadcontext.h
	index 3c422c4..5cd8d7c 100644
	--- a/src/core/threadcontext.h
	+++ b/src/core/threadcontext.h
	@@ -117,6 +117,13 @@ struct MVMThreadContext {
	/* The second GC generation allocator. */
	MVMGen2Allocator *gen2;

	+ /* Hitting the fixed size alloc over and over for a frame's lexical env
	+ * and locals storage space can be costly. We can try to either re-use
	+ * the last two blobs (and avoid the alloc altogether) or at least free
	+ * two and alloc two in one go without releasing locks in-between. */
	+ MVMRegister *fsa_cache[2];
	+ MVMuint32 fsa_cache_sizes[2];
	+
	/* Number of bytes promoted to gen2 in current GC run. */
	MVMuint32 gc_promoted_bytes;