flaviut/-

## -
Data structures for the GC
==========================

search( key, t ) typekey key; patricia t; {
  if ( t==NULL ) notfound( key );
  else {
    while ( !IsData(t) )
      t = bit(t->level,key) ? t->right : t->left;
    if ( key == t->k ) found( t ); else notfound( key );
  }
};

patricia insert( key, t ) typekey key; patricia t; {
  patricia p; patricia InsBetween(); int i;
  if (t==NULL) return( NewDataNode(key) );
  for( p=t; !IsData(p); )
    p = bit( p->level, key ) ? p->right : p->left ; /* find first different bit */
    for (i=1; i<=D && bit(i,key)==bit(i,p->k); i++);
    if (i>D) { Error /* Key already in table */; return(t); }
    else return( InsBetween( key, t, i ) );
}

patricia InsBetween( key, t, i ) typekey key; patricia t; int i; {
  patricia p;
  if ( IsData(t) || i < t->level ) {
    /* create a new internal node */
    p = NewDataNode( key );
    return( bit(i,key) ? NewIntNode(i,t,p) : NewIntNode(i,p,t) );
  }
  if (bit(t->level,key)==1)
    t->right = InsBetween( key, t->right, i );
  else
    t->left = InsBetween( key, t->left, i );
  return( t );
};


The current data structures are hard to follow, depend on a working malloc()
implementation, and use more memory than necessary. Not controlling the data
structures requires an AT.

Requirements:
- merging blocks
- splitting blocks
- give pages back to the OS
- identifying whether a pointer is the start of a cell O(1)

Each allocated memory object has a size that is a multiple of 8. This assures
proper alignment.
We manage memory in chunks of size 16K. The chunk header must be a multiple of
16 bytes to assure proper alignment!
There are fixed-size chunks that contain only objects of the same size. This is
done for up to 8 * 256 = 2K bytes sized objects.

There are also var-size chunks that manage a multiple of 16K. It follows that
each object in a v-chunk is at least 2K bytes. So a block of 16K may only have
at most 8 objects!


function fpmmap(adr:pointer;len,prot,flags,fd,off:sizeint):pointer;external name 'FPC_SYSC_MMAP';
function fpmunmap(adr:pointer;len:sizeint):pointer;external name 'FPC_SYSC_MUNMAP';
function geterrno:longint;external name 'FPC_SYS_GETERRNO';

const   PROT_READ  = $1;             { page can be read }
        PROT_WRITE = $2;             { page can be written }
        PROT_EXEC  = $4;             { page can be executed }
        PROT_NONE  = $0;             { page can not be accessed }

        MAP_SHARED    = $1;          { Share changes }
        MAP_PRIVATE   = $2;          { Changes are private }
        MAP_TYPE      = $f;          { Mask for type of mapping }
        MAP_FIXED     = $10;         { Interpret addr exactly }
        MAP_ANONYMOUS = $20;         { don't use a file }

        MAP_GROWSDOWN  = $100;       { stack-like segment }
        MAP_DENYWRITE  = $800;       { ETXTBSY }
        MAP_EXECUTABLE = $1000;      { mark it as an executable }
        MAP_LOCKED     = $2000;      { pages are locked }
        MAP_NORESERVE  = $4000;      { don't check for reservations }

function req_pages(count:cardinal):pointer;
{Requests count consecutive pages from the OS.}
begin
  req_pages:=fpmmap(nil,count shl page_shift,PROT_READ or PROT_WRITE,
                    MAP_PRIVATE or MAP_ANONYMOUS,0,0);
  if geterrno<>0 then
    req_pages:=nil; {This one can fail, so we can handle an out of memory
                     situation.}
end;

procedure sack_pages(p:pointer;count:cardinal);
begin
  fpmunmap(p,count shl page_shift);
  if geterrno<>0 then
    runerror(204); {This one should succees.}
end;

-------------------------------------------------------------------------------
/* Initialize critical section */
void csinitialize (CRITICAL_SECTION *cs) {
    InitializeCriticalSection (cs);
}
/* Delete critical section */
void csdelete (CRITICAL_SECTION *cs) {
    DeleteCriticalSection (cs);
}
/* Enter critical section */
int csenter (CRITICAL_SECTION *cs) {
    EnterCriticalSection (cs);
    return 0;
}
/* Leave critical section */
int csleave (CRITICAL_SECTION *cs) {
    LeaveCriticalSection (cs);
    return 0;
}

static int g_cs_initialized;
static CRITICAL_SECTION g_cs;

/* mmap for windows */
void *mmap (void *ptr, long size, long prot, long type, long handle, long arg) {
    static long g_pagesize;
    static long g_regionsize;
    /* Wait for spin lock */
    slwait (&g_sl);
    /* First time initialization */
    if (! g_pagesize)
        g_pagesize = getpagesize ();
    if (! g_regionsize)
        g_regionsize = getregionsize ();
    /* Allocate this */
    ptr = VirtualAlloc (ptr, size,
			MEM_RESERVE | MEM_COMMIT | MEM_TOP_DOWN, PAGE_READWRITE);
    if (! ptr) {
        ptr = MMAP_FAILURE;
        goto mmap_exit;
    }
mmap_exit:
    /* Release spin lock */
    slrelease (&g_sl);
    return ptr;
}
/* munmap for windows */
long munmap (void *ptr, long size) {
    static long g_pagesize;
    static long g_regionsize;
    int rc = MUNMAP_FAILURE;
    /* Wait for spin lock */
    slwait (&g_sl);
    /* First time initialization */
    if (! g_pagesize)
        g_pagesize = getpagesize ();
    if (! g_regionsize)
        g_regionsize = getregionsize ();
    /* Free this */
    if (! VirtualFree (ptr, 0,
                       MEM_RELEASE))
        goto munmap_exit;
    rc = 0;
munmap_exit:
    /* Release spin lock */
    slrelease (&g_sl);
    return rc;
}

uint __fastcall FindHighestBit(uint32 n)
{
   __asm
   {
      sub eax, eax
      sub ecx, ecx
      bsr eax, n
      setnz cl
      add eax, ecx
   }
}

char msb_k[256] =
   {
     0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
     5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
     6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
     8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
   };

 int msb(uint32_t x)
 {
   if (x > 0x0000FFFF)
     if (x > 0x00FFFFFF) return 24 + msb_k[x >> 24];
     else return 16 + msb_k[x >> 16];
   else
     if (x > 0x000000FF) return 8 + msb_k[x >> 8];
                         return msb_k[x ];
 }

The following (more generic) loop may be quicker though...

 int msb(untype x)
 {
   int c = 0;
   while (x > 0xFF) { c += 8; x >>= 8; }
   return c + msb_k[x];
 }


Counting bits set, Brian Kernighan's way

unsigned int v; // count the number of bits set in v
unsigned int c; // c accumulates the total bits set in v
for (c = 0; v; c++)
{
  v &= v - 1; // clear the least significant bit set
}


The best method for counting bits in a 32-bit integer v is the following:

v = v - ((v >> 1) & 0x55555555);                    // reuse input as temporary
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);     // temp
c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count

The best bit counting method takes only 12 operations, which is the same as the
lookup-table method, but avoids the memory and potential cache misses of a
table. It is a hybrid between the purely parallel method above and the earlier
methods using multiplies (in the section on counting bits with 64-bit
instructions), though it doesn't use 64-bit instructions. The counts of bits
set in the bytes is done in parallel, and the sum total of the bits set in the
bytes is computed by multiplying by 0x1010101 and shifting right 24 bits.

A generalization of the best bit counting method to integers of bit widths
upto 128 (parameterized by type T) is this:

v = v - ((v >> 1) & (T)~(T)0/3);                           // temp
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);      // temp
v = (v + (v >> 4)) & (T)~(T)0/255*15;                      // temp
c = (T)(v * ((T)~(T)0/255)) >> (sizeof(v) - 1) * CHAR_BIT; // count

-------------------------------------------------------------------------------

We manage the memory in *pages*. For allocations smaller than a page, the page
is reserved for objects of the same size. This wastes up to PageSize div 2
bytes in the worst case! Thus the pageSize should be smaller than the operating
system pagesize; we use 512 Bytes.

type
  PPageDesc = ptr TPageDesc

  TBitIndex = range[0..UnitsPerPage-1]

  TPageDesc = record
    next: PPageDesc # all nodes are connected with this pointer
    key: TAddress   # start address at bit 0
    bits: array[TBitIndex, int] # a bit vector

  PPageDescArray = ptr array[0..1000_000, PPageDesc]
  TCellSet = record
    counter, max: int
    head: PPageDesc
    data: PPageDescArray


-------------------------------------------------------------------------------

We use manage *chunks*. There are fixed size chunks and variable sized chunks.

type
  TChunk = object
    startAddr: int # key in a table; lowest 3 bits are used for flags
    size: int  # chunk manages the interval [startAddr..startAddr+size-1]
    le, ri: ptr TChunk # we make a tree of it!

  TFixedChunk = object of TChunk
    elemsize: int  # size of each cell in this chunk (multiple of 8)
                   # indexed by an array
    ptr: int       # cyclic check
    counter: int   # counts the number of set bits in "used"
    next: ptr TFixedChunk # next fixed chunk with the same size
    used: set[range[0..511]]


  TVarChunk = object of TChunk
    prev, next: ptr TVarChunk


const
  startMask = (-1) shl 3
  chunkMask = 1
  usedMask = 2

var
  fixedChunks: array[0..255, PFixedChunk]
  root: PChunk

proc allocFixed(size: int): pointer =
  var c = fixedChunks[size shr 3]
  # we search for free bits in an intelligent fashion:
  # k:=high(c.used)-c.counter bits are free, so we should try only k times
  # to find a free slot
  block newChunk:
    while c != nil:
      for i in 0..high(c.used)-c.counter-1:
        if c.ptr in c.used: c.ptr = (c.ptr + 1) mod (high(c.used)+1)
        else: break newChunk # found a free place!
      c = c.next
    c = newFixedChunk(fixedChunks[size shr 3], size)
  incl(c.used, c.ptr)
  result = cast[pointer](c.startAddr and startMask +% c.ptr*%align(size))
  inc(c.ptr)
  inc(c.counter)

proc getChunk(p: Pointer) =
  # use a splay tree for this!

proc dealloc(p: pointer) =
  var c = getChunk(p)
  if c == nil:
    raise newException(Exception, "invalid pointer")
  if (c.startAddr and usedMask) == 0:
    raise newException(Exception, "attempt to free a chunk twice")
  if (c.startAddr and chunkMask) == 0:
    # fixed chunk:
    var f = PFixedChunk(c)
    var pos = (cast[int](p) -%
                (c.startAddr and startMask)) div (f.size div high(f.used))
    excl(f.used, pos)
    dec(f.counter)
    f.ptr = min(f.ptr, pos)
    if f.counter == 0:
      setChunkToFree(c)
  else:
    # variable chunk:
    var v = PVarChunk(c)


C vs LLVM for code generation
=============================

Advantages of C:
* easier bootstrap; especially becuase the RTL depends on C's structures and
  constants which may change
* the compiler will port to the LEGO mindstorm easily
* interfacing with C is easier
* code generator for it already written
* works better on Windows than LLVM

Advantages of LLVM:
* easier to generate code for it
* compilation times will be better when linking directly to LLVM
* better support for debug information
* only one assembler format to support
* we get code generators for C and CLI for free
* better exception support
* atomic instructions; transactional memory project is available
* may allow easy interfacing with Pypy, Ocaml, etc. (though of not much use)
* generated code is checked deeply; assurance that the generated code does not
  depend on implicit type conversions
* better support for merging of generics

Disadvantages of C:
* chars are flawed horribly
* cannot return arrays or sets
* arrays are fucked up
* generating correct C types is hard, but the algorithm works now
* exceptions are inefficent to support (move to C++?)

Disadvantages of LLVM:
* code generation is not that much easier...
* code generation for the getelementptr is hard
* code generation for offsetof and sizeof is not that clear
* generated code is even harder to read than C
* code generation for exceptions is hard
* bootstrapping is MUCH harder
* dependency to a project that may become "closed source" by Apple
* assembler not integrated
* code generation for coroutines is not any easier than in C


Ideas for the code generator
============================

The C code generator's code is hard to maintain and - what's worse - hard to
reason about. Is it *correct*? That's especially hard because:

* Chars are either signed or unsigned. This means endless suffering... We
  cannot avoid it because we really want to use string literals which have
  the type ``char*``. For interoperability this is a must.
* Checks for range types are hard to take account into.
* Whether any place takes the tyGenericInst into account is unknown. Bugs will
  be hard to find. Code inspection seems the only way.
* Generating code for set constructors is a mess.
* Generating code for case statements is a mess.
* Generating type information has been a mess, but now isn't.
* Generating C types has been hard, but the algorithm is now correct.
* Things would be easier if just one huge C file had been generated, but this
  is clearly unfeasable.
* Efficient code has to be generated.
* The code generation for the GC does not make it any easier.
* Porting the code generator is difficult.

The solution is to extend the transformation pass, so that it eliminates
tyGenericInst and tyRange. All range checks are explicit. All other checks too.
String case statements are transformed into if statements. Assignments are
split into asgnRef(), etc. Statements are **not** transformed into gotos.
The initialization section is generated (as is the type information).


Writeln macro
=============

macro writeln(args: expr): stmt =
  result = newNode(nkStmtList)
  for i in 2..n.len-1:
    result.add(newCall("real_write", n[1], n[i]))
  template nl = real_write("\n")
  result.add(nl)


ROR and ROL for int and int64
=============================

int WORD_LENGTH;
int rol(int value, int places)
{
  places %= WORD_LENGTH;
  return (value<<places)|(value>>(WORD_LENGTH-places);
}

int ror(int value, int places)
{
  return (value>>places)|(value<<WORD_LENGTH-places);
}


Ideas about the macro system
============================

Nimrod has generics, templates and macros. Generics are hardest to implement
and I still don't know whether they will work in practice as in C++. Which is
the goal anyway. There should be simple mechanisms that replace the generic
functionality. There has to be a way to escape within macros.

But does this []-operator allow tables?

type
  TBitset = record
    a: sequence of int

const
  intBitSize = sizeof(int)*8

proc init(s: var TBitset, capacity: natural = intBitSize) =
  init(s.a, capacity div intBitSize)

proc len(s: TBitset): natural =
  if s.a != nil: result = s.a.len
  else: result = 0

proc == (a, b: TBitSet): bool =
  var i = 0
  while i < a.len and i < b.len:
    if a.a[i] != b.a[i]: return false
    ++i
  if i < a.len: # at the end of b
    while i < a.len:
      if a.a[i] != 0: return false
      ++i
  elif i < b.len:
    while i < b.len:
      if b.a[i] != 0: return false
      ++i
  return true

proc hash(s: TBitset): int =
  var h = 0
  for i in 0 .. s.len-1:
    h = h +% s.a[i]
    h = h +% h shl 10
    h = h xor (h shr 6)
  h = h +% h shl 3
  h = h xor (h shr 11)
  h = h +% h shl 15
  result = h

proc [] (s: TBitset, index: natural): bool =
  if index < s.a.len * intBitSize:
    result = s.a[index div intBitSize] and 1 shl (index mod intBitSize) != 0
  else:
    result = false
  # getter

proc [] (s: var TBitset, index: natural, bit: bool) =
  # setter
  if index >= s.a.len * intBitSize:
    s.a.len = index div intBitSize + 1
  if bit:
    s.a[index div intBitSize] = s.a[index div intBitSize]
      or 1 shl index mod intBitSize
  else:
    s.a[index div intBitSize] = s.a[index div intBitSize]
      and not (1 shl index mod intBitSize)

proc reset(s: var TBitset, index: natural) =
  if index < s.a.len * intBitSize:
    s.a[index div intBitSize] = s.a[index div intBitSize]
      and not (1 shl index mod intBitSize)

template excl: expr = reset
proc incl(s: acc TBitset, index: natural) = s[index] = true

proc flip(s: TBitset, index: natural) =
  if index < s.a.len * intBitSize:
    s.a[index div intBitSize] = s.a[index div intBitSize]
      bitxor (1 shl index mod intBitSize)

iterator elements(s: TBitset): natural =
  for i in 0 .. s.len-1:
    var x = s.a[i]
    var j = 0
    while x != 0:
      if x bitand 1:
        yield i * intBitSize + j
      x = x shr 1
      ++j

# example 1: generic minimum and maximum procs:
generic {type TComparable,
         with proc cmp(a, b: TComparable): int }:
         # ``cmp`` is used if this parameter is not provided; this is symbolized
         # by the "with"
  proc max(a, b: TComparable): TComparable =
    if cmp(a, b) <= 0: result = b
    else: result = a

  proc min(a, b: TComparable): TComparable =
    if cmp(a, b) <= 0: result = a
    else: result = b

  # but beware!
  proc max(a, b: TComparable): var TComparable =
    if cmp(a, b) <= 0: result = a # do not allow this!
    else: result = b              # one may not take the address of a local!
                                  # if it is the return value!

max(8, 9.9) should be possible! Conversions are needed. Algorithm for
overloading resolution:

First we have to get generic parameters right: Unification over any order is
quite expensive! Thus only unification from first to last is tried. It is
only needed for generic procs. The only problem is that this rules is not
applied for constructor expressions like {} or [].

# -----------------------------------------------------------------------------
# tables have a few problems: if TKey has nil as valid value, no "FUsed" bitset
# is needed! If the hash value is easily recomputed, no explicit hash field is
# needed. Thus there are 4 versions:

# 1. table with FUsed and with hash values cached
# 2. table with FUsed and with no hash values cached (int, ref, ptr)
# 3. table without FUsed (requires nil-type!) and with hash values cached
# 4. table without FUsed (requires nil-type!) and without hash caching

# But, often low(int) for int is an invalid value too! So what is needed is a
# valid() proc that can be overwritten!

# What I want is:

# ----------------------- abbreviation with "generic" -------------------------
template TTable(TKey: typedesc, TValue: typedesc,
                valid: expr = isNil): typedesc =
  type
    TTableType = record
      counter: int
      buckets: seq[(Tkey, TValue)]

  proc init^ (t: var TTableType) =
    # ^ means ``add to outer scope``

  proc put(t: var TTableType, k: TKey, v: TValue) =
    ...

But: p(x: table[int, int]) should be possible without new instantiation!


signature KeyValue = {
  type TKey,
  type TValue,
  const cheapHash = false,
  with proc hash(k: TKey): int
}

generic {type TKey , # type parameters may be prefixed with "type"
         type TValue,
         # whether FUsed is needed is determined by static checks with TKey
         const hashingCheap: bool = false,
         with proc hash(k: TKey): int # contraints
         }:
  type
    TTable = record
      counter: int
      buckets: seq[(TKey, TValue)] # () is a tuple type
      when not (TKey with nil):
        FUsed: TBitset

    TDummyType{} = int # explicit {} override "generic" section

  proc used(t: var TTable{TKey, TValue}, h: int): bool = # here {} are needed!
    when not (TKey with nil):
      result = t.FUsed[h]
    else:
      result = t.buckets[h] != nil

  proc init(t: var TTable{TKey, TValue}, capacity: natural = 32) =
    t.counter = 0
    init(buckets, capacity)
    when not (TKey with nil):
      init(FUsed, capacity)

  proc len(t: TTable{TKey, TValue}): natural =
    result = t.counter

  iterator pairs(t: TTable{TKey, TValue}): (TKey, var TValue) =
    for i in 0 .. t.buckets.len - 1:
      if used(t, i):
        yield t.buckets[i].key, t.buckets[i].val

  proc get(t: var TTable{TKey, TValue}, key: TKey): var TValue =
    # raises an exception if not in buckets:
    assert(t.buckets != nil)


  proc [] (t: var TTable{TKey, TValue}, key: TKey): var TValue =
    # ``t[key] = val`` works if we never raise an exception.
    # check if the location already exists:
    assert(t.buckets != nil)


Ideas about a high level data structure
=======================================

A high level rope structure
---------------------------

The operations a high level string type should provide:

- concat in O(1) (this means implementation as balanced trees)
- slice in O(1)
- iterating over characters in O(m) where m is the depth of the tree
- equality comparison should be in O(1); this is hard!
- suitable for inserting in a table (hashing)
- nil is the empty rope

type
  PRope* = ref TBaseRope
  TBaseRope = object of TObject
    kind: TRopeKind
    len: int
  TConcRope = object of TBaseRope
    le, ri: PRope
  TSubRope = object of TBaseRope
    start: int
    sub: PRope
  TLeafRope = object of TBaseRope
    a: array [0..31, char]


Ideas about a GUI toolkit
-------------------------

Even though GTK2 is the way to go, I am quite disatisfied with the available
solutions: They are bloated, complicated, yet not powerful enough. Ideally
the GUI is described via a set of Nimrod macros. I18n requires that only
abstract constants should be used. However, this is given for free! A callback
system sucks, a generic message passing mechanism seems much better.

The GUI should contain *priorities*, so that on small devices the system can
change a list to a combobox, for example. Each widget could correspond to a
basic Nimrod type:

=========================  ========================
Widget                     Type
=========================  ========================
Radio-Buttons              Enum
Fixed Combobox             Enum
Flexible Combobox          list[T]
Checkbox                   bool
Textfield                  string
Counter-box                int/float
Button                     Action (proc)
-------------------------  ------------------------
Complex widgets:
-------------------------  ------------------------
TextEditor, Canvas,        - no common type -
Browser, etc.
=========================  ========================

*Constraints* are directly supported by the DSL. A dialog represents a data
structure. When aborting, no changes are copied to the real structure.