From 6a40cbf5959226bf1e6ec7bdb6c7e03253072c28 Mon Sep 17 00:00:00 2001
From: Leandro Lucarella <llucax@gmail.com>
Date: Thu, 9 Sep 2010 00:17:16 -0300
Subject: [PATCH] Add eager allocation support when fork()ing

Eager allocation consist in allocating a new pool when a collection is
triggered (because an allocation failed to find enough free space in the
current pools). This enables the mutator to keep running when the mark
phase run in a fork()ed process in parallel. This completes the concurrent
GC, decreasing the maximum pause time (not only the stop-the-world time)
dramatically (almost by 2 orders of magnitude).

As a side effect, the total run-time is greatly reduced too because the GC
pressure is reduced due to the extra allocated pools. The number of
collections needed by a program can be reduced 3 to 9 times depending on
the load, which explains the total run-time reduction, even for
single-core environments.

To allow the mutator run in parallel with the mark phase, the freebits
of free pages must be set for *all* the possible block sizes, not only for
the start of the page, because the freebits are used as a starting point
for mark bits and the bin size of a freed page can be changed *after* the
mark phase is started, resulting in an inconsistency between mark bits and
free bits. Pools allocated during a parallel mark set all the mark bits
to avoid the sweep phase freeing those pages after the mark is done.
---
 rt/gc/cdgc/bits.d |  17 +++++++
 rt/gc/cdgc/gc.d   | 117 ++++++++++++++++++++++++++++++++++++----------
 rt/gc/cdgc/opts.d |  13 +++++-
 rt/gc/cdgc/os.d   |  35 ++++++++++++--
 4 files changed, 154 insertions(+), 28 deletions(-)

diff --git a/rt/gc/cdgc/bits.d b/rt/gc/cdgc/bits.d
index 6bfb073..7c06c83 100644
--- a/rt/gc/cdgc/bits.d
+++ b/rt/gc/cdgc/bits.d
@@ -165,6 +165,23 @@ struct GCBits
         cstring.memset(data + 1, 0, nwords * uint.sizeof);
     }
 
+    void set_all()
+    {
+        cstring.memset(data + 1, 0xff, nwords * uint.sizeof);
+    }
+
+    void set_group(size_t base, size_t nbits)
+    in
+    {
+    }
+    body
+    {
+        assert ((base % 8) == 0);
+        assert ((nbits % 8) == 0);
+        size_t nbytes = nbits / 8;
+        cstring.memset(data + 1 + (base >> BITS_SHIFT), 0xff, nbytes);
+    }
+
     void copy(GCBits *f)
     in
     {
diff --git a/rt/gc/cdgc/gc.d b/rt/gc/cdgc/gc.d
index 5f65ff9..0d9e927 100644
--- a/rt/gc/cdgc/gc.d
+++ b/rt/gc/cdgc/gc.d
@@ -51,6 +51,7 @@ import opts = rt.gc.cdgc.opts;
 import cstdlib = tango.stdc.stdlib;
 import cstring = tango.stdc.string;
 import cstdio = tango.stdc.stdio;
+debug(COLLECT_PRINTF) alias cstdio.printf printf;
 
 /*
  * This is a small optimization that proved it's usefulness. For small chunks
@@ -202,6 +203,9 @@ struct GC
     /// Turn off collections if > 0
     int disabled;
 
+    // PID of the fork()ed process doing the mark() (0 if is not running)
+    int mark_proc_pid;
+
     /// min(pool.baseAddr)
     byte *min_addr;
     /// max(pool.topAddr)
@@ -402,6 +406,11 @@ size_t reserve(size_t size)
  */
 void minimize()
 {
+    // Disabled if a parallel collection is in progress because the shared mark
+    // bits of the freed pool might be used by the mark process
+    if (gc.mark_proc_pid != 0)
+        return;
+
     size_t n;
     size_t pn;
     Pool* pool;
@@ -596,15 +605,13 @@ int allocPage(Bins bin)
     byte* p = pool.baseAddr + pn * PAGESIZE;
     byte*  ptop = p + PAGESIZE;
     size_t bit_i = pn * (PAGESIZE / 16);
-    size_t bit_stride = size / 16;
-    for (; p < ptop; p += size, bit_i += bit_stride)
+    pool.freebits.set_group(bit_i, PAGESIZE / 16);
+    for (; p < ptop; p += size)
     {
         List* l = cast(List *) p;
         l.next = *list_head;
         l.pool = pool;
         *list_head = l;
-        // TODO: maybe this can be optimized to be set in chunks
-        pool.freebits.set(bit_i);
     }
     return 1;
 }
@@ -787,18 +794,56 @@ size_t fullcollect(void *stackTop)
 {
     debug(COLLECT_PRINTF) printf("Gcx.fullcollect()\n");
 
-    // we always need to stop the world to make threads save the CPU registers
+    // If eager allocation is used, we need to check first if there is a mark
+    // process running. If there isn't, we start a new one (see the next code
+    // block). If there is, we check if it's still running or already finished.
+    // If it's still running, we tell the caller process no memory has been
+    // recovered (it will allocated more to fulfill the current request).  If
+    // the mark process is done, we lunch the sweep phase and hope enough
+    // memory is freed (if that not the case, the caller will allocate more
+    // memory and the next time it's exhausted it will run a new collection).
+    if (opts.options.eager_alloc) {
+        if (gc.mark_proc_pid != 0) { // there is a mark process in progress
+            os.WRes r = os.wait_pid(gc.mark_proc_pid, false); // don't block
+            assert (r != os.WRes.ERROR);
+            switch (r) {
+            case os.WRes.DONE:
+                debug(COLLECT_PRINTF) printf("\t\tmark proc DONE\n");
+                gc.mark_proc_pid = 0;
+                return sweep();
+            case os.WRes.RUNNING:
+                debug(COLLECT_PRINTF) printf("\t\tmark proc RUNNING\n");
+                return 0;
+            case os.WRes.ERROR:
+                debug(COLLECT_PRINTF) printf("\t\tmark proc ERROR\n");
+                disable_fork(); // Try to keep going without forking
+                break;
+            }
+        }
+    }
+
+    // We always need to stop the world to make threads save the CPU registers
     // in the stack and prepare themselves for thread_scanAll()
     thread_suspendAll();
     gc.stats.world_stopped();
 
+    // If forking is enabled, we fork() and start a new mark phase in the
+    // child. The parent process will tell the caller that no memory could be
+    // recycled if eager allocation is used, allowing the mutator to keep going
+    // almost instantly (at the expense of more memory consumption because
+    // a new allocation will be triggered to fulfill the current request). If
+    // no eager allocation is used, the parent will wait for the mark phase to
+    // finish before returning control to the mutator, but other threads are
+    // restarted and may run in parallel with the mark phase (unless they
+    // allocate or use the GC themselves, in which case the global GC lock will
+    // stop them).
     if (opts.options.fork) {
         cstdio.fflush(null); // avoid duplicated FILE* output
         os.pid_t child_pid = os.fork();
         assert (child_pid != -1); // don't accept errors in non-release mode
         switch (child_pid) {
-        case -1: // if fork() fails, fallback to stop-the-world
-            opts.options.fork = false;
+        case -1: // if fork() fails, fall-back to stop-the-world
+            disable_fork();
             break;
         case 0: // child process (i.e. the collectors mark phase)
             mark(stackTop);
@@ -808,15 +853,28 @@ size_t fullcollect(void *stackTop)
             // start the world again and wait for the mark phase to finish
             thread_resumeAll();
             gc.stats.world_started();
-            int status = void;
-            os.pid_t wait_pid = os.waitpid(child_pid, &status, 0);
-            assert (wait_pid == child_pid);
-            return sweep();
+            if (opts.options.eager_alloc) {
+                gc.mark_proc_pid = child_pid;
+                return 0;
+            }
+            os.WRes r = os.wait_pid(child_pid); // block until it finishes
+            assert (r == os.WRes.DONE);
+            debug(COLLECT_PRINTF) printf("\t\tmark proc DONE (block)\n");
+            if (r == os.WRes.DONE)
+                return sweep();
+            debug(COLLECT_PRINTF) printf("\tmark() proc ERROR\n");
+            // If there was some error, try to keep going without forking
+            disable_fork();
+            // Re-suspend the threads to do the marking in this process
+            thread_suspendAll();
+            gc.stats.world_stopped();
         }
 
     }
 
-    // if we reach here, we are using the standard stop-the-world collection
+    // If we reach here, we are using the standard stop-the-world collection,
+    // either because fork was disabled in the first place, or because it was
+    // disabled because of some error.
     mark(stackTop);
     thread_resumeAll();
     gc.stats.world_started();
@@ -1043,9 +1101,9 @@ version(none) // BUG: doesn't work because freebits() must also be cleared
                     }
                     clrAttr(pool, bit_i, BlkAttr.ALL_BITS);
 
-                    debug(COLLECT_PRINTF) printf("\tcollecting big %x\n", p);
+                    debug(COLLECT_PRINTF) printf("\tcollecting big %p\n", p);
                     pool.pagetable[pn] = B_FREE;
-                    pool.freebits.set(bit_i);
+                    pool.freebits.set_group(bit_i, PAGESIZE / 16);
                     freedpages++;
                     if (opts.options.mem_stomp)
                         memset(p, 0xF3, PAGESIZE);
@@ -1054,7 +1112,7 @@ version(none) // BUG: doesn't work because freebits() must also be cleared
                         pn++;
                         pool.pagetable[pn] = B_FREE;
                         bit_i += bit_stride;
-                        pool.freebits.set(bit_i);
+                        pool.freebits.set_group(bit_i, PAGESIZE / 16);
                         freedpages++;
 
                         if (opts.options.mem_stomp)
@@ -1097,10 +1155,8 @@ version(none) // BUG: doesn't work because freebits() must also be cleared
                     if (!pool.freebits.test(bit_i))
                         goto Lnotfree;
                 }
-                // we don't need to explicitly set the freebit here because all
-                // freebits were already set, including the bit used for the
-                // whole freed page (bit_base).
                 pool.pagetable[pn] = B_FREE;
+                pool.freebits.set_group(bit_base, PAGESIZE / 16);
                 recoveredpages++;
                 continue;
 
@@ -1200,6 +1256,13 @@ body
 }
 
 
+void disable_fork()
+{
+    // we have to disable both options, as eager_alloc assumes fork is enabled
+    opts.options.fork = false;
+    opts.options.eager_alloc = false;
+}
+
 
 void initialize()
 {
@@ -1209,6 +1272,9 @@ void initialize()
     // If we are going to fork, make sure we have the needed OS support
     if (opts.options.fork)
         opts.options.fork = os.HAVE_SHARED && os.HAVE_FORK;
+    // Eager allocation is only possible when forking
+    if (!opts.options.fork)
+        opts.options.eager_alloc = false;
     gc.lock = GCLock.classinfo;
     gc.inited = 1;
     setStackBottom(rt_stackBottom());
@@ -1612,13 +1678,9 @@ private void free(void *p)
         // Free pages
         size_t npages = 1;
         size_t n = pagenum;
-        pool.freebits.set(bit_i);
-        size_t bit_stride = PAGESIZE / 16;
-        while (++n < pool.npages && pool.pagetable[n] == B_PAGEPLUS) {
+        pool.freebits.set_group(bit_i, PAGESIZE / 16);
+        while (++n < pool.npages && pool.pagetable[n] == B_PAGEPLUS)
             npages++;
-            bit_i += bit_stride;
-            pool.freebits.set(bit_i);
-        }
         if (opts.options.mem_stomp)
             memset(p, 0xF2, npages * PAGESIZE);
         pool.freePages(pagenum, npages);
@@ -1946,6 +2008,13 @@ struct Pool
         finals.alloc(nbits); // not used by the mark phase
         noscan.alloc(nbits); // mark phase *MUST* have a snapshot
 
+        // all is free when we start
+        freebits.set_all();
+
+        // avoid accidental sweeping of new pools while using eager allocation
+        if (gc.mark_proc_pid)
+            mark.set_all();
+
         pagetable = cast(ubyte*) cstdlib.malloc(npages);
         if (!pagetable)
             onOutOfMemoryError();
diff --git a/rt/gc/cdgc/opts.d b/rt/gc/cdgc/opts.d
index 042cc01..dc2c8de 100644
--- a/rt/gc/cdgc/opts.d
+++ b/rt/gc/cdgc/opts.d
@@ -53,6 +53,7 @@ struct Options
     bool mem_stomp = false;
     bool conservative = false;
     bool fork = true;
+    bool eager_alloc = true;
 }
 
 package Options options;
@@ -90,6 +91,8 @@ void process_option(char* opt_name, char* opt_value)
         options.conservative = parse_bool(opt_value);
     else if (cstr_eq(opt_name, "no_fork"))
         options.fork = !parse_bool(opt_value);
+    else if (cstr_eq(opt_name, "eager_alloc"))
+        options.eager_alloc = parse_bool(opt_value);
 }
 
 
@@ -149,6 +152,7 @@ unittest
         assert (mem_stomp == false);
         assert (conservative == false);
         assert (fork == true);
+        assert (eager_alloc == true);
     }
     parse("mem_stomp");
     with (options) {
@@ -158,8 +162,9 @@ unittest
         assert (mem_stomp == true);
         assert (conservative == false);
         assert (fork == true);
+        assert (eager_alloc == true);
     }
-    parse("mem_stomp=0:verbose=2:conservative:no_fork=10");
+    parse("mem_stomp=0:verbose=2:conservative:no_fork=10:eager_alloc=0");
     with (options) {
         assert (verbose == 2);
         assert (log_file[0] == '\0');
@@ -167,6 +172,7 @@ unittest
         assert (mem_stomp == false);
         assert (conservative == true);
         assert (fork == false);
+        assert (eager_alloc == false);
     }
     parse("log_file=12345 67890:verbose=1:sentinel=4:mem_stomp=1");
     with (options) {
@@ -176,6 +182,7 @@ unittest
         assert (mem_stomp == true);
         assert (conservative == true);
         assert (fork == false);
+        assert (eager_alloc == false);
     }
     parse(null);
     with (options) {
@@ -185,6 +192,7 @@ unittest
         assert (mem_stomp == true);
         assert (conservative == true);
         assert (fork == false);
+        assert (eager_alloc == false);
     }
     parse("");
     with (options) {
@@ -194,6 +202,7 @@ unittest
         assert (mem_stomp == true);
         assert (conservative == true);
         assert (fork == false);
+        assert (eager_alloc == false);
     }
     parse(":");
     with (options) {
@@ -203,6 +212,7 @@ unittest
         assert (mem_stomp == true);
         assert (conservative == true);
         assert (fork == false);
+        assert (eager_alloc == false);
     }
     parse("::::");
     with (options) {
@@ -212,6 +222,7 @@ unittest
         assert (mem_stomp == true);
         assert (conservative == true);
         assert (fork == false);
+        assert (eager_alloc == false);
     }
 }
 
diff --git a/rt/gc/cdgc/os.d b/rt/gc/cdgc/os.d
index 81939d2..62e247a 100644
--- a/rt/gc/cdgc/os.d
+++ b/rt/gc/cdgc/os.d
@@ -32,6 +32,16 @@ module rt.gc.cdgc.os;
 
 // Public interface/Documentation
 
+/**
+ * Possible results for the wait_pid() function.
+ */
+enum WRes
+{
+    DONE, /// The process has finished successfully
+    RUNNING, /// The process is still running
+    ERROR /// There was an error waiting for the process
+}
+
 version (D_Ddoc) {
 
 /**
@@ -42,8 +52,16 @@ version (D_Ddoc) {
  */
 const HAVE_FORK = true;
 
+/**
+ * Wait for a process with PID pid to finish.
+ *
+ * If block is false, this function will not block, and return WRes.RUNNING if
+ * the process is still running. Otherwise it will return always WRes.DONE
+ * (unless there is an error, in which case WRes.ERROR is returned).
+ */
+WRes wait_pid(pid_t pid, bool block = true);
+
 public import tango.stdc.posix.unistd: pid_t, fork;
-public import tango.stdc.posix.sys.wait: waitpid;
 
 }
 
@@ -51,14 +69,25 @@ public import tango.stdc.posix.sys.wait: waitpid;
 else version (Posix) {
     enum { HAVE_FORK = true }
     public import tango.stdc.posix.unistd: pid_t, fork;
-    public import tango.stdc.posix.sys.wait: waitpid;
+    import tango.stdc.posix.sys.wait: waitpid, WNOHANG;
+    public WRes wait_pid(pid_t pid, bool block = true) {
+        int status = void;
+        pid_t waited_pid = waitpid(pid, &status, block ? 0 : WNOHANG);
+        if (waited_pid == 0)
+            return WRes.RUNNING;
+        assert (waited_pid == pid);
+        assert (status == 0);
+        if (waited_pid != pid || status != 0)
+            return WRes.ERROR;
+        return WRes.DONE;
+    }
 }
 
 else {
     enum { HAVE_FORK = false }
     alias int pid_t;
     pid_t fork() { assert (false); return -1; }
-    pid_t waitpid(pid_t, int*, int) { assert (false); return -1; }
+    WRes wait_pid(pid_t, bool = true) { assert (false); return false; }
 }
 
 
-- 
2.43.0