Fix cost-centre-stacks bug (#5654)
authorSimon Marlow <marlowsd@gmail.com>
Thu, 15 Dec 2016 16:17:19 +0000 (11:17 -0500)
committerBen Gamari <ben@smart-cactus.org>
Thu, 15 Dec 2016 16:17:29 +0000 (11:17 -0500)
This fixes some cases of wrong stacks being generated by the profiler.
For background and details on the fix see
`Note [Evaluating functions with profiling]` in `rts/Apply.cmm`.

This does have an impact on allocations for some programs when
profiling.  nofib results:

```
   k-nucleotide          +0.0%     +8.8%    +11.0%    +11.0%      0.0%
         puzzle          +0.0%    +12.5%     0.244     0.246      0.0%
      typecheck           0.0%     +8.7%    +16.1%    +16.2%      0.0%
------------------------------------------------------------------------
--------
            Min          -0.0%     -0.0%    -34.4%    -35.5%    -25.0%
            Max          +0.0%    +12.5%    +48.9%    +49.4%    +10.6%
 Geometric Mean          +0.0%     +0.6%     +2.0%     +1.8%     -0.3%

```

But runtimes don't seem to be affected much, and the examples I looked
at were completely legitimate.  For example, in puzzle we have this:

```
position :: ItemType -> StateType ->  BankType
position Bono = bonoPos
position Edge = edgePos
position Larry = larryPos
position Adam = adamPos
```

where the identifiers on the rhs are all record selectors.  Previously
the profiler gave a stack that looked like

```
  position
  bonoPos
  ...
```

i.e. `bonoPos` was at the same level of the call stack as `position`,
but now it looks like

```
  position
   bonoPos
   ...
```

I used the normaliser from the testsuite to diff the profiling output
from other nofib programs and they all looked better.

Test Plan:
* the broken test passes
* validate
* compiled and ran all of nofib, measured perf, diff'd several .prof
files

Reviewers: niteria, erikd, austin, scpmw, bgamari

Reviewed By: bgamari

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D2804

GHC Trac Issues: #5654, #10007

compiler/codeGen/StgCmmClosure.hs
includes/Cmm.h
rts/Apply.cmm
testsuite/tests/profiling/should_run/T5654-O0.hs [moved from testsuite/tests/profiling/should_run/T5654.hs with 100% similarity]
testsuite/tests/profiling/should_run/T5654-O0.prof.sample [new file with mode: 0644]
testsuite/tests/profiling/should_run/T5654-O1.hs [new file with mode: 0644]
testsuite/tests/profiling/should_run/T5654-O1.prof.sample [new file with mode: 0644]
testsuite/tests/profiling/should_run/T5654.prof.sample [deleted file]
testsuite/tests/profiling/should_run/T680.prof.sample
testsuite/tests/profiling/should_run/all.T

index 0ce119b..1943dc4 100644 (file)
@@ -565,8 +565,10 @@ getCallMethod dflags _ id _ n_args v_args _cg_loc
 
 getCallMethod dflags name id (LFReEntrant _ _ arity _ _) n_args _v_args _cg_loc
               _self_loop_info
-  | n_args == 0    = ASSERT( arity /= 0 )
-                     ReturnIt        -- No args at all
+  | n_args == 0 -- No args at all
+  && not (gopt Opt_SccProfilingOn dflags)
+     -- See Note [Evaluating functions with profiling] in rts/Apply.cmm
+  = ASSERT( arity /= 0 ) ReturnIt
   | n_args < arity = SlowCall        -- Not enough args
   | otherwise      = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity
 
index 0623c3e..276348a 100644 (file)
 //    explicit jumps, for use when we are doing the stack management
 //    ourselves.
 
+#if defined(PROFILING)
+// See Note [Evaluating functions with profiling] in rts/Apply.cmm
+#define ENTER(x) jump stg_ap_0_fast(x);
+#else
 #define ENTER(x) ENTER_(return,x)
+#endif
+
 #define ENTER_R1() ENTER_(RET_R1,R1)
 
 #define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1]
index 149a320..3a73ce0 100644 (file)
@@ -30,7 +30,114 @@ stg_ap_0_fast ( P_ fun )
     IF_DEBUG(sanity,
         ccall checkStackFrame(Sp "ptr"));
 
+#if !defined(PROFILING)
+
     ENTER(fun);
+
+#else
+
+/*
+  Note [Evaluating functions with profiling]
+
+  If we evaluate something like
+
+    let f = {-# SCC "f" #-} g
+
+  where g is a function, then updating the thunk for f to point to g
+  would be incorrect: we've lost the SCC annotation.  In general, when
+  we evaluate a function and the current CCS is different from the one
+  stored in the function, we need to return a function with the
+  correct CCS in it.
+
+  The mechanism we use to wrap the function is to create a
+  zero-argument PAP as a proxy object to hold the new CCS, and return
+  that.
+*/
+
+again:
+    W_  info;
+    W_ untaggedfun;
+    untaggedfun = UNTAG(fun);
+    info = %INFO_PTR(untaggedfun);
+    switch [INVALID_OBJECT .. N_CLOSURE_TYPES]
+        (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {
+        case
+            IND,
+            IND_STATIC:
+        {
+            fun = StgInd_indirectee(fun);
+            goto again;
+        }
+        case
+            FUN,
+            FUN_1_0,
+            FUN_0_1,
+            FUN_2_0,
+            FUN_1_1,
+            FUN_0_2,
+            FUN_STATIC,
+            BCO:
+        {
+            if (CCCS == StgHeader_ccs(untaggedfun)) {
+                return (fun);
+            } else {
+                // We're going to build a new PAP, with zero extra
+                // arguments and therefore the same arity as the
+                // original function.  In other words, we're using a
+                // zero-argument PAP as an indirection to the
+                // function, so that we can attach a different CCS to
+                // it.
+                HP_CHK_GEN(SIZEOF_StgPAP);
+                TICK_ALLOC_PAP(SIZEOF_StgPAP, 0);
+                // attribute this allocation to the "overhead of profiling"
+                CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
+                P_ pap;
+                W_ arity;
+                pap = Hp - SIZEOF_StgPAP + WDS(1);
+                SET_HDR(pap, stg_PAP_info, CCCS);
+                arity = TO_W_(StgFunInfoExtra_arity(%FUN_INFO(info)));
+                StgPAP_arity(pap) = arity;
+                StgPAP_fun(pap)   = fun;
+                StgPAP_n_args(pap) = 0;
+                return (pap);
+            }
+        }
+        case PAP:
+        {
+            if (CCCS == StgHeader_ccs(untaggedfun)) {
+                return (fun);
+            } else {
+                // We're going to copy this PAP, and put the new CCS in it
+                fun = untaggedfun;
+                W_ size;
+                size = SIZEOF_StgPAP + WDS(TO_W_(StgPAP_n_args(fun)));
+                HP_CHK_GEN(size);
+                TICK_ALLOC_PAP(size, 0);
+                // attribute this allocation to the "overhead of profiling"
+                CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
+                P_ pap;
+                pap = Hp - size + WDS(1);
+                SET_HDR(pap, stg_PAP_info, CCCS);
+                StgPAP_arity(pap) = StgPAP_arity(fun);
+                StgPAP_n_args(pap) = StgPAP_n_args(fun);
+                StgPAP_fun(pap)   = StgPAP_fun(fun);
+                W_ i;
+                i = TO_W_(StgPAP_n_args(fun));
+            loop:
+                if (i == 0) {
+                    return (pap);
+                }
+                i = i - 1;
+                StgPAP_payload(pap,i) = StgPAP_payload(fun,i);
+                goto loop;
+            }
+        }
+        default:
+        {
+            jump %ENTRY_CODE(info) (UNTAG(fun));
+        }
+    }
+#endif
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/testsuite/tests/profiling/should_run/T5654-O0.prof.sample b/testsuite/tests/profiling/should_run/T5654-O0.prof.sample
new file mode 100644 (file)
index 0000000..2344d26
--- /dev/null
@@ -0,0 +1,28 @@
+       Thu Dec  8 11:35 2016 Time and Allocation Profiling Report  (Final)
+
+          T5654-O0 +RTS -p -RTS
+
+       total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
+       total alloc =      39,184 bytes  (excludes profiling overheads)
+
+COST CENTRE MODULE           SRC              %time %alloc
+
+MAIN        MAIN             <built-in>         0.0    1.0
+CAF         GHC.IO.Handle.FD <entire-module>    0.0   88.3
+CAF         GHC.IO.Encoding  <entire-module>    0.0    7.1
+CAF         GHC.Conc.Signal  <entire-module>    0.0    1.6
+
+
+                                                                         individual      inherited
+COST CENTRE MODULE                SRC                 no.     entries  %time %alloc   %time %alloc
+
+MAIN        MAIN                  <built-in>          104          0    0.0    1.0     0.0  100.0
+ CAF        Main                  <entire-module>     207          0    0.0    0.9     0.0    1.4
+  f         Main                  T5654-O0.hs:7:1-5   209          1    0.0    0.1     0.0    0.1
+  main      Main                  T5654-O0.hs:13:1-21 208          1    0.0    0.3     0.0    0.4
+   f        Main                  T5654-O0.hs:7:1-5   210          0    0.0    0.0     0.0    0.1
+    g       Main                  T5654-O0.hs:11:1-11 211          1    0.0    0.1     0.0    0.1
+ CAF        GHC.Conc.Signal       <entire-module>     201          0    0.0    1.6     0.0    1.6
+ CAF        GHC.IO.Encoding       <entire-module>     191          0    0.0    7.1     0.0    7.1
+ CAF        GHC.IO.Encoding.Iconv <entire-module>     189          0    0.0    0.6     0.0    0.6
+ CAF        GHC.IO.Handle.FD      <entire-module>     181          0    0.0   88.3     0.0   88.3
diff --git a/testsuite/tests/profiling/should_run/T5654-O1.hs b/testsuite/tests/profiling/should_run/T5654-O1.hs
new file mode 100644 (file)
index 0000000..d7f83bf
--- /dev/null
@@ -0,0 +1,14 @@
+-- Tests for a bug in the handling of cost-centre stacks in the
+-- runtime, where we lose the current cost-centre stack when
+-- evaluating a function.
+
+{-# NOINLINE f #-}
+f :: Int -> Int
+f = g   -- here we should remember the stack under which g was evaluated
+
+{-# NOINLINE g #-}
+g :: Int -> Int
+g x = x + 1
+
+main =  return $! f 3
+
diff --git a/testsuite/tests/profiling/should_run/T5654-O1.prof.sample b/testsuite/tests/profiling/should_run/T5654-O1.prof.sample
new file mode 100644 (file)
index 0000000..0e65631
--- /dev/null
@@ -0,0 +1,27 @@
+       Thu Dec  8 11:34 2016 Time and Allocation Profiling Report  (Final)
+
+          T5654-O1 +RTS -p -RTS
+
+       total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
+       total alloc =      39,064 bytes  (excludes profiling overheads)
+
+COST CENTRE MODULE           SRC              %time %alloc
+
+MAIN        MAIN             <built-in>         0.0    1.9
+CAF         GHC.IO.Handle.FD <entire-module>    0.0   88.6
+CAF         GHC.IO.Encoding  <entire-module>    0.0    7.1
+CAF         GHC.Conc.Signal  <entire-module>    0.0    1.6
+
+
+                                                                         individual      inherited
+COST CENTRE MODULE                SRC                 no.     entries  %time %alloc   %time %alloc
+
+MAIN        MAIN                  <built-in>          104          0    0.0    1.9     0.0  100.0
+ CAF        Main                  <entire-module>     207          0    0.0    0.0     0.0    0.2
+  main      Main                  T5654-O1.hs:13:1-21 208          1    0.0    0.1     0.0    0.2
+   f        Main                  T5654-O1.hs:7:1-5   209          1    0.0    0.0     0.0    0.0
+    g       Main                  T5654-O1.hs:11:1-11 210          1    0.0    0.0     0.0    0.0
+ CAF        GHC.Conc.Signal       <entire-module>     201          0    0.0    1.6     0.0    1.6
+ CAF        GHC.IO.Encoding       <entire-module>     191          0    0.0    7.1     0.0    7.1
+ CAF        GHC.IO.Encoding.Iconv <entire-module>     189          0    0.0    0.6     0.0    0.6
+ CAF        GHC.IO.Handle.FD      <entire-module>     181          0    0.0   88.6     0.0   88.6
diff --git a/testsuite/tests/profiling/should_run/T5654.prof.sample b/testsuite/tests/profiling/should_run/T5654.prof.sample
deleted file mode 100644 (file)
index 7d12acd..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-       Wed Nov 23 11:52 2011 Time and Allocation Profiling Report  (Final)
-
-          scc004 +RTS -p -RTS
-
-       total time  =        0.00 secs   (0 ticks @ 20 ms)
-       total alloc =      39,400 bytes  (excludes profiling overheads)
-
-COST CENTRE MODULE           %time %alloc
-
-MAIN        MAIN               0.0    1.7
-CAF         GHC.Conc.Signal    0.0    1.7
-CAF         GHC.IO.Encoding    0.0    7.1
-CAF         GHC.IO.Handle.FD   0.0   88.7
-
-
-                                                      individual     inherited
-COST CENTRE MODULE                  no.     entries  %time %alloc   %time %alloc
-
-MAIN        MAIN                     98           0    0.0    1.7     0.0  100.0
- CAF        GHC.IO.Handle.FD        124           0    0.0   88.7     0.0   88.7
- CAF        GHC.IO.Encoding.Iconv   116           0    0.0    0.6     0.0    0.6
- CAF        GHC.IO.Encoding         114           0    0.0    7.1     0.0    7.1
- CAF        GHC.Conc.Signal         106           0    0.0    1.7     0.0    1.7
- CAF        Main                    104           0    0.0    0.1     0.0    0.2
-  f         Main                    197           1    0.0    0.0     0.0    0.0
-  main      Main                    196           1    0.0    0.0     0.0    0.1
-   f        Main                    198           0    0.0    0.0     0.0    0.0
-    g       Main                    199           1    0.0    0.0     0.0    0.0
index d14bad9..1d52d71 100644 (file)
@@ -1,9 +1,9 @@
-       Sat Jun  4 11:59 2016 Time and Allocation Profiling Report  (Final)
+       Thu Dec  8 15:23 2016 Time and Allocation Profiling Report  (Final)
 
           T680 +RTS -hc -p -RTS
 
        total time  =        0.00 secs   (0 ticks @ 1000 us, 1 processor)
-       total alloc =     752,952 bytes  (excludes profiling overheads)
+       total alloc =     753,032 bytes  (excludes profiling overheads)
 
 COST CENTRE MODULE           SRC                    %time %alloc
 
@@ -17,26 +17,26 @@ foo.bar.\   Main             T680.hs:(8,11)-(9,38)    0.0   38.2
                                                                                     individual      inherited
 COST CENTRE          MODULE                SRC                   no.     entries  %time %alloc   %time %alloc
 
-MAIN                 MAIN                  <built-in>             46          0    0.0    0.1     0.0  100.0
- CAF                 Main                  <entire-module>        91          0    0.0    0.0     0.0   93.6
-  k                  Main                  T680.hs:12:1-17        95          1    0.0    0.0     0.0    0.0
-   foo               Main                  T680.hs:(2,1)-(9,38)   96          1    0.0    0.0     0.0    0.0
-    foo.bar          Main                  T680.hs:(5,3)-(9,38)  100          1    0.0    0.0     0.0    0.0
-     foo.bar.k'      Main                  T680.hs:6:9-34        104          1    0.0    0.0     0.0    0.0
-      k.\            Main                  T680.hs:12:16         105          1    0.0    0.0     0.0    0.0
-  main               Main                  T680.hs:20:1-14        92          1    0.0    0.0     0.0    0.0
-  r                  Main                  T680.hs:18:1-26        94          1    0.0    0.0     0.0   93.5
-   k                 Main                  T680.hs:12:1-17        97          0    0.0    0.0     0.0   93.5
-    foo              Main                  T680.hs:(2,1)-(9,38)   98          0    0.0    0.0     0.0   93.5
-     foo.\           Main                  T680.hs:3:12-40        99       4001    0.0   25.5     0.0   25.5
-     foo.bar         Main                  T680.hs:(5,3)-(9,38)  101          0    0.0   29.8     0.0   68.0
-      foo.bar.\      Main                  T680.hs:(8,11)-(9,38) 102       4001    0.0   38.2     0.0   38.2
-       foo.bar.\.k'' Main                  T680.hs:8:15-27       103       4000    0.0    0.0     0.0    0.0
-      foo.bar.k'     Main                  T680.hs:6:9-34        106          0    0.0    0.0     0.0    0.0
- CAF                 GHC.IO.Handle.FD      <entire-module>        84          0    0.0    4.6     0.0    4.6
- CAF                 GHC.IO.Handle.Text    <entire-module>        83          0    0.0    0.0     0.0    0.0
- CAF                 GHC.Conc.Signal       <entire-module>        80          0    0.0    0.1     0.0    0.1
- CAF                 GHC.IO.Encoding       <entire-module>        78          0    0.0    0.4     0.0    0.4
- CAF                 GHC.Show              <entire-module>        76          0    0.0    0.0     0.0    0.0
- CAF                 GHC.IO.Encoding.Iconv <entire-module>        62          0    0.0    0.0     0.0    0.0
- main                Main                  T680.hs:20:1-14        93          0    0.0    1.2     0.0    1.2
+MAIN                 MAIN                  <built-in>            104          0    0.0    0.1     0.0  100.0
+ CAF                 Main                  <entire-module>       207          0    0.0    0.0     0.0   93.5
+  k                  Main                  T680.hs:12:1-17       211          1    0.0    0.0     0.0    0.0
+   foo               Main                  T680.hs:(2,1)-(9,38)  212          1    0.0    0.0     0.0    0.0
+    foo.bar          Main                  T680.hs:(5,3)-(9,38)  216          1    0.0    0.0     0.0    0.0
+     foo.bar.k'      Main                  T680.hs:6:9-34        220          1    0.0    0.0     0.0    0.0
+      k.\            Main                  T680.hs:12:16         221          1    0.0    0.0     0.0    0.0
+  main               Main                  T680.hs:20:1-14       208          1    0.0    0.0     0.0    0.0
+  r                  Main                  T680.hs:18:1-26       210          1    0.0    0.0     0.0   93.5
+   k                 Main                  T680.hs:12:1-17       213          0    0.0    0.0     0.0   93.5
+    foo              Main                  T680.hs:(2,1)-(9,38)  214          0    0.0    0.0     0.0   93.5
+     foo.\           Main                  T680.hs:3:12-40       215       4001    0.0   25.5     0.0   25.5
+     foo.bar         Main                  T680.hs:(5,3)-(9,38)  217          0    0.0   29.8     0.0   68.0
+      foo.bar.\      Main                  T680.hs:(8,11)-(9,38) 218       4001    0.0   38.2     0.0   38.2
+       foo.bar.\.k'' Main                  T680.hs:8:15-27       219       4000    0.0    0.0     0.0    0.0
+      foo.bar.k'     Main                  T680.hs:6:9-34        222          0    0.0    0.0     0.0    0.0
+ CAF                 GHC.Conc.Signal       <entire-module>       201          0    0.0    0.1     0.0    0.1
+ CAF                 GHC.IO.Encoding       <entire-module>       191          0    0.0    0.4     0.0    0.4
+ CAF                 GHC.IO.Encoding.Iconv <entire-module>       189          0    0.0    0.0     0.0    0.0
+ CAF                 GHC.IO.Handle.FD      <entire-module>       181          0    0.0    4.6     0.0    4.6
+ CAF                 GHC.IO.Handle.Text    <entire-module>       179          0    0.0    0.0     0.0    0.0
+ CAF                 GHC.Show              <entire-module>       165          0    0.0    0.0     0.0    0.0
+ main                Main                  T680.hs:20:1-14       209          0    0.0    1.2     0.0    1.2
index 151b75b..5faca29 100644 (file)
@@ -47,7 +47,9 @@ test('scc002', [], compile_and_run, [''])
 test('scc003', [], compile_and_run,
      ['-fno-state-hack']) # Note [consistent stacks]
 
-test('T5654', [expect_broken(5654)], compile_and_run, [''])
+test('T5654-O0', [only_ways(['prof'])], compile_and_run, [''])
+
+test('T5654-O1', [only_ways(['profasm'])], compile_and_run, [''])
 
 test('T5654b-O0', [only_ways(['prof'])], compile_and_run, [''])