Make profiling work with multiple capabilities (+RTS -N)
authorSimon Marlow <marlowsd@gmail.com>
Mon, 28 Nov 2011 16:48:43 +0000 (16:48 +0000)
committerSimon Marlow <marlowsd@gmail.com>
Tue, 29 Nov 2011 12:21:18 +0000 (12:21 +0000)
This means that both time and heap profiling work for parallel
programs.  Main internal changes:

  - CCCS is no longer a global variable; it is now another
    pseudo-register in the StgRegTable struct.  Thus every
    Capability has its own CCCS.

  - There is a new built-in CCS called "IDLE", which records ticks for
    Capabilities in the idle state.  If you profile a single-threaded
    program with +RTS -N2, you'll see about 50% of time in "IDLE".

  - There is appropriate locking in rts/Profiling.c to protect the
    shared cost-centre-stack data structures.

This patch does enough to get it working, I have cut one big corner:
the cost-centre-stack data structure is still shared amongst all
Capabilities, which means that multiple Capabilities will race when
updating the "allocations" and "entries" fields of a CCS.  Not only
does this give unpredictable results, but it runs very slowly due to
cache line bouncing.

It is strongly recommended that you use -fno-prof-count-entries to
disable the "entries" count when profiling parallel programs. (I shall
add a note to this effect to the docs).

34 files changed:
compiler/cmm/CmmExpr.hs
compiler/cmm/CmmLex.x
compiler/cmm/CmmParse.y
compiler/cmm/PprCmmExpr.hs
compiler/codeGen/CgCase.lhs
compiler/codeGen/CgClosure.lhs
compiler/codeGen/CgForeignCall.hs
compiler/codeGen/CgProf.hs
compiler/codeGen/CgUtils.hs
compiler/codeGen/StgCmmForeign.hs
compiler/codeGen/StgCmmProf.hs
compiler/codeGen/StgCmmUtils.hs
includes/Cmm.h
includes/RtsAPI.h
includes/mkDerivedConstants.c
includes/rts/prof/CCS.h
includes/stg/MiscClosures.h
includes/stg/Regs.h
rts/Apply.cmm
rts/AutoApply.h
rts/Capability.c
rts/Exception.cmm
rts/Interpreter.c
rts/PrimOps.cmm
rts/Profiling.c
rts/Proftimer.c
rts/RetainerProfile.h
rts/RtsFlags.c
rts/Schedule.c
rts/StgMiscClosures.cmm
rts/StgStdThunks.cmm
rts/sm/GC.c
rts/sm/Storage.c
utils/genapply/GenApply.hs

index ef97a82..885639b 100644 (file)
@@ -343,7 +343,8 @@ data GlobalReg
   | SpLim              -- Stack limit
   | Hp                 -- Heap ptr; points to last occupied heap location.
   | HpLim              -- Heap limit register
-  | CurrentTSO         -- pointer to current thread's TSO
+  | CCCS                -- Current cost-centre stack
+  | CurrentTSO          -- pointer to current thread's TSO
   | CurrentNursery     -- pointer to allocation area
   | HpAlloc            -- allocation count for heap check failure
 
@@ -395,6 +396,7 @@ instance Ord GlobalReg where
    compare SpLim SpLim = EQ
    compare Hp Hp = EQ
    compare HpLim HpLim = EQ
+   compare CCCS CCCS = EQ
    compare CurrentTSO CurrentTSO = EQ
    compare CurrentNursery CurrentNursery = EQ
    compare HpAlloc HpAlloc = EQ
@@ -419,6 +421,8 @@ instance Ord GlobalReg where
    compare _ Hp = GT
    compare HpLim _ = LT
    compare _ HpLim = GT
+   compare CCCS _ = LT
+   compare _ CCCS = GT
    compare CurrentTSO _ = LT
    compare _ CurrentTSO = GT
    compare CurrentNursery _ = LT
index 1e2b20d..ddd681d 100644 (file)
@@ -106,8 +106,9 @@ $white_no_nl+               ;
   SpLim                        { global_reg SpLim }
   Hp                   { global_reg Hp }
   HpLim                        { global_reg HpLim }
-  CurrentTSO           { global_reg CurrentTSO }
-  CurrentNursery       { global_reg CurrentNursery }
+  CCCS                  { global_reg CCCS }
+  CurrentTSO            { global_reg CurrentTSO }
+  CurrentNursery        { global_reg CurrentNursery }
   HpAlloc              { global_reg HpAlloc }
   BaseReg              { global_reg BaseReg }
   
index 0a50f60..4e315dd 100644 (file)
@@ -775,8 +775,9 @@ isPtrGlobalReg Sp                = True
 isPtrGlobalReg SpLim                = True
 isPtrGlobalReg Hp                   = True
 isPtrGlobalReg HpLim                = True
-isPtrGlobalReg CurrentTSO           = True
-isPtrGlobalReg CurrentNursery       = True
+isPtrGlobalReg CCCS                  = True
+isPtrGlobalReg CurrentTSO            = True
+isPtrGlobalReg CurrentNursery        = True
 isPtrGlobalReg (VanillaReg _ VGcPtr) = True
 isPtrGlobalReg _                    = False
 
index 76fbdce..81ce84c 100644 (file)
@@ -271,6 +271,7 @@ pprGlobalReg gr
         SpLim          -> ptext (sLit "SpLim")
         Hp             -> ptext (sLit "Hp")
         HpLim          -> ptext (sLit "HpLim")
+        CCCS           -> ptext (sLit "CCCS")
         CurrentTSO     -> ptext (sLit "CurrentTSO")
         CurrentNursery -> ptext (sLit "CurrentNursery")
         HpAlloc        -> ptext (sLit "HpAlloc")
index e4fe386..a36621b 100644 (file)
@@ -670,6 +670,6 @@ restoreCurrentCostCentre Nothing     _freeit = nopC
 restoreCurrentCostCentre (Just slot) freeit
  = do  { sp_rel <- getSpRelOffset slot
        ; whenC freeit (freeStackSlots [slot])
-       ; stmtC (CmmStore curCCSAddr (CmmLoad sp_rel bWord)) }
+        ; stmtC (storeCurCCS (CmmLoad sp_rel bWord)) }
 \end{code}
 
index 243d59f..7bad851 100644 (file)
@@ -316,9 +316,10 @@ mkFunEntryCode cl_info cc reg_args stk_args sp_top reg_save_code body = do
         -- Do the business
   ; funWrapper cl_info reg_args reg_save_code $ do
        { tickyEnterFun cl_info
-        ; enterCostCentreFun cc $
-              CmmMachOp mo_wordSub [ CmmReg nodeReg
-                                   , CmmLit (mkIntCLit (funTag cl_info)) ]
+        ; enterCostCentreFun cc
+              (CmmMachOp mo_wordSub [ CmmReg nodeReg
+                                    , CmmLit (mkIntCLit (funTag cl_info)) ])
+              (node : map snd reg_args) -- live regs
 
         ; cgExpr body }
   }
index 7d67132..8d8b97d 100644 (file)
@@ -240,8 +240,8 @@ emitLoadThreadState = do
   emitOpenNursery
   -- and load the current cost centre stack from the TSO when profiling:
   when opt_SccProfilingOn $
-        stmtC (CmmStore curCCSAddr
-                (CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_CCCS) bWord))
+        stmtC $ storeCurCCS $
+                  CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_CCCS) bWord
 
 emitOpenNursery :: Code
 emitOpenNursery = stmtsC [
index 13667c3..3e247ff 100644 (file)
@@ -21,7 +21,7 @@ module CgProf (
         enterCostCentreThunk,
         enterCostCentreFun,
         costCentreFrom,
-       curCCS, curCCSAddr,
+        curCCS, storeCurCCS,
        emitCostCentreDecl, emitCostCentreStackDecl, 
         emitSetCCC,
 
@@ -66,11 +66,10 @@ import Control.Monad
 
 -- Expression representing the current cost centre stack
 curCCS :: CmmExpr
-curCCS = CmmLoad curCCSAddr bWord
+curCCS = CmmReg (CmmGlobal CCCS)
 
--- Address of current CCS variable, for storing into
-curCCSAddr :: CmmExpr
-curCCSAddr = CmmLit (CmmLabel (mkCmmDataLabel rtsPackageId (fsLit "CCCS")))
+storeCurCCS :: CmmExpr -> CmmStmt
+storeCurCCS e = CmmAssign (CmmGlobal CCCS) e
 
 mkCCostCentre :: CostCentre -> CmmLit
 mkCCostCentre cc = CmmLabel (mkCCLabel cc)
@@ -135,14 +134,15 @@ profAlloc words ccs
 enterCostCentreThunk :: CmmExpr -> Code
 enterCostCentreThunk closure = 
   ifProfiling $ do 
-    stmtC $ CmmStore curCCSAddr (costCentreFrom closure)
+    stmtC $ storeCurCCS (costCentreFrom closure)
 
-enterCostCentreFun :: CostCentreStack -> CmmExpr -> Code
-enterCostCentreFun ccs closure =
+enterCostCentreFun :: CostCentreStack -> CmmExpr -> [GlobalReg] -> Code
+enterCostCentreFun ccs closure vols =
   ifProfiling $ do
     if isCurrentCCS ccs
-       then emitRtsCall rtsPackageId (fsLit "enterFunCCS")
-               [CmmHinted (costCentreFrom closure) AddrHint]
+       then emitRtsCallWithVols rtsPackageId (fsLit "enterFunCCS")
+               [CmmHinted (CmmReg (CmmGlobal BaseReg)) AddrHint,
+                CmmHinted (costCentreFrom closure) AddrHint] vols
        else return () -- top-level function, nothing to do
 
 ifProfiling :: Code -> Code
@@ -226,7 +226,7 @@ emitSetCCC cc tick push
     tmp <- newTemp bWord -- TODO FIXME NOW
     pushCostCentre tmp curCCS cc
     when tick $ stmtC (bumpSccCount (CmmReg (CmmLocal tmp)))
-    when push $ stmtC (CmmStore curCCSAddr (CmmReg (CmmLocal tmp)))
+    when push $ stmtC (storeCurCCS (CmmReg (CmmLocal tmp)))
 
 pushCostCentre :: LocalReg -> CmmExpr -> CostCentre -> Code
 pushCostCentre result ccs cc
index 85957e8..5274a17 100644 (file)
@@ -286,7 +286,7 @@ callerSaveVolatileRegs vols = (caller_save, caller_load)
     caller_save = foldr ($!) [] (map callerSaveGlobalReg    regs_to_save)
     caller_load = foldr ($!) [] (map callerRestoreGlobalReg regs_to_save)
 
-    system_regs = [Sp,SpLim,Hp,HpLim,CurrentTSO,CurrentNursery,
+    system_regs = [Sp,SpLim,Hp,HpLim,CCCS,CurrentTSO,CurrentNursery,
                    {-SparkHd,SparkTl,SparkBase,SparkLim,-}BaseReg ]
 
     regs_to_save = system_regs ++ vol_list
@@ -384,6 +384,9 @@ callerSaves Hp                  = True
 #ifdef CALLER_SAVES_HpLim
 callerSaves HpLim               = True
 #endif
+#ifdef CALLER_SAVES_CCCS
+callerSaves CCCS                = True
+#endif
 #ifdef CALLER_SAVES_CurrentTSO
 callerSaves CurrentTSO          = True
 #endif
@@ -423,6 +426,7 @@ baseRegOffset (LongReg 1)         = oFFSET_StgRegTable_rL1
 baseRegOffset (LongReg n)         = panic ("Registers above L1 are not supported (tried to use L" ++ show n ++ ")")
 baseRegOffset Hp                  = oFFSET_StgRegTable_rHp
 baseRegOffset HpLim               = oFFSET_StgRegTable_rHpLim
+baseRegOffset CCCS                = oFFSET_StgRegTable_rCCCS
 baseRegOffset CurrentTSO          = oFFSET_StgRegTable_rCurrentTSO
 baseRegOffset CurrentNursery      = oFFSET_StgRegTable_rCurrentNursery
 baseRegOffset HpAlloc             = oFFSET_StgRegTable_rHpAlloc
index 78aabd8..7c739c6 100644 (file)
@@ -210,8 +210,8 @@ loadThreadState tso stack = do
         openNursery,
         -- and load the current cost centre stack from the TSO when profiling:
         if opt_SccProfilingOn then
-          mkStore curCCSAddr
-                  (CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_CCCS) ccsType)
+          storeCurCCS
+            (CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_CCCS) ccsType)
         else mkNop]
 emitLoadThreadState :: LocalReg -> LocalReg -> FCode ()
 emitLoadThreadState tso stack = emit $ loadThreadState tso stack
index 13c1be7..d9b3583 100644 (file)
@@ -21,7 +21,7 @@ module StgCmmProf (
        dynProfHdr, profDynAlloc, profAlloc, staticProfHdr, initUpdFrameProf,
         enterCostCentreThunk,
         costCentreFrom,
-       curCCS, curCCSAddr,
+        curCCS, storeCurCCS,
         emitSetCCC,
 
        saveCurrentCostCentre, restoreCurrentCostCentre,
@@ -73,11 +73,10 @@ ccType :: CmmType   -- Type of a cost centre
 ccType = bWord
 
 curCCS :: CmmExpr
-curCCS = CmmLoad curCCSAddr ccsType
+curCCS = CmmReg (CmmGlobal CCCS)
 
--- Address of current CCS variable, for storing into
-curCCSAddr :: CmmExpr
-curCCSAddr = CmmLit (CmmLabel (mkCmmDataLabel rtsPackageId (fsLit "CCCS")))
+storeCurCCS :: CmmExpr -> CmmAGraph
+storeCurCCS e = mkAssign (CmmGlobal CCCS) e
 
 mkCCostCentre :: CostCentre -> CmmLit
 mkCCostCentre cc = CmmLabel (mkCCLabel cc)
@@ -150,7 +149,7 @@ restoreCurrentCostCentre :: Maybe LocalReg -> FCode ()
 restoreCurrentCostCentre Nothing 
   = return ()
 restoreCurrentCostCentre (Just local_cc)
-  = emit (mkStore curCCSAddr (CmmReg (CmmLocal local_cc)))
+  = emit (storeCurCCS (CmmReg (CmmLocal local_cc)))
 
 
 -------------------------------------------------------------------------------
@@ -186,7 +185,7 @@ profAlloc words ccs
 enterCostCentreThunk :: CmmExpr -> FCode ()
 enterCostCentreThunk closure = 
   ifProfiling $ do 
-    emit $ mkStore curCCSAddr (costCentreFrom closure)
+    emit $ storeCurCCS (costCentreFrom closure)
 
 ifProfiling :: FCode () -> FCode ()
 ifProfiling code
@@ -269,7 +268,7 @@ emitSetCCC cc tick push
     tmp <- newTemp ccsType -- TODO FIXME NOW
     pushCostCentre tmp curCCS cc
     when tick $ emit (bumpSccCount (CmmReg (CmmLocal tmp)))
-    when push $ emit (mkStore curCCSAddr (CmmReg (CmmLocal tmp)))
+    when push $ emit (storeCurCCS (CmmReg (CmmLocal tmp)))
 
 pushCostCentre :: LocalReg -> CmmExpr -> CostCentre -> FCode ()
 pushCostCentre result ccs cc
index f209005..c332713 100644 (file)
@@ -253,7 +253,7 @@ callerSaveVolatileRegs = (caller_save, caller_load)
     caller_save = catAGraphs (map callerSaveGlobalReg    regs_to_save)
     caller_load = catAGraphs (map callerRestoreGlobalReg regs_to_save)
 
-    system_regs = [ Sp,SpLim,Hp,HpLim,CurrentTSO,CurrentNursery
+    system_regs = [ Sp,SpLim,Hp,HpLim,CCCS,CurrentTSO,CurrentNursery
                    {- ,SparkHd,SparkTl,SparkBase,SparkLim -}
                  , BaseReg ]
 
@@ -366,6 +366,9 @@ callerSaves Hp                      = True
 #ifdef CALLER_SAVES_HpLim
 callerSaves HpLim              = True
 #endif
+#ifdef CALLER_SAVES_CCCS
+callerSaves CCCS                = True
+#endif
 #ifdef CALLER_SAVES_CurrentTSO
 callerSaves CurrentTSO         = True
 #endif
@@ -385,7 +388,8 @@ baseRegOffset SpLim           = oFFSET_StgRegTable_rSpLim
 baseRegOffset (LongReg 1)         = oFFSET_StgRegTable_rL1
 baseRegOffset Hp                 = oFFSET_StgRegTable_rHp
 baseRegOffset HpLim              = oFFSET_StgRegTable_rHpLim
-baseRegOffset CurrentTSO         = oFFSET_StgRegTable_rCurrentTSO
+baseRegOffset CCCS                = oFFSET_StgRegTable_rCCCS
+baseRegOffset CurrentTSO          = oFFSET_StgRegTable_rCurrentTSO
 baseRegOffset CurrentNursery     = oFFSET_StgRegTable_rCurrentNursery
 baseRegOffset HpAlloc            = oFFSET_StgRegTable_rHpAlloc
 baseRegOffset GCEnter1           = oFFSET_stgGCEnter1
index 641faa2..11c02b4 100644 (file)
    CCCS_ALLOC(bytes);
 
 /* CCS_ALLOC wants the size in words, because ccs->mem_alloc is in words */
-#define CCCS_ALLOC(__alloc) CCS_ALLOC(BYTES_TO_WDS(__alloc), W_[CCCS])
+#define CCCS_ALLOC(__alloc) CCS_ALLOC(BYTES_TO_WDS(__alloc), CCCS)
 
 #define HP_CHK_GEN_TICKY(alloc,liveness,reentry)       \
    HP_CHK_GEN(alloc,liveness,reentry);                 \
index 329b156..8d948f9 100644 (file)
@@ -37,6 +37,15 @@ typedef struct StgClosure_ *HaskellObj;
  */
 typedef struct Capability_ Capability;
 
+/*
+ * The public view of a Capability: we can be sure it starts with
+ * these two components (but it may have more private fields).
+ */
+typedef struct CapabilityPublic_ {
+    StgFunTable f;
+    StgRegTable r;
+} CapabilityPublic;
+
 /* ----------------------------------------------------------------------------
    RTS configuration settings, for passing to hs_init_ghc()
    ------------------------------------------------------------------------- */
index b02b6c8..a2c9160 100644 (file)
@@ -222,6 +222,7 @@ main(int argc, char *argv[])
     field_offset(StgRegTable, rSpLim);
     field_offset(StgRegTable, rHp);
     field_offset(StgRegTable, rHpLim);
+    field_offset(StgRegTable, rCCCS);
     field_offset(StgRegTable, rCurrentTSO);
     field_offset(StgRegTable, rCurrentNursery);
     field_offset(StgRegTable, rHpAlloc);
index 4692d16..9737fc9 100644 (file)
@@ -114,8 +114,6 @@ typedef struct _IndexTable {
    Pre-defined cost centres and cost centre stacks
    -------------------------------------------------------------------------- */
 
-extern CostCentreStack * RTS_VAR(CCCS);                /* current CCS */
 #if IN_STG_CODE
 
 extern StgWord CC_MAIN[];      
@@ -153,6 +151,9 @@ extern CostCentreStack CCS_DONT_CARE[];  // shouldn't ever get set
 extern CostCentre      CC_PINNED[];
 extern CostCentreStack CCS_PINNED[];     // pinned memory
 
+extern CostCentre      CC_IDLE[];
+extern CostCentreStack CCS_IDLE[];       // capability is idle
+
 #endif /* IN_STG_CODE */
 
 extern unsigned int RTS_VAR(CC_ID);     // global ids
@@ -165,7 +166,7 @@ extern unsigned int RTS_VAR(era);
  * ---------------------------------------------------------------------------*/
 
 CostCentreStack * pushCostCentre (CostCentreStack *, CostCentre *);
-void              enterFunCCS    (CostCentreStack *);
+void              enterFunCCS    (StgRegTable *reg, CostCentreStack *);
 
 /* -----------------------------------------------------------------------------
    Registering CCs and CCSs
index 45dc836..fcfdede 100644 (file)
@@ -488,7 +488,6 @@ extern StgWord RTS_VAR(stable_ptr_table);
 
 // Profiling.c
 extern unsigned int RTS_VAR(era);
-extern StgWord      RTS_VAR(CCCS);             /* current CCS */
 extern unsigned int RTS_VAR(entering_PAP);
 extern StgWord      RTS_VAR(CC_LIST);               /* registered CC list */
 extern StgWord      RTS_VAR(CCS_LIST);         /* registered CCS list */
index 1d0c00c..b7f0abd 100644 (file)
@@ -80,6 +80,7 @@ typedef struct StgRegTable_ {
   StgPtr         rSpLim;
   StgPtr         rHp;
   StgPtr         rHpLim;
+  struct _CostCentreStack * rCCCS;  // current cost-centre-stack
   struct StgTSO_ *     rCurrentTSO;
   struct nursery_ *    rNursery;
   struct bdescr_ *     rCurrentNursery; /* Hp/HpLim point into this block */
index 5397fc5..a2d4a7e 100644 (file)
@@ -86,7 +86,7 @@ stg_PAP_apply
   TICK_ENT_PAP();
   LDV_ENTER(pap);
 #ifdef PROFILING
-  foreign "C" enterFunCCS(StgHeader_ccs(pap));
+  foreign "C" enterFunCCS(BaseReg "ptr", StgHeader_ccs(pap) "ptr");
 #endif
 
   // Reload the stack 
index 547c5d2..d0c5c3f 100644 (file)
@@ -22,7 +22,7 @@
     TICK_ALLOC_HEAP_NOCTR(BYTES_TO_WDS(size));         \
     TICK_ALLOC_PAP(n+1 /* +1 for the FUN */, 0);       \
     pap = Hp + WDS(1) - size;                          \
-    SET_HDR(pap, stg_PAP_info, W_[CCCS]);              \
+    SET_HDR(pap, stg_PAP_info, CCCS);                   \
     StgPAP_arity(pap) = HALF_W_(arity - m);            \
     StgPAP_fun(pap)   = R1;                            \
     StgPAP_n_args(pap) = HALF_W_(n);                   \
@@ -52,7 +52,7 @@
      TICK_ALLOC_HEAP_NOCTR(BYTES_TO_WDS(size));                        \
      TICK_ALLOC_PAP(n+1 /* +1 for the FUN */, 0);              \
      new_pap = Hp + WDS(1) - size;                             \
-     SET_HDR(new_pap, stg_PAP_info, W_[CCCS]);                 \
+     SET_HDR(new_pap, stg_PAP_info, CCCS);                      \
      StgPAP_arity(new_pap) = HALF_W_(arity - m);               \
      W_ n_args;                                                        \
      n_args = TO_W_(StgPAP_n_args(pap));                       \
 
 // Jump to target, saving CCCS and restoring it on return
 #if defined(PROFILING)
-#define jump_SAVE_CCCS(target) \
-    Sp(-1) = W_[CCCS]; \
-    Sp(-2) = stg_restore_cccs_info; \
-    Sp_adj(-2); \
+#define jump_SAVE_CCCS(target)                  \
+    Sp(-1) = CCCS;                              \
+    Sp(-2) = stg_restore_cccs_info;             \
+    Sp_adj(-2);                                 \
     jump (target)
 #else
 #define jump_SAVE_CCCS(target) jump (target)
index 3b45dec..fd9f64f 100644 (file)
@@ -46,7 +46,7 @@ volatile StgWord waiting_for_gc = 0;
 /* Let foreign code get the current Capability -- assuming there is one!
  * This is useful for unsafe foreign calls because they are called with
  * the current Capability held, but they are not passed it. For example,
- * see see the integer-gmp package which calls allocateLocal() in its
+ * see see the integer-gmp package which calls allocate() in its
  * stgAllocForGMP() function (which gets called by gmp functions).
  * */
 Capability * rts_unsafeGetMyCapability (void)
@@ -265,6 +265,10 @@ initCapability( Capability *cap, nat i )
     cap->context_switch = 0;
     cap->pinned_object_block = NULL;
 
+#ifdef PROFILING
+    cap->r.rCCCS = CCS_SYSTEM;
+#endif
+
     traceCapsetAssignCap(CAPSET_OSPROCESS_DEFAULT, i);
     traceCapsetAssignCap(CAPSET_CLOCKDOMAIN_DEFAULT, i);
 #if defined(THREADED_RTS)
@@ -453,6 +457,9 @@ releaseCapability_ (Capability* cap,
        }
     }
 
+#ifdef PROFILING
+    cap->r.rCCCS = CCS_IDLE;
+#endif
     last_free_capability = cap;
     debugTrace(DEBUG_sched, "freeing capability %d", cap->no);
 }
@@ -604,6 +611,9 @@ waitForReturnCapability (Capability **pCap, Task *task)
 
     }
 
+#ifdef PROFILING
+    cap->r.rCCCS = CCS_SYSTEM;
+#endif
     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
 
     debugTrace(DEBUG_sched, "resuming capability %d", cap->no);
@@ -676,14 +686,19 @@ yieldCapability (Capability** pCap, Task *task)
                task->next = NULL;
                 cap->n_spare_workers--;
             }
-           cap->running_task = task;
+
+            cap->running_task = task;
            RELEASE_LOCK(&cap->lock);
            break;
        }
 
-       debugTrace(DEBUG_sched, "resuming capability %d", cap->no);
+        debugTrace(DEBUG_sched, "resuming capability %d", cap->no);
        ASSERT(cap->running_task == task);
 
+#ifdef PROFILING
+        cap->r.rCCCS = CCS_SYSTEM;
+#endif
+
     *pCap = cap;
 
     ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
index 1192db7..78907c4 100644 (file)
@@ -369,7 +369,7 @@ stg_catchzh
   
     /* Set up the catch frame */
     Sp = Sp - SIZEOF_StgCatchFrame;
-    SET_HDR(Sp,stg_catch_frame_info,W_[CCCS]);
+    SET_HDR(Sp,stg_catch_frame_info,CCCS);
     
     StgCatchFrame_handler(Sp) = R2;
     StgCatchFrame_exceptions_blocked(Sp) = 
@@ -427,7 +427,7 @@ stg_raisezh
      */
     if (RtsFlags_ProfFlags_showCCSOnException(RtsFlags) != 0::I32) {
         SAVE_THREAD_STATE();
-        foreign "C" fprintCCS_stderr(W_[CCCS] "ptr",
+        foreign "C" fprintCCS_stderr(CCCS "ptr",
                                      exception "ptr",
                                      CurrentTSO "ptr") [];
         LOAD_THREAD_STATE();
index da151e1..2eac1cd 100644 (file)
@@ -614,7 +614,7 @@ do_apply:
                // build a new PAP and return it.
                StgPAP *new_pap;
                new_pap = (StgPAP *)allocate(cap, PAP_sizeW(pap->n_args + m));
-               SET_HDR(new_pap,&stg_PAP_info,CCCS);
+                SET_HDR(new_pap,&stg_PAP_info,cap->r.rCCCS);
                new_pap->arity = pap->arity - n;
                new_pap->n_args = pap->n_args + m;
                new_pap->fun = pap->fun;
@@ -659,7 +659,7 @@ do_apply:
                StgPAP *pap;
                nat i;
                pap = (StgPAP *)allocate(cap, PAP_sizeW(m));
-               SET_HDR(pap, &stg_PAP_info,CCCS);
+                SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS);
                pap->arity = arity - n;
                pap->fun = obj;
                pap->n_args = m;
index 8836d3b..2ca347e 100644 (file)
@@ -63,7 +63,7 @@ stg_newByteArrayzh
     words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
     ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) [];
     TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
-    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(p) = n;
     RET_P(p);
 }
@@ -96,7 +96,7 @@ stg_newPinnedByteArrayzh
        to BA_ALIGN bytes: */
     p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
 
-    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(p) = n;
     RET_P(p);
 }
@@ -136,7 +136,7 @@ stg_newAlignedPinnedByteArrayzh
        <alignment> is a power of 2, which is technically not guaranteed */
     p = p + ((-p - SIZEOF_StgArrWords) & (alignment - 1));
 
-    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(p) = n;
     RET_P(p);
 }
@@ -157,7 +157,7 @@ stg_newArrayzh
     ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2];
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
 
-    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
     StgMutArrPtrs_ptrs(arr) = n;
     StgMutArrPtrs_size(arr) = size;
 
@@ -225,7 +225,7 @@ stg_newMutVarzh
     ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
-    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
+    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS);
     StgMutVar_var(mv) = R1;
     
     RET_P(mv);
@@ -297,21 +297,21 @@ stg_atomicModifyMutVarzh
    TICK_ALLOC_THUNK_2();
    CCCS_ALLOC(THUNK_2_SIZE);
    z = Hp - THUNK_2_SIZE + WDS(1);
-   SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
+   SET_HDR(z, stg_ap_2_upd_info, CCCS);
    LDV_RECORD_CREATE(z);
    StgThunk_payload(z,0) = f;
 
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
    y = z - THUNK_1_SIZE;
-   SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
+   SET_HDR(y, stg_sel_0_upd_info, CCCS);
    LDV_RECORD_CREATE(y);
    StgThunk_payload(y,0) = z;
 
    TICK_ALLOC_THUNK_1();
    CCCS_ALLOC(THUNK_1_SIZE);
    r = y - THUNK_1_SIZE;
-   SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
+   SET_HDR(r, stg_sel_1_upd_info, CCCS);
    LDV_RECORD_CREATE(r);
    StgThunk_payload(r,0) = z;
 
@@ -353,7 +353,7 @@ stg_mkWeakzh
   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh );
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
-  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+  SET_HDR(w, stg_WEAK_info, CCCS);
 
   // We don't care about cfinalizer here.
   // Should StgWeak_cfinalizer(w) be stg_NO_FINALIZER_closure or
@@ -397,14 +397,14 @@ stg_mkWeakForeignEnvzh
   ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh );
 
   w = Hp - SIZEOF_StgWeak + WDS(1);
-  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+  SET_HDR(w, stg_WEAK_info, CCCS);
 
   payload_words = 4;
   words         = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
   ("ptr" p)     = foreign "C" allocate(MyCapability() "ptr", words) [];
 
   TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
-  SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+  SET_HDR(p, stg_ARR_WORDS_info, CCCS);
 
   StgArrWords_bytes(p)     = WDS(payload_words);
   StgArrWords_payload(p,0) = fptr;
@@ -877,7 +877,7 @@ stg_atomicallyzh
   Sp = Sp - SIZEOF_StgAtomicallyFrame;
   frame = Sp;
 
-  SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
+  SET_HDR(frame,stg_atomically_frame_info, CCCS);
   StgAtomicallyFrame_code(frame) = R1;
   StgAtomicallyFrame_result(frame) = NO_TREC;
   StgAtomicallyFrame_next_invariant_to_check(frame) = END_INVARIANT_CHECK_QUEUE;
@@ -903,7 +903,7 @@ stg_catchSTMzh
   Sp = Sp - SIZEOF_StgCatchSTMFrame;
   frame = Sp;
 
-  SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
+  SET_HDR(frame, stg_catch_stm_frame_info, CCCS);
   StgCatchSTMFrame_handler(frame) = R2;
   StgCatchSTMFrame_code(frame) = R1;
 
@@ -941,7 +941,7 @@ stg_catchRetryzh
   Sp = Sp - SIZEOF_StgCatchRetryFrame;
   frame = Sp;
   
-  SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
+  SET_HDR(frame, stg_catch_retry_frame_info, CCCS);
   StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
   StgCatchRetryFrame_first_code(frame) = R1;
   StgCatchRetryFrame_alt_code(frame) = R2;
@@ -1153,7 +1153,7 @@ stg_newMVarzh
     ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh );
   
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
-    SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]);
+    SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
         // MVARs start dirty: generation 0 has no mutable list
     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
@@ -1527,7 +1527,7 @@ stg_makeStableNamezh
      */
     if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
        sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
-       SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
+       SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS);
        StgStableName_sn(sn_obj) = index;
        snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
     } else {
@@ -1578,7 +1578,7 @@ stg_newBCOzh
     ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh );
 
     bco = Hp - bytes + WDS(1);
-    SET_HDR(bco, stg_BCO_info, W_[CCCS]);
+    SET_HDR(bco, stg_BCO_info, CCCS);
     
     StgBCO_instrs(bco)     = R1;
     StgBCO_literals(bco)   = R2;
@@ -1617,7 +1617,7 @@ stg_mkApUpd0zh
     CCCS_ALLOC(SIZEOF_StgAP);
 
     ap = Hp - SIZEOF_StgAP + WDS(1);
-    SET_HDR(ap, stg_AP_info, W_[CCCS]);
+    SET_HDR(ap, stg_AP_info, CCCS);
     
     StgAP_n_args(ap) = HALF_W_(0);
     StgAP_fun(ap) = R1;
@@ -1668,7 +1668,7 @@ out:
     ptrs_arr  = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1);
     nptrs_arr = Hp - nptrs_arr_sz + WDS(1);
 
-    SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]);
+    SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, CCCS);
     StgMutArrPtrs_ptrs(ptrs_arr) = ptrs;
     StgMutArrPtrs_size(ptrs_arr) = ptrs + ptrs_arr_cards;
 
@@ -1683,7 +1683,7 @@ for:
        allocated in the nursery.  The GC will fill it in if/when the array
        is promoted. */
     
-    SET_HDR(nptrs_arr, stg_ARR_WORDS_info, W_[CCCS]);
+    SET_HDR(nptrs_arr, stg_ARR_WORDS_info, CCCS);
     StgArrWords_bytes(nptrs_arr) = WDS(nptrs);
     p = 0;
 for2:
index c393c8f..ac2708e 100644 (file)
@@ -18,6 +18,7 @@
 #include "Arena.h"
 #include "RetainerProfile.h"
 #include "Printer.h"
+#include "Capability.h"
 
 #include <string.h>
 
@@ -51,16 +52,16 @@ FILE *prof_file;
 static char *hp_filename;      /* heap profile (hp2ps style) log file */
 FILE *hp_file;
 
-/* The Current Cost Centre Stack (for attributing costs)
- */
-CostCentreStack *CCCS;
-
 /* Linked lists to keep track of CCs and CCSs that haven't
  * been declared in the log file yet
  */
 CostCentre      *CC_LIST  = NULL;
 CostCentreStack *CCS_LIST = NULL;
 
+#ifdef THREADED_RTS
+Mutex ccs_mutex;
+#endif
+
 /*
  * Built-in cost centres and cost-centre stacks:
  *
@@ -92,6 +93,7 @@ CC_DECLARE(CC_GC,        "GC",          "GC",        CC_NOT_CAF, );
 CC_DECLARE(CC_OVERHEAD,  "OVERHEAD_of", "PROFILING", CC_NOT_CAF, );
 CC_DECLARE(CC_DONT_CARE, "DONT_CARE",   "MAIN",      CC_NOT_CAF, );
 CC_DECLARE(CC_PINNED,    "PINNED",      "SYSTEM",    CC_NOT_CAF, );
+CC_DECLARE(CC_IDLE,      "IDLE",        "IDLE",      CC_NOT_CAF, );
 
 CCS_DECLARE(CCS_MAIN,      CC_MAIN,       );
 CCS_DECLARE(CCS_SYSTEM,            CC_SYSTEM,     );
@@ -99,6 +101,7 @@ CCS_DECLARE(CCS_GC,         CC_GC,         );
 CCS_DECLARE(CCS_OVERHEAD,   CC_OVERHEAD,   );
 CCS_DECLARE(CCS_DONT_CARE,  CC_DONT_CARE,  );
 CCS_DECLARE(CCS_PINNED,     CC_PINNED,     );
+CCS_DECLARE(CCS_IDLE,       CC_IDLE,       );
 
 /*
  * Static Functions
@@ -143,7 +146,12 @@ initProfiling1 (void)
     prof_arena = newArena();
 
     /* for the benefit of allocate()... */
-    CCCS = CCS_SYSTEM;
+    {
+        nat n;
+        for (n=0; n < n_capabilities; n++) {
+            capabilities[n].r.rCCCS = CCS_SYSTEM;
+        }
+    }
 }
 
 void
@@ -157,8 +165,6 @@ initProfiling2 (void)
 {
     CostCentreStack *ccs, *next;
 
-    CCCS = CCS_SYSTEM;
-
     /* Set up the log file, and dump the header and cost centre
      * information into it.
      */
@@ -173,12 +179,14 @@ initProfiling2 (void)
     REGISTER_CC(CC_OVERHEAD);
     REGISTER_CC(CC_DONT_CARE);
     REGISTER_CC(CC_PINNED);
+    REGISTER_CC(CC_IDLE);
 
     REGISTER_CCS(CCS_SYSTEM);
     REGISTER_CCS(CCS_GC);
     REGISTER_CCS(CCS_OVERHEAD);
     REGISTER_CCS(CCS_DONT_CARE);
     REGISTER_CCS(CCS_PINNED);
+    REGISTER_CCS(CCS_IDLE);
     REGISTER_CCS(CCS_MAIN);
 
     /* find all the registered cost centre stacks, and make them
@@ -310,12 +318,17 @@ endProfiling ( void )
 
 // implements  c1 ++> c2,  where c1 and c2 are equal depth
 //
-static void enterFunEqualStacks (CostCentreStack *ccs, CostCentreStack *ccsfn)
+static CostCentreStack *
+enterFunEqualStacks (CostCentreStack *ccs0,
+                     CostCentreStack *ccsapp,
+                     CostCentreStack *ccsfn)
 {
-    ASSERT(ccs->depth == ccsfn->depth);
-    if (ccs == ccsfn) return;
-    enterFunEqualStacks(ccs->prevStack, ccsfn->prevStack);
-    CCCS = pushCostCentre(CCCS, ccsfn->cc);
+    ASSERT(ccsapp->depth == ccsfn->depth);
+    if (ccsapp == ccsfn) return ccs0;
+    return pushCostCentre(enterFunEqualStacks(ccs0,
+                                              ccsapp->prevStack,
+                                              ccsfn->prevStack),
+                          ccsfn->cc);
 }
 
 // implements  c1 ++> c2,  where c2 is deeper than c1.
@@ -323,21 +336,25 @@ static void enterFunEqualStacks (CostCentreStack *ccs, CostCentreStack *ccsfn)
 // enterFunEqualStacks(), and then push on the elements that we
 // dropped in reverse order.
 //
-static void enterFunCurShorter (CostCentreStack *ccsfn, StgWord n)
+static CostCentreStack *
+enterFunCurShorter (CostCentreStack *ccsapp, CostCentreStack *ccsfn, StgWord n)
 {
     if (n == 0) {
-        ASSERT(ccsfn->depth == CCCS->depth);
-        enterFunEqualStacks(CCCS,ccsfn);
-        return;
+        ASSERT(ccsfn->depth == ccsapp->depth);
+        return enterFunEqualStacks(ccsapp,ccsapp,ccsfn);;
+    } else {
+        ASSERT(ccsfn->depth > ccsapp->depth);
+        return pushCostCentre(enterFunCurShorter(ccsapp, ccsfn->prevStack, n-1),
+                              ccsfn->cc);
     }
-    enterFunCurShorter(ccsfn->prevStack, n-1);
-    CCCS = pushCostCentre(CCCS, ccsfn->cc);
 }
 
-void enterFunCCS ( CostCentreStack *ccsfn )
+void enterFunCCS (StgRegTable *reg, CostCentreStack *ccsfn)
 {
+    CostCentreStack *ccsapp;
+
     // common case 1: both stacks are the same
-    if (ccsfn == CCCS) {
+    if (ccsfn == reg->rCCCS) {
         return;
     }
 
@@ -346,34 +363,38 @@ void enterFunCCS ( CostCentreStack *ccsfn )
         return;
     }
 
+    ccsapp = reg->rCCCS;
+    reg->rCCCS = CCS_OVERHEAD;
+
     // common case 3: the stacks are completely different (e.g. one is a
     // descendent of MAIN and the other of a CAF): we append the whole
     // of the function stack to the current CCS.
-    if (ccsfn->root != CCCS->root) {
-        CCCS = appendCCS(CCCS,ccsfn);
+    if (ccsfn->root != ccsapp->root) {
+        reg->rCCCS = appendCCS(ccsapp,ccsfn);
         return;
     }
 
-    // uncommon case 4: CCCS is deeper than ccsfn
-    if (CCCS->depth > ccsfn->depth) {
+    // uncommon case 4: ccsapp is deeper than ccsfn
+    if (ccsapp->depth > ccsfn->depth) {
         nat i, n;
-        CostCentreStack *tmp = CCCS;
-        n = CCCS->depth - ccsfn->depth;
+        CostCentreStack *tmp = ccsapp;
+        n = ccsapp->depth - ccsfn->depth;
         for (i = 0; i < n; i++) {
             tmp = tmp->prevStack;
         }
-        enterFunEqualStacks(tmp,ccsfn);
+        reg->rCCCS = enterFunEqualStacks(ccsapp,tmp,ccsfn);
         return;
     }
 
     // uncommon case 5: ccsfn is deeper than CCCS
-    if (ccsfn->depth > CCCS->depth) {
-        enterFunCurShorter(ccsfn, ccsfn->depth - CCCS->depth);
+    if (ccsfn->depth > ccsapp->depth) {
+        reg->rCCCS = enterFunCurShorter(ccsapp, ccsfn,
+                                        ccsfn->depth - ccsapp->depth);
         return;
     }
 
     // uncommon case 6: stacks are equal depth, but different
-    enterFunEqualStacks(CCCS,ccsfn);
+    reg->rCCCS = enterFunEqualStacks(ccsapp,ccsapp,ccsfn);
 }
 
 /* -----------------------------------------------------------------------------
@@ -477,20 +498,41 @@ appendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 )
 CostCentreStack *
 pushCostCentre (CostCentreStack *ccs, CostCentre *cc)
 {
-    CostCentreStack *temp_ccs;
-  
-    if (ccs == EMPTY_STACK)
-        return actualPush(ccs,cc);
-    else {
-        if (ccs->cc == cc)
+    CostCentreStack *temp_ccs, *ret;
+    IndexTable *ixtable;
+
+    if (ccs == EMPTY_STACK) {
+        ACQUIRE_LOCK(&ccs_mutex);
+        ret = actualPush(ccs,cc);
+    }
+    else
+    {
+        if (ccs->cc == cc) {
             return ccs;
-        else {
+        else {
             // check if we've already memoized this stack
-            temp_ccs = isInIndexTable(ccs->indexTable,cc);
+            ixtable = ccs->indexTable;
+            temp_ccs = isInIndexTable(ixtable,cc);
       
-            if (temp_ccs != EMPTY_STACK)
+            if (temp_ccs != EMPTY_STACK) {
                 return temp_ccs;
-            else {
+            } else {
+
+                // not in the IndexTable, now we take the lock:
+                ACQUIRE_LOCK(&ccs_mutex);
+
+                if (ccs->indexTable != ixtable)
+                {
+                    // someone modified ccs->indexTable while
+                    // we did not hold the lock, so we must
+                    // check it again:
+                    temp_ccs = isInIndexTable(ixtable,cc);
+                    if (temp_ccs != EMPTY_STACK)
+                    {
+                        RELEASE_LOCK(&ccs_mutex);
+                        return temp_ccs;
+                    }
+                }
                 temp_ccs = checkLoop(ccs,cc);
                 if (temp_ccs != NULL) {
                     // This CC is already in the stack somewhere.
@@ -510,13 +552,16 @@ pushCostCentre (CostCentreStack *ccs, CostCentre *cc)
 #endif
                     ccs->indexTable = addToIndexTable (ccs->indexTable,
                                                        new_ccs, cc, 1);
-                    return new_ccs;
+                    ret = new_ccs;
                 } else {
-                    return actualPush (ccs,cc);
+                    ret = actualPush (ccs,cc);
                 }
             }
         }
     }
+
+    RELEASE_LOCK(&ccs_mutex);
+    return ret;
 }
 
 static CostCentreStack *
@@ -801,11 +846,12 @@ reportCCSProfiling( void )
        fprintf(prof_file, " %s", prog_argv[count]);
     fprintf(prof_file, "\n\n");
 
-    fprintf(prof_file, "\ttotal time  = %11.2f secs   (%lu ticks @ %d us)\n",
+    fprintf(prof_file, "\ttotal time  = %11.2f secs   (%lu ticks @ %d us, %d processor%s)\n",
             ((double) total_prof_ticks *
-             (double) RtsFlags.MiscFlags.tickInterval) / TIME_RESOLUTION,
+             (double) RtsFlags.MiscFlags.tickInterval) / (TIME_RESOLUTION * n_capabilities),
            (unsigned long) total_prof_ticks,
-            (int) TimeToUS(RtsFlags.MiscFlags.tickInterval));
+            (int) TimeToUS(RtsFlags.MiscFlags.tickInterval),
+            n_capabilities, n_capabilities > 1 ? "s" : "");
 
     fprintf(prof_file, "\ttotal alloc = %11s bytes",
            showStgWord64(total_alloc * sizeof(W_),
index 76d7679..569f087 100644 (file)
@@ -11,6 +11,7 @@
 
 #include "Profiling.h"
 #include "Proftimer.h"
+#include "Capability.h"
 
 #ifdef PROFILING
 static rtsBool do_prof_ticks = rtsFalse;       // enable profiling ticks
@@ -73,7 +74,10 @@ handleProfTick(void)
 #ifdef PROFILING
     total_ticks++;
     if (do_prof_ticks) {
-       CCCS->time_ticks++;
+        nat n;
+        for (n=0; n < n_capabilities; n++) {
+            capabilities[n].r.rCCCS->time_ticks++;
+        }
     }
 #endif
 
index b2adf71..0e75327 100644 (file)
@@ -21,6 +21,9 @@ void endRetainerProfiling  ( void );
 void retainerProfile       ( void );
 void resetStaticObjectForRetainerProfiling( StgClosure *static_objects );
 
+// flip is either 1 or 0, changed at the beginning of retainerProfile()
+// It is used to tell whether a retainer set has been touched so far
+// during this pass.
 extern StgWord flip;
 
 // extract the retainer set field from c
index 3e3290d..650c4f9 100644 (file)
@@ -1147,12 +1147,6 @@ error = rtsTrue;
                       errorBelch("Using large values for -N is not allowed by default. Link with -rtsopts to allow full control.");
                       stg_exit(EXIT_FAILURE);
                     }
-#if defined(PROFILING)
-                    if (nNodes > 1) {
-                        errorBelch("bad option %s: only -N1 is supported with profiling", rts_argv[arg]);
-                     error = rtsTrue;
-                    }
-#endif
                     RtsFlags.ParFlags.nNodes = (nat)nNodes;
                }
                ) break;
index 8c30500..04a66e3 100644 (file)
@@ -501,7 +501,7 @@ run_thread:
     // Costs for the scheduler are assigned to CCS_SYSTEM
     stopHeapProfTimer();
 #if defined(PROFILING)
-    CCCS = CCS_SYSTEM;
+    cap->r.rCCCS = CCS_SYSTEM;
 #endif
     
     schedulePostRunThread(cap,t);
@@ -2262,7 +2262,7 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
            if (raise_closure == NULL) {
                raise_closure = 
                    (StgThunk *)allocate(cap,sizeofW(StgThunk)+1);
-               SET_HDR(raise_closure, &stg_raise_info, CCCS);
+                SET_HDR(raise_closure, &stg_raise_info, cap->r.rCCCS);
                raise_closure->payload[0] = exception;
            }
             updateThunk(cap, tso, ((StgUpdateFrame *)p)->updatee,
index 5ddc1ac..e4b128f 100644 (file)
@@ -42,7 +42,7 @@ INFO_TABLE_RET (stg_stack_underflow_frame, UNDERFLOW_FRAME, P_ unused)
 INFO_TABLE_RET (stg_restore_cccs, RET_SMALL, W_ cccs)
 {
 #if defined(PROFILING)
-    W_[CCCS] = Sp(1);
+    CCCS = Sp(1);
 #endif
     Sp_adj(2);
     jump %ENTRY_CODE(Sp(0));
index be85999..171ab52 100644 (file)
@@ -30,8 +30,8 @@
 #define NOUPD_FRAME_SIZE    (SIZEOF_StgHeader)
 
 #ifdef PROFILING
-#define SAVE_CCCS(fs)          StgHeader_ccs(Sp-fs) = W_[CCCS]
-#define GET_SAVED_CCCS  W_[CCCS] = StgHeader_ccs(Sp)
+#define SAVE_CCCS(fs)   StgHeader_ccs(Sp-fs) = CCCS
+#define GET_SAVED_CCCS  CCCS = StgHeader_ccs(Sp)
 #define RET_PARAMS      W_ unused1, W_ unused2
 #else
 #define SAVE_CCCS(fs)   /* empty */
index 7892280..733c2d6 100644 (file)
@@ -189,7 +189,7 @@ GarbageCollect (rtsBool force_major_gc,
 #endif
 
 #ifdef PROFILING
-  CostCentreStack *prev_CCS;
+  CostCentreStack *save_CCS[n_capabilities];
 #endif
 
   ACQUIRE_SM_LOCK;
@@ -221,8 +221,10 @@ GarbageCollect (rtsBool force_major_gc,
 
   // attribute any costs to CCS_GC 
 #ifdef PROFILING
-  prev_CCS = CCCS;
-  CCCS = CCS_GC;
+  for (n = 0; n < n_capabilities; n++) {
+      save_CCS[n] = capabilities[n].r.rCCCS;
+      capabilities[n].r.rCCCS = CCS_GC;
+  }
 #endif
 
   /* Approximate how much we allocated.  
@@ -626,10 +628,8 @@ GarbageCollect (rtsBool force_major_gc,
 #ifdef PROFILING
   // resetStaticObjectForRetainerProfiling() must be called before
   // zeroing below.
-  if (n_gc_threads > 1) {
-      barf("profiling is currently broken with multi-threaded GC");
-      // ToDo: fix the gct->scavenged_static_objects below
-  }
+
+  // ToDo: fix the gct->scavenged_static_objects below
   resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects);
 #endif
 
@@ -704,7 +704,9 @@ GarbageCollect (rtsBool force_major_gc,
 
   // restore enclosing cost centre 
 #ifdef PROFILING
-  CCCS = prev_CCS;
+  for (n = 0; n < n_capabilities; n++) {
+      capabilities[n].r.rCCCS = save_CCS[n];
+  }
 #endif
 
 #ifdef DEBUG
index 1dad6c8..be3badf 100644 (file)
@@ -608,7 +608,7 @@ allocate (Capability *cap, lnat n)
     StgPtr p;
 
     TICK_ALLOC_HEAP_NOCTR(n);
-    CCS_ALLOC(CCCS,n);
+    CCS_ALLOC(cap->r.rCCCS,n);
     
     if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
        lnat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
@@ -719,7 +719,7 @@ allocatePinned (Capability *cap, lnat n)
     }
 
     TICK_ALLOC_HEAP_NOCTR(n);
-    CCS_ALLOC(CCCS,n);
+    CCS_ALLOC(cap->r.rCCCS,n);
 
     bd = cap->pinned_object_block;
     
index 2ffa81b..b255b92 100644 (file)
@@ -301,7 +301,7 @@ genMkPAP regstatus macro jump ticker disamb
                       loadSpWordOff "W_" (sp_stk_args+stack_args_size-3)
                         <> text " = stg_restore_cccs_info;" $$
                       loadSpWordOff "W_" (sp_stk_args+stack_args_size-2)
-                        <> text " = W_[CCCS];"
+                        <> text " = CCCS;"
                     else empty) $$
                   loadSpWordOff "W_" (sp_stk_args+stack_args_size-1)
                         <> text " = "