Fix haddocks for marker events in Debug.Trace
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import OccurAnal ( occurAnalyseExpr_NoBinderSwap )
50 import CoreOpt
51 import CoreArity ( manifestArity )
52 import CoreUtils
53 import Id
54 import Demand ( isBottomingSig )
55 import DataCon
56 import Literal
57 import PrimOp
58 import IdInfo
59 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
60 import Type
61 import PrelNames
62 import TysPrim ( realWorldStatePrimTy )
63 import Bag
64 import Util
65 import Outputable
66 import ForeignCall
67 import Name
68
69 import qualified Data.ByteString as BS
70 import Data.List
71
72 {-
73 ************************************************************************
74 * *
75 \subsection{Making unfoldings}
76 * *
77 ************************************************************************
78 -}
79
80 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
81 mkTopUnfolding dflags is_bottoming rhs
82 = mkUnfolding dflags InlineRhs True is_bottoming rhs
83
84 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
85 -- For implicit Ids, do a tiny bit of optimising first
86 mkImplicitUnfolding dflags expr
87 = mkTopUnfolding dflags False (simpleOptExpr dflags expr)
88
89 -- Note [Top-level flag on inline rules]
90 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
91 -- Slight hack: note that mk_inline_rules conservatively sets the
92 -- top-level flag to True. It gets set more accurately by the simplifier
93 -- Simplify.simplUnfolding.
94
95 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
96 mkSimpleUnfolding dflags rhs
97 = mkUnfolding dflags InlineRhs False False rhs
98
99 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
100 mkDFunUnfolding bndrs con ops
101 = DFunUnfolding { df_bndrs = bndrs
102 , df_con = con
103 , df_args = map occurAnalyseExpr_NoBinderSwap ops }
104 -- See Note [Occurrence analysis of unfoldings]
105
106 mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
107 mkWwInlineRule dflags expr arity
108 = mkCoreUnfolding InlineStable True
109 (simpleOptExpr dflags expr)
110 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
111 , ug_boring_ok = boringCxtNotOk })
112
113 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
114 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
115 = mkCoreUnfolding InlineCompulsory True
116 (simpleOptExpr unsafeGlobalDynFlags expr)
117 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
118 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
119
120 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
121 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
122 mkWorkerUnfolding dflags work_fn
123 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
124 , uf_is_top = top_lvl })
125 | isStableSource src
126 = mkCoreUnfolding src top_lvl new_tmpl guidance
127 where
128 new_tmpl = simpleOptExpr dflags (work_fn tmpl)
129 guidance = calcUnfoldingGuidance dflags False new_tmpl
130
131 mkWorkerUnfolding _ _ _ = noUnfolding
132
133 -- | Make an unfolding that may be used unsaturated
134 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
135 -- manifest arity (the number of outer lambdas applications will
136 -- resolve before doing any work).
137 mkInlineUnfolding :: CoreExpr -> Unfolding
138 mkInlineUnfolding expr
139 = mkCoreUnfolding InlineStable
140 True -- Note [Top-level flag on inline rules]
141 expr' guide
142 where
143 expr' = simpleOptExpr unsafeGlobalDynFlags expr
144 guide = UnfWhen { ug_arity = manifestArity expr'
145 , ug_unsat_ok = unSaturatedOk
146 , ug_boring_ok = boring_ok }
147 boring_ok = inlineBoringOk expr'
148
149 -- | Make an unfolding that will be used once the RHS has been saturated
150 -- to the given arity.
151 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
152 mkInlineUnfoldingWithArity arity expr
153 = mkCoreUnfolding InlineStable
154 True -- Note [Top-level flag on inline rules]
155 expr' guide
156 where
157 expr' = simpleOptExpr unsafeGlobalDynFlags expr
158 guide = UnfWhen { ug_arity = arity
159 , ug_unsat_ok = needSaturated
160 , ug_boring_ok = boring_ok }
161 -- See Note [INLINE pragmas and boring contexts] as to why we need to look
162 -- at the arity here.
163 boring_ok | arity == 0 = True
164 | otherwise = inlineBoringOk expr'
165
166 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
167 mkInlinableUnfolding dflags expr
168 = mkUnfolding dflags InlineStable False False expr'
169 where
170 expr' = simpleOptExpr dflags expr
171
172 specUnfolding :: DynFlags -> [Var] -> (CoreExpr -> CoreExpr) -> Arity
173 -> Unfolding -> Unfolding
174 -- See Note [Specialising unfoldings]
175 -- specUnfolding spec_bndrs spec_app arity_decrease unf
176 -- = \spec_bndrs. spec_app( unf )
177 --
178 specUnfolding dflags spec_bndrs spec_app arity_decrease
179 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
180 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
181 mkDFunUnfolding spec_bndrs con (map spec_arg args)
182 -- There is a hard-to-check assumption here that the spec_app has
183 -- enough applications to exactly saturate the old_bndrs
184 -- For DFunUnfoldings we transform
185 -- \old_bndrs. MkD <op1> ... <opn>
186 -- to
187 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
188 -- The ASSERT checks the value part of that
189 where
190 spec_arg arg = simpleOptExpr dflags (spec_app (mkLams old_bndrs arg))
191 -- The beta-redexes created by spec_app will be
192 -- simplified away by simplOptExpr
193
194 specUnfolding dflags spec_bndrs spec_app arity_decrease
195 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
196 , uf_is_top = top_lvl
197 , uf_guidance = old_guidance })
198 | isStableSource src -- See Note [Specialising unfoldings]
199 , UnfWhen { ug_arity = old_arity
200 , ug_unsat_ok = unsat_ok
201 , ug_boring_ok = boring_ok } <- old_guidance
202 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
203 , ug_unsat_ok = unsat_ok
204 , ug_boring_ok = boring_ok }
205 new_tmpl = simpleOptExpr dflags (mkLams spec_bndrs (spec_app tmpl))
206 -- The beta-redexes created by spec_app will be
207 -- simplified away by simplOptExpr
208
209 in mkCoreUnfolding src top_lvl new_tmpl guidance
210
211 specUnfolding _ _ _ _ _ = noUnfolding
212
213 {- Note [Specialising unfoldings]
214 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
215 When we specialise a function for some given type-class arguments, we use
216 specUnfolding to specialise its unfolding. Some important points:
217
218 * If the original function has a DFunUnfolding, the specialised one
219 must do so too! Otherwise we lose the magic rules that make it
220 interact with ClassOps
221
222 * There is a bit of hack for INLINABLE functions:
223 f :: Ord a => ....
224 f = <big-rhs>
225 {- INLINABLE f #-}
226 Now if we specialise f, should the specialised version still have
227 an INLINABLE pragma? If it does, we'll capture a specialised copy
228 of <big-rhs> as its unfolding, and that probaby won't inline. But
229 if we don't, the specialised version of <big-rhs> might be small
230 enough to inline at a call site. This happens with Control.Monad.liftM3,
231 and can cause a lot more allocation as a result (nofib n-body shows this).
232
233 Moreover, keeping the INLINABLE thing isn't much help, because
234 the specialised function (probaby) isn't overloaded any more.
235
236 Conclusion: drop the INLINEALE pragma. In practice what this means is:
237 if a stable unfolding has UnfoldingGuidance of UnfWhen,
238 we keep it (so the specialised thing too will always inline)
239 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
240 (which arises from INLINABLE), we discard it
241
242 Note [Honour INLINE on 0-ary bindings]
243 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
244 Consider
245
246 x = <expensive>
247 {-# INLINE x #-}
248
249 f y = ...x...
250
251 The semantics of an INLINE pragma is
252
253 inline x at every call site, provided it is saturated;
254 that is, applied to at least as many arguments as appear
255 on the LHS of the Haskell source definition.
256
257 (This soure-code-derived arity is stored in the `ug_arity` field of
258 the `UnfoldingGuidance`.)
259
260 In the example, x's ug_arity is 0, so we should inline it at every use
261 site. It's rare to have such an INLINE pragma (usually INLINE Is on
262 functions), but it's occasionally very important (#15578, #15519).
263 In #15519 we had something like
264 x = case (g a b) of I# r -> T r
265 {-# INLINE x #-}
266 f y = ...(h x)....
267
268 where h is strict. So we got
269 f y = ...(case g a b of I# r -> h (T r))...
270
271 and that in turn allowed SpecConstr to ramp up performance.
272
273 How do we deliver on this? By adjusting the ug_boring_ok
274 flag in mkInlineUnfoldingWithArity; see
275 Note [INLINE pragmas and boring contexts]
276
277 NB: there is a real risk that full laziness will float it right back
278 out again. Consider again
279 x = factorial 200
280 {-# INLINE x #-}
281 f y = ...x...
282
283 After inlining we get
284 f y = ...(factorial 200)...
285
286 but it's entirely possible that full laziness will do
287 lvl23 = factorial 200
288 f y = ...lvl23...
289
290 That's a problem for another day.
291
292 Note [INLINE pragmas and boring contexts]
293 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
294 An INLINE pragma uses mkInlineUnfoldingWithArity to build the
295 unfolding. That sets the ug_boring_ok flag to False if the function
296 is not tiny (inlineBoringOK), so that even INLINE functions are not
297 inlined in an utterly boring context. E.g.
298 \x y. Just (f y x)
299 Nothing is gained by inlining f here, even if it has an INLINE
300 pragma.
301
302 But for 0-ary bindings, we want to inline regardless; see
303 Note [Honour INLINE on 0-ary bindings].
304
305 I'm a bit worried that it's possible for the same kind of problem
306 to arise for non-0-ary functions too, but let's wait and see.
307 -}
308
309 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
310 -> UnfoldingGuidance -> Unfolding
311 -- Occurrence-analyses the expression before capturing it
312 mkCoreUnfolding src top_lvl expr guidance
313 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr,
314 -- See Note [Occurrence analysis of unfoldings]
315 uf_src = src,
316 uf_is_top = top_lvl,
317 uf_is_value = exprIsHNF expr,
318 uf_is_conlike = exprIsConLike expr,
319 uf_is_work_free = exprIsWorkFree expr,
320 uf_expandable = exprIsExpandable expr,
321 uf_guidance = guidance }
322
323 mkUnfolding :: DynFlags -> UnfoldingSource
324 -> Bool -- Is top-level
325 -> Bool -- Definitely a bottoming binding
326 -- (only relevant for top-level bindings)
327 -> CoreExpr
328 -> Unfolding
329 -- Calculates unfolding guidance
330 -- Occurrence-analyses the expression before capturing it
331 mkUnfolding dflags src is_top_lvl is_bottoming expr
332 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr,
333 -- See Note [Occurrence analysis of unfoldings]
334 uf_src = src,
335 uf_is_top = is_top_lvl,
336 uf_is_value = exprIsHNF expr,
337 uf_is_conlike = exprIsConLike expr,
338 uf_expandable = exprIsExpandable expr,
339 uf_is_work_free = exprIsWorkFree expr,
340 uf_guidance = guidance }
341 where
342 is_top_bottoming = is_top_lvl && is_bottoming
343 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
344 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr_NoBinderSwap expr))!
345 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
346
347 {-
348 Note [Occurrence analysis of unfoldings]
349 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
350 We do occurrence-analysis of unfoldings once and for all, when the
351 unfolding is built, rather than each time we inline them.
352
353 But given this decision it's vital that we do
354 *always* do it. Consider this unfolding
355 \x -> letrec { f = ...g...; g* = f } in body
356 where g* is (for some strange reason) the loop breaker. If we don't
357 occ-anal it when reading it in, we won't mark g as a loop breaker, and
358 we may inline g entirely in body, dropping its binding, and leaving
359 the occurrence in f out of scope. This happened in #8892, where
360 the unfolding in question was a DFun unfolding.
361
362 But more generally, the simplifier is designed on the
363 basis that it is looking at occurrence-analysed expressions, so better
364 ensure that they acutally are.
365
366 We use occurAnalyseExpr_NoBinderSwap instead of occurAnalyseExpr;
367 see Note [No binder swap in unfoldings].
368
369 Note [No binder swap in unfoldings]
370 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
371 The binder swap can temporarily violate Core Lint, by assinging
372 a LocalId binding to a GlobalId. For example, if A.foo{r872}
373 is a GlobalId with unique r872, then
374
375 case A.foo{r872} of bar {
376 K x -> ...(A.foo{r872})...
377 }
378
379 gets transformed to
380
381 case A.foo{r872} of bar {
382 K x -> let foo{r872} = bar
383 in ...(A.foo{r872})...
384
385 This is usually not a problem, because the simplifier will transform
386 this to:
387
388 case A.foo{r872} of bar {
389 K x -> ...(bar)...
390
391 However, after occurrence analysis but before simplification, this extra 'let'
392 violates the Core Lint invariant that we do not have local 'let' bindings for
393 GlobalIds. That seems (just) tolerable for the occurrence analysis that happens
394 just before the Simplifier, but not for unfoldings, which are Linted
395 independently.
396 As a quick workaround, we disable binder swap in this module.
397 See #16288 and #16296 for further plans.
398
399 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
400 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
401 Notice that we give the non-occur-analysed expression to
402 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
403 first; for example, sometimes during simplification, there's a large
404 let-bound thing which has been substituted, and so is now dead; so
405 'expr' contains two copies of the thing while the occurrence-analysed
406 expression doesn't.
407
408 Nevertheless, we *don't* and *must not* occ-analyse before computing
409 the size because
410
411 a) The size computation bales out after a while, whereas occurrence
412 analysis does not.
413
414 b) Residency increases sharply if you occ-anal first. I'm not
415 100% sure why, but it's a large effect. Compiling Cabal went
416 from residency of 534M to over 800M with this one change.
417
418 This can occasionally mean that the guidance is very pessimistic;
419 it gets fixed up next round. And it should be rare, because large
420 let-bound things that are dead are usually caught by preInlineUnconditionally
421
422
423 ************************************************************************
424 * *
425 \subsection{The UnfoldingGuidance type}
426 * *
427 ************************************************************************
428 -}
429
430 inlineBoringOk :: CoreExpr -> Bool
431 -- See Note [INLINE for small functions]
432 -- True => the result of inlining the expression is
433 -- no bigger than the expression itself
434 -- eg (\x y -> f y x)
435 -- This is a quick and dirty version. It doesn't attempt
436 -- to deal with (\x y z -> x (y z))
437 -- The really important one is (x `cast` c)
438 inlineBoringOk e
439 = go 0 e
440 where
441 go :: Int -> CoreExpr -> Bool
442 go credit (Lam x e) | isId x = go (credit+1) e
443 | otherwise = go credit e
444 go credit (App f (Type {})) = go credit f
445 go credit (App f a) | credit > 0
446 , exprIsTrivial a = go (credit-1) f
447 go credit (Tick _ e) = go credit e -- dubious
448 go credit (Cast e _) = go credit e
449 go _ (Var {}) = boringCxtOk
450 go _ _ = boringCxtNotOk
451
452 calcUnfoldingGuidance
453 :: DynFlags
454 -> Bool -- Definitely a top-level, bottoming binding
455 -> CoreExpr -- Expression to look at
456 -> UnfoldingGuidance
457 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
458 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
459 = calcUnfoldingGuidance dflags is_top_bottoming expr
460 calcUnfoldingGuidance dflags is_top_bottoming expr
461 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
462 TooBig -> UnfNever
463 SizeIs size cased_bndrs scrut_discount
464 | uncondInline expr n_val_bndrs size
465 -> UnfWhen { ug_unsat_ok = unSaturatedOk
466 , ug_boring_ok = boringCxtOk
467 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
468
469 | is_top_bottoming
470 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
471
472 | otherwise
473 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
474 , ug_size = size
475 , ug_res = scrut_discount }
476
477 where
478 (bndrs, body) = collectBinders expr
479 bOMB_OUT_SIZE = ufCreationThreshold dflags
480 -- Bomb out if size gets bigger than this
481 val_bndrs = filter isId bndrs
482 n_val_bndrs = length val_bndrs
483
484 mk_discount :: Bag (Id,Int) -> Id -> Int
485 mk_discount cbs bndr = foldl' combine 0 cbs
486 where
487 combine acc (bndr', disc)
488 | bndr == bndr' = acc `plus_disc` disc
489 | otherwise = acc
490
491 plus_disc :: Int -> Int -> Int
492 plus_disc | isFunTy (idType bndr) = max
493 | otherwise = (+)
494 -- See Note [Function and non-function discounts]
495
496 {-
497 Note [Computing the size of an expression]
498 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
499 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
500 heuristics right has taken a long time. Here's the basic strategy:
501
502 * Variables, literals: 0
503 (Exception for string literals, see litSize.)
504
505 * Function applications (f e1 .. en): 1 + #value args
506
507 * Constructor applications: 1, regardless of #args
508
509 * Let(rec): 1 + size of components
510
511 * Note, cast: 0
512
513 Examples
514
515 Size Term
516 --------------
517 0 42#
518 0 x
519 0 True
520 2 f x
521 1 Just x
522 4 f (g x)
523
524 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
525 a function call to account for. Notice also that constructor applications
526 are very cheap, because exposing them to a caller is so valuable.
527
528 [25/5/11] All sizes are now multiplied by 10, except for primops
529 (which have sizes like 1 or 4. This makes primops look fantastically
530 cheap, and seems to be almost unversally beneficial. Done partly as a
531 result of #4978.
532
533 Note [Do not inline top-level bottoming functions]
534 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
535 The FloatOut pass has gone to some trouble to float out calls to 'error'
536 and similar friends. See Note [Bottoming floats] in SetLevels.
537 Do not re-inline them! But we *do* still inline if they are very small
538 (the uncondInline stuff).
539
540 Note [INLINE for small functions]
541 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
542 Consider {-# INLINE f #-}
543 f x = Just x
544 g y = f y
545 Then f's RHS is no larger than its LHS, so we should inline it into
546 even the most boring context. In general, f the function is
547 sufficiently small that its body is as small as the call itself, the
548 inline unconditionally, regardless of how boring the context is.
549
550 Things to note:
551
552 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
553 than the thing it's replacing. Notice that
554 (f x) --> (g 3) -- YES, unconditionally
555 (f x) --> x : [] -- YES, *even though* there are two
556 -- arguments to the cons
557 x --> g 3 -- NO
558 x --> Just v -- NO
559
560 It's very important not to unconditionally replace a variable by
561 a non-atomic term.
562
563 (2) We do this even if the thing isn't saturated, else we end up with the
564 silly situation that
565 f x y = x
566 ...map (f 3)...
567 doesn't inline. Even in a boring context, inlining without being
568 saturated will give a lambda instead of a PAP, and will be more
569 efficient at runtime.
570
571 (3) However, when the function's arity > 0, we do insist that it
572 has at least one value argument at the call site. (This check is
573 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
574 f = /\a \x:a. x
575 d = /\b. MkD (f b)
576 If we inline f here we get
577 d = /\b. MkD (\x:b. x)
578 and then prepareRhs floats out the argument, abstracting the type
579 variables, so we end up with the original again!
580
581 (4) We must be much more cautious about arity-zero things. Consider
582 let x = y +# z in ...
583 In *size* terms primops look very small, because the generate a
584 single instruction, but we do not want to unconditionally replace
585 every occurrence of x with (y +# z). So we only do the
586 unconditional-inline thing for *trivial* expressions.
587
588 NB: you might think that PostInlineUnconditionally would do this
589 but it doesn't fire for top-level things; see SimplUtils
590 Note [Top level and postInlineUnconditionally]
591 -}
592
593 uncondInline :: CoreExpr -> Arity -> Int -> Bool
594 -- Inline unconditionally if there no size increase
595 -- Size of call is arity (+1 for the function)
596 -- See Note [INLINE for small functions]
597 uncondInline rhs arity size
598 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
599 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
600
601 sizeExpr :: DynFlags
602 -> Int -- Bomb out if it gets bigger than this
603 -> [Id] -- Arguments; we're interested in which of these
604 -- get case'd
605 -> CoreExpr
606 -> ExprSize
607
608 -- Note [Computing the size of an expression]
609
610 sizeExpr dflags bOMB_OUT_SIZE top_args expr
611 = size_up expr
612 where
613 size_up (Cast e _) = size_up e
614 size_up (Tick _ e) = size_up e
615 size_up (Type _) = sizeZero -- Types cost nothing
616 size_up (Coercion _) = sizeZero
617 size_up (Lit lit) = sizeN (litSize lit)
618 size_up (Var f) | isRealWorldId f = sizeZero
619 -- Make sure we get constructor discounts even
620 -- on nullary constructors
621 | otherwise = size_up_call f [] 0
622
623 size_up (App fun arg)
624 | isTyCoArg arg = size_up fun
625 | otherwise = size_up arg `addSizeNSD`
626 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
627
628 size_up (Lam b e)
629 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
630 | otherwise = size_up e
631
632 size_up (Let (NonRec binder rhs) body)
633 = size_up_rhs (binder, rhs) `addSizeNSD`
634 size_up body `addSizeN`
635 size_up_alloc binder
636
637 size_up (Let (Rec pairs) body)
638 = foldr (addSizeNSD . size_up_rhs)
639 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
640 pairs
641
642 size_up (Case e _ _ alts)
643 | null alts
644 = size_up e -- case e of {} never returns, so take size of scrutinee
645
646 size_up (Case e _ _ alts)
647 -- Now alts is non-empty
648 | Just v <- is_top_arg e -- We are scrutinising an argument variable
649 = let
650 alt_sizes = map size_up_alt alts
651
652 -- alts_size tries to compute a good discount for
653 -- the case when we are scrutinising an argument variable
654 alts_size (SizeIs tot tot_disc tot_scrut)
655 -- Size of all alternatives
656 (SizeIs max _ _)
657 -- Size of biggest alternative
658 = SizeIs tot (unitBag (v, 20 + tot - max)
659 `unionBags` tot_disc) tot_scrut
660 -- If the variable is known, we produce a
661 -- discount that will take us back to 'max',
662 -- the size of the largest alternative The
663 -- 1+ is a little discount for reduced
664 -- allocation in the caller
665 --
666 -- Notice though, that we return tot_disc,
667 -- the total discount from all branches. I
668 -- think that's right.
669
670 alts_size tot_size _ = tot_size
671 in
672 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
673 (foldr1 maxSize alt_sizes)
674 -- Good to inline if an arg is scrutinised, because
675 -- that may eliminate allocation in the caller
676 -- And it eliminates the case itself
677 where
678 is_top_arg (Var v) | v `elem` top_args = Just v
679 is_top_arg (Cast e _) = is_top_arg e
680 is_top_arg _ = Nothing
681
682
683 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
684 foldr (addAltSize . size_up_alt) case_size alts
685 where
686 case_size
687 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
688 | otherwise = sizeZero
689 -- Normally we don't charge for the case itself, but
690 -- we charge one per alternative (see size_up_alt,
691 -- below) to account for the cost of the info table
692 -- and comparisons.
693 --
694 -- However, in certain cases (see is_inline_scrut
695 -- below), no code is generated for the case unless
696 -- there are multiple alts. In these cases we
697 -- subtract one, making the first alt free.
698 -- e.g. case x# +# y# of _ -> ... should cost 1
699 -- case touch# x# of _ -> ... should cost 0
700 -- (see #4978)
701 --
702 -- I would like to not have the "lengthAtMost alts 1"
703 -- condition above, but without that some programs got worse
704 -- (spectral/hartel/event and spectral/para). I don't fully
705 -- understand why. (SDM 24/5/11)
706
707 -- unboxed variables, inline primops and unsafe foreign calls
708 -- are all "inline" things:
709 is_inline_scrut (Var v) = isUnliftedType (idType v)
710 is_inline_scrut scrut
711 | (Var f, _) <- collectArgs scrut
712 = case idDetails f of
713 FCallId fc -> not (isSafeForeignCall fc)
714 PrimOpId op -> not (primOpOutOfLine op)
715 _other -> False
716 | otherwise
717 = False
718
719 size_up_rhs (bndr, rhs)
720 | Just join_arity <- isJoinId_maybe bndr
721 -- Skip arguments to join point
722 , (_bndrs, body) <- collectNBinders join_arity rhs
723 = size_up body
724 | otherwise
725 = size_up rhs
726
727 ------------
728 -- size_up_app is used when there's ONE OR MORE value args
729 size_up_app (App fun arg) args voids
730 | isTyCoArg arg = size_up_app fun args voids
731 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
732 | otherwise = size_up arg `addSizeNSD`
733 size_up_app fun (arg:args) voids
734 size_up_app (Var fun) args voids = size_up_call fun args voids
735 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
736 size_up_app (Cast expr _) args voids = size_up_app expr args voids
737 size_up_app other args voids = size_up other `addSizeN`
738 callSize (length args) voids
739 -- if the lhs is not an App or a Var, or an invisible thing like a
740 -- Tick or Cast, then we should charge for a complete call plus the
741 -- size of the lhs itself.
742
743 ------------
744 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
745 size_up_call fun val_args voids
746 = case idDetails fun of
747 FCallId _ -> sizeN (callSize (length val_args) voids)
748 DataConWorkId dc -> conSize dc (length val_args)
749 PrimOpId op -> primOpSize op (length val_args)
750 ClassOpId _ -> classOpSize dflags top_args val_args
751 _ -> funSize dflags top_args fun (length val_args) voids
752
753 ------------
754 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
755 -- Don't charge for args, so that wrappers look cheap
756 -- (See comments about wrappers with Case)
757 --
758 -- IMPORTANT: *do* charge 1 for the alternative, else we
759 -- find that giant case nests are treated as practically free
760 -- A good example is Foreign.C.Error.errnoToIOError
761
762 ------------
763 -- Cost to allocate binding with given binder
764 size_up_alloc bndr
765 | isTyVar bndr -- Doesn't exist at runtime
766 || isJoinId bndr -- Not allocated at all
767 || isUnliftedType (idType bndr) -- Doesn't live in heap
768 = 0
769 | otherwise
770 = 10
771
772 ------------
773 -- These addSize things have to be here because
774 -- I don't want to give them bOMB_OUT_SIZE as an argument
775 addSizeN TooBig _ = TooBig
776 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
777
778 -- addAltSize is used to add the sizes of case alternatives
779 addAltSize TooBig _ = TooBig
780 addAltSize _ TooBig = TooBig
781 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
782 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
783 (xs `unionBags` ys)
784 (d1 + d2) -- Note [addAltSize result discounts]
785
786 -- This variant ignores the result discount from its LEFT argument
787 -- It's used when the second argument isn't part of the result
788 addSizeNSD TooBig _ = TooBig
789 addSizeNSD _ TooBig = TooBig
790 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
791 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
792 (xs `unionBags` ys)
793 d2 -- Ignore d1
794
795 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
796
797 -- an expression of type State# RealWorld must be a variable
798 isRealWorldExpr (Var id) = isRealWorldId id
799 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
800 isRealWorldExpr _ = False
801
802 -- | Finds a nominal size of a string literal.
803 litSize :: Literal -> Int
804 -- Used by CoreUnfold.sizeExpr
805 litSize (LitNumber LitNumInteger _ _) = 100 -- Note [Size of literal integers]
806 litSize (LitNumber LitNumNatural _ _) = 100
807 litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
808 -- If size could be 0 then @f "x"@ might be too small
809 -- [Sept03: make literal strings a bit bigger to avoid fruitless
810 -- duplication of little strings]
811 litSize _other = 0 -- Must match size of nullary constructors
812 -- Key point: if x |-> 4, then x must inline unconditionally
813 -- (eg via case binding)
814
815 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
816 -- See Note [Conlike is interesting]
817 classOpSize _ _ []
818 = sizeZero
819 classOpSize dflags top_args (arg1 : other_args)
820 = SizeIs size arg_discount 0
821 where
822 size = 20 + (10 * length other_args)
823 -- If the class op is scrutinising a lambda bound dictionary then
824 -- give it a discount, to encourage the inlining of this function
825 -- The actual discount is rather arbitrarily chosen
826 arg_discount = case arg1 of
827 Var dict | dict `elem` top_args
828 -> unitBag (dict, ufDictDiscount dflags)
829 _other -> emptyBag
830
831 -- | The size of a function call
832 callSize
833 :: Int -- ^ number of value args
834 -> Int -- ^ number of value args that are void
835 -> Int
836 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
837 -- The 1+ is for the function itself
838 -- Add 1 for each non-trivial arg;
839 -- the allocation cost, as in let(rec)
840
841 -- | The size of a jump to a join point
842 jumpSize
843 :: Int -- ^ number of value args
844 -> Int -- ^ number of value args that are void
845 -> Int
846 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
847 -- A jump is 20% the size of a function call. Making jumps free reopens
848 -- bug #6048, but making them any more expensive loses a 21% improvement in
849 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
850 -- better solution?
851
852 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
853 -- Size for functions that are not constructors or primops
854 -- Note [Function applications]
855 funSize dflags top_args fun n_val_args voids
856 | fun `hasKey` buildIdKey = buildSize
857 | fun `hasKey` augmentIdKey = augmentSize
858 | otherwise = SizeIs size arg_discount res_discount
859 where
860 some_val_args = n_val_args > 0
861 is_join = isJoinId fun
862
863 size | is_join = jumpSize n_val_args voids
864 | not some_val_args = 0
865 | otherwise = callSize n_val_args voids
866
867 -- DISCOUNTS
868 -- See Note [Function and non-function discounts]
869 arg_discount | some_val_args && fun `elem` top_args
870 = unitBag (fun, ufFunAppDiscount dflags)
871 | otherwise = emptyBag
872 -- If the function is an argument and is applied
873 -- to some values, give it an arg-discount
874
875 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
876 | otherwise = 0
877 -- If the function is partially applied, show a result discount
878 -- XXX maybe behave like ConSize for eval'd variable
879
880 conSize :: DataCon -> Int -> ExprSize
881 conSize dc n_val_args
882 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
883
884 -- See Note [Unboxed tuple size and result discount]
885 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
886
887 -- See Note [Constructor size and result discount]
888 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
889
890 -- XXX still looks to large to me
891
892 {-
893 Note [Constructor size and result discount]
894 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
895 Treat a constructors application as size 10, regardless of how many
896 arguments it has; we are keen to expose them (and we charge separately
897 for their args). We can't treat them as size zero, else we find that
898 (Just x) has size 0, which is the same as a lone variable; and hence
899 'v' will always be replaced by (Just x), where v is bound to Just x.
900
901 The "result discount" is applied if the result of the call is
902 scrutinised (say by a case). For a constructor application that will
903 mean the constructor application will disappear, so we don't need to
904 charge it to the function. So the discount should at least match the
905 cost of the constructor application, namely 10. But to give a bit
906 of extra incentive we give a discount of 10*(1 + n_val_args).
907
908 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
909 and said it was an "unambiguous win", but its terribly dangerous
910 because a function with many many case branches, each finishing with
911 a constructor, can have an arbitrarily large discount. This led to
912 terrible code bloat: see #6099.
913
914 Note [Unboxed tuple size and result discount]
915 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
916 However, unboxed tuples count as size zero. I found occasions where we had
917 f x y z = case op# x y z of { s -> (# s, () #) }
918 and f wasn't getting inlined.
919
920 I tried giving unboxed tuples a *result discount* of zero (see the
921 commented-out line). Why? When returned as a result they do not
922 allocate, so maybe we don't want to charge so much for them If you
923 have a non-zero discount here, we find that workers often get inlined
924 back into wrappers, because it look like
925 f x = case $wf x of (# a,b #) -> (a,b)
926 and we are keener because of the case. However while this change
927 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
928 more. All other changes were very small. So it's not a big deal but I
929 didn't adopt the idea.
930
931 Note [Function and non-function discounts]
932 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
933 We want a discount if the function is applied. A good example is
934 monadic combinators with continuation arguments, where inlining is
935 quite important.
936
937 But we don't want a big discount when a function is called many times
938 (see the detailed comments with #6048) because if the function is
939 big it won't be inlined at its many call sites and no benefit results.
940 Indeed, we can get exponentially big inlinings this way; that is what
941 #6048 is about.
942
943 On the other hand, for data-valued arguments, if there are lots of
944 case expressions in the body, each one will get smaller if we apply
945 the function to a constructor application, so we *want* a big discount
946 if the argument is scrutinised by many case expressions.
947
948 Conclusion:
949 - For functions, take the max of the discounts
950 - For data values, take the sum of the discounts
951
952
953 Note [Literal integer size]
954 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
955 Literal integers *can* be big (mkInteger [...coefficients...]), but
956 need not be (S# n). We just use an arbitrary big-ish constant here
957 so that, in particular, we don't inline top-level defns like
958 n = S# 5
959 There's no point in doing so -- any optimisations will see the S#
960 through n's unfolding. Nor will a big size inhibit unfoldings functions
961 that mention a literal Integer, because the float-out pass will float
962 all those constants to top level.
963 -}
964
965 primOpSize :: PrimOp -> Int -> ExprSize
966 primOpSize op n_val_args
967 = if primOpOutOfLine op
968 then sizeN (op_size + n_val_args)
969 else sizeN op_size
970 where
971 op_size = primOpCodeSize op
972
973
974 buildSize :: ExprSize
975 buildSize = SizeIs 0 emptyBag 40
976 -- We really want to inline applications of build
977 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
978 -- Indeed, we should add a result_discount because build is
979 -- very like a constructor. We don't bother to check that the
980 -- build is saturated (it usually is). The "-2" discounts for the \c n,
981 -- The "4" is rather arbitrary.
982
983 augmentSize :: ExprSize
984 augmentSize = SizeIs 0 emptyBag 40
985 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
986 -- e plus ys. The -2 accounts for the \cn
987
988 -- When we return a lambda, give a discount if it's used (applied)
989 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
990 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
991 lamScrutDiscount _ TooBig = TooBig
992
993 {-
994 Note [addAltSize result discounts]
995 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
996 When adding the size of alternatives, we *add* the result discounts
997 too, rather than take the *maximum*. For a multi-branch case, this
998 gives a discount for each branch that returns a constructor, making us
999 keener to inline. I did try using 'max' instead, but it makes nofib
1000 'rewrite' and 'puzzle' allocate significantly more, and didn't make
1001 binary sizes shrink significantly either.
1002
1003 Note [Discounts and thresholds]
1004 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1005 Constants for discounts and thesholds are defined in main/DynFlags,
1006 all of form ufXxxx. They are:
1007
1008 ufCreationThreshold
1009 At a definition site, if the unfolding is bigger than this, we
1010 may discard it altogether
1011
1012 ufUseThreshold
1013 At a call site, if the unfolding, less discounts, is smaller than
1014 this, then it's small enough inline
1015
1016 ufKeenessFactor
1017 Factor by which the discounts are multiplied before
1018 subtracting from size
1019
1020 ufDictDiscount
1021 The discount for each occurrence of a dictionary argument
1022 as an argument of a class method. Should be pretty small
1023 else big functions may get inlined
1024
1025 ufFunAppDiscount
1026 Discount for a function argument that is applied. Quite
1027 large, because if we inline we avoid the higher-order call.
1028
1029 ufDearOp
1030 The size of a foreign call or not-dupable PrimOp
1031
1032 ufVeryAggressive
1033 If True, the compiler ignores all the thresholds and inlines very
1034 aggressively. It still adheres to arity, simplifier phase control and
1035 loop breakers.
1036
1037
1038 Note [Function applications]
1039 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1040 In a function application (f a b)
1041
1042 - If 'f' is an argument to the function being analysed,
1043 and there's at least one value arg, record a FunAppDiscount for f
1044
1045 - If the application if a PAP (arity > 2 in this example)
1046 record a *result* discount (because inlining
1047 with "extra" args in the call may mean that we now
1048 get a saturated application)
1049
1050 Code for manipulating sizes
1051 -}
1052
1053 -- | The size of a candidate expression for unfolding
1054 data ExprSize
1055 = TooBig
1056 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
1057 , _es_args :: !(Bag (Id,Int))
1058 -- ^ Arguments cased herein, and discount for each such
1059 , _es_discount :: {-# UNPACK #-} !Int
1060 -- ^ Size to subtract if result is scrutinised by a case
1061 -- expression
1062 }
1063
1064 instance Outputable ExprSize where
1065 ppr TooBig = text "TooBig"
1066 ppr (SizeIs a _ c) = brackets (int a <+> int c)
1067
1068 -- subtract the discount before deciding whether to bale out. eg. we
1069 -- want to inline a large constructor application into a selector:
1070 -- tup = (a_1, ..., a_99)
1071 -- x = case tup of ...
1072 --
1073 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
1074 mkSizeIs max n xs d | (n - d) > max = TooBig
1075 | otherwise = SizeIs n xs d
1076
1077 maxSize :: ExprSize -> ExprSize -> ExprSize
1078 maxSize TooBig _ = TooBig
1079 maxSize _ TooBig = TooBig
1080 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
1081 | otherwise = s2
1082
1083 sizeZero :: ExprSize
1084 sizeN :: Int -> ExprSize
1085
1086 sizeZero = SizeIs 0 emptyBag 0
1087 sizeN n = SizeIs n emptyBag 0
1088
1089 {-
1090 ************************************************************************
1091 * *
1092 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
1093 * *
1094 ************************************************************************
1095
1096 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
1097 we ``couldn't possibly use'' on the other side. Can be overridden w/
1098 flaggery. Just the same as smallEnoughToInline, except that it has no
1099 actual arguments.
1100 -}
1101
1102 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1103 couldBeSmallEnoughToInline dflags threshold rhs
1104 = case sizeExpr dflags threshold [] body of
1105 TooBig -> False
1106 _ -> True
1107 where
1108 (_, body) = collectBinders rhs
1109
1110 ----------------
1111 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1112 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1113 = size <= ufUseThreshold dflags
1114 smallEnoughToInline _ _
1115 = False
1116
1117 ----------------
1118
1119 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1120 -- ^ Sees if the unfolding is pretty certain to inline.
1121 -- If so, return a *stable* unfolding for it, that will always inline.
1122 certainlyWillInline dflags fn_info
1123 = case unfoldingInfo fn_info of
1124 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1125 | loop_breaker -> Nothing -- Won't inline, so try w/w
1126 | noinline -> Nothing -- See Note [Worker-wrapper for NOINLINE functions]
1127 | otherwise -> do_cunf e g -- Depends on size, so look at that
1128
1129 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1130 -- to do so, and even if it is currently a
1131 -- loop breaker, it may not be later
1132
1133 _other_unf -> Nothing
1134
1135 where
1136 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1137 noinline = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline
1138 fn_unf = unfoldingInfo fn_info
1139
1140 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1141 do_cunf _ UnfNever = Nothing
1142 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1143 -- INLINE functions have UnfWhen
1144
1145 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1146 -- binary sizes go up by 10%! (This is with SplitObjs.)
1147 -- I'm not totally sure why.
1148 -- INLINABLE functions come via this path
1149 -- See Note [certainlyWillInline: INLINABLE]
1150 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1151 | arityInfo fn_info > 0 -- See Note [certainlyWillInline: be careful of thunks]
1152 , not (isBottomingSig (strictnessInfo fn_info))
1153 -- Do not unconditionally inline a bottoming functions even if
1154 -- it seems smallish. We've carefully lifted it out to top level,
1155 -- so we don't want to re-inline it.
1156 , let unf_arity = length args
1157 , size - (10 * (unf_arity + 1)) <= ufUseThreshold dflags
1158 = Just (fn_unf { uf_src = InlineStable
1159 , uf_guidance = UnfWhen { ug_arity = unf_arity
1160 , ug_unsat_ok = unSaturatedOk
1161 , ug_boring_ok = inlineBoringOk expr } })
1162 -- Note the "unsaturatedOk". A function like f = \ab. a
1163 -- will certainly inline, even if partially applied (f e), so we'd
1164 -- better make sure that the transformed inlining has the same property
1165 | otherwise
1166 = Nothing
1167
1168 {- Note [certainlyWillInline: be careful of thunks]
1169 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1170 Don't claim that thunks will certainly inline, because that risks work
1171 duplication. Even if the work duplication is not great (eg is_cheap
1172 holds), it can make a big difference in an inner loop In #5623 we
1173 found that the WorkWrap phase thought that
1174 y = case x of F# v -> F# (v +# v)
1175 was certainlyWillInline, so the addition got duplicated.
1176
1177 Note that we check arityInfo instead of the arity of the unfolding to detect
1178 this case. This is so that we don't accidentally fail to inline small partial
1179 applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2
1180 (say). Here there is no risk of work duplication, and the RHS is tiny, so
1181 certainlyWillInline should return True. But `unf_arity` is zero! However f's
1182 arity, gotten from `arityInfo fn_info`, is 1.
1183
1184 Failing to say that `f` will inline forces W/W to generate a potentially huge
1185 worker for f that will immediately cancel with `g`'s wrapper anyway, causing
1186 unnecessary churn in the Simplifier while arriving at the same result.
1187
1188 Note [certainlyWillInline: INLINABLE]
1189 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1190 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1191 even though we have a stable inlining, so that strictness w/w takes
1192 place. It makes a big difference to efficiency, and the w/w pass knows
1193 how to transfer the INLINABLE info to the worker; see WorkWrap
1194 Note [Worker-wrapper for INLINABLE functions]
1195
1196 ************************************************************************
1197 * *
1198 \subsection{callSiteInline}
1199 * *
1200 ************************************************************************
1201
1202 This is the key function. It decides whether to inline a variable at a call site
1203
1204 callSiteInline is used at call sites, so it is a bit more generous.
1205 It's a very important function that embodies lots of heuristics.
1206 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1207 and occurs exactly once or
1208 occurs once in each branch of a case and is small
1209
1210 If the thing is in WHNF, there's no danger of duplicating work,
1211 so we can inline if it occurs once, or is small
1212
1213 NOTE: we don't want to inline top-level functions that always diverge.
1214 It just makes the code bigger. Tt turns out that the convenient way to prevent
1215 them inlining is to give them a NOINLINE pragma, which we do in
1216 StrictAnal.addStrictnessInfoToTopId
1217 -}
1218
1219 callSiteInline :: DynFlags
1220 -> Id -- The Id
1221 -> Bool -- True <=> unfolding is active
1222 -> Bool -- True if there are no arguments at all (incl type args)
1223 -> [ArgSummary] -- One for each value arg; True if it is interesting
1224 -> CallCtxt -- True <=> continuation is interesting
1225 -> Maybe CoreExpr -- Unfolding, if any
1226
1227 data ArgSummary = TrivArg -- Nothing interesting
1228 | NonTrivArg -- Arg has structure
1229 | ValueArg -- Arg is a con-app or PAP
1230 -- ..or con-like. Note [Conlike is interesting]
1231
1232 instance Outputable ArgSummary where
1233 ppr TrivArg = text "TrivArg"
1234 ppr NonTrivArg = text "NonTrivArg"
1235 ppr ValueArg = text "ValueArg"
1236
1237 nonTriv :: ArgSummary -> Bool
1238 nonTriv TrivArg = False
1239 nonTriv _ = True
1240
1241 data CallCtxt
1242 = BoringCtxt
1243 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1244 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1245 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1246
1247 | ValAppCtxt -- We're applied to at least one value arg
1248 -- This arises when we have ((f x |> co) y)
1249 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1250
1251 | CaseCtxt -- We're the scrutinee of a case
1252 -- that decomposes its scrutinee
1253
1254 instance Outputable CallCtxt where
1255 ppr CaseCtxt = text "CaseCtxt"
1256 ppr ValAppCtxt = text "ValAppCtxt"
1257 ppr BoringCtxt = text "BoringCtxt"
1258 ppr RhsCtxt = text "RhsCtxt"
1259 ppr DiscArgCtxt = text "DiscArgCtxt"
1260 ppr RuleArgCtxt = text "RuleArgCtxt"
1261
1262 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1263 = case idUnfolding id of
1264 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1265 -- Things with an INLINE pragma may have an unfolding *and*
1266 -- be a loop breaker (maybe the knot is not yet untied)
1267 CoreUnfolding { uf_tmpl = unf_template
1268 , uf_is_work_free = is_wf
1269 , uf_guidance = guidance, uf_expandable = is_exp }
1270 | active_unfolding -> tryUnfolding dflags id lone_variable
1271 arg_infos cont_info unf_template
1272 is_wf is_exp guidance
1273 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1274 NoUnfolding -> Nothing
1275 BootUnfolding -> Nothing
1276 OtherCon {} -> Nothing
1277 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1278
1279 traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1280 traceInline dflags inline_id str doc result
1281 | Just prefix <- inlineCheck dflags
1282 = if prefix `isPrefixOf` occNameString (getOccName inline_id)
1283 then pprTrace str doc result
1284 else result
1285 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1286 = pprTrace str doc result
1287 | otherwise
1288 = result
1289
1290 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1291 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1292 -> Maybe CoreExpr
1293 tryUnfolding dflags id lone_variable
1294 arg_infos cont_info unf_template
1295 is_wf is_exp guidance
1296 = case guidance of
1297 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1298
1299 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1300 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1301 -- See Note [INLINE for small functions (3)]
1302 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1303 | otherwise
1304 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1305 where
1306 some_benefit = calc_some_benefit uf_arity
1307 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1308
1309 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1310 | ufVeryAggressive dflags
1311 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1312 | is_wf && some_benefit && small_enough
1313 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1314 | otherwise
1315 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1316 where
1317 some_benefit = calc_some_benefit (length arg_discounts)
1318 extra_doc = text "discounted size =" <+> int discounted_size
1319 discounted_size = size - discount
1320 small_enough = discounted_size <= ufUseThreshold dflags
1321 discount = computeDiscount dflags arg_discounts
1322 res_discount arg_infos cont_info
1323
1324 where
1325 mk_doc some_benefit extra_doc yes_or_no
1326 = vcat [ text "arg infos" <+> ppr arg_infos
1327 , text "interesting continuation" <+> ppr cont_info
1328 , text "some_benefit" <+> ppr some_benefit
1329 , text "is exp:" <+> ppr is_exp
1330 , text "is work-free:" <+> ppr is_wf
1331 , text "guidance" <+> ppr guidance
1332 , extra_doc
1333 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1334
1335 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1336 n_val_args = length arg_infos
1337
1338 -- some_benefit is used when the RHS is small enough
1339 -- and the call has enough (or too many) value
1340 -- arguments (ie n_val_args >= arity). But there must
1341 -- be *something* interesting about some argument, or the
1342 -- result context, to make it worth inlining
1343 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1344 -- expected by the unfolding
1345 calc_some_benefit uf_arity
1346 | not saturated = interesting_args -- Under-saturated
1347 -- Note [Unsaturated applications]
1348 | otherwise = interesting_args -- Saturated or over-saturated
1349 || interesting_call
1350 where
1351 saturated = n_val_args >= uf_arity
1352 over_saturated = n_val_args > uf_arity
1353 interesting_args = any nonTriv arg_infos
1354 -- NB: (any nonTriv arg_infos) looks at the
1355 -- over-saturated args too which is "wrong";
1356 -- but if over-saturated we inline anyway.
1357
1358 interesting_call
1359 | over_saturated
1360 = True
1361 | otherwise
1362 = case cont_info of
1363 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1364 ValAppCtxt -> True -- Note [Cast then apply]
1365 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1366 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt]
1367 RhsCtxt -> uf_arity > 0 --
1368 _other -> False -- See Note [Nested functions]
1369
1370
1371 {-
1372 Note [Unfold into lazy contexts], Note [RHS of lets]
1373 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1374 When the call is the argument of a function with a RULE, or the RHS of a let,
1375 we are a little bit keener to inline. For example
1376 f y = (y,y,y)
1377 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1378 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1379 only we can't see it. Also
1380 x = f v
1381 could be expensive whereas
1382 x = case v of (a,b) -> a
1383 is patently cheap and may allow more eta expansion.
1384 So we treat the RHS of a let as not-totally-boring.
1385
1386 Note [Unsaturated applications]
1387 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1388 When a call is not saturated, we *still* inline if one of the
1389 arguments has interesting structure. That's sometimes very important.
1390 A good example is the Ord instance for Bool in Base:
1391
1392 Rec {
1393 $fOrdBool =GHC.Classes.D:Ord
1394 @ Bool
1395 ...
1396 $cmin_ajX
1397
1398 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1399 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1400 }
1401
1402 But the defn of GHC.Classes.$dmmin is:
1403
1404 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1405 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1406 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1407 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1408 GHC.Types.False -> y GHC.Types.True -> x }) -}
1409
1410 We *really* want to inline $dmmin, even though it has arity 3, in
1411 order to unravel the recursion.
1412
1413
1414 Note [Things to watch]
1415 ~~~~~~~~~~~~~~~~~~~~~~
1416 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1417 Assume x is exported, so not inlined unconditionally.
1418 Then we want x to inline unconditionally; no reason for it
1419 not to, and doing so avoids an indirection.
1420
1421 * { x = I# 3; ....f x.... }
1422 Make sure that x does not inline unconditionally!
1423 Lest we get extra allocation.
1424
1425 Note [Inlining an InlineRule]
1426 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1427 An InlineRules is used for
1428 (a) programmer INLINE pragmas
1429 (b) inlinings from worker/wrapper
1430
1431 For (a) the RHS may be large, and our contract is that we *only* inline
1432 when the function is applied to all the arguments on the LHS of the
1433 source-code defn. (The uf_arity in the rule.)
1434
1435 However for worker/wrapper it may be worth inlining even if the
1436 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1437 require saturation.
1438
1439 Note [Nested functions]
1440 ~~~~~~~~~~~~~~~~~~~~~~~
1441 At one time we treated a call of a non-top-level function as
1442 "interesting" (regardless of how boring the context) in the hope
1443 that inlining it would eliminate the binding, and its allocation.
1444 Specifically, in the default case of interesting_call we had
1445 _other -> not is_top && uf_arity > 0
1446
1447 But actually postInlineUnconditionally does some of this and overall
1448 it makes virtually no difference to nofib. So I simplified away this
1449 special case
1450
1451 Note [Cast then apply]
1452 ~~~~~~~~~~~~~~~~~~~~~~
1453 Consider
1454 myIndex = __inline_me ( (/\a. <blah>) |> co )
1455 co :: (forall a. a -> a) ~ (forall a. T a)
1456 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1457
1458 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1459 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1460
1461 Note [Inlining in ArgCtxt]
1462 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1463 The condition (arity > 0) here is very important, because otherwise
1464 we end up inlining top-level stuff into useless places; eg
1465 x = I# 3#
1466 f = \y. g x
1467 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1468 and 20% to 'power'.
1469
1470 At one stage I replaced this condition by 'True' (leading to the above
1471 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1472 to work ok now.
1473
1474 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1475 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1476 for the RHS of a 'let', we only profit from the inlining if we get a
1477 CONLIKE thing (modulo lets).
1478
1479 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1480 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1481 The "lone-variable" case is important. I spent ages messing about
1482 with unsatisfactory variants, but this is nice. The idea is that if a
1483 variable appears all alone
1484
1485 as an arg of lazy fn, or rhs BoringCtxt
1486 as scrutinee of a case CaseCtxt
1487 as arg of a fn ArgCtxt
1488 AND
1489 it is bound to a cheap expression
1490
1491 then we should not inline it (unless there is some other reason,
1492 e.g. it is the sole occurrence). That is what is happening at
1493 the use of 'lone_variable' in 'interesting_call'.
1494
1495 Why? At least in the case-scrutinee situation, turning
1496 let x = (a,b) in case x of y -> ...
1497 into
1498 let x = (a,b) in case (a,b) of y -> ...
1499 and thence to
1500 let x = (a,b) in let y = (a,b) in ...
1501 is bad if the binding for x will remain.
1502
1503 Another example: I discovered that strings
1504 were getting inlined straight back into applications of 'error'
1505 because the latter is strict.
1506 s = "foo"
1507 f = \x -> ...(error s)...
1508
1509 Fundamentally such contexts should not encourage inlining because, provided
1510 the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1511 context can ``see'' the unfolding of the variable (e.g. case or a
1512 RULE) so there's no gain.
1513
1514 However, watch out:
1515
1516 * Consider this:
1517 foo = _inline_ (\n. [n])
1518 bar = _inline_ (foo 20)
1519 baz = \n. case bar of { (m:_) -> m + n }
1520 Here we really want to inline 'bar' so that we can inline 'foo'
1521 and the whole thing unravels as it should obviously do. This is
1522 important: in the NDP project, 'bar' generates a closure data
1523 structure rather than a list.
1524
1525 So the non-inlining of lone_variables should only apply if the
1526 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1527 looks through the unfolding. Hence the "&& is_wf" in the
1528 InlineRule branch.
1529
1530 * Even a type application or coercion isn't a lone variable.
1531 Consider
1532 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1533 We had better inline that sucker! The case won't see through it.
1534
1535 For now, I'm treating treating a variable applied to types
1536 in a *lazy* context "lone". The motivating example was
1537 f = /\a. \x. BIG
1538 g = /\a. \y. h (f a)
1539 There's no advantage in inlining f here, and perhaps
1540 a significant disadvantage. Hence some_val_args in the Stop case
1541
1542 Note [Interaction of exprIsWorkFree and lone variables]
1543 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1544 The lone-variable test says "don't inline if a case expression
1545 scrutinises a lone variable whose unfolding is cheap". It's very
1546 important that, under these circumstances, exprIsConApp_maybe
1547 can spot a constructor application. So, for example, we don't
1548 consider
1549 let x = e in (x,x)
1550 to be cheap, and that's good because exprIsConApp_maybe doesn't
1551 think that expression is a constructor application.
1552
1553 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1554 rather than is_wf, which was utterly wrong, because the above
1555 expression responds True to exprIsHNF, which is what sets is_value.
1556
1557 This kind of thing can occur if you have
1558
1559 {-# INLINE foo #-}
1560 foo = let x = e in (x,x)
1561
1562 which Roman did.
1563
1564
1565 -}
1566
1567 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1568 -> Int
1569 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1570 -- We multiple the raw discounts (args_discount and result_discount)
1571 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1572 -- *size* whereas the discounts imply that there's some extra
1573 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1574 -- by inlining.
1575
1576 = 10 -- Discount of 10 because the result replaces the call
1577 -- so we count 10 for the function itself
1578
1579 + 10 * length actual_arg_discounts
1580 -- Discount of 10 for each arg supplied,
1581 -- because the result replaces the call
1582
1583 + round (ufKeenessFactor dflags *
1584 fromIntegral (total_arg_discount + res_discount'))
1585 where
1586 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1587 total_arg_discount = sum actual_arg_discounts
1588
1589 mk_arg_discount _ TrivArg = 0
1590 mk_arg_discount _ NonTrivArg = 10
1591 mk_arg_discount discount ValueArg = discount
1592
1593 res_discount'
1594 | LT <- arg_discounts `compareLength` arg_infos
1595 = res_discount -- Over-saturated
1596 | otherwise
1597 = case cont_info of
1598 BoringCtxt -> 0
1599 CaseCtxt -> res_discount -- Presumably a constructor
1600 ValAppCtxt -> res_discount -- Presumably a function
1601 _ -> 40 `min` res_discount
1602 -- ToDo: this 40 `min` res_discount doesn't seem right
1603 -- for DiscArgCtxt it shouldn't matter because the function will
1604 -- get the arg discount for any non-triv arg
1605 -- for RuleArgCtxt we do want to be keener to inline; but not only
1606 -- constructor results
1607 -- for RhsCtxt I suppose that exposing a data con is good in general
1608 -- And 40 seems very arbitrary
1609 --
1610 -- res_discount can be very large when a function returns
1611 -- constructors; but we only want to invoke that large discount
1612 -- when there's a case continuation.
1613 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1614 -- But we want to aovid inlining large functions that return
1615 -- constructors into contexts that are simply "interesting"