Always do the worker/wrapper split for NOINLINEs
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import PprCore () -- Instances
50 import OccurAnal ( occurAnalyseExpr_NoBinderSwap )
51 import CoreOpt
52 import CoreArity ( manifestArity )
53 import CoreUtils
54 import Id
55 import Demand ( isBottomingSig )
56 import DataCon
57 import Literal
58 import PrimOp
59 import IdInfo
60 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
61 import Type
62 import PrelNames
63 import TysPrim ( realWorldStatePrimTy )
64 import Bag
65 import Util
66 import Outputable
67 import ForeignCall
68 import Name
69
70 import qualified Data.ByteString as BS
71 import Data.List
72
73 {-
74 ************************************************************************
75 * *
76 \subsection{Making unfoldings}
77 * *
78 ************************************************************************
79 -}
80
81 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
82 mkTopUnfolding dflags is_bottoming rhs
83 = mkUnfolding dflags InlineRhs True is_bottoming rhs
84
85 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
86 -- For implicit Ids, do a tiny bit of optimising first
87 mkImplicitUnfolding dflags expr
88 = mkTopUnfolding dflags False (simpleOptExpr dflags expr)
89
90 -- Note [Top-level flag on inline rules]
91 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 -- Slight hack: note that mk_inline_rules conservatively sets the
93 -- top-level flag to True. It gets set more accurately by the simplifier
94 -- Simplify.simplUnfolding.
95
96 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
97 mkSimpleUnfolding dflags rhs
98 = mkUnfolding dflags InlineRhs False False rhs
99
100 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
101 mkDFunUnfolding bndrs con ops
102 = DFunUnfolding { df_bndrs = bndrs
103 , df_con = con
104 , df_args = map occurAnalyseExpr_NoBinderSwap ops }
105 -- See Note [Occurrence analysis of unfoldings]
106
107 mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
108 mkWwInlineRule dflags expr arity
109 = mkCoreUnfolding InlineStable True
110 (simpleOptExpr dflags expr)
111 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
112 , ug_boring_ok = boringCxtNotOk })
113
114 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
115 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
116 = mkCoreUnfolding InlineCompulsory True
117 (simpleOptExpr unsafeGlobalDynFlags expr)
118 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
119 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
120
121 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
122 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
123 mkWorkerUnfolding dflags work_fn
124 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
125 , uf_is_top = top_lvl })
126 | isStableSource src
127 = mkCoreUnfolding src top_lvl new_tmpl guidance
128 where
129 new_tmpl = simpleOptExpr dflags (work_fn tmpl)
130 guidance = calcUnfoldingGuidance dflags False new_tmpl
131
132 mkWorkerUnfolding _ _ _ = noUnfolding
133
134 -- | Make an unfolding that may be used unsaturated
135 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
136 -- manifest arity (the number of outer lambdas applications will
137 -- resolve before doing any work).
138 mkInlineUnfolding :: CoreExpr -> Unfolding
139 mkInlineUnfolding expr
140 = mkCoreUnfolding InlineStable
141 True -- Note [Top-level flag on inline rules]
142 expr' guide
143 where
144 expr' = simpleOptExpr unsafeGlobalDynFlags expr
145 guide = UnfWhen { ug_arity = manifestArity expr'
146 , ug_unsat_ok = unSaturatedOk
147 , ug_boring_ok = boring_ok }
148 boring_ok = inlineBoringOk expr'
149
150 -- | Make an unfolding that will be used once the RHS has been saturated
151 -- to the given arity.
152 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
153 mkInlineUnfoldingWithArity arity expr
154 = mkCoreUnfolding InlineStable
155 True -- Note [Top-level flag on inline rules]
156 expr' guide
157 where
158 expr' = simpleOptExpr unsafeGlobalDynFlags expr
159 guide = UnfWhen { ug_arity = arity
160 , ug_unsat_ok = needSaturated
161 , ug_boring_ok = boring_ok }
162 -- See Note [INLINE pragmas and boring contexts] as to why we need to look
163 -- at the arity here.
164 boring_ok | arity == 0 = True
165 | otherwise = inlineBoringOk expr'
166
167 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
168 mkInlinableUnfolding dflags expr
169 = mkUnfolding dflags InlineStable False False expr'
170 where
171 expr' = simpleOptExpr dflags expr
172
173 specUnfolding :: DynFlags -> [Var] -> (CoreExpr -> CoreExpr) -> Arity
174 -> Unfolding -> Unfolding
175 -- See Note [Specialising unfoldings]
176 -- specUnfolding spec_bndrs spec_app arity_decrease unf
177 -- = \spec_bndrs. spec_app( unf )
178 --
179 specUnfolding dflags spec_bndrs spec_app arity_decrease
180 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
181 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
182 mkDFunUnfolding spec_bndrs con (map spec_arg args)
183 -- There is a hard-to-check assumption here that the spec_app has
184 -- enough applications to exactly saturate the old_bndrs
185 -- For DFunUnfoldings we transform
186 -- \old_bndrs. MkD <op1> ... <opn>
187 -- to
188 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
189 -- The ASSERT checks the value part of that
190 where
191 spec_arg arg = simpleOptExpr dflags (spec_app (mkLams old_bndrs arg))
192 -- The beta-redexes created by spec_app will be
193 -- simplified away by simplOptExpr
194
195 specUnfolding dflags spec_bndrs spec_app arity_decrease
196 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
197 , uf_is_top = top_lvl
198 , uf_guidance = old_guidance })
199 | isStableSource src -- See Note [Specialising unfoldings]
200 , UnfWhen { ug_arity = old_arity
201 , ug_unsat_ok = unsat_ok
202 , ug_boring_ok = boring_ok } <- old_guidance
203 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
204 , ug_unsat_ok = unsat_ok
205 , ug_boring_ok = boring_ok }
206 new_tmpl = simpleOptExpr dflags (mkLams spec_bndrs (spec_app tmpl))
207 -- The beta-redexes created by spec_app will be
208 -- simplified away by simplOptExpr
209
210 in mkCoreUnfolding src top_lvl new_tmpl guidance
211
212 specUnfolding _ _ _ _ _ = noUnfolding
213
214 {- Note [Specialising unfoldings]
215 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
216 When we specialise a function for some given type-class arguments, we use
217 specUnfolding to specialise its unfolding. Some important points:
218
219 * If the original function has a DFunUnfolding, the specialised one
220 must do so too! Otherwise we lose the magic rules that make it
221 interact with ClassOps
222
223 * There is a bit of hack for INLINABLE functions:
224 f :: Ord a => ....
225 f = <big-rhs>
226 {- INLINABLE f #-}
227 Now if we specialise f, should the specialised version still have
228 an INLINABLE pragma? If it does, we'll capture a specialised copy
229 of <big-rhs> as its unfolding, and that probaby won't inline. But
230 if we don't, the specialised version of <big-rhs> might be small
231 enough to inline at a call site. This happens with Control.Monad.liftM3,
232 and can cause a lot more allocation as a result (nofib n-body shows this).
233
234 Moreover, keeping the INLINABLE thing isn't much help, because
235 the specialised function (probaby) isn't overloaded any more.
236
237 Conclusion: drop the INLINEALE pragma. In practice what this means is:
238 if a stable unfolding has UnfoldingGuidance of UnfWhen,
239 we keep it (so the specialised thing too will always inline)
240 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
241 (which arises from INLINABLE), we discard it
242
243 Note [Honour INLINE on 0-ary bindings]
244 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
245 Consider
246
247 x = <expensive>
248 {-# INLINE x #-}
249
250 f y = ...x...
251
252 The semantics of an INLINE pragma is
253
254 inline x at every call site, provided it is saturated;
255 that is, applied to at least as many arguments as appear
256 on the LHS of the Haskell source definition.
257
258 (This soure-code-derived arity is stored in the `ug_arity` field of
259 the `UnfoldingGuidance`.)
260
261 In the example, x's ug_arity is 0, so we should inline it at every use
262 site. It's rare to have such an INLINE pragma (usually INLINE Is on
263 functions), but it's occasionally very important (Trac #15578, #15519).
264 In #15519 we had something like
265 x = case (g a b) of I# r -> T r
266 {-# INLINE x #-}
267 f y = ...(h x)....
268
269 where h is strict. So we got
270 f y = ...(case g a b of I# r -> h (T r))...
271
272 and that in turn allowed SpecConstr to ramp up performance.
273
274 How do we deliver on this? By adjusting the ug_boring_ok
275 flag in mkInlineUnfoldingWithArity; see
276 Note [INLINE pragmas and boring contexts]
277
278 NB: there is a real risk that full laziness will float it right back
279 out again. Consider again
280 x = factorial 200
281 {-# INLINE x #-}
282 f y = ...x...
283
284 After inlining we get
285 f y = ...(factorial 200)...
286
287 but it's entirely possible that full laziness will do
288 lvl23 = factorial 200
289 f y = ...lvl23...
290
291 That's a problem for another day.
292
293 Note [INLINE pragmas and boring contexts]
294 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
295 An INLINE pragma uses mkInlineUnfoldingWithArity to build the
296 unfolding. That sets the ug_boring_ok flag to False if the function
297 is not tiny (inlineBoringOK), so that even INLINE functions are not
298 inlined in an utterly boring context. E.g.
299 \x y. Just (f y x)
300 Nothing is gained by inlining f here, even if it has an INLINE
301 pragma.
302
303 But for 0-ary bindings, we want to inline regardless; see
304 Note [Honour INLINE on 0-ary bindings].
305
306 I'm a bit worried that it's possible for the same kind of problem
307 to arise for non-0-ary functions too, but let's wait and see.
308 -}
309
310 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
311 -> UnfoldingGuidance -> Unfolding
312 -- Occurrence-analyses the expression before capturing it
313 mkCoreUnfolding src top_lvl expr guidance
314 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr,
315 -- See Note [Occurrence analysis of unfoldings]
316 uf_src = src,
317 uf_is_top = top_lvl,
318 uf_is_value = exprIsHNF expr,
319 uf_is_conlike = exprIsConLike expr,
320 uf_is_work_free = exprIsWorkFree expr,
321 uf_expandable = exprIsExpandable expr,
322 uf_guidance = guidance }
323
324 mkUnfolding :: DynFlags -> UnfoldingSource
325 -> Bool -- Is top-level
326 -> Bool -- Definitely a bottoming binding
327 -- (only relevant for top-level bindings)
328 -> CoreExpr
329 -> Unfolding
330 -- Calculates unfolding guidance
331 -- Occurrence-analyses the expression before capturing it
332 mkUnfolding dflags src is_top_lvl is_bottoming expr
333 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr,
334 -- See Note [Occurrence analysis of unfoldings]
335 uf_src = src,
336 uf_is_top = is_top_lvl,
337 uf_is_value = exprIsHNF expr,
338 uf_is_conlike = exprIsConLike expr,
339 uf_expandable = exprIsExpandable expr,
340 uf_is_work_free = exprIsWorkFree expr,
341 uf_guidance = guidance }
342 where
343 is_top_bottoming = is_top_lvl && is_bottoming
344 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
345 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr_NoBinderSwap expr))!
346 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
347
348 {-
349 Note [Occurrence analysis of unfoldings]
350 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351 We do occurrence-analysis of unfoldings once and for all, when the
352 unfolding is built, rather than each time we inline them.
353
354 But given this decision it's vital that we do
355 *always* do it. Consider this unfolding
356 \x -> letrec { f = ...g...; g* = f } in body
357 where g* is (for some strange reason) the loop breaker. If we don't
358 occ-anal it when reading it in, we won't mark g as a loop breaker, and
359 we may inline g entirely in body, dropping its binding, and leaving
360 the occurrence in f out of scope. This happened in Trac #8892, where
361 the unfolding in question was a DFun unfolding.
362
363 But more generally, the simplifier is designed on the
364 basis that it is looking at occurrence-analysed expressions, so better
365 ensure that they acutally are.
366
367 We use occurAnalyseExpr_NoBinderSwap instead of occurAnalyseExpr;
368 see Note [No binder swap in unfoldings].
369
370 Note [No binder swap in unfoldings]
371 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
372 The binder swap can temporarily violate Core Lint, by assinging
373 a LocalId binding to a GlobalId. For example, if A.foo{r872}
374 is a GlobalId with unique r872, then
375
376 case A.foo{r872} of bar {
377 K x -> ...(A.foo{r872})...
378 }
379
380 gets transformed to
381
382 case A.foo{r872} of bar {
383 K x -> let foo{r872} = bar
384 in ...(A.foo{r872})...
385
386 This is usually not a problem, because the simplifier will transform
387 this to:
388
389 case A.foo{r872} of bar {
390 K x -> ...(bar)...
391
392 However, after occurrence analysis but before simplification, this extra 'let'
393 violates the Core Lint invariant that we do not have local 'let' bindings for
394 GlobalIds. That seems (just) tolerable for the occurrence analysis that happens
395 just before the Simplifier, but not for unfoldings, which are Linted
396 independently.
397 As a quick workaround, we disable binder swap in this module.
398 See Trac #16288 and #16296 for further plans.
399
400 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
401 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
402 Notice that we give the non-occur-analysed expression to
403 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
404 first; for example, sometimes during simplification, there's a large
405 let-bound thing which has been substituted, and so is now dead; so
406 'expr' contains two copies of the thing while the occurrence-analysed
407 expression doesn't.
408
409 Nevertheless, we *don't* and *must not* occ-analyse before computing
410 the size because
411
412 a) The size computation bales out after a while, whereas occurrence
413 analysis does not.
414
415 b) Residency increases sharply if you occ-anal first. I'm not
416 100% sure why, but it's a large effect. Compiling Cabal went
417 from residency of 534M to over 800M with this one change.
418
419 This can occasionally mean that the guidance is very pessimistic;
420 it gets fixed up next round. And it should be rare, because large
421 let-bound things that are dead are usually caught by preInlineUnconditionally
422
423
424 ************************************************************************
425 * *
426 \subsection{The UnfoldingGuidance type}
427 * *
428 ************************************************************************
429 -}
430
431 inlineBoringOk :: CoreExpr -> Bool
432 -- See Note [INLINE for small functions]
433 -- True => the result of inlining the expression is
434 -- no bigger than the expression itself
435 -- eg (\x y -> f y x)
436 -- This is a quick and dirty version. It doesn't attempt
437 -- to deal with (\x y z -> x (y z))
438 -- The really important one is (x `cast` c)
439 inlineBoringOk e
440 = go 0 e
441 where
442 go :: Int -> CoreExpr -> Bool
443 go credit (Lam x e) | isId x = go (credit+1) e
444 | otherwise = go credit e
445 go credit (App f (Type {})) = go credit f
446 go credit (App f a) | credit > 0
447 , exprIsTrivial a = go (credit-1) f
448 go credit (Tick _ e) = go credit e -- dubious
449 go credit (Cast e _) = go credit e
450 go _ (Var {}) = boringCxtOk
451 go _ _ = boringCxtNotOk
452
453 calcUnfoldingGuidance
454 :: DynFlags
455 -> Bool -- Definitely a top-level, bottoming binding
456 -> CoreExpr -- Expression to look at
457 -> UnfoldingGuidance
458 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
459 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
460 = calcUnfoldingGuidance dflags is_top_bottoming expr
461 calcUnfoldingGuidance dflags is_top_bottoming expr
462 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
463 TooBig -> UnfNever
464 SizeIs size cased_bndrs scrut_discount
465 | uncondInline expr n_val_bndrs size
466 -> UnfWhen { ug_unsat_ok = unSaturatedOk
467 , ug_boring_ok = boringCxtOk
468 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
469
470 | is_top_bottoming
471 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
472
473 | otherwise
474 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
475 , ug_size = size
476 , ug_res = scrut_discount }
477
478 where
479 (bndrs, body) = collectBinders expr
480 bOMB_OUT_SIZE = ufCreationThreshold dflags
481 -- Bomb out if size gets bigger than this
482 val_bndrs = filter isId bndrs
483 n_val_bndrs = length val_bndrs
484
485 mk_discount :: Bag (Id,Int) -> Id -> Int
486 mk_discount cbs bndr = foldlBag combine 0 cbs
487 where
488 combine acc (bndr', disc)
489 | bndr == bndr' = acc `plus_disc` disc
490 | otherwise = acc
491
492 plus_disc :: Int -> Int -> Int
493 plus_disc | isFunTy (idType bndr) = max
494 | otherwise = (+)
495 -- See Note [Function and non-function discounts]
496
497 {-
498 Note [Computing the size of an expression]
499 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
500 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
501 heuristics right has taken a long time. Here's the basic strategy:
502
503 * Variables, literals: 0
504 (Exception for string literals, see litSize.)
505
506 * Function applications (f e1 .. en): 1 + #value args
507
508 * Constructor applications: 1, regardless of #args
509
510 * Let(rec): 1 + size of components
511
512 * Note, cast: 0
513
514 Examples
515
516 Size Term
517 --------------
518 0 42#
519 0 x
520 0 True
521 2 f x
522 1 Just x
523 4 f (g x)
524
525 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
526 a function call to account for. Notice also that constructor applications
527 are very cheap, because exposing them to a caller is so valuable.
528
529 [25/5/11] All sizes are now multiplied by 10, except for primops
530 (which have sizes like 1 or 4. This makes primops look fantastically
531 cheap, and seems to be almost unversally beneficial. Done partly as a
532 result of #4978.
533
534 Note [Do not inline top-level bottoming functions]
535 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
536 The FloatOut pass has gone to some trouble to float out calls to 'error'
537 and similar friends. See Note [Bottoming floats] in SetLevels.
538 Do not re-inline them! But we *do* still inline if they are very small
539 (the uncondInline stuff).
540
541 Note [INLINE for small functions]
542 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
543 Consider {-# INLINE f #-}
544 f x = Just x
545 g y = f y
546 Then f's RHS is no larger than its LHS, so we should inline it into
547 even the most boring context. In general, f the function is
548 sufficiently small that its body is as small as the call itself, the
549 inline unconditionally, regardless of how boring the context is.
550
551 Things to note:
552
553 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
554 than the thing it's replacing. Notice that
555 (f x) --> (g 3) -- YES, unconditionally
556 (f x) --> x : [] -- YES, *even though* there are two
557 -- arguments to the cons
558 x --> g 3 -- NO
559 x --> Just v -- NO
560
561 It's very important not to unconditionally replace a variable by
562 a non-atomic term.
563
564 (2) We do this even if the thing isn't saturated, else we end up with the
565 silly situation that
566 f x y = x
567 ...map (f 3)...
568 doesn't inline. Even in a boring context, inlining without being
569 saturated will give a lambda instead of a PAP, and will be more
570 efficient at runtime.
571
572 (3) However, when the function's arity > 0, we do insist that it
573 has at least one value argument at the call site. (This check is
574 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
575 f = /\a \x:a. x
576 d = /\b. MkD (f b)
577 If we inline f here we get
578 d = /\b. MkD (\x:b. x)
579 and then prepareRhs floats out the argument, abstracting the type
580 variables, so we end up with the original again!
581
582 (4) We must be much more cautious about arity-zero things. Consider
583 let x = y +# z in ...
584 In *size* terms primops look very small, because the generate a
585 single instruction, but we do not want to unconditionally replace
586 every occurrence of x with (y +# z). So we only do the
587 unconditional-inline thing for *trivial* expressions.
588
589 NB: you might think that PostInlineUnconditionally would do this
590 but it doesn't fire for top-level things; see SimplUtils
591 Note [Top level and postInlineUnconditionally]
592 -}
593
594 uncondInline :: CoreExpr -> Arity -> Int -> Bool
595 -- Inline unconditionally if there no size increase
596 -- Size of call is arity (+1 for the function)
597 -- See Note [INLINE for small functions]
598 uncondInline rhs arity size
599 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
600 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
601
602 sizeExpr :: DynFlags
603 -> Int -- Bomb out if it gets bigger than this
604 -> [Id] -- Arguments; we're interested in which of these
605 -- get case'd
606 -> CoreExpr
607 -> ExprSize
608
609 -- Note [Computing the size of an expression]
610
611 sizeExpr dflags bOMB_OUT_SIZE top_args expr
612 = size_up expr
613 where
614 size_up (Cast e _) = size_up e
615 size_up (Tick _ e) = size_up e
616 size_up (Type _) = sizeZero -- Types cost nothing
617 size_up (Coercion _) = sizeZero
618 size_up (Lit lit) = sizeN (litSize lit)
619 size_up (Var f) | isRealWorldId f = sizeZero
620 -- Make sure we get constructor discounts even
621 -- on nullary constructors
622 | otherwise = size_up_call f [] 0
623
624 size_up (App fun arg)
625 | isTyCoArg arg = size_up fun
626 | otherwise = size_up arg `addSizeNSD`
627 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
628
629 size_up (Lam b e)
630 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
631 | otherwise = size_up e
632
633 size_up (Let (NonRec binder rhs) body)
634 = size_up_rhs (binder, rhs) `addSizeNSD`
635 size_up body `addSizeN`
636 size_up_alloc binder
637
638 size_up (Let (Rec pairs) body)
639 = foldr (addSizeNSD . size_up_rhs)
640 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
641 pairs
642
643 size_up (Case e _ _ alts)
644 | null alts
645 = size_up e -- case e of {} never returns, so take size of scrutinee
646
647 size_up (Case e _ _ alts)
648 -- Now alts is non-empty
649 | Just v <- is_top_arg e -- We are scrutinising an argument variable
650 = let
651 alt_sizes = map size_up_alt alts
652
653 -- alts_size tries to compute a good discount for
654 -- the case when we are scrutinising an argument variable
655 alts_size (SizeIs tot tot_disc tot_scrut)
656 -- Size of all alternatives
657 (SizeIs max _ _)
658 -- Size of biggest alternative
659 = SizeIs tot (unitBag (v, 20 + tot - max)
660 `unionBags` tot_disc) tot_scrut
661 -- If the variable is known, we produce a
662 -- discount that will take us back to 'max',
663 -- the size of the largest alternative The
664 -- 1+ is a little discount for reduced
665 -- allocation in the caller
666 --
667 -- Notice though, that we return tot_disc,
668 -- the total discount from all branches. I
669 -- think that's right.
670
671 alts_size tot_size _ = tot_size
672 in
673 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
674 (foldr1 maxSize alt_sizes)
675 -- Good to inline if an arg is scrutinised, because
676 -- that may eliminate allocation in the caller
677 -- And it eliminates the case itself
678 where
679 is_top_arg (Var v) | v `elem` top_args = Just v
680 is_top_arg (Cast e _) = is_top_arg e
681 is_top_arg _ = Nothing
682
683
684 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
685 foldr (addAltSize . size_up_alt) case_size alts
686 where
687 case_size
688 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
689 | otherwise = sizeZero
690 -- Normally we don't charge for the case itself, but
691 -- we charge one per alternative (see size_up_alt,
692 -- below) to account for the cost of the info table
693 -- and comparisons.
694 --
695 -- However, in certain cases (see is_inline_scrut
696 -- below), no code is generated for the case unless
697 -- there are multiple alts. In these cases we
698 -- subtract one, making the first alt free.
699 -- e.g. case x# +# y# of _ -> ... should cost 1
700 -- case touch# x# of _ -> ... should cost 0
701 -- (see #4978)
702 --
703 -- I would like to not have the "lengthAtMost alts 1"
704 -- condition above, but without that some programs got worse
705 -- (spectral/hartel/event and spectral/para). I don't fully
706 -- understand why. (SDM 24/5/11)
707
708 -- unboxed variables, inline primops and unsafe foreign calls
709 -- are all "inline" things:
710 is_inline_scrut (Var v) = isUnliftedType (idType v)
711 is_inline_scrut scrut
712 | (Var f, _) <- collectArgs scrut
713 = case idDetails f of
714 FCallId fc -> not (isSafeForeignCall fc)
715 PrimOpId op -> not (primOpOutOfLine op)
716 _other -> False
717 | otherwise
718 = False
719
720 size_up_rhs (bndr, rhs)
721 | Just join_arity <- isJoinId_maybe bndr
722 -- Skip arguments to join point
723 , (_bndrs, body) <- collectNBinders join_arity rhs
724 = size_up body
725 | otherwise
726 = size_up rhs
727
728 ------------
729 -- size_up_app is used when there's ONE OR MORE value args
730 size_up_app (App fun arg) args voids
731 | isTyCoArg arg = size_up_app fun args voids
732 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
733 | otherwise = size_up arg `addSizeNSD`
734 size_up_app fun (arg:args) voids
735 size_up_app (Var fun) args voids = size_up_call fun args voids
736 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
737 size_up_app (Cast expr _) args voids = size_up_app expr args voids
738 size_up_app other args voids = size_up other `addSizeN`
739 callSize (length args) voids
740 -- if the lhs is not an App or a Var, or an invisible thing like a
741 -- Tick or Cast, then we should charge for a complete call plus the
742 -- size of the lhs itself.
743
744 ------------
745 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
746 size_up_call fun val_args voids
747 = case idDetails fun of
748 FCallId _ -> sizeN (callSize (length val_args) voids)
749 DataConWorkId dc -> conSize dc (length val_args)
750 PrimOpId op -> primOpSize op (length val_args)
751 ClassOpId _ -> classOpSize dflags top_args val_args
752 _ -> funSize dflags top_args fun (length val_args) voids
753
754 ------------
755 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
756 -- Don't charge for args, so that wrappers look cheap
757 -- (See comments about wrappers with Case)
758 --
759 -- IMPORTANT: *do* charge 1 for the alternative, else we
760 -- find that giant case nests are treated as practically free
761 -- A good example is Foreign.C.Error.errnoToIOError
762
763 ------------
764 -- Cost to allocate binding with given binder
765 size_up_alloc bndr
766 | isTyVar bndr -- Doesn't exist at runtime
767 || isJoinId bndr -- Not allocated at all
768 || isUnliftedType (idType bndr) -- Doesn't live in heap
769 = 0
770 | otherwise
771 = 10
772
773 ------------
774 -- These addSize things have to be here because
775 -- I don't want to give them bOMB_OUT_SIZE as an argument
776 addSizeN TooBig _ = TooBig
777 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
778
779 -- addAltSize is used to add the sizes of case alternatives
780 addAltSize TooBig _ = TooBig
781 addAltSize _ TooBig = TooBig
782 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
783 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
784 (xs `unionBags` ys)
785 (d1 + d2) -- Note [addAltSize result discounts]
786
787 -- This variant ignores the result discount from its LEFT argument
788 -- It's used when the second argument isn't part of the result
789 addSizeNSD TooBig _ = TooBig
790 addSizeNSD _ TooBig = TooBig
791 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
792 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
793 (xs `unionBags` ys)
794 d2 -- Ignore d1
795
796 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
797
798 -- an expression of type State# RealWorld must be a variable
799 isRealWorldExpr (Var id) = isRealWorldId id
800 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
801 isRealWorldExpr _ = False
802
803 -- | Finds a nominal size of a string literal.
804 litSize :: Literal -> Int
805 -- Used by CoreUnfold.sizeExpr
806 litSize (LitNumber LitNumInteger _ _) = 100 -- Note [Size of literal integers]
807 litSize (LitNumber LitNumNatural _ _) = 100
808 litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
809 -- If size could be 0 then @f "x"@ might be too small
810 -- [Sept03: make literal strings a bit bigger to avoid fruitless
811 -- duplication of little strings]
812 litSize _other = 0 -- Must match size of nullary constructors
813 -- Key point: if x |-> 4, then x must inline unconditionally
814 -- (eg via case binding)
815
816 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
817 -- See Note [Conlike is interesting]
818 classOpSize _ _ []
819 = sizeZero
820 classOpSize dflags top_args (arg1 : other_args)
821 = SizeIs size arg_discount 0
822 where
823 size = 20 + (10 * length other_args)
824 -- If the class op is scrutinising a lambda bound dictionary then
825 -- give it a discount, to encourage the inlining of this function
826 -- The actual discount is rather arbitrarily chosen
827 arg_discount = case arg1 of
828 Var dict | dict `elem` top_args
829 -> unitBag (dict, ufDictDiscount dflags)
830 _other -> emptyBag
831
832 -- | The size of a function call
833 callSize
834 :: Int -- ^ number of value args
835 -> Int -- ^ number of value args that are void
836 -> Int
837 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
838 -- The 1+ is for the function itself
839 -- Add 1 for each non-trivial arg;
840 -- the allocation cost, as in let(rec)
841
842 -- | The size of a jump to a join point
843 jumpSize
844 :: Int -- ^ number of value args
845 -> Int -- ^ number of value args that are void
846 -> Int
847 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
848 -- A jump is 20% the size of a function call. Making jumps free reopens
849 -- bug #6048, but making them any more expensive loses a 21% improvement in
850 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
851 -- better solution?
852
853 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
854 -- Size for functions that are not constructors or primops
855 -- Note [Function applications]
856 funSize dflags top_args fun n_val_args voids
857 | fun `hasKey` buildIdKey = buildSize
858 | fun `hasKey` augmentIdKey = augmentSize
859 | otherwise = SizeIs size arg_discount res_discount
860 where
861 some_val_args = n_val_args > 0
862 is_join = isJoinId fun
863
864 size | is_join = jumpSize n_val_args voids
865 | not some_val_args = 0
866 | otherwise = callSize n_val_args voids
867
868 -- DISCOUNTS
869 -- See Note [Function and non-function discounts]
870 arg_discount | some_val_args && fun `elem` top_args
871 = unitBag (fun, ufFunAppDiscount dflags)
872 | otherwise = emptyBag
873 -- If the function is an argument and is applied
874 -- to some values, give it an arg-discount
875
876 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
877 | otherwise = 0
878 -- If the function is partially applied, show a result discount
879 -- XXX maybe behave like ConSize for eval'd variable
880
881 conSize :: DataCon -> Int -> ExprSize
882 conSize dc n_val_args
883 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
884
885 -- See Note [Unboxed tuple size and result discount]
886 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
887
888 -- See Note [Constructor size and result discount]
889 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
890
891 -- XXX still looks to large to me
892
893 {-
894 Note [Constructor size and result discount]
895 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
896 Treat a constructors application as size 10, regardless of how many
897 arguments it has; we are keen to expose them (and we charge separately
898 for their args). We can't treat them as size zero, else we find that
899 (Just x) has size 0, which is the same as a lone variable; and hence
900 'v' will always be replaced by (Just x), where v is bound to Just x.
901
902 The "result discount" is applied if the result of the call is
903 scrutinised (say by a case). For a constructor application that will
904 mean the constructor application will disappear, so we don't need to
905 charge it to the function. So the discount should at least match the
906 cost of the constructor application, namely 10. But to give a bit
907 of extra incentive we give a discount of 10*(1 + n_val_args).
908
909 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
910 and said it was an "unambiguous win", but its terribly dangerous
911 because a function with many many case branches, each finishing with
912 a constructor, can have an arbitrarily large discount. This led to
913 terrible code bloat: see Trac #6099.
914
915 Note [Unboxed tuple size and result discount]
916 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
917 However, unboxed tuples count as size zero. I found occasions where we had
918 f x y z = case op# x y z of { s -> (# s, () #) }
919 and f wasn't getting inlined.
920
921 I tried giving unboxed tuples a *result discount* of zero (see the
922 commented-out line). Why? When returned as a result they do not
923 allocate, so maybe we don't want to charge so much for them If you
924 have a non-zero discount here, we find that workers often get inlined
925 back into wrappers, because it look like
926 f x = case $wf x of (# a,b #) -> (a,b)
927 and we are keener because of the case. However while this change
928 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
929 more. All other changes were very small. So it's not a big deal but I
930 didn't adopt the idea.
931
932 Note [Function and non-function discounts]
933 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
934 We want a discount if the function is applied. A good example is
935 monadic combinators with continuation arguments, where inlining is
936 quite important.
937
938 But we don't want a big discount when a function is called many times
939 (see the detailed comments with Trac #6048) because if the function is
940 big it won't be inlined at its many call sites and no benefit results.
941 Indeed, we can get exponentially big inlinings this way; that is what
942 Trac #6048 is about.
943
944 On the other hand, for data-valued arguments, if there are lots of
945 case expressions in the body, each one will get smaller if we apply
946 the function to a constructor application, so we *want* a big discount
947 if the argument is scrutinised by many case expressions.
948
949 Conclusion:
950 - For functions, take the max of the discounts
951 - For data values, take the sum of the discounts
952
953
954 Note [Literal integer size]
955 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
956 Literal integers *can* be big (mkInteger [...coefficients...]), but
957 need not be (S# n). We just use an arbitrary big-ish constant here
958 so that, in particular, we don't inline top-level defns like
959 n = S# 5
960 There's no point in doing so -- any optimisations will see the S#
961 through n's unfolding. Nor will a big size inhibit unfoldings functions
962 that mention a literal Integer, because the float-out pass will float
963 all those constants to top level.
964 -}
965
966 primOpSize :: PrimOp -> Int -> ExprSize
967 primOpSize op n_val_args
968 = if primOpOutOfLine op
969 then sizeN (op_size + n_val_args)
970 else sizeN op_size
971 where
972 op_size = primOpCodeSize op
973
974
975 buildSize :: ExprSize
976 buildSize = SizeIs 0 emptyBag 40
977 -- We really want to inline applications of build
978 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
979 -- Indeed, we should add a result_discount because build is
980 -- very like a constructor. We don't bother to check that the
981 -- build is saturated (it usually is). The "-2" discounts for the \c n,
982 -- The "4" is rather arbitrary.
983
984 augmentSize :: ExprSize
985 augmentSize = SizeIs 0 emptyBag 40
986 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
987 -- e plus ys. The -2 accounts for the \cn
988
989 -- When we return a lambda, give a discount if it's used (applied)
990 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
991 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
992 lamScrutDiscount _ TooBig = TooBig
993
994 {-
995 Note [addAltSize result discounts]
996 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
997 When adding the size of alternatives, we *add* the result discounts
998 too, rather than take the *maximum*. For a multi-branch case, this
999 gives a discount for each branch that returns a constructor, making us
1000 keener to inline. I did try using 'max' instead, but it makes nofib
1001 'rewrite' and 'puzzle' allocate significantly more, and didn't make
1002 binary sizes shrink significantly either.
1003
1004 Note [Discounts and thresholds]
1005 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1006 Constants for discounts and thesholds are defined in main/DynFlags,
1007 all of form ufXxxx. They are:
1008
1009 ufCreationThreshold
1010 At a definition site, if the unfolding is bigger than this, we
1011 may discard it altogether
1012
1013 ufUseThreshold
1014 At a call site, if the unfolding, less discounts, is smaller than
1015 this, then it's small enough inline
1016
1017 ufKeenessFactor
1018 Factor by which the discounts are multiplied before
1019 subtracting from size
1020
1021 ufDictDiscount
1022 The discount for each occurrence of a dictionary argument
1023 as an argument of a class method. Should be pretty small
1024 else big functions may get inlined
1025
1026 ufFunAppDiscount
1027 Discount for a function argument that is applied. Quite
1028 large, because if we inline we avoid the higher-order call.
1029
1030 ufDearOp
1031 The size of a foreign call or not-dupable PrimOp
1032
1033 ufVeryAggressive
1034 If True, the compiler ignores all the thresholds and inlines very
1035 aggressively. It still adheres to arity, simplifier phase control and
1036 loop breakers.
1037
1038
1039 Note [Function applications]
1040 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1041 In a function application (f a b)
1042
1043 - If 'f' is an argument to the function being analysed,
1044 and there's at least one value arg, record a FunAppDiscount for f
1045
1046 - If the application if a PAP (arity > 2 in this example)
1047 record a *result* discount (because inlining
1048 with "extra" args in the call may mean that we now
1049 get a saturated application)
1050
1051 Code for manipulating sizes
1052 -}
1053
1054 -- | The size of a candidate expression for unfolding
1055 data ExprSize
1056 = TooBig
1057 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
1058 , _es_args :: !(Bag (Id,Int))
1059 -- ^ Arguments cased herein, and discount for each such
1060 , _es_discount :: {-# UNPACK #-} !Int
1061 -- ^ Size to subtract if result is scrutinised by a case
1062 -- expression
1063 }
1064
1065 instance Outputable ExprSize where
1066 ppr TooBig = text "TooBig"
1067 ppr (SizeIs a _ c) = brackets (int a <+> int c)
1068
1069 -- subtract the discount before deciding whether to bale out. eg. we
1070 -- want to inline a large constructor application into a selector:
1071 -- tup = (a_1, ..., a_99)
1072 -- x = case tup of ...
1073 --
1074 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
1075 mkSizeIs max n xs d | (n - d) > max = TooBig
1076 | otherwise = SizeIs n xs d
1077
1078 maxSize :: ExprSize -> ExprSize -> ExprSize
1079 maxSize TooBig _ = TooBig
1080 maxSize _ TooBig = TooBig
1081 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
1082 | otherwise = s2
1083
1084 sizeZero :: ExprSize
1085 sizeN :: Int -> ExprSize
1086
1087 sizeZero = SizeIs 0 emptyBag 0
1088 sizeN n = SizeIs n emptyBag 0
1089
1090 {-
1091 ************************************************************************
1092 * *
1093 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
1094 * *
1095 ************************************************************************
1096
1097 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
1098 we ``couldn't possibly use'' on the other side. Can be overridden w/
1099 flaggery. Just the same as smallEnoughToInline, except that it has no
1100 actual arguments.
1101 -}
1102
1103 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1104 couldBeSmallEnoughToInline dflags threshold rhs
1105 = case sizeExpr dflags threshold [] body of
1106 TooBig -> False
1107 _ -> True
1108 where
1109 (_, body) = collectBinders rhs
1110
1111 ----------------
1112 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1113 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1114 = size <= ufUseThreshold dflags
1115 smallEnoughToInline _ _
1116 = False
1117
1118 ----------------
1119
1120 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1121 -- ^ Sees if the unfolding is pretty certain to inline.
1122 -- If so, return a *stable* unfolding for it, that will always inline.
1123 certainlyWillInline dflags fn_info
1124 = case unfoldingInfo fn_info of
1125 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1126 | loop_breaker -> Nothing -- Won't inline, so try w/w
1127 | noinline -> Nothing -- See Note [Worker-wrapper for NOINLINE functions]
1128 | otherwise -> do_cunf e g -- Depends on size, so look at that
1129
1130 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1131 -- to do so, and even if it is currently a
1132 -- loop breaker, it may not be later
1133
1134 _other_unf -> Nothing
1135
1136 where
1137 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1138 noinline = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline
1139 fn_unf = unfoldingInfo fn_info
1140
1141 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1142 do_cunf _ UnfNever = Nothing
1143 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1144 -- INLINE functions have UnfWhen
1145
1146 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1147 -- binary sizes go up by 10%! (This is with SplitObjs.)
1148 -- I'm not totally sure why.
1149 -- INLINABLE functions come via this path
1150 -- See Note [certainlyWillInline: INLINABLE]
1151 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1152 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1153 , not (isBottomingSig (strictnessInfo fn_info))
1154 -- Do not unconditionally inline a bottoming functions even if
1155 -- it seems smallish. We've carefully lifted it out to top level,
1156 -- so we don't want to re-inline it.
1157 , let arity = length args
1158 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1159 = Just (fn_unf { uf_src = InlineStable
1160 , uf_guidance = UnfWhen { ug_arity = arity
1161 , ug_unsat_ok = unSaturatedOk
1162 , ug_boring_ok = inlineBoringOk expr } })
1163 -- Note the "unsaturatedOk". A function like f = \ab. a
1164 -- will certainly inline, even if partially applied (f e), so we'd
1165 -- better make sure that the transformed inlining has the same property
1166 | otherwise
1167 = Nothing
1168
1169 {- Note [certainlyWillInline: be careful of thunks]
1170 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1171 Don't claim that thunks will certainly inline, because that risks work
1172 duplication. Even if the work duplication is not great (eg is_cheap
1173 holds), it can make a big difference in an inner loop In Trac #5623 we
1174 found that the WorkWrap phase thought that
1175 y = case x of F# v -> F# (v +# v)
1176 was certainlyWillInline, so the addition got duplicated.
1177
1178 Note [certainlyWillInline: INLINABLE]
1179 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1180 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1181 even though we have a stable inlining, so that strictness w/w takes
1182 place. It makes a big difference to efficiency, and the w/w pass knows
1183 how to transfer the INLINABLE info to the worker; see WorkWrap
1184 Note [Worker-wrapper for INLINABLE functions]
1185
1186 ************************************************************************
1187 * *
1188 \subsection{callSiteInline}
1189 * *
1190 ************************************************************************
1191
1192 This is the key function. It decides whether to inline a variable at a call site
1193
1194 callSiteInline is used at call sites, so it is a bit more generous.
1195 It's a very important function that embodies lots of heuristics.
1196 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1197 and occurs exactly once or
1198 occurs once in each branch of a case and is small
1199
1200 If the thing is in WHNF, there's no danger of duplicating work,
1201 so we can inline if it occurs once, or is small
1202
1203 NOTE: we don't want to inline top-level functions that always diverge.
1204 It just makes the code bigger. Tt turns out that the convenient way to prevent
1205 them inlining is to give them a NOINLINE pragma, which we do in
1206 StrictAnal.addStrictnessInfoToTopId
1207 -}
1208
1209 callSiteInline :: DynFlags
1210 -> Id -- The Id
1211 -> Bool -- True <=> unfolding is active
1212 -> Bool -- True if there are no arguments at all (incl type args)
1213 -> [ArgSummary] -- One for each value arg; True if it is interesting
1214 -> CallCtxt -- True <=> continuation is interesting
1215 -> Maybe CoreExpr -- Unfolding, if any
1216
1217 data ArgSummary = TrivArg -- Nothing interesting
1218 | NonTrivArg -- Arg has structure
1219 | ValueArg -- Arg is a con-app or PAP
1220 -- ..or con-like. Note [Conlike is interesting]
1221
1222 instance Outputable ArgSummary where
1223 ppr TrivArg = text "TrivArg"
1224 ppr NonTrivArg = text "NonTrivArg"
1225 ppr ValueArg = text "ValueArg"
1226
1227 nonTriv :: ArgSummary -> Bool
1228 nonTriv TrivArg = False
1229 nonTriv _ = True
1230
1231 data CallCtxt
1232 = BoringCtxt
1233 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1234 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1235 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1236
1237 | ValAppCtxt -- We're applied to at least one value arg
1238 -- This arises when we have ((f x |> co) y)
1239 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1240
1241 | CaseCtxt -- We're the scrutinee of a case
1242 -- that decomposes its scrutinee
1243
1244 instance Outputable CallCtxt where
1245 ppr CaseCtxt = text "CaseCtxt"
1246 ppr ValAppCtxt = text "ValAppCtxt"
1247 ppr BoringCtxt = text "BoringCtxt"
1248 ppr RhsCtxt = text "RhsCtxt"
1249 ppr DiscArgCtxt = text "DiscArgCtxt"
1250 ppr RuleArgCtxt = text "RuleArgCtxt"
1251
1252 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1253 = case idUnfolding id of
1254 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1255 -- Things with an INLINE pragma may have an unfolding *and*
1256 -- be a loop breaker (maybe the knot is not yet untied)
1257 CoreUnfolding { uf_tmpl = unf_template
1258 , uf_is_work_free = is_wf
1259 , uf_guidance = guidance, uf_expandable = is_exp }
1260 | active_unfolding -> tryUnfolding dflags id lone_variable
1261 arg_infos cont_info unf_template
1262 is_wf is_exp guidance
1263 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1264 NoUnfolding -> Nothing
1265 BootUnfolding -> Nothing
1266 OtherCon {} -> Nothing
1267 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1268
1269 traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1270 traceInline dflags inline_id str doc result
1271 | Just prefix <- inlineCheck dflags
1272 = if prefix `isPrefixOf` occNameString (getOccName inline_id)
1273 then pprTrace str doc result
1274 else result
1275 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1276 = pprTrace str doc result
1277 | otherwise
1278 = result
1279
1280 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1281 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1282 -> Maybe CoreExpr
1283 tryUnfolding dflags id lone_variable
1284 arg_infos cont_info unf_template
1285 is_wf is_exp guidance
1286 = case guidance of
1287 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1288
1289 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1290 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1291 -- See Note [INLINE for small functions (3)]
1292 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1293 | otherwise
1294 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1295 where
1296 some_benefit = calc_some_benefit uf_arity
1297 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1298
1299 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1300 | ufVeryAggressive dflags
1301 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1302 | is_wf && some_benefit && small_enough
1303 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1304 | otherwise
1305 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1306 where
1307 some_benefit = calc_some_benefit (length arg_discounts)
1308 extra_doc = text "discounted size =" <+> int discounted_size
1309 discounted_size = size - discount
1310 small_enough = discounted_size <= ufUseThreshold dflags
1311 discount = computeDiscount dflags arg_discounts
1312 res_discount arg_infos cont_info
1313
1314 where
1315 mk_doc some_benefit extra_doc yes_or_no
1316 = vcat [ text "arg infos" <+> ppr arg_infos
1317 , text "interesting continuation" <+> ppr cont_info
1318 , text "some_benefit" <+> ppr some_benefit
1319 , text "is exp:" <+> ppr is_exp
1320 , text "is work-free:" <+> ppr is_wf
1321 , text "guidance" <+> ppr guidance
1322 , extra_doc
1323 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1324
1325 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1326 n_val_args = length arg_infos
1327
1328 -- some_benefit is used when the RHS is small enough
1329 -- and the call has enough (or too many) value
1330 -- arguments (ie n_val_args >= arity). But there must
1331 -- be *something* interesting about some argument, or the
1332 -- result context, to make it worth inlining
1333 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1334 -- expected by the unfolding
1335 calc_some_benefit uf_arity
1336 | not saturated = interesting_args -- Under-saturated
1337 -- Note [Unsaturated applications]
1338 | otherwise = interesting_args -- Saturated or over-saturated
1339 || interesting_call
1340 where
1341 saturated = n_val_args >= uf_arity
1342 over_saturated = n_val_args > uf_arity
1343 interesting_args = any nonTriv arg_infos
1344 -- NB: (any nonTriv arg_infos) looks at the
1345 -- over-saturated args too which is "wrong";
1346 -- but if over-saturated we inline anyway.
1347
1348 interesting_call
1349 | over_saturated
1350 = True
1351 | otherwise
1352 = case cont_info of
1353 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1354 ValAppCtxt -> True -- Note [Cast then apply]
1355 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1356 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt]
1357 RhsCtxt -> uf_arity > 0 --
1358 _other -> False -- See Note [Nested functions]
1359
1360
1361 {-
1362 Note [Unfold into lazy contexts], Note [RHS of lets]
1363 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1364 When the call is the argument of a function with a RULE, or the RHS of a let,
1365 we are a little bit keener to inline. For example
1366 f y = (y,y,y)
1367 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1368 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1369 only we can't see it. Also
1370 x = f v
1371 could be expensive whereas
1372 x = case v of (a,b) -> a
1373 is patently cheap and may allow more eta expansion.
1374 So we treat the RHS of a let as not-totally-boring.
1375
1376 Note [Unsaturated applications]
1377 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1378 When a call is not saturated, we *still* inline if one of the
1379 arguments has interesting structure. That's sometimes very important.
1380 A good example is the Ord instance for Bool in Base:
1381
1382 Rec {
1383 $fOrdBool =GHC.Classes.D:Ord
1384 @ Bool
1385 ...
1386 $cmin_ajX
1387
1388 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1389 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1390 }
1391
1392 But the defn of GHC.Classes.$dmmin is:
1393
1394 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1395 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1396 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1397 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1398 GHC.Types.False -> y GHC.Types.True -> x }) -}
1399
1400 We *really* want to inline $dmmin, even though it has arity 3, in
1401 order to unravel the recursion.
1402
1403
1404 Note [Things to watch]
1405 ~~~~~~~~~~~~~~~~~~~~~~
1406 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1407 Assume x is exported, so not inlined unconditionally.
1408 Then we want x to inline unconditionally; no reason for it
1409 not to, and doing so avoids an indirection.
1410
1411 * { x = I# 3; ....f x.... }
1412 Make sure that x does not inline unconditionally!
1413 Lest we get extra allocation.
1414
1415 Note [Inlining an InlineRule]
1416 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1417 An InlineRules is used for
1418 (a) programmer INLINE pragmas
1419 (b) inlinings from worker/wrapper
1420
1421 For (a) the RHS may be large, and our contract is that we *only* inline
1422 when the function is applied to all the arguments on the LHS of the
1423 source-code defn. (The uf_arity in the rule.)
1424
1425 However for worker/wrapper it may be worth inlining even if the
1426 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1427 require saturation.
1428
1429 Note [Nested functions]
1430 ~~~~~~~~~~~~~~~~~~~~~~~
1431 At one time we treated a call of a non-top-level function as
1432 "interesting" (regardless of how boring the context) in the hope
1433 that inlining it would eliminate the binding, and its allocation.
1434 Specifically, in the default case of interesting_call we had
1435 _other -> not is_top && uf_arity > 0
1436
1437 But actually postInlineUnconditionally does some of this and overall
1438 it makes virtually no difference to nofib. So I simplified away this
1439 special case
1440
1441 Note [Cast then apply]
1442 ~~~~~~~~~~~~~~~~~~~~~~
1443 Consider
1444 myIndex = __inline_me ( (/\a. <blah>) |> co )
1445 co :: (forall a. a -> a) ~ (forall a. T a)
1446 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1447
1448 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1449 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1450
1451 Note [Inlining in ArgCtxt]
1452 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1453 The condition (arity > 0) here is very important, because otherwise
1454 we end up inlining top-level stuff into useless places; eg
1455 x = I# 3#
1456 f = \y. g x
1457 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1458 and 20% to 'power'.
1459
1460 At one stage I replaced this condition by 'True' (leading to the above
1461 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1462 to work ok now.
1463
1464 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1465 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1466 for the RHS of a 'let', we only profit from the inlining if we get a
1467 CONLIKE thing (modulo lets).
1468
1469 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1470 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1471 The "lone-variable" case is important. I spent ages messing about
1472 with unsatisfactory variants, but this is nice. The idea is that if a
1473 variable appears all alone
1474
1475 as an arg of lazy fn, or rhs BoringCtxt
1476 as scrutinee of a case CaseCtxt
1477 as arg of a fn ArgCtxt
1478 AND
1479 it is bound to a cheap expression
1480
1481 then we should not inline it (unless there is some other reason,
1482 e.g. it is the sole occurrence). That is what is happening at
1483 the use of 'lone_variable' in 'interesting_call'.
1484
1485 Why? At least in the case-scrutinee situation, turning
1486 let x = (a,b) in case x of y -> ...
1487 into
1488 let x = (a,b) in case (a,b) of y -> ...
1489 and thence to
1490 let x = (a,b) in let y = (a,b) in ...
1491 is bad if the binding for x will remain.
1492
1493 Another example: I discovered that strings
1494 were getting inlined straight back into applications of 'error'
1495 because the latter is strict.
1496 s = "foo"
1497 f = \x -> ...(error s)...
1498
1499 Fundamentally such contexts should not encourage inlining because, provided
1500 the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1501 context can ``see'' the unfolding of the variable (e.g. case or a
1502 RULE) so there's no gain.
1503
1504 However, watch out:
1505
1506 * Consider this:
1507 foo = _inline_ (\n. [n])
1508 bar = _inline_ (foo 20)
1509 baz = \n. case bar of { (m:_) -> m + n }
1510 Here we really want to inline 'bar' so that we can inline 'foo'
1511 and the whole thing unravels as it should obviously do. This is
1512 important: in the NDP project, 'bar' generates a closure data
1513 structure rather than a list.
1514
1515 So the non-inlining of lone_variables should only apply if the
1516 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1517 looks through the unfolding. Hence the "&& is_wf" in the
1518 InlineRule branch.
1519
1520 * Even a type application or coercion isn't a lone variable.
1521 Consider
1522 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1523 We had better inline that sucker! The case won't see through it.
1524
1525 For now, I'm treating treating a variable applied to types
1526 in a *lazy* context "lone". The motivating example was
1527 f = /\a. \x. BIG
1528 g = /\a. \y. h (f a)
1529 There's no advantage in inlining f here, and perhaps
1530 a significant disadvantage. Hence some_val_args in the Stop case
1531
1532 Note [Interaction of exprIsWorkFree and lone variables]
1533 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1534 The lone-variable test says "don't inline if a case expression
1535 scrutinises a lone variable whose unfolding is cheap". It's very
1536 important that, under these circumstances, exprIsConApp_maybe
1537 can spot a constructor application. So, for example, we don't
1538 consider
1539 let x = e in (x,x)
1540 to be cheap, and that's good because exprIsConApp_maybe doesn't
1541 think that expression is a constructor application.
1542
1543 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1544 rather than is_wf, which was utterly wrong, because the above
1545 expression responds True to exprIsHNF, which is what sets is_value.
1546
1547 This kind of thing can occur if you have
1548
1549 {-# INLINE foo #-}
1550 foo = let x = e in (x,x)
1551
1552 which Roman did.
1553
1554
1555 -}
1556
1557 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1558 -> Int
1559 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1560 -- We multiple the raw discounts (args_discount and result_discount)
1561 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1562 -- *size* whereas the discounts imply that there's some extra
1563 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1564 -- by inlining.
1565
1566 = 10 -- Discount of 10 because the result replaces the call
1567 -- so we count 10 for the function itself
1568
1569 + 10 * length actual_arg_discounts
1570 -- Discount of 10 for each arg supplied,
1571 -- because the result replaces the call
1572
1573 + round (ufKeenessFactor dflags *
1574 fromIntegral (total_arg_discount + res_discount'))
1575 where
1576 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1577 total_arg_discount = sum actual_arg_discounts
1578
1579 mk_arg_discount _ TrivArg = 0
1580 mk_arg_discount _ NonTrivArg = 10
1581 mk_arg_discount discount ValueArg = discount
1582
1583 res_discount'
1584 | LT <- arg_discounts `compareLength` arg_infos
1585 = res_discount -- Over-saturated
1586 | otherwise
1587 = case cont_info of
1588 BoringCtxt -> 0
1589 CaseCtxt -> res_discount -- Presumably a constructor
1590 ValAppCtxt -> res_discount -- Presumably a function
1591 _ -> 40 `min` res_discount
1592 -- ToDo: this 40 `min` res_discount doesn't seem right
1593 -- for DiscArgCtxt it shouldn't matter because the function will
1594 -- get the arg discount for any non-triv arg
1595 -- for RuleArgCtxt we do want to be keener to inline; but not only
1596 -- constructor results
1597 -- for RhsCtxt I suppose that exposing a data con is good in general
1598 -- And 40 seems very arbitrary
1599 --
1600 -- res_discount can be very large when a function returns
1601 -- constructors; but we only want to invoke that large discount
1602 -- when there's a case continuation.
1603 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1604 -- But we want to aovid inlining large functions that return
1605 -- constructors into contexts that are simply "interesting"