3ac35c98484b2726c35febec32d10a458fa15ed9
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import PprCore () -- Instances
50 import OccurAnal ( occurAnalyseExpr_NoBinderSwap )
51 import CoreOpt
52 import CoreArity ( manifestArity )
53 import CoreUtils
54 import Id
55 import Demand ( isBottomingSig )
56 import DataCon
57 import Literal
58 import PrimOp
59 import IdInfo
60 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
61 import Type
62 import PrelNames
63 import TysPrim ( realWorldStatePrimTy )
64 import Bag
65 import Util
66 import Outputable
67 import ForeignCall
68 import Name
69
70 import qualified Data.ByteString as BS
71 import Data.List
72
73 {-
74 ************************************************************************
75 * *
76 \subsection{Making unfoldings}
77 * *
78 ************************************************************************
79 -}
80
81 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
82 mkTopUnfolding dflags is_bottoming rhs
83 = mkUnfolding dflags InlineRhs True is_bottoming rhs
84
85 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
86 -- For implicit Ids, do a tiny bit of optimising first
87 mkImplicitUnfolding dflags expr
88 = mkTopUnfolding dflags False (simpleOptExpr dflags expr)
89
90 -- Note [Top-level flag on inline rules]
91 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 -- Slight hack: note that mk_inline_rules conservatively sets the
93 -- top-level flag to True. It gets set more accurately by the simplifier
94 -- Simplify.simplUnfolding.
95
96 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
97 mkSimpleUnfolding dflags rhs
98 = mkUnfolding dflags InlineRhs False False rhs
99
100 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
101 mkDFunUnfolding bndrs con ops
102 = DFunUnfolding { df_bndrs = bndrs
103 , df_con = con
104 , df_args = map occurAnalyseExpr_NoBinderSwap ops }
105 -- See Note [Occurrence analysis of unfoldings]
106
107 mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
108 mkWwInlineRule dflags expr arity
109 = mkCoreUnfolding InlineStable True
110 (simpleOptExpr dflags expr)
111 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
112 , ug_boring_ok = boringCxtNotOk })
113
114 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
115 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
116 = mkCoreUnfolding InlineCompulsory True
117 (simpleOptExpr unsafeGlobalDynFlags expr)
118 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
119 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
120
121 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
122 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
123 mkWorkerUnfolding dflags work_fn
124 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
125 , uf_is_top = top_lvl })
126 | isStableSource src
127 = mkCoreUnfolding src top_lvl new_tmpl guidance
128 where
129 new_tmpl = simpleOptExpr dflags (work_fn tmpl)
130 guidance = calcUnfoldingGuidance dflags False new_tmpl
131
132 mkWorkerUnfolding _ _ _ = noUnfolding
133
134 -- | Make an unfolding that may be used unsaturated
135 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
136 -- manifest arity (the number of outer lambdas applications will
137 -- resolve before doing any work).
138 mkInlineUnfolding :: CoreExpr -> Unfolding
139 mkInlineUnfolding expr
140 = mkCoreUnfolding InlineStable
141 True -- Note [Top-level flag on inline rules]
142 expr' guide
143 where
144 expr' = simpleOptExpr unsafeGlobalDynFlags expr
145 guide = UnfWhen { ug_arity = manifestArity expr'
146 , ug_unsat_ok = unSaturatedOk
147 , ug_boring_ok = boring_ok }
148 boring_ok = inlineBoringOk expr'
149
150 -- | Make an unfolding that will be used once the RHS has been saturated
151 -- to the given arity.
152 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
153 mkInlineUnfoldingWithArity arity expr
154 = mkCoreUnfolding InlineStable
155 True -- Note [Top-level flag on inline rules]
156 expr' guide
157 where
158 expr' = simpleOptExpr unsafeGlobalDynFlags expr
159 guide = UnfWhen { ug_arity = arity
160 , ug_unsat_ok = needSaturated
161 , ug_boring_ok = boring_ok }
162 -- See Note [INLINE pragmas and boring contexts] as to why we need to look
163 -- at the arity here.
164 boring_ok | arity == 0 = True
165 | otherwise = inlineBoringOk expr'
166
167 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
168 mkInlinableUnfolding dflags expr
169 = mkUnfolding dflags InlineStable False False expr'
170 where
171 expr' = simpleOptExpr dflags expr
172
173 specUnfolding :: DynFlags -> [Var] -> (CoreExpr -> CoreExpr) -> Arity
174 -> Unfolding -> Unfolding
175 -- See Note [Specialising unfoldings]
176 -- specUnfolding spec_bndrs spec_app arity_decrease unf
177 -- = \spec_bndrs. spec_app( unf )
178 --
179 specUnfolding dflags spec_bndrs spec_app arity_decrease
180 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
181 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
182 mkDFunUnfolding spec_bndrs con (map spec_arg args)
183 -- There is a hard-to-check assumption here that the spec_app has
184 -- enough applications to exactly saturate the old_bndrs
185 -- For DFunUnfoldings we transform
186 -- \old_bndrs. MkD <op1> ... <opn>
187 -- to
188 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
189 -- The ASSERT checks the value part of that
190 where
191 spec_arg arg = simpleOptExpr dflags (spec_app (mkLams old_bndrs arg))
192 -- The beta-redexes created by spec_app will be
193 -- simplified away by simplOptExpr
194
195 specUnfolding dflags spec_bndrs spec_app arity_decrease
196 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
197 , uf_is_top = top_lvl
198 , uf_guidance = old_guidance })
199 | isStableSource src -- See Note [Specialising unfoldings]
200 , UnfWhen { ug_arity = old_arity
201 , ug_unsat_ok = unsat_ok
202 , ug_boring_ok = boring_ok } <- old_guidance
203 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
204 , ug_unsat_ok = unsat_ok
205 , ug_boring_ok = boring_ok }
206 new_tmpl = simpleOptExpr dflags (mkLams spec_bndrs (spec_app tmpl))
207 -- The beta-redexes created by spec_app will be
208 -- simplified away by simplOptExpr
209
210 in mkCoreUnfolding src top_lvl new_tmpl guidance
211
212 specUnfolding _ _ _ _ _ = noUnfolding
213
214 {- Note [Specialising unfoldings]
215 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
216 When we specialise a function for some given type-class arguments, we use
217 specUnfolding to specialise its unfolding. Some important points:
218
219 * If the original function has a DFunUnfolding, the specialised one
220 must do so too! Otherwise we lose the magic rules that make it
221 interact with ClassOps
222
223 * There is a bit of hack for INLINABLE functions:
224 f :: Ord a => ....
225 f = <big-rhs>
226 {- INLINABLE f #-}
227 Now if we specialise f, should the specialised version still have
228 an INLINABLE pragma? If it does, we'll capture a specialised copy
229 of <big-rhs> as its unfolding, and that probaby won't inline. But
230 if we don't, the specialised version of <big-rhs> might be small
231 enough to inline at a call site. This happens with Control.Monad.liftM3,
232 and can cause a lot more allocation as a result (nofib n-body shows this).
233
234 Moreover, keeping the INLINABLE thing isn't much help, because
235 the specialised function (probaby) isn't overloaded any more.
236
237 Conclusion: drop the INLINEALE pragma. In practice what this means is:
238 if a stable unfolding has UnfoldingGuidance of UnfWhen,
239 we keep it (so the specialised thing too will always inline)
240 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
241 (which arises from INLINABLE), we discard it
242
243 Note [Honour INLINE on 0-ary bindings]
244 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
245 Consider
246
247 x = <expensive>
248 {-# INLINE x #-}
249
250 f y = ...x...
251
252 The semantics of an INLINE pragma is
253
254 inline x at every call site, provided it is saturated;
255 that is, applied to at least as many arguments as appear
256 on the LHS of the Haskell source definition.
257
258 (This soure-code-derived arity is stored in the `ug_arity` field of
259 the `UnfoldingGuidance`.)
260
261 In the example, x's ug_arity is 0, so we should inline it at every use
262 site. It's rare to have such an INLINE pragma (usually INLINE Is on
263 functions), but it's occasionally very important (Trac #15578, #15519).
264 In #15519 we had something like
265 x = case (g a b) of I# r -> T r
266 {-# INLINE x #-}
267 f y = ...(h x)....
268
269 where h is strict. So we got
270 f y = ...(case g a b of I# r -> h (T r))...
271
272 and that in turn allowed SpecConstr to ramp up performance.
273
274 How do we deliver on this? By adjusting the ug_boring_ok
275 flag in mkInlineUnfoldingWithArity; see
276 Note [INLINE pragmas and boring contexts]
277
278 NB: there is a real risk that full laziness will float it right back
279 out again. Consider again
280 x = factorial 200
281 {-# INLINE x #-}
282 f y = ...x...
283
284 After inlining we get
285 f y = ...(factorial 200)...
286
287 but it's entirely possible that full laziness will do
288 lvl23 = factorial 200
289 f y = ...lvl23...
290
291 That's a problem for another day.
292
293 Note [INLINE pragmas and boring contexts]
294 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
295 An INLINE pragma uses mkInlineUnfoldingWithArity to build the
296 unfolding. That sets the ug_boring_ok flag to False if the function
297 is not tiny (inlineBoringOK), so that even INLINE functions are not
298 inlined in an utterly boring context. E.g.
299 \x y. Just (f y x)
300 Nothing is gained by inlining f here, even if it has an INLINE
301 pragma.
302
303 But for 0-ary bindings, we want to inline regardless; see
304 Note [Honour INLINE on 0-ary bindings].
305
306 I'm a bit worried that it's possible for the same kind of problem
307 to arise for non-0-ary functions too, but let's wait and see.
308 -}
309
310 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
311 -> UnfoldingGuidance -> Unfolding
312 -- Occurrence-analyses the expression before capturing it
313 mkCoreUnfolding src top_lvl expr guidance
314 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr,
315 -- See Note [Occurrence analysis of unfoldings]
316 uf_src = src,
317 uf_is_top = top_lvl,
318 uf_is_value = exprIsHNF expr,
319 uf_is_conlike = exprIsConLike expr,
320 uf_is_work_free = exprIsWorkFree expr,
321 uf_expandable = exprIsExpandable expr,
322 uf_guidance = guidance }
323
324 mkUnfolding :: DynFlags -> UnfoldingSource
325 -> Bool -- Is top-level
326 -> Bool -- Definitely a bottoming binding
327 -- (only relevant for top-level bindings)
328 -> CoreExpr
329 -> Unfolding
330 -- Calculates unfolding guidance
331 -- Occurrence-analyses the expression before capturing it
332 mkUnfolding dflags src is_top_lvl is_bottoming expr
333 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr,
334 -- See Note [Occurrence analysis of unfoldings]
335 uf_src = src,
336 uf_is_top = is_top_lvl,
337 uf_is_value = exprIsHNF expr,
338 uf_is_conlike = exprIsConLike expr,
339 uf_expandable = exprIsExpandable expr,
340 uf_is_work_free = exprIsWorkFree expr,
341 uf_guidance = guidance }
342 where
343 is_top_bottoming = is_top_lvl && is_bottoming
344 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
345 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr_NoBinderSwap expr))!
346 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
347
348 {-
349 Note [Occurrence analysis of unfoldings]
350 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351 We do occurrence-analysis of unfoldings once and for all, when the
352 unfolding is built, rather than each time we inline them.
353
354 But given this decision it's vital that we do
355 *always* do it. Consider this unfolding
356 \x -> letrec { f = ...g...; g* = f } in body
357 where g* is (for some strange reason) the loop breaker. If we don't
358 occ-anal it when reading it in, we won't mark g as a loop breaker, and
359 we may inline g entirely in body, dropping its binding, and leaving
360 the occurrence in f out of scope. This happened in Trac #8892, where
361 the unfolding in question was a DFun unfolding.
362
363 But more generally, the simplifier is designed on the
364 basis that it is looking at occurrence-analysed expressions, so better
365 ensure that they acutally are.
366
367 We use occurAnalyseExpr_NoBinderSwap instead of occurAnalyseExpr;
368 see Note [No binder swap in unfoldings].
369
370 Note [No binder swap in unfoldings]
371 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
372 The binder swap can temporarily violate Core Lint, by assinging
373 a LocalId binding to a GlobalId. For example, if A.foo{r872}
374 is a GlobalId with unique r872, then
375
376 case A.foo{r872} of bar {
377 K x -> ...(A.foo{r872})...
378 }
379
380 gets transformed to
381
382 case A.foo{r872} of bar {
383 K x -> let foo{r872} = bar
384 in ...(A.foo{r872})...
385
386 This is usually not a problem, because the simplifier will transform
387 this to:
388
389 case A.foo{r872} of bar {
390 K x -> ...(bar)...
391
392 However, after occurrence analysis but before simplification, this extra 'let'
393 violates the Core Lint invariant that we do not have local 'let' bindings for
394 GlobalIds. That seems (just) tolerable for the occurrence analysis that happens
395 just before the Simplifier, but not for unfoldings, which are Linted
396 independently.
397 As a quick workaround, we disable binder swap in this module.
398 See Trac #16288 and #16296 for further plans.
399
400 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
401 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
402 Notice that we give the non-occur-analysed expression to
403 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
404 first; for example, sometimes during simplification, there's a large
405 let-bound thing which has been substituted, and so is now dead; so
406 'expr' contains two copies of the thing while the occurrence-analysed
407 expression doesn't.
408
409 Nevertheless, we *don't* and *must not* occ-analyse before computing
410 the size because
411
412 a) The size computation bales out after a while, whereas occurrence
413 analysis does not.
414
415 b) Residency increases sharply if you occ-anal first. I'm not
416 100% sure why, but it's a large effect. Compiling Cabal went
417 from residency of 534M to over 800M with this one change.
418
419 This can occasionally mean that the guidance is very pessimistic;
420 it gets fixed up next round. And it should be rare, because large
421 let-bound things that are dead are usually caught by preInlineUnconditionally
422
423
424 ************************************************************************
425 * *
426 \subsection{The UnfoldingGuidance type}
427 * *
428 ************************************************************************
429 -}
430
431 inlineBoringOk :: CoreExpr -> Bool
432 -- See Note [INLINE for small functions]
433 -- True => the result of inlining the expression is
434 -- no bigger than the expression itself
435 -- eg (\x y -> f y x)
436 -- This is a quick and dirty version. It doesn't attempt
437 -- to deal with (\x y z -> x (y z))
438 -- The really important one is (x `cast` c)
439 inlineBoringOk e
440 = go 0 e
441 where
442 go :: Int -> CoreExpr -> Bool
443 go credit (Lam x e) | isId x = go (credit+1) e
444 | otherwise = go credit e
445 go credit (App f (Type {})) = go credit f
446 go credit (App f a) | credit > 0
447 , exprIsTrivial a = go (credit-1) f
448 go credit (Tick _ e) = go credit e -- dubious
449 go credit (Cast e _) = go credit e
450 go _ (Var {}) = boringCxtOk
451 go _ _ = boringCxtNotOk
452
453 calcUnfoldingGuidance
454 :: DynFlags
455 -> Bool -- Definitely a top-level, bottoming binding
456 -> CoreExpr -- Expression to look at
457 -> UnfoldingGuidance
458 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
459 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
460 = calcUnfoldingGuidance dflags is_top_bottoming expr
461 calcUnfoldingGuidance dflags is_top_bottoming expr
462 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
463 TooBig -> UnfNever
464 SizeIs size cased_bndrs scrut_discount
465 | uncondInline expr n_val_bndrs size
466 -> UnfWhen { ug_unsat_ok = unSaturatedOk
467 , ug_boring_ok = boringCxtOk
468 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
469
470 | is_top_bottoming
471 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
472
473 | otherwise
474 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
475 , ug_size = size
476 , ug_res = scrut_discount }
477
478 where
479 (bndrs, body) = collectBinders expr
480 bOMB_OUT_SIZE = ufCreationThreshold dflags
481 -- Bomb out if size gets bigger than this
482 val_bndrs = filter isId bndrs
483 n_val_bndrs = length val_bndrs
484
485 mk_discount :: Bag (Id,Int) -> Id -> Int
486 mk_discount cbs bndr = foldlBag combine 0 cbs
487 where
488 combine acc (bndr', disc)
489 | bndr == bndr' = acc `plus_disc` disc
490 | otherwise = acc
491
492 plus_disc :: Int -> Int -> Int
493 plus_disc | isFunTy (idType bndr) = max
494 | otherwise = (+)
495 -- See Note [Function and non-function discounts]
496
497 {-
498 Note [Computing the size of an expression]
499 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
500 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
501 heuristics right has taken a long time. Here's the basic strategy:
502
503 * Variables, literals: 0
504 (Exception for string literals, see litSize.)
505
506 * Function applications (f e1 .. en): 1 + #value args
507
508 * Constructor applications: 1, regardless of #args
509
510 * Let(rec): 1 + size of components
511
512 * Note, cast: 0
513
514 Examples
515
516 Size Term
517 --------------
518 0 42#
519 0 x
520 0 True
521 2 f x
522 1 Just x
523 4 f (g x)
524
525 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
526 a function call to account for. Notice also that constructor applications
527 are very cheap, because exposing them to a caller is so valuable.
528
529 [25/5/11] All sizes are now multiplied by 10, except for primops
530 (which have sizes like 1 or 4. This makes primops look fantastically
531 cheap, and seems to be almost unversally beneficial. Done partly as a
532 result of #4978.
533
534 Note [Do not inline top-level bottoming functions]
535 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
536 The FloatOut pass has gone to some trouble to float out calls to 'error'
537 and similar friends. See Note [Bottoming floats] in SetLevels.
538 Do not re-inline them! But we *do* still inline if they are very small
539 (the uncondInline stuff).
540
541 Note [INLINE for small functions]
542 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
543 Consider {-# INLINE f #-}
544 f x = Just x
545 g y = f y
546 Then f's RHS is no larger than its LHS, so we should inline it into
547 even the most boring context. In general, f the function is
548 sufficiently small that its body is as small as the call itself, the
549 inline unconditionally, regardless of how boring the context is.
550
551 Things to note:
552
553 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
554 than the thing it's replacing. Notice that
555 (f x) --> (g 3) -- YES, unconditionally
556 (f x) --> x : [] -- YES, *even though* there are two
557 -- arguments to the cons
558 x --> g 3 -- NO
559 x --> Just v -- NO
560
561 It's very important not to unconditionally replace a variable by
562 a non-atomic term.
563
564 (2) We do this even if the thing isn't saturated, else we end up with the
565 silly situation that
566 f x y = x
567 ...map (f 3)...
568 doesn't inline. Even in a boring context, inlining without being
569 saturated will give a lambda instead of a PAP, and will be more
570 efficient at runtime.
571
572 (3) However, when the function's arity > 0, we do insist that it
573 has at least one value argument at the call site. (This check is
574 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
575 f = /\a \x:a. x
576 d = /\b. MkD (f b)
577 If we inline f here we get
578 d = /\b. MkD (\x:b. x)
579 and then prepareRhs floats out the argument, abstracting the type
580 variables, so we end up with the original again!
581
582 (4) We must be much more cautious about arity-zero things. Consider
583 let x = y +# z in ...
584 In *size* terms primops look very small, because the generate a
585 single instruction, but we do not want to unconditionally replace
586 every occurrence of x with (y +# z). So we only do the
587 unconditional-inline thing for *trivial* expressions.
588
589 NB: you might think that PostInlineUnconditionally would do this
590 but it doesn't fire for top-level things; see SimplUtils
591 Note [Top level and postInlineUnconditionally]
592 -}
593
594 uncondInline :: CoreExpr -> Arity -> Int -> Bool
595 -- Inline unconditionally if there no size increase
596 -- Size of call is arity (+1 for the function)
597 -- See Note [INLINE for small functions]
598 uncondInline rhs arity size
599 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
600 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
601
602 sizeExpr :: DynFlags
603 -> Int -- Bomb out if it gets bigger than this
604 -> [Id] -- Arguments; we're interested in which of these
605 -- get case'd
606 -> CoreExpr
607 -> ExprSize
608
609 -- Note [Computing the size of an expression]
610
611 sizeExpr dflags bOMB_OUT_SIZE top_args expr
612 = size_up expr
613 where
614 size_up (Cast e _) = size_up e
615 size_up (Tick _ e) = size_up e
616 size_up (Type _) = sizeZero -- Types cost nothing
617 size_up (Coercion _) = sizeZero
618 size_up (Lit lit) = sizeN (litSize lit)
619 size_up (Var f) | isRealWorldId f = sizeZero
620 -- Make sure we get constructor discounts even
621 -- on nullary constructors
622 | otherwise = size_up_call f [] 0
623
624 size_up (App fun arg)
625 | isTyCoArg arg = size_up fun
626 | otherwise = size_up arg `addSizeNSD`
627 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
628
629 size_up (Lam b e)
630 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
631 | otherwise = size_up e
632
633 size_up (Let (NonRec binder rhs) body)
634 = size_up_rhs (binder, rhs) `addSizeNSD`
635 size_up body `addSizeN`
636 size_up_alloc binder
637
638 size_up (Let (Rec pairs) body)
639 = foldr (addSizeNSD . size_up_rhs)
640 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
641 pairs
642
643 size_up (Case e _ _ alts)
644 | null alts
645 = size_up e -- case e of {} never returns, so take size of scrutinee
646
647 size_up (Case e _ _ alts)
648 -- Now alts is non-empty
649 | Just v <- is_top_arg e -- We are scrutinising an argument variable
650 = let
651 alt_sizes = map size_up_alt alts
652
653 -- alts_size tries to compute a good discount for
654 -- the case when we are scrutinising an argument variable
655 alts_size (SizeIs tot tot_disc tot_scrut)
656 -- Size of all alternatives
657 (SizeIs max _ _)
658 -- Size of biggest alternative
659 = SizeIs tot (unitBag (v, 20 + tot - max)
660 `unionBags` tot_disc) tot_scrut
661 -- If the variable is known, we produce a
662 -- discount that will take us back to 'max',
663 -- the size of the largest alternative The
664 -- 1+ is a little discount for reduced
665 -- allocation in the caller
666 --
667 -- Notice though, that we return tot_disc,
668 -- the total discount from all branches. I
669 -- think that's right.
670
671 alts_size tot_size _ = tot_size
672 in
673 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
674 (foldr1 maxSize alt_sizes)
675 -- Good to inline if an arg is scrutinised, because
676 -- that may eliminate allocation in the caller
677 -- And it eliminates the case itself
678 where
679 is_top_arg (Var v) | v `elem` top_args = Just v
680 is_top_arg (Cast e _) = is_top_arg e
681 is_top_arg _ = Nothing
682
683
684 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
685 foldr (addAltSize . size_up_alt) case_size alts
686 where
687 case_size
688 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
689 | otherwise = sizeZero
690 -- Normally we don't charge for the case itself, but
691 -- we charge one per alternative (see size_up_alt,
692 -- below) to account for the cost of the info table
693 -- and comparisons.
694 --
695 -- However, in certain cases (see is_inline_scrut
696 -- below), no code is generated for the case unless
697 -- there are multiple alts. In these cases we
698 -- subtract one, making the first alt free.
699 -- e.g. case x# +# y# of _ -> ... should cost 1
700 -- case touch# x# of _ -> ... should cost 0
701 -- (see #4978)
702 --
703 -- I would like to not have the "lengthAtMost alts 1"
704 -- condition above, but without that some programs got worse
705 -- (spectral/hartel/event and spectral/para). I don't fully
706 -- understand why. (SDM 24/5/11)
707
708 -- unboxed variables, inline primops and unsafe foreign calls
709 -- are all "inline" things:
710 is_inline_scrut (Var v) = isUnliftedType (idType v)
711 is_inline_scrut scrut
712 | (Var f, _) <- collectArgs scrut
713 = case idDetails f of
714 FCallId fc -> not (isSafeForeignCall fc)
715 PrimOpId op -> not (primOpOutOfLine op)
716 _other -> False
717 | otherwise
718 = False
719
720 size_up_rhs (bndr, rhs)
721 | Just join_arity <- isJoinId_maybe bndr
722 -- Skip arguments to join point
723 , (_bndrs, body) <- collectNBinders join_arity rhs
724 = size_up body
725 | otherwise
726 = size_up rhs
727
728 ------------
729 -- size_up_app is used when there's ONE OR MORE value args
730 size_up_app (App fun arg) args voids
731 | isTyCoArg arg = size_up_app fun args voids
732 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
733 | otherwise = size_up arg `addSizeNSD`
734 size_up_app fun (arg:args) voids
735 size_up_app (Var fun) args voids = size_up_call fun args voids
736 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
737 size_up_app (Cast expr _) args voids = size_up_app expr args voids
738 size_up_app other args voids = size_up other `addSizeN`
739 callSize (length args) voids
740 -- if the lhs is not an App or a Var, or an invisible thing like a
741 -- Tick or Cast, then we should charge for a complete call plus the
742 -- size of the lhs itself.
743
744 ------------
745 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
746 size_up_call fun val_args voids
747 = case idDetails fun of
748 FCallId _ -> sizeN (callSize (length val_args) voids)
749 DataConWorkId dc -> conSize dc (length val_args)
750 PrimOpId op -> primOpSize op (length val_args)
751 ClassOpId _ -> classOpSize dflags top_args val_args
752 _ -> funSize dflags top_args fun (length val_args) voids
753
754 ------------
755 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
756 -- Don't charge for args, so that wrappers look cheap
757 -- (See comments about wrappers with Case)
758 --
759 -- IMPORTANT: *do* charge 1 for the alternative, else we
760 -- find that giant case nests are treated as practically free
761 -- A good example is Foreign.C.Error.errnoToIOError
762
763 ------------
764 -- Cost to allocate binding with given binder
765 size_up_alloc bndr
766 | isTyVar bndr -- Doesn't exist at runtime
767 || isJoinId bndr -- Not allocated at all
768 || isUnliftedType (idType bndr) -- Doesn't live in heap
769 = 0
770 | otherwise
771 = 10
772
773 ------------
774 -- These addSize things have to be here because
775 -- I don't want to give them bOMB_OUT_SIZE as an argument
776 addSizeN TooBig _ = TooBig
777 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
778
779 -- addAltSize is used to add the sizes of case alternatives
780 addAltSize TooBig _ = TooBig
781 addAltSize _ TooBig = TooBig
782 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
783 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
784 (xs `unionBags` ys)
785 (d1 + d2) -- Note [addAltSize result discounts]
786
787 -- This variant ignores the result discount from its LEFT argument
788 -- It's used when the second argument isn't part of the result
789 addSizeNSD TooBig _ = TooBig
790 addSizeNSD _ TooBig = TooBig
791 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
792 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
793 (xs `unionBags` ys)
794 d2 -- Ignore d1
795
796 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
797
798 -- an expression of type State# RealWorld must be a variable
799 isRealWorldExpr (Var id) = isRealWorldId id
800 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
801 isRealWorldExpr _ = False
802
803 -- | Finds a nominal size of a string literal.
804 litSize :: Literal -> Int
805 -- Used by CoreUnfold.sizeExpr
806 litSize (LitNumber LitNumInteger _ _) = 100 -- Note [Size of literal integers]
807 litSize (LitNumber LitNumNatural _ _) = 100
808 litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
809 -- If size could be 0 then @f "x"@ might be too small
810 -- [Sept03: make literal strings a bit bigger to avoid fruitless
811 -- duplication of little strings]
812 litSize _other = 0 -- Must match size of nullary constructors
813 -- Key point: if x |-> 4, then x must inline unconditionally
814 -- (eg via case binding)
815
816 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
817 -- See Note [Conlike is interesting]
818 classOpSize _ _ []
819 = sizeZero
820 classOpSize dflags top_args (arg1 : other_args)
821 = SizeIs size arg_discount 0
822 where
823 size = 20 + (10 * length other_args)
824 -- If the class op is scrutinising a lambda bound dictionary then
825 -- give it a discount, to encourage the inlining of this function
826 -- The actual discount is rather arbitrarily chosen
827 arg_discount = case arg1 of
828 Var dict | dict `elem` top_args
829 -> unitBag (dict, ufDictDiscount dflags)
830 _other -> emptyBag
831
832 -- | The size of a function call
833 callSize
834 :: Int -- ^ number of value args
835 -> Int -- ^ number of value args that are void
836 -> Int
837 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
838 -- The 1+ is for the function itself
839 -- Add 1 for each non-trivial arg;
840 -- the allocation cost, as in let(rec)
841
842 -- | The size of a jump to a join point
843 jumpSize
844 :: Int -- ^ number of value args
845 -> Int -- ^ number of value args that are void
846 -> Int
847 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
848 -- A jump is 20% the size of a function call. Making jumps free reopens
849 -- bug #6048, but making them any more expensive loses a 21% improvement in
850 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
851 -- better solution?
852
853 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
854 -- Size for functions that are not constructors or primops
855 -- Note [Function applications]
856 funSize dflags top_args fun n_val_args voids
857 | fun `hasKey` buildIdKey = buildSize
858 | fun `hasKey` augmentIdKey = augmentSize
859 | otherwise = SizeIs size arg_discount res_discount
860 where
861 some_val_args = n_val_args > 0
862 is_join = isJoinId fun
863
864 size | is_join = jumpSize n_val_args voids
865 | not some_val_args = 0
866 | otherwise = callSize n_val_args voids
867
868 -- DISCOUNTS
869 -- See Note [Function and non-function discounts]
870 arg_discount | some_val_args && fun `elem` top_args
871 = unitBag (fun, ufFunAppDiscount dflags)
872 | otherwise = emptyBag
873 -- If the function is an argument and is applied
874 -- to some values, give it an arg-discount
875
876 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
877 | otherwise = 0
878 -- If the function is partially applied, show a result discount
879 -- XXX maybe behave like ConSize for eval'd variable
880
881 conSize :: DataCon -> Int -> ExprSize
882 conSize dc n_val_args
883 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
884
885 -- See Note [Unboxed tuple size and result discount]
886 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
887
888 -- See Note [Constructor size and result discount]
889 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
890
891 -- XXX still looks to large to me
892
893 {-
894 Note [Constructor size and result discount]
895 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
896 Treat a constructors application as size 10, regardless of how many
897 arguments it has; we are keen to expose them (and we charge separately
898 for their args). We can't treat them as size zero, else we find that
899 (Just x) has size 0, which is the same as a lone variable; and hence
900 'v' will always be replaced by (Just x), where v is bound to Just x.
901
902 The "result discount" is applied if the result of the call is
903 scrutinised (say by a case). For a constructor application that will
904 mean the constructor application will disappear, so we don't need to
905 charge it to the function. So the discount should at least match the
906 cost of the constructor application, namely 10. But to give a bit
907 of extra incentive we give a discount of 10*(1 + n_val_args).
908
909 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
910 and said it was an "unambiguous win", but its terribly dangerous
911 because a function with many many case branches, each finishing with
912 a constructor, can have an arbitrarily large discount. This led to
913 terrible code bloat: see Trac #6099.
914
915 Note [Unboxed tuple size and result discount]
916 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
917 However, unboxed tuples count as size zero. I found occasions where we had
918 f x y z = case op# x y z of { s -> (# s, () #) }
919 and f wasn't getting inlined.
920
921 I tried giving unboxed tuples a *result discount* of zero (see the
922 commented-out line). Why? When returned as a result they do not
923 allocate, so maybe we don't want to charge so much for them If you
924 have a non-zero discount here, we find that workers often get inlined
925 back into wrappers, because it look like
926 f x = case $wf x of (# a,b #) -> (a,b)
927 and we are keener because of the case. However while this change
928 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
929 more. All other changes were very small. So it's not a big deal but I
930 didn't adopt the idea.
931
932 Note [Function and non-function discounts]
933 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
934 We want a discount if the function is applied. A good example is
935 monadic combinators with continuation arguments, where inlining is
936 quite important.
937
938 But we don't want a big discount when a function is called many times
939 (see the detailed comments with Trac #6048) because if the function is
940 big it won't be inlined at its many call sites and no benefit results.
941 Indeed, we can get exponentially big inlinings this way; that is what
942 Trac #6048 is about.
943
944 On the other hand, for data-valued arguments, if there are lots of
945 case expressions in the body, each one will get smaller if we apply
946 the function to a constructor application, so we *want* a big discount
947 if the argument is scrutinised by many case expressions.
948
949 Conclusion:
950 - For functions, take the max of the discounts
951 - For data values, take the sum of the discounts
952
953
954 Note [Literal integer size]
955 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
956 Literal integers *can* be big (mkInteger [...coefficients...]), but
957 need not be (S# n). We just use an arbitrary big-ish constant here
958 so that, in particular, we don't inline top-level defns like
959 n = S# 5
960 There's no point in doing so -- any optimisations will see the S#
961 through n's unfolding. Nor will a big size inhibit unfoldings functions
962 that mention a literal Integer, because the float-out pass will float
963 all those constants to top level.
964 -}
965
966 primOpSize :: PrimOp -> Int -> ExprSize
967 primOpSize op n_val_args
968 = if primOpOutOfLine op
969 then sizeN (op_size + n_val_args)
970 else sizeN op_size
971 where
972 op_size = primOpCodeSize op
973
974
975 buildSize :: ExprSize
976 buildSize = SizeIs 0 emptyBag 40
977 -- We really want to inline applications of build
978 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
979 -- Indeed, we should add a result_discount because build is
980 -- very like a constructor. We don't bother to check that the
981 -- build is saturated (it usually is). The "-2" discounts for the \c n,
982 -- The "4" is rather arbitrary.
983
984 augmentSize :: ExprSize
985 augmentSize = SizeIs 0 emptyBag 40
986 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
987 -- e plus ys. The -2 accounts for the \cn
988
989 -- When we return a lambda, give a discount if it's used (applied)
990 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
991 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
992 lamScrutDiscount _ TooBig = TooBig
993
994 {-
995 Note [addAltSize result discounts]
996 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
997 When adding the size of alternatives, we *add* the result discounts
998 too, rather than take the *maximum*. For a multi-branch case, this
999 gives a discount for each branch that returns a constructor, making us
1000 keener to inline. I did try using 'max' instead, but it makes nofib
1001 'rewrite' and 'puzzle' allocate significantly more, and didn't make
1002 binary sizes shrink significantly either.
1003
1004 Note [Discounts and thresholds]
1005 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1006 Constants for discounts and thesholds are defined in main/DynFlags,
1007 all of form ufXxxx. They are:
1008
1009 ufCreationThreshold
1010 At a definition site, if the unfolding is bigger than this, we
1011 may discard it altogether
1012
1013 ufUseThreshold
1014 At a call site, if the unfolding, less discounts, is smaller than
1015 this, then it's small enough inline
1016
1017 ufKeenessFactor
1018 Factor by which the discounts are multiplied before
1019 subtracting from size
1020
1021 ufDictDiscount
1022 The discount for each occurrence of a dictionary argument
1023 as an argument of a class method. Should be pretty small
1024 else big functions may get inlined
1025
1026 ufFunAppDiscount
1027 Discount for a function argument that is applied. Quite
1028 large, because if we inline we avoid the higher-order call.
1029
1030 ufDearOp
1031 The size of a foreign call or not-dupable PrimOp
1032
1033 ufVeryAggressive
1034 If True, the compiler ignores all the thresholds and inlines very
1035 aggressively. It still adheres to arity, simplifier phase control and
1036 loop breakers.
1037
1038
1039 Note [Function applications]
1040 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1041 In a function application (f a b)
1042
1043 - If 'f' is an argument to the function being analysed,
1044 and there's at least one value arg, record a FunAppDiscount for f
1045
1046 - If the application if a PAP (arity > 2 in this example)
1047 record a *result* discount (because inlining
1048 with "extra" args in the call may mean that we now
1049 get a saturated application)
1050
1051 Code for manipulating sizes
1052 -}
1053
1054 -- | The size of a candidate expression for unfolding
1055 data ExprSize
1056 = TooBig
1057 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
1058 , _es_args :: !(Bag (Id,Int))
1059 -- ^ Arguments cased herein, and discount for each such
1060 , _es_discount :: {-# UNPACK #-} !Int
1061 -- ^ Size to subtract if result is scrutinised by a case
1062 -- expression
1063 }
1064
1065 instance Outputable ExprSize where
1066 ppr TooBig = text "TooBig"
1067 ppr (SizeIs a _ c) = brackets (int a <+> int c)
1068
1069 -- subtract the discount before deciding whether to bale out. eg. we
1070 -- want to inline a large constructor application into a selector:
1071 -- tup = (a_1, ..., a_99)
1072 -- x = case tup of ...
1073 --
1074 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
1075 mkSizeIs max n xs d | (n - d) > max = TooBig
1076 | otherwise = SizeIs n xs d
1077
1078 maxSize :: ExprSize -> ExprSize -> ExprSize
1079 maxSize TooBig _ = TooBig
1080 maxSize _ TooBig = TooBig
1081 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
1082 | otherwise = s2
1083
1084 sizeZero :: ExprSize
1085 sizeN :: Int -> ExprSize
1086
1087 sizeZero = SizeIs 0 emptyBag 0
1088 sizeN n = SizeIs n emptyBag 0
1089
1090 {-
1091 ************************************************************************
1092 * *
1093 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
1094 * *
1095 ************************************************************************
1096
1097 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
1098 we ``couldn't possibly use'' on the other side. Can be overridden w/
1099 flaggery. Just the same as smallEnoughToInline, except that it has no
1100 actual arguments.
1101 -}
1102
1103 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1104 couldBeSmallEnoughToInline dflags threshold rhs
1105 = case sizeExpr dflags threshold [] body of
1106 TooBig -> False
1107 _ -> True
1108 where
1109 (_, body) = collectBinders rhs
1110
1111 ----------------
1112 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1113 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1114 = size <= ufUseThreshold dflags
1115 smallEnoughToInline _ _
1116 = False
1117
1118 ----------------
1119
1120 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1121 -- Sees if the unfolding is pretty certain to inline
1122 -- If so, return a *stable* unfolding for it, that will always inline
1123 certainlyWillInline dflags fn_info
1124 = case unfoldingInfo fn_info of
1125 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1126 | loop_breaker -> Nothing -- Won't inline, so try w/w
1127 | otherwise -> do_cunf e g -- Depends on size, so look at that
1128
1129 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1130 -- to do so, and even if it is currently a
1131 -- loop breaker, it may not be later
1132
1133 _other_unf -> Nothing
1134
1135 where
1136 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1137 fn_unf = unfoldingInfo fn_info
1138
1139 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1140 do_cunf _ UnfNever = Nothing
1141 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1142 -- INLINE functions have UnfWhen
1143
1144 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1145 -- binary sizes go up by 10%! (This is with SplitObjs.)
1146 -- I'm not totally sure why.
1147 -- INLINABLE functions come via this path
1148 -- See Note [certainlyWillInline: INLINABLE]
1149 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1150 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1151 , case inlinePragmaSpec (inlinePragInfo fn_info) of
1152 NoInline -> False -- NOINLINE; do not say certainlyWillInline!
1153 _ -> True -- INLINE, INLINABLE, or nothing
1154 , not (isBottomingSig (strictnessInfo fn_info))
1155 -- Do not unconditionally inline a bottoming functions even if
1156 -- it seems smallish. We've carefully lifted it out to top level,
1157 -- so we don't want to re-inline it.
1158 , let arity = length args
1159 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1160 = Just (fn_unf { uf_src = InlineStable
1161 , uf_guidance = UnfWhen { ug_arity = arity
1162 , ug_unsat_ok = unSaturatedOk
1163 , ug_boring_ok = inlineBoringOk expr } })
1164 -- Note the "unsaturatedOk". A function like f = \ab. a
1165 -- will certainly inline, even if partially applied (f e), so we'd
1166 -- better make sure that the transformed inlining has the same property
1167 | otherwise
1168 = Nothing
1169
1170 {- Note [certainlyWillInline: be careful of thunks]
1171 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1172 Don't claim that thunks will certainly inline, because that risks work
1173 duplication. Even if the work duplication is not great (eg is_cheap
1174 holds), it can make a big difference in an inner loop In Trac #5623 we
1175 found that the WorkWrap phase thought that
1176 y = case x of F# v -> F# (v +# v)
1177 was certainlyWillInline, so the addition got duplicated.
1178
1179 Note [certainlyWillInline: INLINABLE]
1180 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1181 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1182 even though we have a stable inlining, so that strictness w/w takes
1183 place. It makes a big difference to efficiency, and the w/w pass knows
1184 how to transfer the INLINABLE info to the worker; see WorkWrap
1185 Note [Worker-wrapper for INLINABLE functions]
1186
1187 ************************************************************************
1188 * *
1189 \subsection{callSiteInline}
1190 * *
1191 ************************************************************************
1192
1193 This is the key function. It decides whether to inline a variable at a call site
1194
1195 callSiteInline is used at call sites, so it is a bit more generous.
1196 It's a very important function that embodies lots of heuristics.
1197 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1198 and occurs exactly once or
1199 occurs once in each branch of a case and is small
1200
1201 If the thing is in WHNF, there's no danger of duplicating work,
1202 so we can inline if it occurs once, or is small
1203
1204 NOTE: we don't want to inline top-level functions that always diverge.
1205 It just makes the code bigger. Tt turns out that the convenient way to prevent
1206 them inlining is to give them a NOINLINE pragma, which we do in
1207 StrictAnal.addStrictnessInfoToTopId
1208 -}
1209
1210 callSiteInline :: DynFlags
1211 -> Id -- The Id
1212 -> Bool -- True <=> unfolding is active
1213 -> Bool -- True if there are no arguments at all (incl type args)
1214 -> [ArgSummary] -- One for each value arg; True if it is interesting
1215 -> CallCtxt -- True <=> continuation is interesting
1216 -> Maybe CoreExpr -- Unfolding, if any
1217
1218 data ArgSummary = TrivArg -- Nothing interesting
1219 | NonTrivArg -- Arg has structure
1220 | ValueArg -- Arg is a con-app or PAP
1221 -- ..or con-like. Note [Conlike is interesting]
1222
1223 instance Outputable ArgSummary where
1224 ppr TrivArg = text "TrivArg"
1225 ppr NonTrivArg = text "NonTrivArg"
1226 ppr ValueArg = text "ValueArg"
1227
1228 nonTriv :: ArgSummary -> Bool
1229 nonTriv TrivArg = False
1230 nonTriv _ = True
1231
1232 data CallCtxt
1233 = BoringCtxt
1234 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1235 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1236 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1237
1238 | ValAppCtxt -- We're applied to at least one value arg
1239 -- This arises when we have ((f x |> co) y)
1240 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1241
1242 | CaseCtxt -- We're the scrutinee of a case
1243 -- that decomposes its scrutinee
1244
1245 instance Outputable CallCtxt where
1246 ppr CaseCtxt = text "CaseCtxt"
1247 ppr ValAppCtxt = text "ValAppCtxt"
1248 ppr BoringCtxt = text "BoringCtxt"
1249 ppr RhsCtxt = text "RhsCtxt"
1250 ppr DiscArgCtxt = text "DiscArgCtxt"
1251 ppr RuleArgCtxt = text "RuleArgCtxt"
1252
1253 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1254 = case idUnfolding id of
1255 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1256 -- Things with an INLINE pragma may have an unfolding *and*
1257 -- be a loop breaker (maybe the knot is not yet untied)
1258 CoreUnfolding { uf_tmpl = unf_template
1259 , uf_is_work_free = is_wf
1260 , uf_guidance = guidance, uf_expandable = is_exp }
1261 | active_unfolding -> tryUnfolding dflags id lone_variable
1262 arg_infos cont_info unf_template
1263 is_wf is_exp guidance
1264 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1265 NoUnfolding -> Nothing
1266 BootUnfolding -> Nothing
1267 OtherCon {} -> Nothing
1268 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1269
1270 traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1271 traceInline dflags inline_id str doc result
1272 | Just prefix <- inlineCheck dflags
1273 = if prefix `isPrefixOf` occNameString (getOccName inline_id)
1274 then pprTrace str doc result
1275 else result
1276 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1277 = pprTrace str doc result
1278 | otherwise
1279 = result
1280
1281 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1282 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1283 -> Maybe CoreExpr
1284 tryUnfolding dflags id lone_variable
1285 arg_infos cont_info unf_template
1286 is_wf is_exp guidance
1287 = case guidance of
1288 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1289
1290 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1291 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1292 -- See Note [INLINE for small functions (3)]
1293 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1294 | otherwise
1295 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1296 where
1297 some_benefit = calc_some_benefit uf_arity
1298 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1299
1300 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1301 | ufVeryAggressive dflags
1302 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1303 | is_wf && some_benefit && small_enough
1304 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1305 | otherwise
1306 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1307 where
1308 some_benefit = calc_some_benefit (length arg_discounts)
1309 extra_doc = text "discounted size =" <+> int discounted_size
1310 discounted_size = size - discount
1311 small_enough = discounted_size <= ufUseThreshold dflags
1312 discount = computeDiscount dflags arg_discounts
1313 res_discount arg_infos cont_info
1314
1315 where
1316 mk_doc some_benefit extra_doc yes_or_no
1317 = vcat [ text "arg infos" <+> ppr arg_infos
1318 , text "interesting continuation" <+> ppr cont_info
1319 , text "some_benefit" <+> ppr some_benefit
1320 , text "is exp:" <+> ppr is_exp
1321 , text "is work-free:" <+> ppr is_wf
1322 , text "guidance" <+> ppr guidance
1323 , extra_doc
1324 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1325
1326 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1327 n_val_args = length arg_infos
1328
1329 -- some_benefit is used when the RHS is small enough
1330 -- and the call has enough (or too many) value
1331 -- arguments (ie n_val_args >= arity). But there must
1332 -- be *something* interesting about some argument, or the
1333 -- result context, to make it worth inlining
1334 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1335 -- expected by the unfolding
1336 calc_some_benefit uf_arity
1337 | not saturated = interesting_args -- Under-saturated
1338 -- Note [Unsaturated applications]
1339 | otherwise = interesting_args -- Saturated or over-saturated
1340 || interesting_call
1341 where
1342 saturated = n_val_args >= uf_arity
1343 over_saturated = n_val_args > uf_arity
1344 interesting_args = any nonTriv arg_infos
1345 -- NB: (any nonTriv arg_infos) looks at the
1346 -- over-saturated args too which is "wrong";
1347 -- but if over-saturated we inline anyway.
1348
1349 interesting_call
1350 | over_saturated
1351 = True
1352 | otherwise
1353 = case cont_info of
1354 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1355 ValAppCtxt -> True -- Note [Cast then apply]
1356 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1357 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt]
1358 RhsCtxt -> uf_arity > 0 --
1359 _other -> False -- See Note [Nested functions]
1360
1361
1362 {-
1363 Note [Unfold into lazy contexts], Note [RHS of lets]
1364 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1365 When the call is the argument of a function with a RULE, or the RHS of a let,
1366 we are a little bit keener to inline. For example
1367 f y = (y,y,y)
1368 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1369 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1370 only we can't see it. Also
1371 x = f v
1372 could be expensive whereas
1373 x = case v of (a,b) -> a
1374 is patently cheap and may allow more eta expansion.
1375 So we treat the RHS of a let as not-totally-boring.
1376
1377 Note [Unsaturated applications]
1378 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1379 When a call is not saturated, we *still* inline if one of the
1380 arguments has interesting structure. That's sometimes very important.
1381 A good example is the Ord instance for Bool in Base:
1382
1383 Rec {
1384 $fOrdBool =GHC.Classes.D:Ord
1385 @ Bool
1386 ...
1387 $cmin_ajX
1388
1389 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1390 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1391 }
1392
1393 But the defn of GHC.Classes.$dmmin is:
1394
1395 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1396 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1397 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1398 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1399 GHC.Types.False -> y GHC.Types.True -> x }) -}
1400
1401 We *really* want to inline $dmmin, even though it has arity 3, in
1402 order to unravel the recursion.
1403
1404
1405 Note [Things to watch]
1406 ~~~~~~~~~~~~~~~~~~~~~~
1407 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1408 Assume x is exported, so not inlined unconditionally.
1409 Then we want x to inline unconditionally; no reason for it
1410 not to, and doing so avoids an indirection.
1411
1412 * { x = I# 3; ....f x.... }
1413 Make sure that x does not inline unconditionally!
1414 Lest we get extra allocation.
1415
1416 Note [Inlining an InlineRule]
1417 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1418 An InlineRules is used for
1419 (a) programmer INLINE pragmas
1420 (b) inlinings from worker/wrapper
1421
1422 For (a) the RHS may be large, and our contract is that we *only* inline
1423 when the function is applied to all the arguments on the LHS of the
1424 source-code defn. (The uf_arity in the rule.)
1425
1426 However for worker/wrapper it may be worth inlining even if the
1427 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1428 require saturation.
1429
1430 Note [Nested functions]
1431 ~~~~~~~~~~~~~~~~~~~~~~~
1432 At one time we treated a call of a non-top-level function as
1433 "interesting" (regardless of how boring the context) in the hope
1434 that inlining it would eliminate the binding, and its allocation.
1435 Specifically, in the default case of interesting_call we had
1436 _other -> not is_top && uf_arity > 0
1437
1438 But actually postInlineUnconditionally does some of this and overall
1439 it makes virtually no difference to nofib. So I simplified away this
1440 special case
1441
1442 Note [Cast then apply]
1443 ~~~~~~~~~~~~~~~~~~~~~~
1444 Consider
1445 myIndex = __inline_me ( (/\a. <blah>) |> co )
1446 co :: (forall a. a -> a) ~ (forall a. T a)
1447 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1448
1449 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1450 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1451
1452 Note [Inlining in ArgCtxt]
1453 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1454 The condition (arity > 0) here is very important, because otherwise
1455 we end up inlining top-level stuff into useless places; eg
1456 x = I# 3#
1457 f = \y. g x
1458 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1459 and 20% to 'power'.
1460
1461 At one stage I replaced this condition by 'True' (leading to the above
1462 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1463 to work ok now.
1464
1465 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1466 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1467 for the RHS of a 'let', we only profit from the inlining if we get a
1468 CONLIKE thing (modulo lets).
1469
1470 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1471 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1472 The "lone-variable" case is important. I spent ages messing about
1473 with unsatisfactory variants, but this is nice. The idea is that if a
1474 variable appears all alone
1475
1476 as an arg of lazy fn, or rhs BoringCtxt
1477 as scrutinee of a case CaseCtxt
1478 as arg of a fn ArgCtxt
1479 AND
1480 it is bound to a cheap expression
1481
1482 then we should not inline it (unless there is some other reason,
1483 e.g. it is the sole occurrence). That is what is happening at
1484 the use of 'lone_variable' in 'interesting_call'.
1485
1486 Why? At least in the case-scrutinee situation, turning
1487 let x = (a,b) in case x of y -> ...
1488 into
1489 let x = (a,b) in case (a,b) of y -> ...
1490 and thence to
1491 let x = (a,b) in let y = (a,b) in ...
1492 is bad if the binding for x will remain.
1493
1494 Another example: I discovered that strings
1495 were getting inlined straight back into applications of 'error'
1496 because the latter is strict.
1497 s = "foo"
1498 f = \x -> ...(error s)...
1499
1500 Fundamentally such contexts should not encourage inlining because, provided
1501 the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1502 context can ``see'' the unfolding of the variable (e.g. case or a
1503 RULE) so there's no gain.
1504
1505 However, watch out:
1506
1507 * Consider this:
1508 foo = _inline_ (\n. [n])
1509 bar = _inline_ (foo 20)
1510 baz = \n. case bar of { (m:_) -> m + n }
1511 Here we really want to inline 'bar' so that we can inline 'foo'
1512 and the whole thing unravels as it should obviously do. This is
1513 important: in the NDP project, 'bar' generates a closure data
1514 structure rather than a list.
1515
1516 So the non-inlining of lone_variables should only apply if the
1517 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1518 looks through the unfolding. Hence the "&& is_wf" in the
1519 InlineRule branch.
1520
1521 * Even a type application or coercion isn't a lone variable.
1522 Consider
1523 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1524 We had better inline that sucker! The case won't see through it.
1525
1526 For now, I'm treating treating a variable applied to types
1527 in a *lazy* context "lone". The motivating example was
1528 f = /\a. \x. BIG
1529 g = /\a. \y. h (f a)
1530 There's no advantage in inlining f here, and perhaps
1531 a significant disadvantage. Hence some_val_args in the Stop case
1532
1533 Note [Interaction of exprIsWorkFree and lone variables]
1534 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1535 The lone-variable test says "don't inline if a case expression
1536 scrutinises a lone variable whose unfolding is cheap". It's very
1537 important that, under these circumstances, exprIsConApp_maybe
1538 can spot a constructor application. So, for example, we don't
1539 consider
1540 let x = e in (x,x)
1541 to be cheap, and that's good because exprIsConApp_maybe doesn't
1542 think that expression is a constructor application.
1543
1544 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1545 rather than is_wf, which was utterly wrong, because the above
1546 expression responds True to exprIsHNF, which is what sets is_value.
1547
1548 This kind of thing can occur if you have
1549
1550 {-# INLINE foo #-}
1551 foo = let x = e in (x,x)
1552
1553 which Roman did.
1554
1555
1556 -}
1557
1558 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1559 -> Int
1560 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1561 -- We multiple the raw discounts (args_discount and result_discount)
1562 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1563 -- *size* whereas the discounts imply that there's some extra
1564 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1565 -- by inlining.
1566
1567 = 10 -- Discount of 10 because the result replaces the call
1568 -- so we count 10 for the function itself
1569
1570 + 10 * length actual_arg_discounts
1571 -- Discount of 10 for each arg supplied,
1572 -- because the result replaces the call
1573
1574 + round (ufKeenessFactor dflags *
1575 fromIntegral (total_arg_discount + res_discount'))
1576 where
1577 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1578 total_arg_discount = sum actual_arg_discounts
1579
1580 mk_arg_discount _ TrivArg = 0
1581 mk_arg_discount _ NonTrivArg = 10
1582 mk_arg_discount discount ValueArg = discount
1583
1584 res_discount'
1585 | LT <- arg_discounts `compareLength` arg_infos
1586 = res_discount -- Over-saturated
1587 | otherwise
1588 = case cont_info of
1589 BoringCtxt -> 0
1590 CaseCtxt -> res_discount -- Presumably a constructor
1591 ValAppCtxt -> res_discount -- Presumably a function
1592 _ -> 40 `min` res_discount
1593 -- ToDo: this 40 `min` res_discount doesn't seem right
1594 -- for DiscArgCtxt it shouldn't matter because the function will
1595 -- get the arg discount for any non-triv arg
1596 -- for RuleArgCtxt we do want to be keener to inline; but not only
1597 -- constructor results
1598 -- for RhsCtxt I suppose that exposing a data con is good in general
1599 -- And 40 seems very arbitrary
1600 --
1601 -- res_discount can be very large when a function returns
1602 -- constructors; but we only want to invoke that large discount
1603 -- when there's a case continuation.
1604 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1605 -- But we want to aovid inlining large functions that return
1606 -- constructors into contexts that are simply "interesting"