Honor INLINE on 0-arity bindings (#15578)
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import PprCore () -- Instances
50 import OccurAnal ( occurAnalyseExpr )
51 import CoreOpt
52 import CoreArity ( manifestArity )
53 import CoreUtils
54 import Id
55 import Demand ( isBottomingSig )
56 import DataCon
57 import Literal
58 import PrimOp
59 import IdInfo
60 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
61 import Type
62 import PrelNames
63 import TysPrim ( realWorldStatePrimTy )
64 import Bag
65 import Util
66 import Outputable
67 import ForeignCall
68 import Name
69
70 import qualified Data.ByteString as BS
71 import Data.List
72
73 {-
74 ************************************************************************
75 * *
76 \subsection{Making unfoldings}
77 * *
78 ************************************************************************
79 -}
80
81 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
82 mkTopUnfolding dflags is_bottoming rhs
83 = mkUnfolding dflags InlineRhs True is_bottoming rhs
84
85 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
86 -- For implicit Ids, do a tiny bit of optimising first
87 mkImplicitUnfolding dflags expr
88 = mkTopUnfolding dflags False (simpleOptExpr dflags expr)
89
90 -- Note [Top-level flag on inline rules]
91 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 -- Slight hack: note that mk_inline_rules conservatively sets the
93 -- top-level flag to True. It gets set more accurately by the simplifier
94 -- Simplify.simplUnfolding.
95
96 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
97 mkSimpleUnfolding dflags rhs
98 = mkUnfolding dflags InlineRhs False False rhs
99
100 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
101 mkDFunUnfolding bndrs con ops
102 = DFunUnfolding { df_bndrs = bndrs
103 , df_con = con
104 , df_args = map occurAnalyseExpr ops }
105 -- See Note [Occurrence analysis of unfoldings]
106
107 mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
108 mkWwInlineRule dflags expr arity
109 = mkCoreUnfolding InlineStable True
110 (simpleOptExpr dflags expr)
111 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
112 , ug_boring_ok = boringCxtNotOk })
113
114 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
115 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
116 = mkCoreUnfolding InlineCompulsory True
117 (simpleOptExpr unsafeGlobalDynFlags expr)
118 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
119 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
120
121 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
122 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
123 mkWorkerUnfolding dflags work_fn
124 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
125 , uf_is_top = top_lvl })
126 | isStableSource src
127 = mkCoreUnfolding src top_lvl new_tmpl guidance
128 where
129 new_tmpl = simpleOptExpr dflags (work_fn tmpl)
130 guidance = calcUnfoldingGuidance dflags False new_tmpl
131
132 mkWorkerUnfolding _ _ _ = noUnfolding
133
134 -- | Make an unfolding that may be used unsaturated
135 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
136 -- manifest arity (the number of outer lambdas applications will
137 -- resolve before doing any work).
138 mkInlineUnfolding :: CoreExpr -> Unfolding
139 mkInlineUnfolding expr
140 = mkCoreUnfolding InlineStable
141 True -- Note [Top-level flag on inline rules]
142 expr' guide
143 where
144 expr' = simpleOptExpr unsafeGlobalDynFlags expr
145 guide = UnfWhen { ug_arity = manifestArity expr'
146 , ug_unsat_ok = unSaturatedOk
147 , ug_boring_ok = boring_ok }
148 boring_ok = inlineBoringOk expr'
149
150 -- | Make an unfolding that will be used once the RHS has been saturated
151 -- to the given arity.
152 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
153 mkInlineUnfoldingWithArity arity expr
154 = mkCoreUnfolding InlineStable
155 True -- Note [Top-level flag on inline rules]
156 expr' guide
157 where
158 expr' = simpleOptExpr unsafeGlobalDynFlags expr
159 guide = UnfWhen { ug_arity = arity
160 , ug_unsat_ok = needSaturated
161 , ug_boring_ok = boring_ok }
162 -- See Note [INLINE pragmas and boring contexts] as to why we need to look
163 -- at the arity here.
164 boring_ok | arity == 0 = True
165 | otherwise = inlineBoringOk expr'
166
167 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
168 mkInlinableUnfolding dflags expr
169 = mkUnfolding dflags InlineStable False False expr'
170 where
171 expr' = simpleOptExpr dflags expr
172
173 specUnfolding :: DynFlags -> [Var] -> (CoreExpr -> CoreExpr) -> Arity
174 -> Unfolding -> Unfolding
175 -- See Note [Specialising unfoldings]
176 -- specUnfolding spec_bndrs spec_app arity_decrease unf
177 -- = \spec_bndrs. spec_app( unf )
178 --
179 specUnfolding dflags spec_bndrs spec_app arity_decrease
180 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
181 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
182 mkDFunUnfolding spec_bndrs con (map spec_arg args)
183 -- There is a hard-to-check assumption here that the spec_app has
184 -- enough applications to exactly saturate the old_bndrs
185 -- For DFunUnfoldings we transform
186 -- \old_bndrs. MkD <op1> ... <opn>
187 -- to
188 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
189 -- The ASSERT checks the value part of that
190 where
191 spec_arg arg = simpleOptExpr dflags (spec_app (mkLams old_bndrs arg))
192 -- The beta-redexes created by spec_app will be
193 -- simplified away by simplOptExpr
194
195 specUnfolding dflags spec_bndrs spec_app arity_decrease
196 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
197 , uf_is_top = top_lvl
198 , uf_guidance = old_guidance })
199 | isStableSource src -- See Note [Specialising unfoldings]
200 , UnfWhen { ug_arity = old_arity
201 , ug_unsat_ok = unsat_ok
202 , ug_boring_ok = boring_ok } <- old_guidance
203 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
204 , ug_unsat_ok = unsat_ok
205 , ug_boring_ok = boring_ok }
206 new_tmpl = simpleOptExpr dflags (mkLams spec_bndrs (spec_app tmpl))
207 -- The beta-redexes created by spec_app will be
208 -- simplified away by simplOptExpr
209
210 in mkCoreUnfolding src top_lvl new_tmpl guidance
211
212 specUnfolding _ _ _ _ _ = noUnfolding
213
214 {- Note [Specialising unfoldings]
215 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
216 When we specialise a function for some given type-class arguments, we use
217 specUnfolding to specialise its unfolding. Some important points:
218
219 * If the original function has a DFunUnfolding, the specialised one
220 must do so too! Otherwise we lose the magic rules that make it
221 interact with ClassOps
222
223 * There is a bit of hack for INLINABLE functions:
224 f :: Ord a => ....
225 f = <big-rhs>
226 {- INLINABLE f #-}
227 Now if we specialise f, should the specialised version still have
228 an INLINABLE pragma? If it does, we'll capture a specialised copy
229 of <big-rhs> as its unfolding, and that probaby won't inline. But
230 if we don't, the specialised version of <big-rhs> might be small
231 enough to inline at a call site. This happens with Control.Monad.liftM3,
232 and can cause a lot more allocation as a result (nofib n-body shows this).
233
234 Moreover, keeping the INLINABLE thing isn't much help, because
235 the specialised function (probaby) isn't overloaded any more.
236
237 Conclusion: drop the INLINEALE pragma. In practice what this means is:
238 if a stable unfolding has UnfoldingGuidance of UnfWhen,
239 we keep it (so the specialised thing too will always inline)
240 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
241 (which arises from INLINABLE), we discard it
242
243 Note [Honour INLINE on 0-ary bindings]
244 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
245 Consider
246
247 x = <expensive>
248 {-# INLINE x #-}
249
250 f y = ...x...
251
252 The semantics of an INLINE pragma is
253
254 inline x at every call site, provided it is saturated;
255 that is, applied to at least as many arguments as appear
256 on the LHS of the Haskell source definition.
257
258 (This soure-code-derived arity is stored in the `ug_arity` field of
259 the `UnfoldingGuidance`.)
260
261 In the example, x's ug_arity is 0, so we should inline it at every use
262 site. It's rare to have such an INLINE pragma (usually INLINE Is on
263 functions), but it's occasionally very important (Trac #15578, #15519).
264 In #15519 we had something like
265 x = case (g a b) of I# r -> T r
266 {-# INLINE x #-}
267 f y = ...(h x)....
268
269 where h is strict. So we got
270 f y = ...(case g a b of I# r -> h (T r))...
271
272 and that in turn allowed SpecConstr to ramp up performance.
273
274 How do we deliver on this? By adjusting the ug_boring_ok
275 flag in mkInlineUnfoldingWithArity; see
276 Note [INLINE pragmas and boring contexts]
277
278 NB: there is a real risk that full laziness will float it right back
279 out again. Consider again
280 x = factorial 200
281 {-# INLINE x #-}
282 f y = ...x...
283
284 After inlining we get
285 f y = ...(factorial 200)...
286
287 but it's entirely possible that full laziness will do
288 lvl23 = factorial 200
289 f y = ...lvl23...
290
291 That's a problem for another day.
292
293 Note [INLINE pragmas and boring contexts]
294 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
295 An INLINE pragma uses mkInlineUnfoldingWithArity to build the
296 unfolding. That sets the ug_boring_ok flag to False if the function
297 is not tiny (inlineBorkingOK), so that even INLINE functions are not
298 inlined in an utterly boring context. E.g.
299 \x y. Just (f y x)
300 Nothing is gained by inlining f here, even if it has an INLINE
301 pragma.
302
303 But for 0-ary bindings, we want to inline regardless; see
304 Note [Honour INLINE on 0-ary bindings].
305
306 I'm a bit worried that it's possible for the same kind of problem
307 to arise for non-0-ary functions too, but let's wait and see.
308 -}
309
310 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
311 -> UnfoldingGuidance -> Unfolding
312 -- Occurrence-analyses the expression before capturing it
313 mkCoreUnfolding src top_lvl expr guidance
314 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
315 -- See Note [Occurrence analysis of unfoldings]
316 uf_src = src,
317 uf_is_top = top_lvl,
318 uf_is_value = exprIsHNF expr,
319 uf_is_conlike = exprIsConLike expr,
320 uf_is_work_free = exprIsWorkFree expr,
321 uf_expandable = exprIsExpandable expr,
322 uf_guidance = guidance }
323
324 mkUnfolding :: DynFlags -> UnfoldingSource
325 -> Bool -- Is top-level
326 -> Bool -- Definitely a bottoming binding
327 -- (only relevant for top-level bindings)
328 -> CoreExpr
329 -> Unfolding
330 -- Calculates unfolding guidance
331 -- Occurrence-analyses the expression before capturing it
332 mkUnfolding dflags src is_top_lvl is_bottoming expr
333 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
334 -- See Note [Occurrence analysis of unfoldings]
335 uf_src = src,
336 uf_is_top = is_top_lvl,
337 uf_is_value = exprIsHNF expr,
338 uf_is_conlike = exprIsConLike expr,
339 uf_expandable = exprIsExpandable expr,
340 uf_is_work_free = exprIsWorkFree expr,
341 uf_guidance = guidance }
342 where
343 is_top_bottoming = is_top_lvl && is_bottoming
344 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
345 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
346 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
347
348 {-
349 Note [Occurrence analysis of unfoldings]
350 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351 We do occurrence-analysis of unfoldings once and for all, when the
352 unfolding is built, rather than each time we inline them.
353
354 But given this decision it's vital that we do
355 *always* do it. Consider this unfolding
356 \x -> letrec { f = ...g...; g* = f } in body
357 where g* is (for some strange reason) the loop breaker. If we don't
358 occ-anal it when reading it in, we won't mark g as a loop breaker, and
359 we may inline g entirely in body, dropping its binding, and leaving
360 the occurrence in f out of scope. This happened in Trac #8892, where
361 the unfolding in question was a DFun unfolding.
362
363 But more generally, the simplifier is designed on the
364 basis that it is looking at occurrence-analysed expressions, so better
365 ensure that they acutally are.
366
367 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
368 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
369 Notice that we give the non-occur-analysed expression to
370 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
371 first; for example, sometimes during simplification, there's a large
372 let-bound thing which has been substituted, and so is now dead; so
373 'expr' contains two copies of the thing while the occurrence-analysed
374 expression doesn't.
375
376 Nevertheless, we *don't* and *must not* occ-analyse before computing
377 the size because
378
379 a) The size computation bales out after a while, whereas occurrence
380 analysis does not.
381
382 b) Residency increases sharply if you occ-anal first. I'm not
383 100% sure why, but it's a large effect. Compiling Cabal went
384 from residency of 534M to over 800M with this one change.
385
386 This can occasionally mean that the guidance is very pessimistic;
387 it gets fixed up next round. And it should be rare, because large
388 let-bound things that are dead are usually caught by preInlineUnconditionally
389
390
391 ************************************************************************
392 * *
393 \subsection{The UnfoldingGuidance type}
394 * *
395 ************************************************************************
396 -}
397
398 inlineBoringOk :: CoreExpr -> Bool
399 -- See Note [INLINE for small functions]
400 -- True => the result of inlining the expression is
401 -- no bigger than the expression itself
402 -- eg (\x y -> f y x)
403 -- This is a quick and dirty version. It doesn't attempt
404 -- to deal with (\x y z -> x (y z))
405 -- The really important one is (x `cast` c)
406 inlineBoringOk e
407 = go 0 e
408 where
409 go :: Int -> CoreExpr -> Bool
410 go credit (Lam x e) | isId x = go (credit+1) e
411 | otherwise = go credit e
412 go credit (App f (Type {})) = go credit f
413 go credit (App f a) | credit > 0
414 , exprIsTrivial a = go (credit-1) f
415 go credit (Tick _ e) = go credit e -- dubious
416 go credit (Cast e _) = go credit e
417 go _ (Var {}) = boringCxtOk
418 go _ _ = boringCxtNotOk
419
420 calcUnfoldingGuidance
421 :: DynFlags
422 -> Bool -- Definitely a top-level, bottoming binding
423 -> CoreExpr -- Expression to look at
424 -> UnfoldingGuidance
425 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
426 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
427 = calcUnfoldingGuidance dflags is_top_bottoming expr
428 calcUnfoldingGuidance dflags is_top_bottoming expr
429 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
430 TooBig -> UnfNever
431 SizeIs size cased_bndrs scrut_discount
432 | uncondInline expr n_val_bndrs size
433 -> UnfWhen { ug_unsat_ok = unSaturatedOk
434 , ug_boring_ok = boringCxtOk
435 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
436
437 | is_top_bottoming
438 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
439
440 | otherwise
441 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
442 , ug_size = size
443 , ug_res = scrut_discount }
444
445 where
446 (bndrs, body) = collectBinders expr
447 bOMB_OUT_SIZE = ufCreationThreshold dflags
448 -- Bomb out if size gets bigger than this
449 val_bndrs = filter isId bndrs
450 n_val_bndrs = length val_bndrs
451
452 mk_discount :: Bag (Id,Int) -> Id -> Int
453 mk_discount cbs bndr = foldlBag combine 0 cbs
454 where
455 combine acc (bndr', disc)
456 | bndr == bndr' = acc `plus_disc` disc
457 | otherwise = acc
458
459 plus_disc :: Int -> Int -> Int
460 plus_disc | isFunTy (idType bndr) = max
461 | otherwise = (+)
462 -- See Note [Function and non-function discounts]
463
464 {-
465 Note [Computing the size of an expression]
466 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
467 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
468 heuristics right has taken a long time. Here's the basic strategy:
469
470 * Variables, literals: 0
471 (Exception for string literals, see litSize.)
472
473 * Function applications (f e1 .. en): 1 + #value args
474
475 * Constructor applications: 1, regardless of #args
476
477 * Let(rec): 1 + size of components
478
479 * Note, cast: 0
480
481 Examples
482
483 Size Term
484 --------------
485 0 42#
486 0 x
487 0 True
488 2 f x
489 1 Just x
490 4 f (g x)
491
492 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
493 a function call to account for. Notice also that constructor applications
494 are very cheap, because exposing them to a caller is so valuable.
495
496 [25/5/11] All sizes are now multiplied by 10, except for primops
497 (which have sizes like 1 or 4. This makes primops look fantastically
498 cheap, and seems to be almost unversally beneficial. Done partly as a
499 result of #4978.
500
501 Note [Do not inline top-level bottoming functions]
502 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
503 The FloatOut pass has gone to some trouble to float out calls to 'error'
504 and similar friends. See Note [Bottoming floats] in SetLevels.
505 Do not re-inline them! But we *do* still inline if they are very small
506 (the uncondInline stuff).
507
508 Note [INLINE for small functions]
509 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
510 Consider {-# INLINE f #-}
511 f x = Just x
512 g y = f y
513 Then f's RHS is no larger than its LHS, so we should inline it into
514 even the most boring context. In general, f the function is
515 sufficiently small that its body is as small as the call itself, the
516 inline unconditionally, regardless of how boring the context is.
517
518 Things to note:
519
520 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
521 than the thing it's replacing. Notice that
522 (f x) --> (g 3) -- YES, unconditionally
523 (f x) --> x : [] -- YES, *even though* there are two
524 -- arguments to the cons
525 x --> g 3 -- NO
526 x --> Just v -- NO
527
528 It's very important not to unconditionally replace a variable by
529 a non-atomic term.
530
531 (2) We do this even if the thing isn't saturated, else we end up with the
532 silly situation that
533 f x y = x
534 ...map (f 3)...
535 doesn't inline. Even in a boring context, inlining without being
536 saturated will give a lambda instead of a PAP, and will be more
537 efficient at runtime.
538
539 (3) However, when the function's arity > 0, we do insist that it
540 has at least one value argument at the call site. (This check is
541 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
542 f = /\a \x:a. x
543 d = /\b. MkD (f b)
544 If we inline f here we get
545 d = /\b. MkD (\x:b. x)
546 and then prepareRhs floats out the argument, abstracting the type
547 variables, so we end up with the original again!
548
549 (4) We must be much more cautious about arity-zero things. Consider
550 let x = y +# z in ...
551 In *size* terms primops look very small, because the generate a
552 single instruction, but we do not want to unconditionally replace
553 every occurrence of x with (y +# z). So we only do the
554 unconditional-inline thing for *trivial* expressions.
555
556 NB: you might think that PostInlineUnconditionally would do this
557 but it doesn't fire for top-level things; see SimplUtils
558 Note [Top level and postInlineUnconditionally]
559 -}
560
561 uncondInline :: CoreExpr -> Arity -> Int -> Bool
562 -- Inline unconditionally if there no size increase
563 -- Size of call is arity (+1 for the function)
564 -- See Note [INLINE for small functions]
565 uncondInline rhs arity size
566 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
567 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
568
569 sizeExpr :: DynFlags
570 -> Int -- Bomb out if it gets bigger than this
571 -> [Id] -- Arguments; we're interested in which of these
572 -- get case'd
573 -> CoreExpr
574 -> ExprSize
575
576 -- Note [Computing the size of an expression]
577
578 sizeExpr dflags bOMB_OUT_SIZE top_args expr
579 = size_up expr
580 where
581 size_up (Cast e _) = size_up e
582 size_up (Tick _ e) = size_up e
583 size_up (Type _) = sizeZero -- Types cost nothing
584 size_up (Coercion _) = sizeZero
585 size_up (Lit lit) = sizeN (litSize lit)
586 size_up (Var f) | isRealWorldId f = sizeZero
587 -- Make sure we get constructor discounts even
588 -- on nullary constructors
589 | otherwise = size_up_call f [] 0
590
591 size_up (App fun arg)
592 | isTyCoArg arg = size_up fun
593 | otherwise = size_up arg `addSizeNSD`
594 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
595
596 size_up (Lam b e)
597 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
598 | otherwise = size_up e
599
600 size_up (Let (NonRec binder rhs) body)
601 = size_up_rhs (binder, rhs) `addSizeNSD`
602 size_up body `addSizeN`
603 size_up_alloc binder
604
605 size_up (Let (Rec pairs) body)
606 = foldr (addSizeNSD . size_up_rhs)
607 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
608 pairs
609
610 size_up (Case e _ _ alts)
611 | null alts
612 = size_up e -- case e of {} never returns, so take size of scrutinee
613
614 size_up (Case e _ _ alts)
615 -- Now alts is non-empty
616 | Just v <- is_top_arg e -- We are scrutinising an argument variable
617 = let
618 alt_sizes = map size_up_alt alts
619
620 -- alts_size tries to compute a good discount for
621 -- the case when we are scrutinising an argument variable
622 alts_size (SizeIs tot tot_disc tot_scrut)
623 -- Size of all alternatives
624 (SizeIs max _ _)
625 -- Size of biggest alternative
626 = SizeIs tot (unitBag (v, 20 + tot - max)
627 `unionBags` tot_disc) tot_scrut
628 -- If the variable is known, we produce a
629 -- discount that will take us back to 'max',
630 -- the size of the largest alternative The
631 -- 1+ is a little discount for reduced
632 -- allocation in the caller
633 --
634 -- Notice though, that we return tot_disc,
635 -- the total discount from all branches. I
636 -- think that's right.
637
638 alts_size tot_size _ = tot_size
639 in
640 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
641 (foldr1 maxSize alt_sizes)
642 -- Good to inline if an arg is scrutinised, because
643 -- that may eliminate allocation in the caller
644 -- And it eliminates the case itself
645 where
646 is_top_arg (Var v) | v `elem` top_args = Just v
647 is_top_arg (Cast e _) = is_top_arg e
648 is_top_arg _ = Nothing
649
650
651 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
652 foldr (addAltSize . size_up_alt) case_size alts
653 where
654 case_size
655 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
656 | otherwise = sizeZero
657 -- Normally we don't charge for the case itself, but
658 -- we charge one per alternative (see size_up_alt,
659 -- below) to account for the cost of the info table
660 -- and comparisons.
661 --
662 -- However, in certain cases (see is_inline_scrut
663 -- below), no code is generated for the case unless
664 -- there are multiple alts. In these cases we
665 -- subtract one, making the first alt free.
666 -- e.g. case x# +# y# of _ -> ... should cost 1
667 -- case touch# x# of _ -> ... should cost 0
668 -- (see #4978)
669 --
670 -- I would like to not have the "lengthAtMost alts 1"
671 -- condition above, but without that some programs got worse
672 -- (spectral/hartel/event and spectral/para). I don't fully
673 -- understand why. (SDM 24/5/11)
674
675 -- unboxed variables, inline primops and unsafe foreign calls
676 -- are all "inline" things:
677 is_inline_scrut (Var v) = isUnliftedType (idType v)
678 is_inline_scrut scrut
679 | (Var f, _) <- collectArgs scrut
680 = case idDetails f of
681 FCallId fc -> not (isSafeForeignCall fc)
682 PrimOpId op -> not (primOpOutOfLine op)
683 _other -> False
684 | otherwise
685 = False
686
687 size_up_rhs (bndr, rhs)
688 | Just join_arity <- isJoinId_maybe bndr
689 -- Skip arguments to join point
690 , (_bndrs, body) <- collectNBinders join_arity rhs
691 = size_up body
692 | otherwise
693 = size_up rhs
694
695 ------------
696 -- size_up_app is used when there's ONE OR MORE value args
697 size_up_app (App fun arg) args voids
698 | isTyCoArg arg = size_up_app fun args voids
699 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
700 | otherwise = size_up arg `addSizeNSD`
701 size_up_app fun (arg:args) voids
702 size_up_app (Var fun) args voids = size_up_call fun args voids
703 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
704 size_up_app (Cast expr _) args voids = size_up_app expr args voids
705 size_up_app other args voids = size_up other `addSizeN`
706 callSize (length args) voids
707 -- if the lhs is not an App or a Var, or an invisible thing like a
708 -- Tick or Cast, then we should charge for a complete call plus the
709 -- size of the lhs itself.
710
711 ------------
712 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
713 size_up_call fun val_args voids
714 = case idDetails fun of
715 FCallId _ -> sizeN (callSize (length val_args) voids)
716 DataConWorkId dc -> conSize dc (length val_args)
717 PrimOpId op -> primOpSize op (length val_args)
718 ClassOpId _ -> classOpSize dflags top_args val_args
719 _ -> funSize dflags top_args fun (length val_args) voids
720
721 ------------
722 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
723 -- Don't charge for args, so that wrappers look cheap
724 -- (See comments about wrappers with Case)
725 --
726 -- IMPORTANT: *do* charge 1 for the alternative, else we
727 -- find that giant case nests are treated as practically free
728 -- A good example is Foreign.C.Error.errnoToIOError
729
730 ------------
731 -- Cost to allocate binding with given binder
732 size_up_alloc bndr
733 | isTyVar bndr -- Doesn't exist at runtime
734 || isJoinId bndr -- Not allocated at all
735 || isUnliftedType (idType bndr) -- Doesn't live in heap
736 = 0
737 | otherwise
738 = 10
739
740 ------------
741 -- These addSize things have to be here because
742 -- I don't want to give them bOMB_OUT_SIZE as an argument
743 addSizeN TooBig _ = TooBig
744 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
745
746 -- addAltSize is used to add the sizes of case alternatives
747 addAltSize TooBig _ = TooBig
748 addAltSize _ TooBig = TooBig
749 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
750 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
751 (xs `unionBags` ys)
752 (d1 + d2) -- Note [addAltSize result discounts]
753
754 -- This variant ignores the result discount from its LEFT argument
755 -- It's used when the second argument isn't part of the result
756 addSizeNSD TooBig _ = TooBig
757 addSizeNSD _ TooBig = TooBig
758 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
759 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
760 (xs `unionBags` ys)
761 d2 -- Ignore d1
762
763 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
764
765 -- an expression of type State# RealWorld must be a variable
766 isRealWorldExpr (Var id) = isRealWorldId id
767 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
768 isRealWorldExpr _ = False
769
770 -- | Finds a nominal size of a string literal.
771 litSize :: Literal -> Int
772 -- Used by CoreUnfold.sizeExpr
773 litSize (LitNumber LitNumInteger _ _) = 100 -- Note [Size of literal integers]
774 litSize (LitNumber LitNumNatural _ _) = 100
775 litSize (MachStr str) = 10 + 10 * ((BS.length str + 3) `div` 4)
776 -- If size could be 0 then @f "x"@ might be too small
777 -- [Sept03: make literal strings a bit bigger to avoid fruitless
778 -- duplication of little strings]
779 litSize _other = 0 -- Must match size of nullary constructors
780 -- Key point: if x |-> 4, then x must inline unconditionally
781 -- (eg via case binding)
782
783 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
784 -- See Note [Conlike is interesting]
785 classOpSize _ _ []
786 = sizeZero
787 classOpSize dflags top_args (arg1 : other_args)
788 = SizeIs size arg_discount 0
789 where
790 size = 20 + (10 * length other_args)
791 -- If the class op is scrutinising a lambda bound dictionary then
792 -- give it a discount, to encourage the inlining of this function
793 -- The actual discount is rather arbitrarily chosen
794 arg_discount = case arg1 of
795 Var dict | dict `elem` top_args
796 -> unitBag (dict, ufDictDiscount dflags)
797 _other -> emptyBag
798
799 -- | The size of a function call
800 callSize
801 :: Int -- ^ number of value args
802 -> Int -- ^ number of value args that are void
803 -> Int
804 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
805 -- The 1+ is for the function itself
806 -- Add 1 for each non-trivial arg;
807 -- the allocation cost, as in let(rec)
808
809 -- | The size of a jump to a join point
810 jumpSize
811 :: Int -- ^ number of value args
812 -> Int -- ^ number of value args that are void
813 -> Int
814 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
815 -- A jump is 20% the size of a function call. Making jumps free reopens
816 -- bug #6048, but making them any more expensive loses a 21% improvement in
817 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
818 -- better solution?
819
820 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
821 -- Size for functions that are not constructors or primops
822 -- Note [Function applications]
823 funSize dflags top_args fun n_val_args voids
824 | fun `hasKey` buildIdKey = buildSize
825 | fun `hasKey` augmentIdKey = augmentSize
826 | otherwise = SizeIs size arg_discount res_discount
827 where
828 some_val_args = n_val_args > 0
829 is_join = isJoinId fun
830
831 size | is_join = jumpSize n_val_args voids
832 | not some_val_args = 0
833 | otherwise = callSize n_val_args voids
834
835 -- DISCOUNTS
836 -- See Note [Function and non-function discounts]
837 arg_discount | some_val_args && fun `elem` top_args
838 = unitBag (fun, ufFunAppDiscount dflags)
839 | otherwise = emptyBag
840 -- If the function is an argument and is applied
841 -- to some values, give it an arg-discount
842
843 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
844 | otherwise = 0
845 -- If the function is partially applied, show a result discount
846 -- XXX maybe behave like ConSize for eval'd variable
847
848 conSize :: DataCon -> Int -> ExprSize
849 conSize dc n_val_args
850 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
851
852 -- See Note [Unboxed tuple size and result discount]
853 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
854
855 -- See Note [Constructor size and result discount]
856 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
857
858 -- XXX still looks to large to me
859
860 {-
861 Note [Constructor size and result discount]
862 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
863 Treat a constructors application as size 10, regardless of how many
864 arguments it has; we are keen to expose them (and we charge separately
865 for their args). We can't treat them as size zero, else we find that
866 (Just x) has size 0, which is the same as a lone variable; and hence
867 'v' will always be replaced by (Just x), where v is bound to Just x.
868
869 The "result discount" is applied if the result of the call is
870 scrutinised (say by a case). For a constructor application that will
871 mean the constructor application will disappear, so we don't need to
872 charge it to the function. So the discount should at least match the
873 cost of the constructor application, namely 10. But to give a bit
874 of extra incentive we give a discount of 10*(1 + n_val_args).
875
876 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
877 and said it was an "unambiguous win", but its terribly dangerous
878 because a function with many many case branches, each finishing with
879 a constructor, can have an arbitrarily large discount. This led to
880 terrible code bloat: see Trac #6099.
881
882 Note [Unboxed tuple size and result discount]
883 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
884 However, unboxed tuples count as size zero. I found occasions where we had
885 f x y z = case op# x y z of { s -> (# s, () #) }
886 and f wasn't getting inlined.
887
888 I tried giving unboxed tuples a *result discount* of zero (see the
889 commented-out line). Why? When returned as a result they do not
890 allocate, so maybe we don't want to charge so much for them If you
891 have a non-zero discount here, we find that workers often get inlined
892 back into wrappers, because it look like
893 f x = case $wf x of (# a,b #) -> (a,b)
894 and we are keener because of the case. However while this change
895 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
896 more. All other changes were very small. So it's not a big deal but I
897 didn't adopt the idea.
898
899 Note [Function and non-function discounts]
900 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
901 We want a discount if the function is applied. A good example is
902 monadic combinators with continuation arguments, where inlining is
903 quite important.
904
905 But we don't want a big discount when a function is called many times
906 (see the detailed comments with Trac #6048) because if the function is
907 big it won't be inlined at its many call sites and no benefit results.
908 Indeed, we can get exponentially big inlinings this way; that is what
909 Trac #6048 is about.
910
911 On the other hand, for data-valued arguments, if there are lots of
912 case expressions in the body, each one will get smaller if we apply
913 the function to a constructor application, so we *want* a big discount
914 if the argument is scrutinised by many case expressions.
915
916 Conclusion:
917 - For functions, take the max of the discounts
918 - For data values, take the sum of the discounts
919
920
921 Note [Literal integer size]
922 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
923 Literal integers *can* be big (mkInteger [...coefficients...]), but
924 need not be (S# n). We just use an arbitrary big-ish constant here
925 so that, in particular, we don't inline top-level defns like
926 n = S# 5
927 There's no point in doing so -- any optimisations will see the S#
928 through n's unfolding. Nor will a big size inhibit unfoldings functions
929 that mention a literal Integer, because the float-out pass will float
930 all those constants to top level.
931 -}
932
933 primOpSize :: PrimOp -> Int -> ExprSize
934 primOpSize op n_val_args
935 = if primOpOutOfLine op
936 then sizeN (op_size + n_val_args)
937 else sizeN op_size
938 where
939 op_size = primOpCodeSize op
940
941
942 buildSize :: ExprSize
943 buildSize = SizeIs 0 emptyBag 40
944 -- We really want to inline applications of build
945 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
946 -- Indeed, we should add a result_discount because build is
947 -- very like a constructor. We don't bother to check that the
948 -- build is saturated (it usually is). The "-2" discounts for the \c n,
949 -- The "4" is rather arbitrary.
950
951 augmentSize :: ExprSize
952 augmentSize = SizeIs 0 emptyBag 40
953 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
954 -- e plus ys. The -2 accounts for the \cn
955
956 -- When we return a lambda, give a discount if it's used (applied)
957 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
958 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
959 lamScrutDiscount _ TooBig = TooBig
960
961 {-
962 Note [addAltSize result discounts]
963 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
964 When adding the size of alternatives, we *add* the result discounts
965 too, rather than take the *maximum*. For a multi-branch case, this
966 gives a discount for each branch that returns a constructor, making us
967 keener to inline. I did try using 'max' instead, but it makes nofib
968 'rewrite' and 'puzzle' allocate significantly more, and didn't make
969 binary sizes shrink significantly either.
970
971 Note [Discounts and thresholds]
972 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
973 Constants for discounts and thesholds are defined in main/DynFlags,
974 all of form ufXxxx. They are:
975
976 ufCreationThreshold
977 At a definition site, if the unfolding is bigger than this, we
978 may discard it altogether
979
980 ufUseThreshold
981 At a call site, if the unfolding, less discounts, is smaller than
982 this, then it's small enough inline
983
984 ufKeenessFactor
985 Factor by which the discounts are multiplied before
986 subtracting from size
987
988 ufDictDiscount
989 The discount for each occurrence of a dictionary argument
990 as an argument of a class method. Should be pretty small
991 else big functions may get inlined
992
993 ufFunAppDiscount
994 Discount for a function argument that is applied. Quite
995 large, because if we inline we avoid the higher-order call.
996
997 ufDearOp
998 The size of a foreign call or not-dupable PrimOp
999
1000 ufVeryAggressive
1001 If True, the compiler ignores all the thresholds and inlines very
1002 aggressively. It still adheres to arity, simplifier phase control and
1003 loop breakers.
1004
1005
1006 Note [Function applications]
1007 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1008 In a function application (f a b)
1009
1010 - If 'f' is an argument to the function being analysed,
1011 and there's at least one value arg, record a FunAppDiscount for f
1012
1013 - If the application if a PAP (arity > 2 in this example)
1014 record a *result* discount (because inlining
1015 with "extra" args in the call may mean that we now
1016 get a saturated application)
1017
1018 Code for manipulating sizes
1019 -}
1020
1021 -- | The size of a candidate expression for unfolding
1022 data ExprSize
1023 = TooBig
1024 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
1025 , _es_args :: !(Bag (Id,Int))
1026 -- ^ Arguments cased herein, and discount for each such
1027 , _es_discount :: {-# UNPACK #-} !Int
1028 -- ^ Size to subtract if result is scrutinised by a case
1029 -- expression
1030 }
1031
1032 instance Outputable ExprSize where
1033 ppr TooBig = text "TooBig"
1034 ppr (SizeIs a _ c) = brackets (int a <+> int c)
1035
1036 -- subtract the discount before deciding whether to bale out. eg. we
1037 -- want to inline a large constructor application into a selector:
1038 -- tup = (a_1, ..., a_99)
1039 -- x = case tup of ...
1040 --
1041 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
1042 mkSizeIs max n xs d | (n - d) > max = TooBig
1043 | otherwise = SizeIs n xs d
1044
1045 maxSize :: ExprSize -> ExprSize -> ExprSize
1046 maxSize TooBig _ = TooBig
1047 maxSize _ TooBig = TooBig
1048 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
1049 | otherwise = s2
1050
1051 sizeZero :: ExprSize
1052 sizeN :: Int -> ExprSize
1053
1054 sizeZero = SizeIs 0 emptyBag 0
1055 sizeN n = SizeIs n emptyBag 0
1056
1057 {-
1058 ************************************************************************
1059 * *
1060 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
1061 * *
1062 ************************************************************************
1063
1064 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
1065 we ``couldn't possibly use'' on the other side. Can be overridden w/
1066 flaggery. Just the same as smallEnoughToInline, except that it has no
1067 actual arguments.
1068 -}
1069
1070 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1071 couldBeSmallEnoughToInline dflags threshold rhs
1072 = case sizeExpr dflags threshold [] body of
1073 TooBig -> False
1074 _ -> True
1075 where
1076 (_, body) = collectBinders rhs
1077
1078 ----------------
1079 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1080 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1081 = size <= ufUseThreshold dflags
1082 smallEnoughToInline _ _
1083 = False
1084
1085 ----------------
1086
1087 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1088 -- Sees if the unfolding is pretty certain to inline
1089 -- If so, return a *stable* unfolding for it, that will always inline
1090 certainlyWillInline dflags fn_info
1091 = case unfoldingInfo fn_info of
1092 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1093 | loop_breaker -> Nothing -- Won't inline, so try w/w
1094 | otherwise -> do_cunf e g -- Depends on size, so look at that
1095
1096 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1097 -- to do so, and even if it is currently a
1098 -- loop breaker, it may not be later
1099
1100 _other_unf -> Nothing
1101
1102 where
1103 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1104 fn_unf = unfoldingInfo fn_info
1105
1106 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1107 do_cunf _ UnfNever = Nothing
1108 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1109 -- INLINE functions have UnfWhen
1110
1111 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1112 -- binary sizes go up by 10%! (This is with SplitObjs.)
1113 -- I'm not totally sure why.
1114 -- INLINABLE functions come via this path
1115 -- See Note [certainlyWillInline: INLINABLE]
1116 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1117 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1118 , case inlinePragmaSpec (inlinePragInfo fn_info) of
1119 NoInline -> False -- NOINLINE; do not say certainlyWillInline!
1120 _ -> True -- INLINE, INLINABLE, or nothing
1121 , not (isBottomingSig (strictnessInfo fn_info))
1122 -- Do not unconditionally inline a bottoming functions even if
1123 -- it seems smallish. We've carefully lifted it out to top level,
1124 -- so we don't want to re-inline it.
1125 , let arity = length args
1126 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1127 = Just (fn_unf { uf_src = InlineStable
1128 , uf_guidance = UnfWhen { ug_arity = arity
1129 , ug_unsat_ok = unSaturatedOk
1130 , ug_boring_ok = inlineBoringOk expr } })
1131 -- Note the "unsaturatedOk". A function like f = \ab. a
1132 -- will certainly inline, even if partially applied (f e), so we'd
1133 -- better make sure that the transformed inlining has the same property
1134 | otherwise
1135 = Nothing
1136
1137 {- Note [certainlyWillInline: be careful of thunks]
1138 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1139 Don't claim that thunks will certainly inline, because that risks work
1140 duplication. Even if the work duplication is not great (eg is_cheap
1141 holds), it can make a big difference in an inner loop In Trac #5623 we
1142 found that the WorkWrap phase thought that
1143 y = case x of F# v -> F# (v +# v)
1144 was certainlyWillInline, so the addition got duplicated.
1145
1146 Note [certainlyWillInline: INLINABLE]
1147 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1148 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1149 even though we have a stable inlining, so that strictness w/w takes
1150 place. It makes a big difference to efficiency, and the w/w pass knows
1151 how to transfer the INLINABLE info to the worker; see WorkWrap
1152 Note [Worker-wrapper for INLINABLE functions]
1153
1154 ************************************************************************
1155 * *
1156 \subsection{callSiteInline}
1157 * *
1158 ************************************************************************
1159
1160 This is the key function. It decides whether to inline a variable at a call site
1161
1162 callSiteInline is used at call sites, so it is a bit more generous.
1163 It's a very important function that embodies lots of heuristics.
1164 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1165 and occurs exactly once or
1166 occurs once in each branch of a case and is small
1167
1168 If the thing is in WHNF, there's no danger of duplicating work,
1169 so we can inline if it occurs once, or is small
1170
1171 NOTE: we don't want to inline top-level functions that always diverge.
1172 It just makes the code bigger. Tt turns out that the convenient way to prevent
1173 them inlining is to give them a NOINLINE pragma, which we do in
1174 StrictAnal.addStrictnessInfoToTopId
1175 -}
1176
1177 callSiteInline :: DynFlags
1178 -> Id -- The Id
1179 -> Bool -- True <=> unfolding is active
1180 -> Bool -- True if there are no arguments at all (incl type args)
1181 -> [ArgSummary] -- One for each value arg; True if it is interesting
1182 -> CallCtxt -- True <=> continuation is interesting
1183 -> Maybe CoreExpr -- Unfolding, if any
1184
1185 data ArgSummary = TrivArg -- Nothing interesting
1186 | NonTrivArg -- Arg has structure
1187 | ValueArg -- Arg is a con-app or PAP
1188 -- ..or con-like. Note [Conlike is interesting]
1189
1190 instance Outputable ArgSummary where
1191 ppr TrivArg = text "TrivArg"
1192 ppr NonTrivArg = text "NonTrivArg"
1193 ppr ValueArg = text "ValueArg"
1194
1195 nonTriv :: ArgSummary -> Bool
1196 nonTriv TrivArg = False
1197 nonTriv _ = True
1198
1199 data CallCtxt
1200 = BoringCtxt
1201 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1202 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1203 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1204
1205 | ValAppCtxt -- We're applied to at least one value arg
1206 -- This arises when we have ((f x |> co) y)
1207 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1208
1209 | CaseCtxt -- We're the scrutinee of a case
1210 -- that decomposes its scrutinee
1211
1212 instance Outputable CallCtxt where
1213 ppr CaseCtxt = text "CaseCtxt"
1214 ppr ValAppCtxt = text "ValAppCtxt"
1215 ppr BoringCtxt = text "BoringCtxt"
1216 ppr RhsCtxt = text "RhsCtxt"
1217 ppr DiscArgCtxt = text "DiscArgCtxt"
1218 ppr RuleArgCtxt = text "RuleArgCtxt"
1219
1220 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1221 = case idUnfolding id of
1222 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1223 -- Things with an INLINE pragma may have an unfolding *and*
1224 -- be a loop breaker (maybe the knot is not yet untied)
1225 CoreUnfolding { uf_tmpl = unf_template
1226 , uf_is_work_free = is_wf
1227 , uf_guidance = guidance, uf_expandable = is_exp }
1228 | active_unfolding -> tryUnfolding dflags id lone_variable
1229 arg_infos cont_info unf_template
1230 is_wf is_exp guidance
1231 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1232 NoUnfolding -> Nothing
1233 BootUnfolding -> Nothing
1234 OtherCon {} -> Nothing
1235 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1236
1237 traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1238 traceInline dflags inline_id str doc result
1239 | Just prefix <- inlineCheck dflags
1240 = if prefix `isPrefixOf` occNameString (getOccName inline_id)
1241 then pprTrace str doc result
1242 else result
1243 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1244 = pprTrace str doc result
1245 | otherwise
1246 = result
1247
1248 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1249 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1250 -> Maybe CoreExpr
1251 tryUnfolding dflags id lone_variable
1252 arg_infos cont_info unf_template
1253 is_wf is_exp guidance
1254 = case guidance of
1255 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1256
1257 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1258 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1259 -- See Note [INLINE for small functions (3)]
1260 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1261 | otherwise
1262 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1263 where
1264 some_benefit = calc_some_benefit uf_arity
1265 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1266
1267 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1268 | ufVeryAggressive dflags
1269 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1270 | is_wf && some_benefit && small_enough
1271 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1272 | otherwise
1273 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1274 where
1275 some_benefit = calc_some_benefit (length arg_discounts)
1276 extra_doc = text "discounted size =" <+> int discounted_size
1277 discounted_size = size - discount
1278 small_enough = discounted_size <= ufUseThreshold dflags
1279 discount = computeDiscount dflags arg_discounts
1280 res_discount arg_infos cont_info
1281
1282 where
1283 mk_doc some_benefit extra_doc yes_or_no
1284 = vcat [ text "arg infos" <+> ppr arg_infos
1285 , text "interesting continuation" <+> ppr cont_info
1286 , text "some_benefit" <+> ppr some_benefit
1287 , text "is exp:" <+> ppr is_exp
1288 , text "is work-free:" <+> ppr is_wf
1289 , text "guidance" <+> ppr guidance
1290 , extra_doc
1291 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1292
1293 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1294 n_val_args = length arg_infos
1295
1296 -- some_benefit is used when the RHS is small enough
1297 -- and the call has enough (or too many) value
1298 -- arguments (ie n_val_args >= arity). But there must
1299 -- be *something* interesting about some argument, or the
1300 -- result context, to make it worth inlining
1301 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1302 -- expected by the unfolding
1303 calc_some_benefit uf_arity
1304 | not saturated = interesting_args -- Under-saturated
1305 -- Note [Unsaturated applications]
1306 | otherwise = interesting_args -- Saturated or over-saturated
1307 || interesting_call
1308 where
1309 saturated = n_val_args >= uf_arity
1310 over_saturated = n_val_args > uf_arity
1311 interesting_args = any nonTriv arg_infos
1312 -- NB: (any nonTriv arg_infos) looks at the
1313 -- over-saturated args too which is "wrong";
1314 -- but if over-saturated we inline anyway.
1315
1316 interesting_call
1317 | over_saturated
1318 = True
1319 | otherwise
1320 = case cont_info of
1321 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1322 ValAppCtxt -> True -- Note [Cast then apply]
1323 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1324 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt]
1325 RhsCtxt -> uf_arity > 0 --
1326 _other -> False -- See Note [Nested functions]
1327
1328
1329 {-
1330 Note [Unfold into lazy contexts], Note [RHS of lets]
1331 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1332 When the call is the argument of a function with a RULE, or the RHS of a let,
1333 we are a little bit keener to inline. For example
1334 f y = (y,y,y)
1335 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1336 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1337 only we can't see it. Also
1338 x = f v
1339 could be expensive whereas
1340 x = case v of (a,b) -> a
1341 is patently cheap and may allow more eta expansion.
1342 So we treat the RHS of a let as not-totally-boring.
1343
1344 Note [Unsaturated applications]
1345 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1346 When a call is not saturated, we *still* inline if one of the
1347 arguments has interesting structure. That's sometimes very important.
1348 A good example is the Ord instance for Bool in Base:
1349
1350 Rec {
1351 $fOrdBool =GHC.Classes.D:Ord
1352 @ Bool
1353 ...
1354 $cmin_ajX
1355
1356 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1357 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1358 }
1359
1360 But the defn of GHC.Classes.$dmmin is:
1361
1362 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1363 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1364 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1365 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1366 GHC.Types.False -> y GHC.Types.True -> x }) -}
1367
1368 We *really* want to inline $dmmin, even though it has arity 3, in
1369 order to unravel the recursion.
1370
1371
1372 Note [Things to watch]
1373 ~~~~~~~~~~~~~~~~~~~~~~
1374 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1375 Assume x is exported, so not inlined unconditionally.
1376 Then we want x to inline unconditionally; no reason for it
1377 not to, and doing so avoids an indirection.
1378
1379 * { x = I# 3; ....f x.... }
1380 Make sure that x does not inline unconditionally!
1381 Lest we get extra allocation.
1382
1383 Note [Inlining an InlineRule]
1384 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1385 An InlineRules is used for
1386 (a) programmer INLINE pragmas
1387 (b) inlinings from worker/wrapper
1388
1389 For (a) the RHS may be large, and our contract is that we *only* inline
1390 when the function is applied to all the arguments on the LHS of the
1391 source-code defn. (The uf_arity in the rule.)
1392
1393 However for worker/wrapper it may be worth inlining even if the
1394 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1395 require saturation.
1396
1397 Note [Nested functions]
1398 ~~~~~~~~~~~~~~~~~~~~~~~
1399 At one time we treated a call of a non-top-level function as
1400 "interesting" (regardless of how boring the context) in the hope
1401 that inlining it would eliminate the binding, and its allocation.
1402 Specifically, in the default case of interesting_call we had
1403 _other -> not is_top && uf_arity > 0
1404
1405 But actually postInlineUnconditionally does some of this and overall
1406 it makes virtually no difference to nofib. So I simplified away this
1407 special case
1408
1409 Note [Cast then apply]
1410 ~~~~~~~~~~~~~~~~~~~~~~
1411 Consider
1412 myIndex = __inline_me ( (/\a. <blah>) |> co )
1413 co :: (forall a. a -> a) ~ (forall a. T a)
1414 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1415
1416 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1417 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1418
1419 Note [Inlining in ArgCtxt]
1420 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1421 The condition (arity > 0) here is very important, because otherwise
1422 we end up inlining top-level stuff into useless places; eg
1423 x = I# 3#
1424 f = \y. g x
1425 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1426 and 20% to 'power'.
1427
1428 At one stage I replaced this condition by 'True' (leading to the above
1429 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1430 to work ok now.
1431
1432 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1433 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1434 for the RHS of a 'let', we only profit from the inlining if we get a
1435 CONLIKE thing (modulo lets).
1436
1437 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1438 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1439 The "lone-variable" case is important. I spent ages messing about
1440 with unsatisfactory variants, but this is nice. The idea is that if a
1441 variable appears all alone
1442
1443 as an arg of lazy fn, or rhs BoringCtxt
1444 as scrutinee of a case CaseCtxt
1445 as arg of a fn ArgCtxt
1446 AND
1447 it is bound to a cheap expression
1448
1449 then we should not inline it (unless there is some other reason,
1450 e.g. it is the sole occurrence). That is what is happening at
1451 the use of 'lone_variable' in 'interesting_call'.
1452
1453 Why? At least in the case-scrutinee situation, turning
1454 let x = (a,b) in case x of y -> ...
1455 into
1456 let x = (a,b) in case (a,b) of y -> ...
1457 and thence to
1458 let x = (a,b) in let y = (a,b) in ...
1459 is bad if the binding for x will remain.
1460
1461 Another example: I discovered that strings
1462 were getting inlined straight back into applications of 'error'
1463 because the latter is strict.
1464 s = "foo"
1465 f = \x -> ...(error s)...
1466
1467 Fundamentally such contexts should not encourage inlining because, provided
1468 the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1469 context can ``see'' the unfolding of the variable (e.g. case or a
1470 RULE) so there's no gain.
1471
1472 However, watch out:
1473
1474 * Consider this:
1475 foo = _inline_ (\n. [n])
1476 bar = _inline_ (foo 20)
1477 baz = \n. case bar of { (m:_) -> m + n }
1478 Here we really want to inline 'bar' so that we can inline 'foo'
1479 and the whole thing unravels as it should obviously do. This is
1480 important: in the NDP project, 'bar' generates a closure data
1481 structure rather than a list.
1482
1483 So the non-inlining of lone_variables should only apply if the
1484 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1485 looks through the unfolding. Hence the "&& is_wf" in the
1486 InlineRule branch.
1487
1488 * Even a type application or coercion isn't a lone variable.
1489 Consider
1490 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1491 We had better inline that sucker! The case won't see through it.
1492
1493 For now, I'm treating treating a variable applied to types
1494 in a *lazy* context "lone". The motivating example was
1495 f = /\a. \x. BIG
1496 g = /\a. \y. h (f a)
1497 There's no advantage in inlining f here, and perhaps
1498 a significant disadvantage. Hence some_val_args in the Stop case
1499
1500 Note [Interaction of exprIsWorkFree and lone variables]
1501 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1502 The lone-variable test says "don't inline if a case expression
1503 scrutinises a lone variable whose unfolding is cheap". It's very
1504 important that, under these circumstances, exprIsConApp_maybe
1505 can spot a constructor application. So, for example, we don't
1506 consider
1507 let x = e in (x,x)
1508 to be cheap, and that's good because exprIsConApp_maybe doesn't
1509 think that expression is a constructor application.
1510
1511 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1512 rather than is_wf, which was utterly wrong, because the above
1513 expression responds True to exprIsHNF, which is what sets is_value.
1514
1515 This kind of thing can occur if you have
1516
1517 {-# INLINE foo #-}
1518 foo = let x = e in (x,x)
1519
1520 which Roman did.
1521
1522
1523 -}
1524
1525 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1526 -> Int
1527 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1528 -- We multiple the raw discounts (args_discount and result_discount)
1529 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1530 -- *size* whereas the discounts imply that there's some extra
1531 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1532 -- by inlining.
1533
1534 = 10 -- Discount of 10 because the result replaces the call
1535 -- so we count 10 for the function itself
1536
1537 + 10 * length actual_arg_discounts
1538 -- Discount of 10 for each arg supplied,
1539 -- because the result replaces the call
1540
1541 + round (ufKeenessFactor dflags *
1542 fromIntegral (total_arg_discount + res_discount'))
1543 where
1544 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1545 total_arg_discount = sum actual_arg_discounts
1546
1547 mk_arg_discount _ TrivArg = 0
1548 mk_arg_discount _ NonTrivArg = 10
1549 mk_arg_discount discount ValueArg = discount
1550
1551 res_discount'
1552 | LT <- arg_discounts `compareLength` arg_infos
1553 = res_discount -- Over-saturated
1554 | otherwise
1555 = case cont_info of
1556 BoringCtxt -> 0
1557 CaseCtxt -> res_discount -- Presumably a constructor
1558 ValAppCtxt -> res_discount -- Presumably a function
1559 _ -> 40 `min` res_discount
1560 -- ToDo: this 40 `min` res_discount doesn't seem right
1561 -- for DiscArgCtxt it shouldn't matter because the function will
1562 -- get the arg discount for any non-triv arg
1563 -- for RuleArgCtxt we do want to be keener to inline; but not only
1564 -- constructor results
1565 -- for RhsCtxt I suppose that exposing a data con is good in general
1566 -- And 40 seems very arbitrary
1567 --
1568 -- res_discount can be very large when a function returns
1569 -- constructors; but we only want to invoke that large discount
1570 -- when there's a case continuation.
1571 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1572 -- But we want to aovid inlining large functions that return
1573 -- constructors into contexts that are simply "interesting"