811ddad00eaef52e5d84342d2dc12ed8e7e5acc0
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import DynFlags
46 import CoreSyn
47 import PprCore () -- Instances
48 import OccurAnal ( occurAnalyseExpr )
49 import CoreOpt
50 import CoreArity ( manifestArity )
51 import CoreUtils
52 import Id
53 import Demand ( isBottomingSig )
54 import DataCon
55 import Literal
56 import PrimOp
57 import IdInfo
58 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
59 import Type
60 import PrelNames
61 import TysPrim ( realWorldStatePrimTy )
62 import Bag
63 import Util
64 import Outputable
65 import ForeignCall
66
67 import qualified Data.ByteString as BS
68
69 {-
70 ************************************************************************
71 * *
72 \subsection{Making unfoldings}
73 * *
74 ************************************************************************
75 -}
76
77 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
78 mkTopUnfolding dflags is_bottoming rhs
79 = mkUnfolding dflags InlineRhs True is_bottoming rhs
80
81 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
82 -- For implicit Ids, do a tiny bit of optimising first
83 mkImplicitUnfolding dflags expr
84 = mkTopUnfolding dflags False (simpleOptExpr expr)
85
86 -- Note [Top-level flag on inline rules]
87 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
88 -- Slight hack: note that mk_inline_rules conservatively sets the
89 -- top-level flag to True. It gets set more accurately by the simplifier
90 -- Simplify.simplUnfolding.
91
92 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
93 mkSimpleUnfolding dflags rhs
94 = mkUnfolding dflags InlineRhs False False rhs
95
96 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
97 mkDFunUnfolding bndrs con ops
98 = DFunUnfolding { df_bndrs = bndrs
99 , df_con = con
100 , df_args = map occurAnalyseExpr ops }
101 -- See Note [Occurrence analysis of unfoldings]
102
103 mkWwInlineRule :: CoreExpr -> Arity -> Unfolding
104 mkWwInlineRule expr arity
105 = mkCoreUnfolding InlineStable True
106 (simpleOptExpr expr)
107 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
108 , ug_boring_ok = boringCxtNotOk })
109
110 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
111 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
112 = mkCoreUnfolding InlineCompulsory True
113 (simpleOptExpr expr)
114 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
115 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
116
117 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
118 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
119 mkWorkerUnfolding dflags work_fn
120 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
121 , uf_is_top = top_lvl })
122 | isStableSource src
123 = mkCoreUnfolding src top_lvl new_tmpl guidance
124 where
125 new_tmpl = simpleOptExpr (work_fn tmpl)
126 guidance = calcUnfoldingGuidance dflags False new_tmpl
127
128 mkWorkerUnfolding _ _ _ = noUnfolding
129
130 -- | Make an unfolding that may be used unsaturated
131 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
132 -- manifest arity (the number of outer lambdas applications will
133 -- resolve before doing any work).
134 mkInlineUnfolding :: CoreExpr -> Unfolding
135 mkInlineUnfolding expr
136 = mkCoreUnfolding InlineStable
137 True -- Note [Top-level flag on inline rules]
138 expr' guide
139 where
140 expr' = simpleOptExpr expr
141 guide = UnfWhen { ug_arity = manifestArity expr'
142 , ug_unsat_ok = unSaturatedOk
143 , ug_boring_ok = boring_ok }
144 boring_ok = inlineBoringOk expr'
145
146 -- | Make an unfolding that will be used once the RHS has been saturated
147 -- to the given arity.
148 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
149 mkInlineUnfoldingWithArity arity expr
150 = mkCoreUnfolding InlineStable
151 True -- Note [Top-level flag on inline rules]
152 expr' guide
153 where
154 expr' = simpleOptExpr expr
155 guide = UnfWhen { ug_arity = arity
156 , ug_unsat_ok = needSaturated
157 , ug_boring_ok = boring_ok }
158 boring_ok = inlineBoringOk expr'
159
160 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
161 mkInlinableUnfolding dflags expr
162 = mkUnfolding dflags InlineStable False False expr'
163 where
164 expr' = simpleOptExpr expr
165
166 specUnfolding :: [Var] -> (CoreExpr -> CoreExpr) -> Arity -> Unfolding -> Unfolding
167 -- See Note [Specialising unfoldings]
168 -- specUnfolding spec_bndrs spec_app arity_decrease unf
169 -- = \spec_bndrs. spec_app( unf )
170 --
171 specUnfolding spec_bndrs spec_app arity_decrease
172 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
173 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
174 mkDFunUnfolding spec_bndrs con (map spec_arg args)
175 -- There is a hard-to-check assumption here that the spec_app has
176 -- enough applications to exactly saturate the old_bndrs
177 -- For DFunUnfoldings we transform
178 -- \old_bndrs. MkD <op1> ... <opn>
179 -- to
180 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
181 -- The ASSERT checks the value part of that
182 where
183 spec_arg arg = simpleOptExpr (spec_app (mkLams old_bndrs arg))
184 -- The beta-redexes created by spec_app will be
185 -- simplified away by simplOptExpr
186
187 specUnfolding spec_bndrs spec_app arity_decrease
188 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
189 , uf_is_top = top_lvl
190 , uf_guidance = old_guidance })
191 | isStableSource src -- See Note [Specialising unfoldings]
192 , UnfWhen { ug_arity = old_arity
193 , ug_unsat_ok = unsat_ok
194 , ug_boring_ok = boring_ok } <- old_guidance
195 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
196 , ug_unsat_ok = unsat_ok
197 , ug_boring_ok = boring_ok }
198 new_tmpl = simpleOptExpr (mkLams spec_bndrs (spec_app tmpl))
199 -- The beta-redexes created by spec_app will be
200 -- simplified away by simplOptExpr
201
202 in mkCoreUnfolding src top_lvl new_tmpl guidance
203
204 specUnfolding _ _ _ _ = noUnfolding
205
206 {- Note [Specialising unfoldings]
207 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
208 When we specialise a function for some given type-class arguments, we use
209 specUnfolding to specialise its unfolding. Some important points:
210
211 * If the original function has a DFunUnfolding, the specialised one
212 must do so too! Otherwise we lose the magic rules that make it
213 interact with ClassOps
214
215 * There is a bit of hack for INLINABLE functions:
216 f :: Ord a => ....
217 f = <big-rhs>
218 {- INLINABLE f #-}
219 Now if we specialise f, should the specialised version still have
220 an INLINABLE pragma? If it does, we'll capture a specialised copy
221 of <big-rhs> as its unfolding, and that probaby won't inline. But
222 if we don't, the specialised version of <big-rhs> might be small
223 enough to inline at a call site. This happens with Control.Monad.liftM3,
224 and can cause a lot more allocation as a result (nofib n-body shows this).
225
226 Moreover, keeping the INLINABLE thing isn't much help, because
227 the specialised function (probaby) isn't overloaded any more.
228
229 Conclusion: drop the INLINEALE pragma. In practice what this means is:
230 if a stable unfolding has UnfoldingGuidance of UnfWhen,
231 we keep it (so the specialised thing too will always inline)
232 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
233 (which arises from INLINABLE), we discard it
234 -}
235
236 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
237 -> UnfoldingGuidance -> Unfolding
238 -- Occurrence-analyses the expression before capturing it
239 mkCoreUnfolding src top_lvl expr guidance
240 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
241 -- See Note [Occurrence analysis of unfoldings]
242 uf_src = src,
243 uf_is_top = top_lvl,
244 uf_is_value = exprIsHNF expr,
245 uf_is_conlike = exprIsConLike expr,
246 uf_is_work_free = exprIsWorkFree expr,
247 uf_expandable = exprIsExpandable expr,
248 uf_guidance = guidance }
249
250 mkUnfolding :: DynFlags -> UnfoldingSource
251 -> Bool -- Is top-level
252 -> Bool -- Definitely a bottoming binding
253 -- (only relevant for top-level bindings)
254 -> CoreExpr
255 -> Unfolding
256 -- Calculates unfolding guidance
257 -- Occurrence-analyses the expression before capturing it
258 mkUnfolding dflags src is_top_lvl is_bottoming expr
259 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
260 -- See Note [Occurrence analysis of unfoldings]
261 uf_src = src,
262 uf_is_top = is_top_lvl,
263 uf_is_value = exprIsHNF expr,
264 uf_is_conlike = exprIsConLike expr,
265 uf_expandable = exprIsExpandable expr,
266 uf_is_work_free = exprIsWorkFree expr,
267 uf_guidance = guidance }
268 where
269 is_top_bottoming = is_top_lvl && is_bottoming
270 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
271 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
272 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
273
274 {-
275 Note [Occurrence analysis of unfoldings]
276 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
277 We do occurrence-analysis of unfoldings once and for all, when the
278 unfolding is built, rather than each time we inline them.
279
280 But given this decision it's vital that we do
281 *always* do it. Consider this unfolding
282 \x -> letrec { f = ...g...; g* = f } in body
283 where g* is (for some strange reason) the loop breaker. If we don't
284 occ-anal it when reading it in, we won't mark g as a loop breaker, and
285 we may inline g entirely in body, dropping its binding, and leaving
286 the occurrence in f out of scope. This happened in Trac #8892, where
287 the unfolding in question was a DFun unfolding.
288
289 But more generally, the simplifier is designed on the
290 basis that it is looking at occurrence-analysed expressions, so better
291 ensure that they acutally are.
292
293 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
294 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
295 Notice that we give the non-occur-analysed expression to
296 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
297 first; for example, sometimes during simplification, there's a large
298 let-bound thing which has been substituted, and so is now dead; so
299 'expr' contains two copies of the thing while the occurrence-analysed
300 expression doesn't.
301
302 Nevertheless, we *don't* and *must not* occ-analyse before computing
303 the size because
304
305 a) The size computation bales out after a while, whereas occurrence
306 analysis does not.
307
308 b) Residency increases sharply if you occ-anal first. I'm not
309 100% sure why, but it's a large effect. Compiling Cabal went
310 from residency of 534M to over 800M with this one change.
311
312 This can occasionally mean that the guidance is very pessimistic;
313 it gets fixed up next round. And it should be rare, because large
314 let-bound things that are dead are usually caught by preInlineUnconditionally
315
316
317 ************************************************************************
318 * *
319 \subsection{The UnfoldingGuidance type}
320 * *
321 ************************************************************************
322 -}
323
324 inlineBoringOk :: CoreExpr -> Bool
325 -- See Note [INLINE for small functions]
326 -- True => the result of inlining the expression is
327 -- no bigger than the expression itself
328 -- eg (\x y -> f y x)
329 -- This is a quick and dirty version. It doesn't attempt
330 -- to deal with (\x y z -> x (y z))
331 -- The really important one is (x `cast` c)
332 inlineBoringOk e
333 = go 0 e
334 where
335 go :: Int -> CoreExpr -> Bool
336 go credit (Lam x e) | isId x = go (credit+1) e
337 | otherwise = go credit e
338 go credit (App f (Type {})) = go credit f
339 go credit (App f a) | credit > 0
340 , exprIsTrivial a = go (credit-1) f
341 go credit (Tick _ e) = go credit e -- dubious
342 go credit (Cast e _) = go credit e
343 go _ (Var {}) = boringCxtOk
344 go _ _ = boringCxtNotOk
345
346 calcUnfoldingGuidance
347 :: DynFlags
348 -> Bool -- Definitely a top-level, bottoming binding
349 -> CoreExpr -- Expression to look at
350 -> UnfoldingGuidance
351 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
352 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
353 = calcUnfoldingGuidance dflags is_top_bottoming expr
354 calcUnfoldingGuidance dflags is_top_bottoming expr
355 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
356 TooBig -> UnfNever
357 SizeIs size cased_bndrs scrut_discount
358 | uncondInline expr n_val_bndrs size
359 -> UnfWhen { ug_unsat_ok = unSaturatedOk
360 , ug_boring_ok = boringCxtOk
361 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
362
363 | is_top_bottoming
364 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
365
366 | otherwise
367 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
368 , ug_size = size
369 , ug_res = scrut_discount }
370
371 where
372 (bndrs, body) = collectBinders expr
373 bOMB_OUT_SIZE = ufCreationThreshold dflags
374 -- Bomb out if size gets bigger than this
375 val_bndrs = filter isId bndrs
376 n_val_bndrs = length val_bndrs
377
378 mk_discount :: Bag (Id,Int) -> Id -> Int
379 mk_discount cbs bndr = foldlBag combine 0 cbs
380 where
381 combine acc (bndr', disc)
382 | bndr == bndr' = acc `plus_disc` disc
383 | otherwise = acc
384
385 plus_disc :: Int -> Int -> Int
386 plus_disc | isFunTy (idType bndr) = max
387 | otherwise = (+)
388 -- See Note [Function and non-function discounts]
389
390 {-
391 Note [Computing the size of an expression]
392 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
393 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
394 heuristics right has taken a long time. Here's the basic strategy:
395
396 * Variables, literals: 0
397 (Exception for string literals, see litSize.)
398
399 * Function applications (f e1 .. en): 1 + #value args
400
401 * Constructor applications: 1, regardless of #args
402
403 * Let(rec): 1 + size of components
404
405 * Note, cast: 0
406
407 Examples
408
409 Size Term
410 --------------
411 0 42#
412 0 x
413 0 True
414 2 f x
415 1 Just x
416 4 f (g x)
417
418 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
419 a function call to account for. Notice also that constructor applications
420 are very cheap, because exposing them to a caller is so valuable.
421
422 [25/5/11] All sizes are now multiplied by 10, except for primops
423 (which have sizes like 1 or 4. This makes primops look fantastically
424 cheap, and seems to be almost unversally beneficial. Done partly as a
425 result of #4978.
426
427 Note [Do not inline top-level bottoming functions]
428 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
429 The FloatOut pass has gone to some trouble to float out calls to 'error'
430 and similar friends. See Note [Bottoming floats] in SetLevels.
431 Do not re-inline them! But we *do* still inline if they are very small
432 (the uncondInline stuff).
433
434 Note [INLINE for small functions]
435 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
436 Consider {-# INLINE f #-}
437 f x = Just x
438 g y = f y
439 Then f's RHS is no larger than its LHS, so we should inline it into
440 even the most boring context. In general, f the function is
441 sufficiently small that its body is as small as the call itself, the
442 inline unconditionally, regardless of how boring the context is.
443
444 Things to note:
445
446 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
447 than the thing it's replacing. Notice that
448 (f x) --> (g 3) -- YES, unconditionally
449 (f x) --> x : [] -- YES, *even though* there are two
450 -- arguments to the cons
451 x --> g 3 -- NO
452 x --> Just v -- NO
453
454 It's very important not to unconditionally replace a variable by
455 a non-atomic term.
456
457 (2) We do this even if the thing isn't saturated, else we end up with the
458 silly situation that
459 f x y = x
460 ...map (f 3)...
461 doesn't inline. Even in a boring context, inlining without being
462 saturated will give a lambda instead of a PAP, and will be more
463 efficient at runtime.
464
465 (3) However, when the function's arity > 0, we do insist that it
466 has at least one value argument at the call site. (This check is
467 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
468 f = /\a \x:a. x
469 d = /\b. MkD (f b)
470 If we inline f here we get
471 d = /\b. MkD (\x:b. x)
472 and then prepareRhs floats out the argument, abstracting the type
473 variables, so we end up with the original again!
474
475 (4) We must be much more cautious about arity-zero things. Consider
476 let x = y +# z in ...
477 In *size* terms primops look very small, because the generate a
478 single instruction, but we do not want to unconditionally replace
479 every occurrence of x with (y +# z). So we only do the
480 unconditional-inline thing for *trivial* expressions.
481
482 NB: you might think that PostInlineUnconditionally would do this
483 but it doesn't fire for top-level things; see SimplUtils
484 Note [Top level and postInlineUnconditionally]
485 -}
486
487 uncondInline :: CoreExpr -> Arity -> Int -> Bool
488 -- Inline unconditionally if there no size increase
489 -- Size of call is arity (+1 for the function)
490 -- See Note [INLINE for small functions]
491 uncondInline rhs arity size
492 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
493 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
494
495 sizeExpr :: DynFlags
496 -> Int -- Bomb out if it gets bigger than this
497 -> [Id] -- Arguments; we're interested in which of these
498 -- get case'd
499 -> CoreExpr
500 -> ExprSize
501
502 -- Note [Computing the size of an expression]
503
504 sizeExpr dflags bOMB_OUT_SIZE top_args expr
505 = size_up expr
506 where
507 size_up (Cast e _) = size_up e
508 size_up (Tick _ e) = size_up e
509 size_up (Type _) = sizeZero -- Types cost nothing
510 size_up (Coercion _) = sizeZero
511 size_up (Lit lit) = sizeN (litSize lit)
512 size_up (Var f) | isRealWorldId f = sizeZero
513 -- Make sure we get constructor discounts even
514 -- on nullary constructors
515 | otherwise = size_up_call f [] 0
516
517 size_up (App fun arg)
518 | isTyCoArg arg = size_up fun
519 | otherwise = size_up arg `addSizeNSD`
520 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
521
522 size_up (Lam b e)
523 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
524 | otherwise = size_up e
525
526 size_up (Let (NonRec binder rhs) body)
527 = size_up_rhs (binder, rhs) `addSizeNSD`
528 size_up body `addSizeN`
529 size_up_alloc binder
530
531 size_up (Let (Rec pairs) body)
532 = foldr (addSizeNSD . size_up_rhs)
533 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
534 pairs
535
536 size_up (Case e _ _ alts)
537 | null alts
538 = size_up e -- case e of {} never returns, so take size of scrutinee
539
540 size_up (Case e _ _ alts)
541 -- Now alts is non-empty
542 | Just v <- is_top_arg e -- We are scrutinising an argument variable
543 = let
544 alt_sizes = map size_up_alt alts
545
546 -- alts_size tries to compute a good discount for
547 -- the case when we are scrutinising an argument variable
548 alts_size (SizeIs tot tot_disc tot_scrut)
549 -- Size of all alternatives
550 (SizeIs max _ _)
551 -- Size of biggest alternative
552 = SizeIs tot (unitBag (v, 20 + tot - max)
553 `unionBags` tot_disc) tot_scrut
554 -- If the variable is known, we produce a
555 -- discount that will take us back to 'max',
556 -- the size of the largest alternative The
557 -- 1+ is a little discount for reduced
558 -- allocation in the caller
559 --
560 -- Notice though, that we return tot_disc,
561 -- the total discount from all branches. I
562 -- think that's right.
563
564 alts_size tot_size _ = tot_size
565 in
566 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
567 (foldr1 maxSize alt_sizes)
568 -- Good to inline if an arg is scrutinised, because
569 -- that may eliminate allocation in the caller
570 -- And it eliminates the case itself
571 where
572 is_top_arg (Var v) | v `elem` top_args = Just v
573 is_top_arg (Cast e _) = is_top_arg e
574 is_top_arg _ = Nothing
575
576
577 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
578 foldr (addAltSize . size_up_alt) case_size alts
579 where
580 case_size
581 | is_inline_scrut e, not (lengthExceeds alts 1) = sizeN (-10)
582 | otherwise = sizeZero
583 -- Normally we don't charge for the case itself, but
584 -- we charge one per alternative (see size_up_alt,
585 -- below) to account for the cost of the info table
586 -- and comparisons.
587 --
588 -- However, in certain cases (see is_inline_scrut
589 -- below), no code is generated for the case unless
590 -- there are multiple alts. In these cases we
591 -- subtract one, making the first alt free.
592 -- e.g. case x# +# y# of _ -> ... should cost 1
593 -- case touch# x# of _ -> ... should cost 0
594 -- (see #4978)
595 --
596 -- I would like to not have the "not (lengthExceeds alts 1)"
597 -- condition above, but without that some programs got worse
598 -- (spectral/hartel/event and spectral/para). I don't fully
599 -- understand why. (SDM 24/5/11)
600
601 -- unboxed variables, inline primops and unsafe foreign calls
602 -- are all "inline" things:
603 is_inline_scrut (Var v) = isUnliftedType (idType v)
604 is_inline_scrut scrut
605 | (Var f, _) <- collectArgs scrut
606 = case idDetails f of
607 FCallId fc -> not (isSafeForeignCall fc)
608 PrimOpId op -> not (primOpOutOfLine op)
609 _other -> False
610 | otherwise
611 = False
612
613 size_up_rhs (bndr, rhs)
614 | Just join_arity <- isJoinId_maybe bndr
615 -- Skip arguments to join point
616 , (_bndrs, body) <- collectNBinders join_arity rhs
617 = size_up body
618 | otherwise
619 = size_up rhs
620
621 ------------
622 -- size_up_app is used when there's ONE OR MORE value args
623 size_up_app (App fun arg) args voids
624 | isTyCoArg arg = size_up_app fun args voids
625 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
626 | otherwise = size_up arg `addSizeNSD`
627 size_up_app fun (arg:args) voids
628 size_up_app (Var fun) args voids = size_up_call fun args voids
629 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
630 size_up_app (Cast expr _) args voids = size_up_app expr args voids
631 size_up_app other args voids = size_up other `addSizeN`
632 callSize (length args) voids
633 -- if the lhs is not an App or a Var, or an invisible thing like a
634 -- Tick or Cast, then we should charge for a complete call plus the
635 -- size of the lhs itself.
636
637 ------------
638 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
639 size_up_call fun val_args voids
640 = case idDetails fun of
641 FCallId _ -> sizeN (callSize (length val_args) voids)
642 DataConWorkId dc -> conSize dc (length val_args)
643 PrimOpId op -> primOpSize op (length val_args)
644 ClassOpId _ -> classOpSize dflags top_args val_args
645 _ -> funSize dflags top_args fun (length val_args) voids
646
647 ------------
648 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
649 -- Don't charge for args, so that wrappers look cheap
650 -- (See comments about wrappers with Case)
651 --
652 -- IMPORATANT: *do* charge 1 for the alternative, else we
653 -- find that giant case nests are treated as practically free
654 -- A good example is Foreign.C.Error.errnoToIOError
655
656 ------------
657 -- Cost to allocate binding with given binder
658 size_up_alloc bndr
659 | isTyVar bndr -- Doesn't exist at runtime
660 || isJoinId bndr -- Not allocated at all
661 || isUnliftedType (idType bndr) -- Doesn't live in heap
662 = 0
663 | otherwise
664 = 10
665
666 ------------
667 -- These addSize things have to be here because
668 -- I don't want to give them bOMB_OUT_SIZE as an argument
669 addSizeN TooBig _ = TooBig
670 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
671
672 -- addAltSize is used to add the sizes of case alternatives
673 addAltSize TooBig _ = TooBig
674 addAltSize _ TooBig = TooBig
675 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
676 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
677 (xs `unionBags` ys)
678 (d1 + d2) -- Note [addAltSize result discounts]
679
680 -- This variant ignores the result discount from its LEFT argument
681 -- It's used when the second argument isn't part of the result
682 addSizeNSD TooBig _ = TooBig
683 addSizeNSD _ TooBig = TooBig
684 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
685 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
686 (xs `unionBags` ys)
687 d2 -- Ignore d1
688
689 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
690
691 -- an expression of type State# RealWorld must be a variable
692 isRealWorldExpr (Var id) = isRealWorldId id
693 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
694 isRealWorldExpr _ = False
695
696 -- | Finds a nominal size of a string literal.
697 litSize :: Literal -> Int
698 -- Used by CoreUnfold.sizeExpr
699 litSize (LitInteger {}) = 100 -- Note [Size of literal integers]
700 litSize (MachStr str) = 10 + 10 * ((BS.length str + 3) `div` 4)
701 -- If size could be 0 then @f "x"@ might be too small
702 -- [Sept03: make literal strings a bit bigger to avoid fruitless
703 -- duplication of little strings]
704 litSize _other = 0 -- Must match size of nullary constructors
705 -- Key point: if x |-> 4, then x must inline unconditionally
706 -- (eg via case binding)
707
708 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
709 -- See Note [Conlike is interesting]
710 classOpSize _ _ []
711 = sizeZero
712 classOpSize dflags top_args (arg1 : other_args)
713 = SizeIs size arg_discount 0
714 where
715 size = 20 + (10 * length other_args)
716 -- If the class op is scrutinising a lambda bound dictionary then
717 -- give it a discount, to encourage the inlining of this function
718 -- The actual discount is rather arbitrarily chosen
719 arg_discount = case arg1 of
720 Var dict | dict `elem` top_args
721 -> unitBag (dict, ufDictDiscount dflags)
722 _other -> emptyBag
723
724 -- | The size of a function call
725 callSize
726 :: Int -- ^ number of value args
727 -> Int -- ^ number of value args that are void
728 -> Int
729 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
730 -- The 1+ is for the function itself
731 -- Add 1 for each non-trivial arg;
732 -- the allocation cost, as in let(rec)
733
734 -- | The size of a jump to a join point
735 jumpSize
736 :: Int -- ^ number of value args
737 -> Int -- ^ number of value args that are void
738 -> Int
739 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
740 -- A jump is 20% the size of a function call. Making jumps free reopens
741 -- bug #6048, but making them any more expensive loses a 21% improvement in
742 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
743 -- better solution?
744
745 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
746 -- Size for functions that are not constructors or primops
747 -- Note [Function applications]
748 funSize dflags top_args fun n_val_args voids
749 | fun `hasKey` buildIdKey = buildSize
750 | fun `hasKey` augmentIdKey = augmentSize
751 | otherwise = SizeIs size arg_discount res_discount
752 where
753 some_val_args = n_val_args > 0
754 is_join = isJoinId fun
755
756 size | is_join = jumpSize n_val_args voids
757 | not some_val_args = 0
758 | otherwise = callSize n_val_args voids
759
760 -- DISCOUNTS
761 -- See Note [Function and non-function discounts]
762 arg_discount | some_val_args && fun `elem` top_args
763 = unitBag (fun, ufFunAppDiscount dflags)
764 | otherwise = emptyBag
765 -- If the function is an argument and is applied
766 -- to some values, give it an arg-discount
767
768 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
769 | otherwise = 0
770 -- If the function is partially applied, show a result discount
771 -- XXX maybe behave like ConSize for eval'd variable
772
773 conSize :: DataCon -> Int -> ExprSize
774 conSize dc n_val_args
775 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
776
777 -- See Note [Unboxed tuple size and result discount]
778 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
779
780 -- See Note [Constructor size and result discount]
781 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
782
783 -- XXX still looks to large to me
784
785 {-
786 Note [Constructor size and result discount]
787 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
788 Treat a constructors application as size 10, regardless of how many
789 arguments it has; we are keen to expose them (and we charge separately
790 for their args). We can't treat them as size zero, else we find that
791 (Just x) has size 0, which is the same as a lone variable; and hence
792 'v' will always be replaced by (Just x), where v is bound to Just x.
793
794 The "result discount" is applied if the result of the call is
795 scrutinised (say by a case). For a constructor application that will
796 mean the constructor application will disappear, so we don't need to
797 charge it to the function. So the discount should at least match the
798 cost of the constructor application, namely 10. But to give a bit
799 of extra incentive we give a discount of 10*(1 + n_val_args).
800
801 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
802 and said it was an "unambiguous win", but its terribly dangerous
803 because a function with many many case branches, each finishing with
804 a constructor, can have an arbitrarily large discount. This led to
805 terrible code bloat: see Trac #6099.
806
807 Note [Unboxed tuple size and result discount]
808 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
809 However, unboxed tuples count as size zero. I found occasions where we had
810 f x y z = case op# x y z of { s -> (# s, () #) }
811 and f wasn't getting inlined.
812
813 I tried giving unboxed tuples a *result discount* of zero (see the
814 commented-out line). Why? When returned as a result they do not
815 allocate, so maybe we don't want to charge so much for them If you
816 have a non-zero discount here, we find that workers often get inlined
817 back into wrappers, because it look like
818 f x = case $wf x of (# a,b #) -> (a,b)
819 and we are keener because of the case. However while this change
820 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
821 more. All other changes were very small. So it's not a big deal but I
822 didn't adopt the idea.
823
824 Note [Function and non-function discounts]
825 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
826 We want a discount if the function is applied. A good example is
827 monadic combinators with continuation arguments, where inlining is
828 quite important.
829
830 But we don't want a big discount when a function is called many times
831 (see the detailed comments with Trac #6048) because if the function is
832 big it won't be inlined at its many call sites and no benefit results.
833 Indeed, we can get exponentially big inlinings this way; that is what
834 Trac #6048 is about.
835
836 On the other hand, for data-valued arguments, if there are lots of
837 case expressions in the body, each one will get smaller if we apply
838 the function to a constructor application, so we *want* a big discount
839 if the argument is scrutinised by many case expressions.
840
841 Conclusion:
842 - For functions, take the max of the discounts
843 - For data values, take the sum of the discounts
844
845
846 Note [Literal integer size]
847 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
848 Literal integers *can* be big (mkInteger [...coefficients...]), but
849 need not be (S# n). We just use an arbitrary big-ish constant here
850 so that, in particular, we don't inline top-level defns like
851 n = S# 5
852 There's no point in doing so -- any optimisations will see the S#
853 through n's unfolding. Nor will a big size inhibit unfoldings functions
854 that mention a literal Integer, because the float-out pass will float
855 all those constants to top level.
856 -}
857
858 primOpSize :: PrimOp -> Int -> ExprSize
859 primOpSize op n_val_args
860 = if primOpOutOfLine op
861 then sizeN (op_size + n_val_args)
862 else sizeN op_size
863 where
864 op_size = primOpCodeSize op
865
866
867 buildSize :: ExprSize
868 buildSize = SizeIs 0 emptyBag 40
869 -- We really want to inline applications of build
870 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
871 -- Indeed, we should add a result_discount because build is
872 -- very like a constructor. We don't bother to check that the
873 -- build is saturated (it usually is). The "-2" discounts for the \c n,
874 -- The "4" is rather arbitrary.
875
876 augmentSize :: ExprSize
877 augmentSize = SizeIs 0 emptyBag 40
878 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
879 -- e plus ys. The -2 accounts for the \cn
880
881 -- When we return a lambda, give a discount if it's used (applied)
882 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
883 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
884 lamScrutDiscount _ TooBig = TooBig
885
886 {-
887 Note [addAltSize result discounts]
888 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
889 When adding the size of alternatives, we *add* the result discounts
890 too, rather than take the *maximum*. For a multi-branch case, this
891 gives a discount for each branch that returns a constructor, making us
892 keener to inline. I did try using 'max' instead, but it makes nofib
893 'rewrite' and 'puzzle' allocate significantly more, and didn't make
894 binary sizes shrink significantly either.
895
896 Note [Discounts and thresholds]
897 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
898 Constants for discounts and thesholds are defined in main/DynFlags,
899 all of form ufXxxx. They are:
900
901 ufCreationThreshold
902 At a definition site, if the unfolding is bigger than this, we
903 may discard it altogether
904
905 ufUseThreshold
906 At a call site, if the unfolding, less discounts, is smaller than
907 this, then it's small enough inline
908
909 ufKeenessFactor
910 Factor by which the discounts are multiplied before
911 subtracting from size
912
913 ufDictDiscount
914 The discount for each occurrence of a dictionary argument
915 as an argument of a class method. Should be pretty small
916 else big functions may get inlined
917
918 ufFunAppDiscount
919 Discount for a function argument that is applied. Quite
920 large, because if we inline we avoid the higher-order call.
921
922 ufDearOp
923 The size of a foreign call or not-dupable PrimOp
924
925 ufVeryAggressive
926 If True, the compiler ignores all the thresholds and inlines very
927 aggressively. It still adheres to arity, simplifier phase control and
928 loop breakers.
929
930
931 Note [Function applications]
932 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
933 In a function application (f a b)
934
935 - If 'f' is an argument to the function being analysed,
936 and there's at least one value arg, record a FunAppDiscount for f
937
938 - If the application if a PAP (arity > 2 in this example)
939 record a *result* discount (because inlining
940 with "extra" args in the call may mean that we now
941 get a saturated application)
942
943 Code for manipulating sizes
944 -}
945
946 -- | The size of an candidate expression for unfolding
947 data ExprSize
948 = TooBig
949 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
950 , _es_args :: !(Bag (Id,Int))
951 -- ^ Arguments cased herein, and discount for each such
952 , _es_discount :: {-# UNPACK #-} !Int
953 -- ^ Size to subtract if result is scrutinised by a case
954 -- expression
955 }
956
957 instance Outputable ExprSize where
958 ppr TooBig = text "TooBig"
959 ppr (SizeIs a _ c) = brackets (int a <+> int c)
960
961 -- subtract the discount before deciding whether to bale out. eg. we
962 -- want to inline a large constructor application into a selector:
963 -- tup = (a_1, ..., a_99)
964 -- x = case tup of ...
965 --
966 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
967 mkSizeIs max n xs d | (n - d) > max = TooBig
968 | otherwise = SizeIs n xs d
969
970 maxSize :: ExprSize -> ExprSize -> ExprSize
971 maxSize TooBig _ = TooBig
972 maxSize _ TooBig = TooBig
973 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
974 | otherwise = s2
975
976 sizeZero :: ExprSize
977 sizeN :: Int -> ExprSize
978
979 sizeZero = SizeIs 0 emptyBag 0
980 sizeN n = SizeIs n emptyBag 0
981
982 {-
983 ************************************************************************
984 * *
985 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
986 * *
987 ************************************************************************
988
989 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
990 we ``couldn't possibly use'' on the other side. Can be overridden w/
991 flaggery. Just the same as smallEnoughToInline, except that it has no
992 actual arguments.
993 -}
994
995 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
996 couldBeSmallEnoughToInline dflags threshold rhs
997 = case sizeExpr dflags threshold [] body of
998 TooBig -> False
999 _ -> True
1000 where
1001 (_, body) = collectBinders rhs
1002
1003 ----------------
1004 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1005 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1006 = size <= ufUseThreshold dflags
1007 smallEnoughToInline _ _
1008 = False
1009
1010 ----------------
1011
1012 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1013 -- Sees if the unfolding is pretty certain to inline
1014 -- If so, return a *stable* unfolding for it, that will always inline
1015 certainlyWillInline dflags fn_info
1016 = case unfoldingInfo fn_info of
1017 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1018 | loop_breaker -> Nothing -- Won't inline, so try w/w
1019 | otherwise -> do_cunf e g -- Depends on size, so look at that
1020
1021 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1022 -- to do so, and even if it is currently a
1023 -- loop breaker, it may not be later
1024
1025 _other_unf -> Nothing
1026
1027 where
1028 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1029 fn_unf = unfoldingInfo fn_info
1030
1031 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1032 do_cunf _ UnfNever = Nothing
1033 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1034 -- INLINE functions have UnfWhen
1035
1036 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1037 -- binary sizes go up by 10%! (This is with SplitObjs.)
1038 -- I'm not totally sure why.
1039 -- INLINABLE functions come via this path
1040 -- See Note [certainlyWillInline: INLINABLE]
1041 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1042 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1043 , case inlinePragmaSpec (inlinePragInfo fn_info) of
1044 NoInline -> False -- NOINLINE; do not say certainlyWillInline!
1045 _ -> True -- INLINE, INLINABLE, or nothing
1046 , not (isBottomingSig (strictnessInfo fn_info))
1047 -- Do not unconditionally inline a bottoming functions even if
1048 -- it seems smallish. We've carefully lifted it out to top level,
1049 -- so we don't want to re-inline it.
1050 , let arity = length args
1051 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1052 = Just (fn_unf { uf_src = InlineStable
1053 , uf_guidance = UnfWhen { ug_arity = arity
1054 , ug_unsat_ok = unSaturatedOk
1055 , ug_boring_ok = inlineBoringOk expr } })
1056 -- Note the "unsaturatedOk". A function like f = \ab. a
1057 -- will certainly inline, even if partially applied (f e), so we'd
1058 -- better make sure that the transformed inlining has the same property
1059 | otherwise
1060 = Nothing
1061
1062 {- Note [certainlyWillInline: be careful of thunks]
1063 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1064 Don't claim that thunks will certainly inline, because that risks work
1065 duplication. Even if the work duplication is not great (eg is_cheap
1066 holds), it can make a big difference in an inner loop In Trac #5623 we
1067 found that the WorkWrap phase thought that
1068 y = case x of F# v -> F# (v +# v)
1069 was certainlyWillInline, so the addition got duplicated.
1070
1071 Note [certainlyWillInline: INLINABLE]
1072 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1073 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1074 even though we have a stable inlining, so that strictness w/w takes
1075 place. It makes a big difference to efficiency, and the w/w pass knows
1076 how to transfer the INLINABLE info to the worker; see WorkWrap
1077 Note [Worker-wrapper for INLINABLE functions]
1078
1079 ************************************************************************
1080 * *
1081 \subsection{callSiteInline}
1082 * *
1083 ************************************************************************
1084
1085 This is the key function. It decides whether to inline a variable at a call site
1086
1087 callSiteInline is used at call sites, so it is a bit more generous.
1088 It's a very important function that embodies lots of heuristics.
1089 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1090 and occurs exactly once or
1091 occurs once in each branch of a case and is small
1092
1093 If the thing is in WHNF, there's no danger of duplicating work,
1094 so we can inline if it occurs once, or is small
1095
1096 NOTE: we don't want to inline top-level functions that always diverge.
1097 It just makes the code bigger. Tt turns out that the convenient way to prevent
1098 them inlining is to give them a NOINLINE pragma, which we do in
1099 StrictAnal.addStrictnessInfoToTopId
1100 -}
1101
1102 callSiteInline :: DynFlags
1103 -> Id -- The Id
1104 -> Bool -- True <=> unfolding is active
1105 -> Bool -- True if there are no arguments at all (incl type args)
1106 -> [ArgSummary] -- One for each value arg; True if it is interesting
1107 -> CallCtxt -- True <=> continuation is interesting
1108 -> Maybe CoreExpr -- Unfolding, if any
1109
1110 data ArgSummary = TrivArg -- Nothing interesting
1111 | NonTrivArg -- Arg has structure
1112 | ValueArg -- Arg is a con-app or PAP
1113 -- ..or con-like. Note [Conlike is interesting]
1114
1115 instance Outputable ArgSummary where
1116 ppr TrivArg = text "TrivArg"
1117 ppr NonTrivArg = text "NonTrivArg"
1118 ppr ValueArg = text "ValueArg"
1119
1120 nonTriv :: ArgSummary -> Bool
1121 nonTriv TrivArg = False
1122 nonTriv _ = True
1123
1124 data CallCtxt
1125 = BoringCtxt
1126 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1127 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1128 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1129
1130 | ValAppCtxt -- We're applied to at least one value arg
1131 -- This arises when we have ((f x |> co) y)
1132 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1133
1134 | CaseCtxt -- We're the scrutinee of a case
1135 -- that decomposes its scrutinee
1136
1137 instance Outputable CallCtxt where
1138 ppr CaseCtxt = text "CaseCtxt"
1139 ppr ValAppCtxt = text "ValAppCtxt"
1140 ppr BoringCtxt = text "BoringCtxt"
1141 ppr RhsCtxt = text "RhsCtxt"
1142 ppr DiscArgCtxt = text "DiscArgCtxt"
1143 ppr RuleArgCtxt = text "RuleArgCtxt"
1144
1145 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1146 = case idUnfolding id of
1147 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1148 -- Things with an INLINE pragma may have an unfolding *and*
1149 -- be a loop breaker (maybe the knot is not yet untied)
1150 CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top
1151 , uf_is_work_free = is_wf
1152 , uf_guidance = guidance, uf_expandable = is_exp }
1153 | active_unfolding -> tryUnfolding dflags id lone_variable
1154 arg_infos cont_info unf_template is_top
1155 is_wf is_exp guidance
1156 | otherwise -> traceInline dflags "Inactive unfolding:" (ppr id) Nothing
1157 NoUnfolding -> Nothing
1158 BootUnfolding -> Nothing
1159 OtherCon {} -> Nothing
1160 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1161
1162 traceInline :: DynFlags -> String -> SDoc -> a -> a
1163 traceInline dflags str doc result
1164 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1165 = pprTrace str doc result
1166 | otherwise
1167 = result
1168
1169 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1170 -> CoreExpr -> Bool -> Bool -> Bool -> UnfoldingGuidance
1171 -> Maybe CoreExpr
1172 tryUnfolding dflags id lone_variable
1173 arg_infos cont_info unf_template is_top
1174 is_wf is_exp guidance
1175 = case guidance of
1176 UnfNever -> traceInline dflags str (text "UnfNever") Nothing
1177
1178 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1179 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1180 -- See Note [INLINE for small functions (3)]
1181 -> traceInline dflags str (mk_doc some_benefit empty True) (Just unf_template)
1182 | otherwise
1183 -> traceInline dflags str (mk_doc some_benefit empty False) Nothing
1184 where
1185 some_benefit = calc_some_benefit uf_arity
1186 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1187
1188 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1189 | ufVeryAggressive dflags
1190 -> traceInline dflags str (mk_doc some_benefit extra_doc True) (Just unf_template)
1191 | is_wf && some_benefit && small_enough
1192 -> traceInline dflags str (mk_doc some_benefit extra_doc True) (Just unf_template)
1193 | otherwise
1194 -> traceInline dflags str (mk_doc some_benefit extra_doc False) Nothing
1195 where
1196 some_benefit = calc_some_benefit (length arg_discounts)
1197 extra_doc = text "discounted size =" <+> int discounted_size
1198 discounted_size = size - discount
1199 small_enough = discounted_size <= ufUseThreshold dflags
1200 discount = computeDiscount dflags arg_discounts
1201 res_discount arg_infos cont_info
1202
1203 where
1204 mk_doc some_benefit extra_doc yes_or_no
1205 = vcat [ text "arg infos" <+> ppr arg_infos
1206 , text "interesting continuation" <+> ppr cont_info
1207 , text "some_benefit" <+> ppr some_benefit
1208 , text "is exp:" <+> ppr is_exp
1209 , text "is work-free:" <+> ppr is_wf
1210 , text "guidance" <+> ppr guidance
1211 , extra_doc
1212 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1213
1214 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1215 n_val_args = length arg_infos
1216
1217 -- some_benefit is used when the RHS is small enough
1218 -- and the call has enough (or too many) value
1219 -- arguments (ie n_val_args >= arity). But there must
1220 -- be *something* interesting about some argument, or the
1221 -- result context, to make it worth inlining
1222 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1223 -- expected by the unfolding
1224 calc_some_benefit uf_arity
1225 | not saturated = interesting_args -- Under-saturated
1226 -- Note [Unsaturated applications]
1227 | otherwise = interesting_args -- Saturated or over-saturated
1228 || interesting_call
1229 where
1230 saturated = n_val_args >= uf_arity
1231 over_saturated = n_val_args > uf_arity
1232 interesting_args = any nonTriv arg_infos
1233 -- NB: (any nonTriv arg_infos) looks at the
1234 -- over-saturated args too which is "wrong";
1235 -- but if over-saturated we inline anyway.
1236
1237 interesting_call
1238 | over_saturated
1239 = True
1240 | otherwise
1241 = case cont_info of
1242 CaseCtxt -> not (lone_variable && is_wf) -- Note [Lone variables]
1243 ValAppCtxt -> True -- Note [Cast then apply]
1244 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1245 DiscArgCtxt -> uf_arity > 0 --
1246 RhsCtxt -> uf_arity > 0 --
1247 _ -> not is_top && uf_arity > 0 -- Note [Nested functions]
1248 -- Note [Inlining in ArgCtxt]
1249
1250 {-
1251 Note [Unfold into lazy contexts], Note [RHS of lets]
1252 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1253 When the call is the argument of a function with a RULE, or the RHS of a let,
1254 we are a little bit keener to inline. For example
1255 f y = (y,y,y)
1256 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1257 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1258 only we can't see it. Also
1259 x = f v
1260 could be expensive whereas
1261 x = case v of (a,b) -> a
1262 is patently cheap and may allow more eta expansion.
1263 So we treat the RHS of a let as not-totally-boring.
1264
1265 Note [Unsaturated applications]
1266 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1267 When a call is not saturated, we *still* inline if one of the
1268 arguments has interesting structure. That's sometimes very important.
1269 A good example is the Ord instance for Bool in Base:
1270
1271 Rec {
1272 $fOrdBool =GHC.Classes.D:Ord
1273 @ Bool
1274 ...
1275 $cmin_ajX
1276
1277 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1278 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1279 }
1280
1281 But the defn of GHC.Classes.$dmmin is:
1282
1283 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1284 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1285 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1286 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1287 GHC.Types.False -> y GHC.Types.True -> x }) -}
1288
1289 We *really* want to inline $dmmin, even though it has arity 3, in
1290 order to unravel the recursion.
1291
1292
1293 Note [Things to watch]
1294 ~~~~~~~~~~~~~~~~~~~~~~
1295 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1296 Assume x is exported, so not inlined unconditionally.
1297 Then we want x to inline unconditionally; no reason for it
1298 not to, and doing so avoids an indirection.
1299
1300 * { x = I# 3; ....f x.... }
1301 Make sure that x does not inline unconditionally!
1302 Lest we get extra allocation.
1303
1304 Note [Inlining an InlineRule]
1305 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1306 An InlineRules is used for
1307 (a) programmer INLINE pragmas
1308 (b) inlinings from worker/wrapper
1309
1310 For (a) the RHS may be large, and our contract is that we *only* inline
1311 when the function is applied to all the arguments on the LHS of the
1312 source-code defn. (The uf_arity in the rule.)
1313
1314 However for worker/wrapper it may be worth inlining even if the
1315 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1316 require saturation.
1317
1318
1319 Note [Nested functions]
1320 ~~~~~~~~~~~~~~~~~~~~~~~
1321 If a function has a nested defn we also record some-benefit, on the
1322 grounds that we are often able to eliminate the binding, and hence the
1323 allocation, for the function altogether; this is good for join points.
1324 But this only makes sense for *functions*; inlining a constructor
1325 doesn't help allocation unless the result is scrutinised. UNLESS the
1326 constructor occurs just once, albeit possibly in multiple case
1327 branches. Then inlining it doesn't increase allocation, but it does
1328 increase the chance that the constructor won't be allocated at all in
1329 the branches that don't use it.
1330
1331 Note [Cast then apply]
1332 ~~~~~~~~~~~~~~~~~~~~~~
1333 Consider
1334 myIndex = __inline_me ( (/\a. <blah>) |> co )
1335 co :: (forall a. a -> a) ~ (forall a. T a)
1336 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1337
1338 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1339 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1340
1341 Note [Inlining in ArgCtxt]
1342 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1343 The condition (arity > 0) here is very important, because otherwise
1344 we end up inlining top-level stuff into useless places; eg
1345 x = I# 3#
1346 f = \y. g x
1347 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1348 and 20% to 'power'.
1349
1350 At one stage I replaced this condition by 'True' (leading to the above
1351 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1352 to work ok now.
1353
1354 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1355 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1356 for the RHS of a 'let', we only profit from the inlining if we get a
1357 CONLIKE thing (modulo lets).
1358
1359 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1360 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1361 The "lone-variable" case is important. I spent ages messing about
1362 with unsatisfactory variants, but this is nice. The idea is that if a
1363 variable appears all alone
1364
1365 as an arg of lazy fn, or rhs BoringCtxt
1366 as scrutinee of a case CaseCtxt
1367 as arg of a fn ArgCtxt
1368 AND
1369 it is bound to a cheap expression
1370
1371 then we should not inline it (unless there is some other reason,
1372 e.g. it is the sole occurrence). That is what is happening at
1373 the use of 'lone_variable' in 'interesting_call'.
1374
1375 Why? At least in the case-scrutinee situation, turning
1376 let x = (a,b) in case x of y -> ...
1377 into
1378 let x = (a,b) in case (a,b) of y -> ...
1379 and thence to
1380 let x = (a,b) in let y = (a,b) in ...
1381 is bad if the binding for x will remain.
1382
1383 Another example: I discovered that strings
1384 were getting inlined straight back into applications of 'error'
1385 because the latter is strict.
1386 s = "foo"
1387 f = \x -> ...(error s)...
1388
1389 Fundamentally such contexts should not encourage inlining because the
1390 context can ``see'' the unfolding of the variable (e.g. case or a
1391 RULE) so there's no gain. If the thing is bound to a value.
1392
1393 However, watch out:
1394
1395 * Consider this:
1396 foo = _inline_ (\n. [n])
1397 bar = _inline_ (foo 20)
1398 baz = \n. case bar of { (m:_) -> m + n }
1399 Here we really want to inline 'bar' so that we can inline 'foo'
1400 and the whole thing unravels as it should obviously do. This is
1401 important: in the NDP project, 'bar' generates a closure data
1402 structure rather than a list.
1403
1404 So the non-inlining of lone_variables should only apply if the
1405 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1406 looks through the unfolding. Hence the "&& is_wf" in the
1407 InlineRule branch.
1408
1409 * Even a type application or coercion isn't a lone variable.
1410 Consider
1411 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1412 We had better inline that sucker! The case won't see through it.
1413
1414 For now, I'm treating treating a variable applied to types
1415 in a *lazy* context "lone". The motivating example was
1416 f = /\a. \x. BIG
1417 g = /\a. \y. h (f a)
1418 There's no advantage in inlining f here, and perhaps
1419 a significant disadvantage. Hence some_val_args in the Stop case
1420
1421 Note [Interaction of exprIsWorkFree and lone variables]
1422 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1423 The lone-variable test says "don't inline if a case expression
1424 scrutinises a lone variable whose unfolding is cheap". It's very
1425 important that, under these circumstances, exprIsConApp_maybe
1426 can spot a constructor application. So, for example, we don't
1427 consider
1428 let x = e in (x,x)
1429 to be cheap, and that's good because exprIsConApp_maybe doesn't
1430 think that expression is a constructor application.
1431
1432 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1433 rather than is_wf, which was utterly wrong, because the above
1434 expression responds True to exprIsHNF, which is what sets is_value.
1435
1436 This kind of thing can occur if you have
1437
1438 {-# INLINE foo #-}
1439 foo = let x = e in (x,x)
1440
1441 which Roman did.
1442 -}
1443
1444 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1445 -> Int
1446 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1447 -- We multiple the raw discounts (args_discount and result_discount)
1448 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1449 -- *size* whereas the discounts imply that there's some extra
1450 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1451 -- by inlining.
1452
1453 = 10 -- Discount of 10 because the result replaces the call
1454 -- so we count 10 for the function itself
1455
1456 + 10 * length actual_arg_discounts
1457 -- Discount of 10 for each arg supplied,
1458 -- because the result replaces the call
1459
1460 + round (ufKeenessFactor dflags *
1461 fromIntegral (total_arg_discount + res_discount'))
1462 where
1463 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1464 total_arg_discount = sum actual_arg_discounts
1465
1466 mk_arg_discount _ TrivArg = 0
1467 mk_arg_discount _ NonTrivArg = 10
1468 mk_arg_discount discount ValueArg = discount
1469
1470 res_discount'
1471 | LT <- arg_discounts `compareLength` arg_infos
1472 = res_discount -- Over-saturated
1473 | otherwise
1474 = case cont_info of
1475 BoringCtxt -> 0
1476 CaseCtxt -> res_discount -- Presumably a constructor
1477 ValAppCtxt -> res_discount -- Presumably a function
1478 _ -> 40 `min` res_discount
1479 -- ToDo: this 40 `min` res_discount doesn't seem right
1480 -- for DiscArgCtxt it shouldn't matter because the function will
1481 -- get the arg discount for any non-triv arg
1482 -- for RuleArgCtxt we do want to be keener to inline; but not only
1483 -- constructor results
1484 -- for RhsCtxt I suppose that exposing a data con is good in general
1485 -- And 40 seems very arbitrary
1486 --
1487 -- res_discount can be very large when a function returns
1488 -- constructors; but we only want to invoke that large discount
1489 -- when there's a case continuation.
1490 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1491 -- But we want to aovid inlining large functions that return
1492 -- constructors into contexts that are simply "interesting"