f7e42659c28af4271f65e9a361fda0daef1f2c9d
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import DynFlags
46 import CoreSyn
47 import PprCore () -- Instances
48 import OccurAnal ( occurAnalyseExpr )
49 import CoreSubst hiding( substTy )
50 import CoreArity ( manifestArity )
51 import CoreUtils
52 import Id
53 import DataCon
54 import Literal
55 import PrimOp
56 import IdInfo
57 import BasicTypes ( Arity )
58 import Type
59 import PrelNames
60 import TysPrim ( realWorldStatePrimTy )
61 import Bag
62 import Util
63 import Outputable
64 import ForeignCall
65
66 import qualified Data.ByteString as BS
67
68 {-
69 ************************************************************************
70 * *
71 \subsection{Making unfoldings}
72 * *
73 ************************************************************************
74 -}
75
76 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
77 mkTopUnfolding dflags is_bottoming rhs
78 = mkUnfolding dflags InlineRhs True is_bottoming rhs
79
80 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
81 -- For implicit Ids, do a tiny bit of optimising first
82 mkImplicitUnfolding dflags expr
83 = mkTopUnfolding dflags False (simpleOptExpr expr)
84
85 -- Note [Top-level flag on inline rules]
86 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
87 -- Slight hack: note that mk_inline_rules conservatively sets the
88 -- top-level flag to True. It gets set more accurately by the simplifier
89 -- Simplify.simplUnfolding.
90
91 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
92 mkSimpleUnfolding dflags rhs
93 = mkUnfolding dflags InlineRhs False False rhs
94
95 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
96 mkDFunUnfolding bndrs con ops
97 = DFunUnfolding { df_bndrs = bndrs
98 , df_con = con
99 , df_args = map occurAnalyseExpr ops }
100 -- See Note [Occurrrence analysis of unfoldings]
101
102 mkWwInlineRule :: CoreExpr -> Arity -> Unfolding
103 mkWwInlineRule expr arity
104 = mkCoreUnfolding InlineStable True
105 (simpleOptExpr expr)
106 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
107 , ug_boring_ok = boringCxtNotOk })
108
109 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
110 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
111 = mkCoreUnfolding InlineCompulsory True
112 (simpleOptExpr expr)
113 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
114 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
115
116 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
117 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
118 mkWorkerUnfolding dflags work_fn
119 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
120 , uf_is_top = top_lvl })
121 | isStableSource src
122 = mkCoreUnfolding src top_lvl new_tmpl guidance
123 where
124 new_tmpl = simpleOptExpr (work_fn tmpl)
125 guidance = calcUnfoldingGuidance dflags False new_tmpl
126
127 mkWorkerUnfolding _ _ _ = noUnfolding
128
129 -- | Make an unfolding that may be used unsaturated
130 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
131 -- manifest arity (the number of outer lambdas applications will
132 -- resolve before doing any work).
133 mkInlineUnfolding :: CoreExpr -> Unfolding
134 mkInlineUnfolding expr
135 = mkCoreUnfolding InlineStable
136 True -- Note [Top-level flag on inline rules]
137 expr' guide
138 where
139 expr' = simpleOptExpr expr
140 guide = UnfWhen { ug_arity = manifestArity expr'
141 , ug_unsat_ok = unSaturatedOk
142 , ug_boring_ok = boring_ok }
143 boring_ok = inlineBoringOk expr'
144
145 -- | Make an unfolding that will be used once the RHS has been saturated
146 -- to the given arity.
147 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
148 mkInlineUnfoldingWithArity arity expr
149 = mkCoreUnfolding InlineStable
150 True -- Note [Top-level flag on inline rules]
151 expr' guide
152 where
153 expr' = simpleOptExpr expr
154 guide = UnfWhen { ug_arity = arity
155 , ug_unsat_ok = needSaturated
156 , ug_boring_ok = boring_ok }
157 boring_ok = inlineBoringOk expr'
158
159 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
160 mkInlinableUnfolding dflags expr
161 = mkUnfolding dflags InlineStable False False expr'
162 where
163 expr' = simpleOptExpr expr
164
165 specUnfolding :: [Var] -> (CoreExpr -> CoreExpr) -> Arity -> Unfolding -> Unfolding
166 -- See Note [Specialising unfoldings]
167 -- specUnfolding spec_bndrs spec_app arity_decrease unf
168 -- = \spec_bndrs. spec_app( unf )
169 --
170 specUnfolding spec_bndrs spec_app arity_decrease
171 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
172 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
173 mkDFunUnfolding spec_bndrs con (map spec_arg args)
174 -- There is a hard-to-check assumption here that the spec_app has
175 -- enough applications to exactly saturate the old_bndrs
176 -- For DFunUnfoldings we transform
177 -- \old_bndrs. MkD <op1> ... <opn>
178 -- to
179 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
180 -- The ASSERT checks the value part of that
181 where
182 spec_arg arg = simpleOptExpr (spec_app (mkLams old_bndrs arg))
183 -- The beta-redexes created by spec_app will be
184 -- simplified away by simplOptExpr
185
186 specUnfolding spec_bndrs spec_app arity_decrease
187 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
188 , uf_is_top = top_lvl
189 , uf_guidance = old_guidance })
190 | isStableSource src -- See Note [Specialising unfoldings]
191 , UnfWhen { ug_arity = old_arity
192 , ug_unsat_ok = unsat_ok
193 , ug_boring_ok = boring_ok } <- old_guidance
194 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
195 , ug_unsat_ok = unsat_ok
196 , ug_boring_ok = boring_ok }
197 new_tmpl = simpleOptExpr (mkLams spec_bndrs (spec_app tmpl))
198 -- The beta-redexes created by spec_app will be
199 -- simplified away by simplOptExpr
200
201 in mkCoreUnfolding src top_lvl new_tmpl guidance
202
203 specUnfolding _ _ _ _ = noUnfolding
204
205 {- Note [Specialising unfoldings]
206 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
207 When we specialise a function for some given type-class arguments, we use
208 specUnfolding to specialise its unfolding. Some important points:
209
210 * If the original function has a DFunUnfolding, the specialised one
211 must do so too! Otherwise we lose the magic rules that make it
212 interact with ClassOps
213
214 * There is a bit of hack for INLINABLE functions:
215 f :: Ord a => ....
216 f = <big-rhs>
217 {- INLINABLE f #-}
218 Now if we specialise f, should the specialised version still have
219 an INLINABLE pragma? If it does, we'll capture a specialised copy
220 of <big-rhs> as its unfolding, and that probaby won't inline. But
221 if we don't, the specialised version of <big-rhs> might be small
222 enough to inline at a call site. This happens with Control.Monad.liftM3,
223 and can cause a lot more allocation as a result (nofib n-body shows this).
224
225 Moreover, keeping the INLINABLE thing isn't much help, because
226 the specialised function (probaby) isn't overloaded any more.
227
228 Conclusion: drop the INLINEALE pragma. In practice what this means is:
229 if a stable unfolding has UnfoldingGuidance of UnfWhen,
230 we keep it (so the specialised thing too will always inline)
231 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
232 (which arises from INLINABLE), we discard it
233 -}
234
235 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
236 -> UnfoldingGuidance -> Unfolding
237 -- Occurrence-analyses the expression before capturing it
238 mkCoreUnfolding src top_lvl expr guidance
239 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
240 -- See Note [Occurrrence analysis of unfoldings]
241 uf_src = src,
242 uf_is_top = top_lvl,
243 uf_is_value = exprIsHNF expr,
244 uf_is_conlike = exprIsConLike expr,
245 uf_is_work_free = exprIsWorkFree expr,
246 uf_expandable = exprIsExpandable expr,
247 uf_guidance = guidance }
248
249 mkUnfolding :: DynFlags -> UnfoldingSource
250 -> Bool -- Is top-level
251 -> Bool -- Definitely a bottoming binding
252 -- (only relevant for top-level bindings)
253 -> CoreExpr
254 -> Unfolding
255 -- Calculates unfolding guidance
256 -- Occurrence-analyses the expression before capturing it
257 mkUnfolding dflags src is_top_lvl is_bottoming expr
258 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
259 -- See Note [Occurrrence analysis of unfoldings]
260 uf_src = src,
261 uf_is_top = is_top_lvl,
262 uf_is_value = exprIsHNF expr,
263 uf_is_conlike = exprIsConLike expr,
264 uf_expandable = exprIsExpandable expr,
265 uf_is_work_free = exprIsWorkFree expr,
266 uf_guidance = guidance }
267 where
268 is_top_bottoming = is_top_lvl && is_bottoming
269 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
270 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
271 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
272
273 {-
274 Note [Occurrence analysis of unfoldings]
275 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
276 We do occurrence-analysis of unfoldings once and for all, when the
277 unfolding is built, rather than each time we inline them.
278
279 But given this decision it's vital that we do
280 *always* do it. Consider this unfolding
281 \x -> letrec { f = ...g...; g* = f } in body
282 where g* is (for some strange reason) the loop breaker. If we don't
283 occ-anal it when reading it in, we won't mark g as a loop breaker, and
284 we may inline g entirely in body, dropping its binding, and leaving
285 the occurrence in f out of scope. This happened in Trac #8892, where
286 the unfolding in question was a DFun unfolding.
287
288 But more generally, the simplifier is designed on the
289 basis that it is looking at occurrence-analysed expressions, so better
290 ensure that they acutally are.
291
292 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
293 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
294 Notice that we give the non-occur-analysed expression to
295 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
296 first; for example, sometimes during simplification, there's a large
297 let-bound thing which has been substituted, and so is now dead; so
298 'expr' contains two copies of the thing while the occurrence-analysed
299 expression doesn't.
300
301 Nevertheless, we *don't* and *must not* occ-analyse before computing
302 the size because
303
304 a) The size computation bales out after a while, whereas occurrence
305 analysis does not.
306
307 b) Residency increases sharply if you occ-anal first. I'm not
308 100% sure why, but it's a large effect. Compiling Cabal went
309 from residency of 534M to over 800M with this one change.
310
311 This can occasionally mean that the guidance is very pessimistic;
312 it gets fixed up next round. And it should be rare, because large
313 let-bound things that are dead are usually caught by preInlineUnconditionally
314
315
316 ************************************************************************
317 * *
318 \subsection{The UnfoldingGuidance type}
319 * *
320 ************************************************************************
321 -}
322
323 inlineBoringOk :: CoreExpr -> Bool
324 -- See Note [INLINE for small functions]
325 -- True => the result of inlining the expression is
326 -- no bigger than the expression itself
327 -- eg (\x y -> f y x)
328 -- This is a quick and dirty version. It doesn't attempt
329 -- to deal with (\x y z -> x (y z))
330 -- The really important one is (x `cast` c)
331 inlineBoringOk e
332 = go 0 e
333 where
334 go :: Int -> CoreExpr -> Bool
335 go credit (Lam x e) | isId x = go (credit+1) e
336 | otherwise = go credit e
337 go credit (App f (Type {})) = go credit f
338 go credit (App f a) | credit > 0
339 , exprIsTrivial a = go (credit-1) f
340 go credit (Tick _ e) = go credit e -- dubious
341 go credit (Cast e _) = go credit e
342 go _ (Var {}) = boringCxtOk
343 go _ _ = boringCxtNotOk
344
345 calcUnfoldingGuidance
346 :: DynFlags
347 -> Bool -- Definitely a top-level, bottoming binding
348 -> CoreExpr -- Expression to look at
349 -> UnfoldingGuidance
350 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
351 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
352 = calcUnfoldingGuidance dflags is_top_bottoming expr
353 calcUnfoldingGuidance dflags is_top_bottoming expr
354 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
355 TooBig -> UnfNever
356 SizeIs size cased_bndrs scrut_discount
357 | uncondInline expr n_val_bndrs size
358 -> UnfWhen { ug_unsat_ok = unSaturatedOk
359 , ug_boring_ok = boringCxtOk
360 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
361
362 | is_top_bottoming
363 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
364
365 | otherwise
366 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
367 , ug_size = size
368 , ug_res = scrut_discount }
369
370 where
371 (bndrs, body) = collectBinders expr
372 bOMB_OUT_SIZE = ufCreationThreshold dflags
373 -- Bomb out if size gets bigger than this
374 val_bndrs = filter isId bndrs
375 n_val_bndrs = length val_bndrs
376
377 mk_discount :: Bag (Id,Int) -> Id -> Int
378 mk_discount cbs bndr = foldlBag combine 0 cbs
379 where
380 combine acc (bndr', disc)
381 | bndr == bndr' = acc `plus_disc` disc
382 | otherwise = acc
383
384 plus_disc :: Int -> Int -> Int
385 plus_disc | isFunTy (idType bndr) = max
386 | otherwise = (+)
387 -- See Note [Function and non-function discounts]
388
389 {-
390 Note [Computing the size of an expression]
391 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
392 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
393 heuristics right has taken a long time. Here's the basic strategy:
394
395 * Variables, literals: 0
396 (Exception for string literals, see litSize.)
397
398 * Function applications (f e1 .. en): 1 + #value args
399
400 * Constructor applications: 1, regardless of #args
401
402 * Let(rec): 1 + size of components
403
404 * Note, cast: 0
405
406 Examples
407
408 Size Term
409 --------------
410 0 42#
411 0 x
412 0 True
413 2 f x
414 1 Just x
415 4 f (g x)
416
417 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
418 a function call to account for. Notice also that constructor applications
419 are very cheap, because exposing them to a caller is so valuable.
420
421 [25/5/11] All sizes are now multiplied by 10, except for primops
422 (which have sizes like 1 or 4. This makes primops look fantastically
423 cheap, and seems to be almost unversally beneficial. Done partly as a
424 result of #4978.
425
426 Note [Do not inline top-level bottoming functions]
427 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
428 The FloatOut pass has gone to some trouble to float out calls to 'error'
429 and similar friends. See Note [Bottoming floats] in SetLevels.
430 Do not re-inline them! But we *do* still inline if they are very small
431 (the uncondInline stuff).
432
433 Note [INLINE for small functions]
434 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
435 Consider {-# INLINE f #-}
436 f x = Just x
437 g y = f y
438 Then f's RHS is no larger than its LHS, so we should inline it into
439 even the most boring context. In general, f the function is
440 sufficiently small that its body is as small as the call itself, the
441 inline unconditionally, regardless of how boring the context is.
442
443 Things to note:
444
445 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
446 than the thing it's replacing. Notice that
447 (f x) --> (g 3) -- YES, unconditionally
448 (f x) --> x : [] -- YES, *even though* there are two
449 -- arguments to the cons
450 x --> g 3 -- NO
451 x --> Just v -- NO
452
453 It's very important not to unconditionally replace a variable by
454 a non-atomic term.
455
456 (2) We do this even if the thing isn't saturated, else we end up with the
457 silly situation that
458 f x y = x
459 ...map (f 3)...
460 doesn't inline. Even in a boring context, inlining without being
461 saturated will give a lambda instead of a PAP, and will be more
462 efficient at runtime.
463
464 (3) However, when the function's arity > 0, we do insist that it
465 has at least one value argument at the call site. (This check is
466 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
467 f = /\a \x:a. x
468 d = /\b. MkD (f b)
469 If we inline f here we get
470 d = /\b. MkD (\x:b. x)
471 and then prepareRhs floats out the argument, abstracting the type
472 variables, so we end up with the original again!
473
474 (4) We must be much more cautious about arity-zero things. Consider
475 let x = y +# z in ...
476 In *size* terms primops look very small, because the generate a
477 single instruction, but we do not want to unconditionally replace
478 every occurrence of x with (y +# z). So we only do the
479 unconditional-inline thing for *trivial* expressions.
480
481 NB: you might think that PostInlineUnconditionally would do this
482 but it doesn't fire for top-level things; see SimplUtils
483 Note [Top level and postInlineUnconditionally]
484 -}
485
486 uncondInline :: CoreExpr -> Arity -> Int -> Bool
487 -- Inline unconditionally if there no size increase
488 -- Size of call is arity (+1 for the function)
489 -- See Note [INLINE for small functions]
490 uncondInline rhs arity size
491 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
492 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
493
494 sizeExpr :: DynFlags
495 -> Int -- Bomb out if it gets bigger than this
496 -> [Id] -- Arguments; we're interested in which of these
497 -- get case'd
498 -> CoreExpr
499 -> ExprSize
500
501 -- Note [Computing the size of an expression]
502
503 sizeExpr dflags bOMB_OUT_SIZE top_args expr
504 = size_up expr
505 where
506 size_up (Cast e _) = size_up e
507 size_up (Tick _ e) = size_up e
508 size_up (Type _) = sizeZero -- Types cost nothing
509 size_up (Coercion _) = sizeZero
510 size_up (Lit lit) = sizeN (litSize lit)
511 size_up (Var f) | isRealWorldId f = sizeZero
512 -- Make sure we get constructor discounts even
513 -- on nullary constructors
514 | otherwise = size_up_call f [] 0
515
516 size_up (App fun arg)
517 | isTyCoArg arg = size_up fun
518 | otherwise = size_up arg `addSizeNSD`
519 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
520
521 size_up (Lam b e)
522 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
523 | otherwise = size_up e
524
525 size_up (Let (NonRec binder rhs) body)
526 = size_up rhs `addSizeNSD`
527 size_up body `addSizeN`
528 (if isUnliftedType (idType binder) then 0 else 10)
529 -- For the allocation
530 -- If the binder has an unlifted type there is no allocation
531
532 size_up (Let (Rec pairs) body)
533 = foldr (addSizeNSD . size_up . snd)
534 (size_up body `addSizeN` (10 * length pairs)) -- (length pairs) for the allocation
535 pairs
536
537 size_up (Case e _ _ alts)
538 | Just v <- is_top_arg e -- We are scrutinising an argument variable
539 = let
540 alt_sizes = map size_up_alt alts
541
542 -- alts_size tries to compute a good discount for
543 -- the case when we are scrutinising an argument variable
544 alts_size (SizeIs tot tot_disc tot_scrut)
545 -- Size of all alternatives
546 (SizeIs max _ _)
547 -- Size of biggest alternative
548 = SizeIs tot (unitBag (v, 20 + tot - max)
549 `unionBags` tot_disc) tot_scrut
550 -- If the variable is known, we produce a
551 -- discount that will take us back to 'max',
552 -- the size of the largest alternative The
553 -- 1+ is a little discount for reduced
554 -- allocation in the caller
555 --
556 -- Notice though, that we return tot_disc,
557 -- the total discount from all branches. I
558 -- think that's right.
559
560 alts_size tot_size _ = tot_size
561 in
562 alts_size (foldr addAltSize sizeZero alt_sizes)
563 (foldr maxSize sizeZero alt_sizes)
564 -- Good to inline if an arg is scrutinised, because
565 -- that may eliminate allocation in the caller
566 -- And it eliminates the case itself
567 where
568 is_top_arg (Var v) | v `elem` top_args = Just v
569 is_top_arg (Cast e _) = is_top_arg e
570 is_top_arg _ = Nothing
571
572
573 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
574 foldr (addAltSize . size_up_alt) case_size alts
575 where
576 case_size
577 | is_inline_scrut e, not (lengthExceeds alts 1) = sizeN (-10)
578 | otherwise = sizeZero
579 -- Normally we don't charge for the case itself, but
580 -- we charge one per alternative (see size_up_alt,
581 -- below) to account for the cost of the info table
582 -- and comparisons.
583 --
584 -- However, in certain cases (see is_inline_scrut
585 -- below), no code is generated for the case unless
586 -- there are multiple alts. In these cases we
587 -- subtract one, making the first alt free.
588 -- e.g. case x# +# y# of _ -> ... should cost 1
589 -- case touch# x# of _ -> ... should cost 0
590 -- (see #4978)
591 --
592 -- I would like to not have the "not (lengthExceeds alts 1)"
593 -- condition above, but without that some programs got worse
594 -- (spectral/hartel/event and spectral/para). I don't fully
595 -- understand why. (SDM 24/5/11)
596
597 -- unboxed variables, inline primops and unsafe foreign calls
598 -- are all "inline" things:
599 is_inline_scrut (Var v) = isUnliftedType (idType v)
600 is_inline_scrut scrut
601 | (Var f, _) <- collectArgs scrut
602 = case idDetails f of
603 FCallId fc -> not (isSafeForeignCall fc)
604 PrimOpId op -> not (primOpOutOfLine op)
605 _other -> False
606 | otherwise
607 = False
608
609 ------------
610 -- size_up_app is used when there's ONE OR MORE value args
611 size_up_app (App fun arg) args voids
612 | isTyCoArg arg = size_up_app fun args voids
613 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
614 | otherwise = size_up arg `addSizeNSD`
615 size_up_app fun (arg:args) voids
616 size_up_app (Var fun) args voids = size_up_call fun args voids
617 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
618 size_up_app (Cast expr _) args voids = size_up_app expr args voids
619 size_up_app other args voids = size_up other `addSizeN`
620 callSize (length args) voids
621 -- if the lhs is not an App or a Var, or an invisible thing like a
622 -- Tick or Cast, then we should charge for a complete call plus the
623 -- size of the lhs itself.
624
625 ------------
626 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
627 size_up_call fun val_args voids
628 = case idDetails fun of
629 FCallId _ -> sizeN (callSize (length val_args) voids)
630 DataConWorkId dc -> conSize dc (length val_args)
631 PrimOpId op -> primOpSize op (length val_args)
632 ClassOpId _ -> classOpSize dflags top_args val_args
633 _ -> funSize dflags top_args fun (length val_args) voids
634
635 ------------
636 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
637 -- Don't charge for args, so that wrappers look cheap
638 -- (See comments about wrappers with Case)
639 --
640 -- IMPORATANT: *do* charge 1 for the alternative, else we
641 -- find that giant case nests are treated as practically free
642 -- A good example is Foreign.C.Error.errrnoToIOError
643
644 ------------
645 -- These addSize things have to be here because
646 -- I don't want to give them bOMB_OUT_SIZE as an argument
647 addSizeN TooBig _ = TooBig
648 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
649
650 -- addAltSize is used to add the sizes of case alternatives
651 addAltSize TooBig _ = TooBig
652 addAltSize _ TooBig = TooBig
653 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
654 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
655 (xs `unionBags` ys)
656 (d1 + d2) -- Note [addAltSize result discounts]
657
658 -- This variant ignores the result discount from its LEFT argument
659 -- It's used when the second argument isn't part of the result
660 addSizeNSD TooBig _ = TooBig
661 addSizeNSD _ TooBig = TooBig
662 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
663 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
664 (xs `unionBags` ys)
665 d2 -- Ignore d1
666
667 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
668
669 -- an expression of type State# RealWorld must be a variable
670 isRealWorldExpr (Var id) = isRealWorldId id
671 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
672 isRealWorldExpr _ = False
673
674 -- | Finds a nominal size of a string literal.
675 litSize :: Literal -> Int
676 -- Used by CoreUnfold.sizeExpr
677 litSize (LitInteger {}) = 100 -- Note [Size of literal integers]
678 litSize (MachStr str) = 10 + 10 * ((BS.length str + 3) `div` 4)
679 -- If size could be 0 then @f "x"@ might be too small
680 -- [Sept03: make literal strings a bit bigger to avoid fruitless
681 -- duplication of little strings]
682 litSize _other = 0 -- Must match size of nullary constructors
683 -- Key point: if x |-> 4, then x must inline unconditionally
684 -- (eg via case binding)
685
686 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
687 -- See Note [Conlike is interesting]
688 classOpSize _ _ []
689 = sizeZero
690 classOpSize dflags top_args (arg1 : other_args)
691 = SizeIs size arg_discount 0
692 where
693 size = 20 + (10 * length other_args)
694 -- If the class op is scrutinising a lambda bound dictionary then
695 -- give it a discount, to encourage the inlining of this function
696 -- The actual discount is rather arbitrarily chosen
697 arg_discount = case arg1 of
698 Var dict | dict `elem` top_args
699 -> unitBag (dict, ufDictDiscount dflags)
700 _other -> emptyBag
701
702 -- | The size of a function call
703 callSize
704 :: Int -- ^ number of value args
705 -> Int -- ^ number of value args that are void
706 -> Int
707 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
708
709 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
710 -- Size for functions that are not constructors or primops
711 -- Note [Function applications]
712 funSize dflags top_args fun n_val_args voids
713 | fun `hasKey` buildIdKey = buildSize
714 | fun `hasKey` augmentIdKey = augmentSize
715 | otherwise = SizeIs size arg_discount res_discount
716 where
717 some_val_args = n_val_args > 0
718
719 size | some_val_args = callSize n_val_args voids
720 | otherwise = 0
721 -- The 1+ is for the function itself
722 -- Add 1 for each non-trivial arg;
723 -- the allocation cost, as in let(rec)
724
725 -- DISCOUNTS
726 -- See Note [Function and non-function discounts]
727 arg_discount | some_val_args && fun `elem` top_args
728 = unitBag (fun, ufFunAppDiscount dflags)
729 | otherwise = emptyBag
730 -- If the function is an argument and is applied
731 -- to some values, give it an arg-discount
732
733 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
734 | otherwise = 0
735 -- If the function is partially applied, show a result discount
736
737 conSize :: DataCon -> Int -> ExprSize
738 conSize dc n_val_args
739 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
740
741 -- See Note [Unboxed tuple size and result discount]
742 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
743
744 -- See Note [Constructor size and result discount]
745 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
746
747 {-
748 Note [Constructor size and result discount]
749 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
750 Treat a constructors application as size 10, regardless of how many
751 arguments it has; we are keen to expose them (and we charge separately
752 for their args). We can't treat them as size zero, else we find that
753 (Just x) has size 0, which is the same as a lone variable; and hence
754 'v' will always be replaced by (Just x), where v is bound to Just x.
755
756 The "result discount" is applied if the result of the call is
757 scrutinised (say by a case). For a constructor application that will
758 mean the constructor application will disappear, so we don't need to
759 charge it to the function. So the discount should at least match the
760 cost of the constructor application, namely 10. But to give a bit
761 of extra incentive we give a discount of 10*(1 + n_val_args).
762
763 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
764 and said it was an "unambiguous win", but its terribly dangerous
765 because a function with many many case branches, each finishing with
766 a constructor, can have an arbitrarily large discount. This led to
767 terrible code bloat: see Trac #6099.
768
769 Note [Unboxed tuple size and result discount]
770 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
771 However, unboxed tuples count as size zero. I found occasions where we had
772 f x y z = case op# x y z of { s -> (# s, () #) }
773 and f wasn't getting inlined.
774
775 I tried giving unboxed tuples a *result discount* of zero (see the
776 commented-out line). Why? When returned as a result they do not
777 allocate, so maybe we don't want to charge so much for them If you
778 have a non-zero discount here, we find that workers often get inlined
779 back into wrappers, because it look like
780 f x = case $wf x of (# a,b #) -> (a,b)
781 and we are keener because of the case. However while this change
782 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
783 more. All other changes were very small. So it's not a big deal but I
784 didn't adopt the idea.
785
786 Note [Function and non-function discounts]
787 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
788 We want a discount if the function is applied. A good example is
789 monadic combinators with continuation arguments, where inlining is
790 quite important.
791
792 But we don't want a big discount when a function is called many times
793 (see the detailed comments with Trac #6048) because if the function is
794 big it won't be inlined at its many call sites and no benefit results.
795 Indeed, we can get exponentially big inlinings this way; that is what
796 Trac #6048 is about.
797
798 On the other hand, for data-valued arguments, if there are lots of
799 case expressions in the body, each one will get smaller if we apply
800 the function to a constructor application, so we *want* a big discount
801 if the argument is scrutinised by many case expressions.
802
803 Conclusion:
804 - For functions, take the max of the discounts
805 - For data values, take the sum of the discounts
806
807
808 Note [Literal integer size]
809 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
810 Literal integers *can* be big (mkInteger [...coefficients...]), but
811 need not be (S# n). We just use an aribitrary big-ish constant here
812 so that, in particular, we don't inline top-level defns like
813 n = S# 5
814 There's no point in doing so -- any optimisations will see the S#
815 through n's unfolding. Nor will a big size inhibit unfoldings functions
816 that mention a literal Integer, because the float-out pass will float
817 all those constants to top level.
818 -}
819
820 primOpSize :: PrimOp -> Int -> ExprSize
821 primOpSize op n_val_args
822 = if primOpOutOfLine op
823 then sizeN (op_size + n_val_args)
824 else sizeN op_size
825 where
826 op_size = primOpCodeSize op
827
828
829 buildSize :: ExprSize
830 buildSize = SizeIs 0 emptyBag 40
831 -- We really want to inline applications of build
832 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
833 -- Indeed, we should add a result_discount because build is
834 -- very like a constructor. We don't bother to check that the
835 -- build is saturated (it usually is). The "-2" discounts for the \c n,
836 -- The "4" is rather arbitrary.
837
838 augmentSize :: ExprSize
839 augmentSize = SizeIs 0 emptyBag 40
840 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
841 -- e plus ys. The -2 accounts for the \cn
842
843 -- When we return a lambda, give a discount if it's used (applied)
844 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
845 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
846 lamScrutDiscount _ TooBig = TooBig
847
848 {-
849 Note [addAltSize result discounts]
850 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
851 When adding the size of alternatives, we *add* the result discounts
852 too, rather than take the *maximum*. For a multi-branch case, this
853 gives a discount for each branch that returns a constructor, making us
854 keener to inline. I did try using 'max' instead, but it makes nofib
855 'rewrite' and 'puzzle' allocate significantly more, and didn't make
856 binary sizes shrink significantly either.
857
858 Note [Discounts and thresholds]
859 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
860 Constants for discounts and thesholds are defined in main/DynFlags,
861 all of form ufXxxx. They are:
862
863 ufCreationThreshold
864 At a definition site, if the unfolding is bigger than this, we
865 may discard it altogether
866
867 ufUseThreshold
868 At a call site, if the unfolding, less discounts, is smaller than
869 this, then it's small enough inline
870
871 ufKeenessFactor
872 Factor by which the discounts are multiplied before
873 subtracting from size
874
875 ufDictDiscount
876 The discount for each occurrence of a dictionary argument
877 as an argument of a class method. Should be pretty small
878 else big functions may get inlined
879
880 ufFunAppDiscount
881 Discount for a function argument that is applied. Quite
882 large, because if we inline we avoid the higher-order call.
883
884 ufDearOp
885 The size of a foreign call or not-dupable PrimOp
886
887
888 Note [Function applications]
889 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
890 In a function application (f a b)
891
892 - If 'f' is an argument to the function being analysed,
893 and there's at least one value arg, record a FunAppDiscount for f
894
895 - If the application if a PAP (arity > 2 in this example)
896 record a *result* discount (because inlining
897 with "extra" args in the call may mean that we now
898 get a saturated application)
899
900 Code for manipulating sizes
901 -}
902
903 -- | The size of an candidate expression for unfolding
904 data ExprSize
905 = TooBig
906 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
907 , _es_args :: !(Bag (Id,Int))
908 -- ^ Arguments cased herein, and discount for each such
909 , _es_discount :: {-# UNPACK #-} !Int
910 -- ^ Size to subtract if result is scrutinised by a case
911 -- expression
912 }
913
914 instance Outputable ExprSize where
915 ppr TooBig = text "TooBig"
916 ppr (SizeIs a _ c) = brackets (int a <+> int c)
917
918 -- subtract the discount before deciding whether to bale out. eg. we
919 -- want to inline a large constructor application into a selector:
920 -- tup = (a_1, ..., a_99)
921 -- x = case tup of ...
922 --
923 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
924 mkSizeIs max n xs d | (n - d) > max = TooBig
925 | otherwise = SizeIs n xs d
926
927 maxSize :: ExprSize -> ExprSize -> ExprSize
928 maxSize TooBig _ = TooBig
929 maxSize _ TooBig = TooBig
930 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
931 | otherwise = s2
932
933 sizeZero :: ExprSize
934 sizeN :: Int -> ExprSize
935
936 sizeZero = SizeIs 0 emptyBag 0
937 sizeN n = SizeIs n emptyBag 0
938
939 {-
940 ************************************************************************
941 * *
942 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
943 * *
944 ************************************************************************
945
946 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
947 we ``couldn't possibly use'' on the other side. Can be overridden w/
948 flaggery. Just the same as smallEnoughToInline, except that it has no
949 actual arguments.
950 -}
951
952 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
953 couldBeSmallEnoughToInline dflags threshold rhs
954 = case sizeExpr dflags threshold [] body of
955 TooBig -> False
956 _ -> True
957 where
958 (_, body) = collectBinders rhs
959
960 ----------------
961 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
962 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
963 = size <= ufUseThreshold dflags
964 smallEnoughToInline _ _
965 = False
966
967 ----------------
968
969 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
970 -- Sees if the unfolding is pretty certain to inline
971 -- If so, return a *stable* unfolding for it, that will always inline
972 certainlyWillInline dflags fn_info
973 = case unfoldingInfo fn_info of
974 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
975 | loop_breaker -> Nothing -- Won't inline, so try w/w
976 | otherwise -> do_cunf e g -- Depends on size, so look at that
977
978 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
979 -- to do so, and even if it is currently a
980 -- loop breaker, it may not be later
981
982 _other_unf -> Nothing
983
984 where
985 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
986 fn_unf = unfoldingInfo fn_info
987
988 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
989 do_cunf _ UnfNever = Nothing
990 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
991 -- INLINE functions have UnfWhen
992
993 -- The UnfIfGoodArgs case seems important. If we w/w small functions
994 -- binary sizes go up by 10%! (This is with SplitObjs.)
995 -- I'm not totally sure why.
996 -- INLINABLE functions come via this path
997 -- See Note [certainlyWillInline: INLINABLE]
998 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
999 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1000 , let arity = length args
1001 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1002 = Just (fn_unf { uf_src = InlineStable
1003 , uf_guidance = UnfWhen { ug_arity = arity
1004 , ug_unsat_ok = unSaturatedOk
1005 , ug_boring_ok = inlineBoringOk expr } })
1006 -- Note the "unsaturatedOk". A function like f = \ab. a
1007 -- will certainly inline, even if partially applied (f e), so we'd
1008 -- better make sure that the transformed inlining has the same property
1009 | otherwise
1010 = Nothing
1011
1012 {- Note [certainlyWillInline: be careful of thunks]
1013 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1014 Don't claim that thunks will certainly inline, because that risks work
1015 duplication. Even if the work duplication is not great (eg is_cheap
1016 holds), it can make a big difference in an inner loop In Trac #5623 we
1017 found that the WorkWrap phase thought that
1018 y = case x of F# v -> F# (v +# v)
1019 was certainlyWillInline, so the addition got duplicated.
1020
1021 Note [certainlyWillInline: INLINABLE]
1022 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1023 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1024 even though we have a stable inlining, so that strictness w/w takes
1025 place. It makes a big difference to efficiency, and the w/w pass knows
1026 how to transfer the INLINABLE info to the worker; see WorkWrap
1027 Note [Worker-wrapper for INLINABLE functions]
1028
1029 ************************************************************************
1030 * *
1031 \subsection{callSiteInline}
1032 * *
1033 ************************************************************************
1034
1035 This is the key function. It decides whether to inline a variable at a call site
1036
1037 callSiteInline is used at call sites, so it is a bit more generous.
1038 It's a very important function that embodies lots of heuristics.
1039 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1040 and occurs exactly once or
1041 occurs once in each branch of a case and is small
1042
1043 If the thing is in WHNF, there's no danger of duplicating work,
1044 so we can inline if it occurs once, or is small
1045
1046 NOTE: we don't want to inline top-level functions that always diverge.
1047 It just makes the code bigger. Tt turns out that the convenient way to prevent
1048 them inlining is to give them a NOINLINE pragma, which we do in
1049 StrictAnal.addStrictnessInfoToTopId
1050 -}
1051
1052 callSiteInline :: DynFlags
1053 -> Id -- The Id
1054 -> Bool -- True <=> unfolding is active
1055 -> Bool -- True if there are no arguments at all (incl type args)
1056 -> [ArgSummary] -- One for each value arg; True if it is interesting
1057 -> CallCtxt -- True <=> continuation is interesting
1058 -> Maybe CoreExpr -- Unfolding, if any
1059
1060 data ArgSummary = TrivArg -- Nothing interesting
1061 | NonTrivArg -- Arg has structure
1062 | ValueArg -- Arg is a con-app or PAP
1063 -- ..or con-like. Note [Conlike is interesting]
1064
1065 instance Outputable ArgSummary where
1066 ppr TrivArg = text "TrivArg"
1067 ppr NonTrivArg = text "NonTrivArg"
1068 ppr ValueArg = text "ValueArg"
1069
1070 nonTriv :: ArgSummary -> Bool
1071 nonTriv TrivArg = False
1072 nonTriv _ = True
1073
1074 data CallCtxt
1075 = BoringCtxt
1076 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1077 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1078 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1079
1080 | ValAppCtxt -- We're applied to at least one value arg
1081 -- This arises when we have ((f x |> co) y)
1082 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1083
1084 | CaseCtxt -- We're the scrutinee of a case
1085 -- that decomposes its scrutinee
1086
1087 instance Outputable CallCtxt where
1088 ppr CaseCtxt = text "CaseCtxt"
1089 ppr ValAppCtxt = text "ValAppCtxt"
1090 ppr BoringCtxt = text "BoringCtxt"
1091 ppr RhsCtxt = text "RhsCtxt"
1092 ppr DiscArgCtxt = text "DiscArgCtxt"
1093 ppr RuleArgCtxt = text "RuleArgCtxt"
1094
1095 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1096 = case idUnfolding id of
1097 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1098 -- Things with an INLINE pragma may have an unfolding *and*
1099 -- be a loop breaker (maybe the knot is not yet untied)
1100 CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top
1101 , uf_is_work_free = is_wf
1102 , uf_guidance = guidance, uf_expandable = is_exp }
1103 | active_unfolding -> tryUnfolding dflags id lone_variable
1104 arg_infos cont_info unf_template is_top
1105 is_wf is_exp guidance
1106 | otherwise -> traceInline dflags "Inactive unfolding:" (ppr id) Nothing
1107 NoUnfolding -> Nothing
1108 BootUnfolding -> Nothing
1109 OtherCon {} -> Nothing
1110 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1111
1112 traceInline :: DynFlags -> String -> SDoc -> a -> a
1113 traceInline dflags str doc result
1114 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1115 = pprTrace str doc result
1116 | otherwise
1117 = result
1118
1119 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1120 -> CoreExpr -> Bool -> Bool -> Bool -> UnfoldingGuidance
1121 -> Maybe CoreExpr
1122 tryUnfolding dflags id lone_variable
1123 arg_infos cont_info unf_template is_top
1124 is_wf is_exp guidance
1125 = case guidance of
1126 UnfNever -> traceInline dflags str (text "UnfNever") Nothing
1127
1128 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1129 | enough_args && (boring_ok || some_benefit)
1130 -- See Note [INLINE for small functions (3)]
1131 -> traceInline dflags str (mk_doc some_benefit empty True) (Just unf_template)
1132 | otherwise
1133 -> traceInline dflags str (mk_doc some_benefit empty False) Nothing
1134 where
1135 some_benefit = calc_some_benefit uf_arity
1136 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1137
1138 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1139 | is_wf && some_benefit && small_enough
1140 -> traceInline dflags str (mk_doc some_benefit extra_doc True) (Just unf_template)
1141 | otherwise
1142 -> traceInline dflags str (mk_doc some_benefit extra_doc False) Nothing
1143 where
1144 some_benefit = calc_some_benefit (length arg_discounts)
1145 extra_doc = text "discounted size =" <+> int discounted_size
1146 discounted_size = size - discount
1147 small_enough = discounted_size <= ufUseThreshold dflags
1148 discount = computeDiscount dflags arg_discounts
1149 res_discount arg_infos cont_info
1150
1151 where
1152 mk_doc some_benefit extra_doc yes_or_no
1153 = vcat [ text "arg infos" <+> ppr arg_infos
1154 , text "interesting continuation" <+> ppr cont_info
1155 , text "some_benefit" <+> ppr some_benefit
1156 , text "is exp:" <+> ppr is_exp
1157 , text "is work-free:" <+> ppr is_wf
1158 , text "guidance" <+> ppr guidance
1159 , extra_doc
1160 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1161
1162 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1163 n_val_args = length arg_infos
1164
1165 -- some_benefit is used when the RHS is small enough
1166 -- and the call has enough (or too many) value
1167 -- arguments (ie n_val_args >= arity). But there must
1168 -- be *something* interesting about some argument, or the
1169 -- result context, to make it worth inlining
1170 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1171 -- expected by the unfolding
1172 calc_some_benefit uf_arity
1173 | not saturated = interesting_args -- Under-saturated
1174 -- Note [Unsaturated applications]
1175 | otherwise = interesting_args -- Saturated or over-saturated
1176 || interesting_call
1177 where
1178 saturated = n_val_args >= uf_arity
1179 over_saturated = n_val_args > uf_arity
1180 interesting_args = any nonTriv arg_infos
1181 -- NB: (any nonTriv arg_infos) looks at the
1182 -- over-saturated args too which is "wrong";
1183 -- but if over-saturated we inline anyway.
1184
1185 interesting_call
1186 | over_saturated
1187 = True
1188 | otherwise
1189 = case cont_info of
1190 CaseCtxt -> not (lone_variable && is_wf) -- Note [Lone variables]
1191 ValAppCtxt -> True -- Note [Cast then apply]
1192 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1193 DiscArgCtxt -> uf_arity > 0 --
1194 RhsCtxt -> uf_arity > 0 --
1195 _ -> not is_top && uf_arity > 0 -- Note [Nested functions]
1196 -- Note [Inlining in ArgCtxt]
1197
1198 {-
1199 Note [Unfold into lazy contexts], Note [RHS of lets]
1200 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1201 When the call is the argument of a function with a RULE, or the RHS of a let,
1202 we are a little bit keener to inline. For example
1203 f y = (y,y,y)
1204 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1205 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1206 only we can't see it. Also
1207 x = f v
1208 could be expensive whereas
1209 x = case v of (a,b) -> a
1210 is patently cheap and may allow more eta expansion.
1211 So we treat the RHS of a let as not-totally-boring.
1212
1213 Note [Unsaturated applications]
1214 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1215 When a call is not saturated, we *still* inline if one of the
1216 arguments has interesting structure. That's sometimes very important.
1217 A good example is the Ord instance for Bool in Base:
1218
1219 Rec {
1220 $fOrdBool =GHC.Classes.D:Ord
1221 @ Bool
1222 ...
1223 $cmin_ajX
1224
1225 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1226 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1227 }
1228
1229 But the defn of GHC.Classes.$dmmin is:
1230
1231 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1232 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1233 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1234 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1235 GHC.Types.False -> y GHC.Types.True -> x }) -}
1236
1237 We *really* want to inline $dmmin, even though it has arity 3, in
1238 order to unravel the recursion.
1239
1240
1241 Note [Things to watch]
1242 ~~~~~~~~~~~~~~~~~~~~~~
1243 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1244 Assume x is exported, so not inlined unconditionally.
1245 Then we want x to inline unconditionally; no reason for it
1246 not to, and doing so avoids an indirection.
1247
1248 * { x = I# 3; ....f x.... }
1249 Make sure that x does not inline unconditionally!
1250 Lest we get extra allocation.
1251
1252 Note [Inlining an InlineRule]
1253 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1254 An InlineRules is used for
1255 (a) programmer INLINE pragmas
1256 (b) inlinings from worker/wrapper
1257
1258 For (a) the RHS may be large, and our contract is that we *only* inline
1259 when the function is applied to all the arguments on the LHS of the
1260 source-code defn. (The uf_arity in the rule.)
1261
1262 However for worker/wrapper it may be worth inlining even if the
1263 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1264 require saturation.
1265
1266
1267 Note [Nested functions]
1268 ~~~~~~~~~~~~~~~~~~~~~~~
1269 If a function has a nested defn we also record some-benefit, on the
1270 grounds that we are often able to eliminate the binding, and hence the
1271 allocation, for the function altogether; this is good for join points.
1272 But this only makes sense for *functions*; inlining a constructor
1273 doesn't help allocation unless the result is scrutinised. UNLESS the
1274 constructor occurs just once, albeit possibly in multiple case
1275 branches. Then inlining it doesn't increase allocation, but it does
1276 increase the chance that the constructor won't be allocated at all in
1277 the branches that don't use it.
1278
1279 Note [Cast then apply]
1280 ~~~~~~~~~~~~~~~~~~~~~~
1281 Consider
1282 myIndex = __inline_me ( (/\a. <blah>) |> co )
1283 co :: (forall a. a -> a) ~ (forall a. T a)
1284 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1285
1286 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1287 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1288
1289 Note [Inlining in ArgCtxt]
1290 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1291 The condition (arity > 0) here is very important, because otherwise
1292 we end up inlining top-level stuff into useless places; eg
1293 x = I# 3#
1294 f = \y. g x
1295 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1296 and 20% to 'power'.
1297
1298 At one stage I replaced this condition by 'True' (leading to the above
1299 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1300 to work ok now.
1301
1302 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1303 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1304 for the RHS of a 'let', we only profit from the inlining if we get a
1305 CONLIKE thing (modulo lets).
1306
1307 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1308 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1309 The "lone-variable" case is important. I spent ages messing about
1310 with unsatisfactory variants, but this is nice. The idea is that if a
1311 variable appears all alone
1312
1313 as an arg of lazy fn, or rhs BoringCtxt
1314 as scrutinee of a case CaseCtxt
1315 as arg of a fn ArgCtxt
1316 AND
1317 it is bound to a cheap expression
1318
1319 then we should not inline it (unless there is some other reason,
1320 e.g. is is the sole occurrence). That is what is happening at
1321 the use of 'lone_variable' in 'interesting_call'.
1322
1323 Why? At least in the case-scrutinee situation, turning
1324 let x = (a,b) in case x of y -> ...
1325 into
1326 let x = (a,b) in case (a,b) of y -> ...
1327 and thence to
1328 let x = (a,b) in let y = (a,b) in ...
1329 is bad if the binding for x will remain.
1330
1331 Another example: I discovered that strings
1332 were getting inlined straight back into applications of 'error'
1333 because the latter is strict.
1334 s = "foo"
1335 f = \x -> ...(error s)...
1336
1337 Fundamentally such contexts should not encourage inlining because the
1338 context can ``see'' the unfolding of the variable (e.g. case or a
1339 RULE) so there's no gain. If the thing is bound to a value.
1340
1341 However, watch out:
1342
1343 * Consider this:
1344 foo = _inline_ (\n. [n])
1345 bar = _inline_ (foo 20)
1346 baz = \n. case bar of { (m:_) -> m + n }
1347 Here we really want to inline 'bar' so that we can inline 'foo'
1348 and the whole thing unravels as it should obviously do. This is
1349 important: in the NDP project, 'bar' generates a closure data
1350 structure rather than a list.
1351
1352 So the non-inlining of lone_variables should only apply if the
1353 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1354 looks through the unfolding. Hence the "&& is_wf" in the
1355 InlineRule branch.
1356
1357 * Even a type application or coercion isn't a lone variable.
1358 Consider
1359 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1360 We had better inline that sucker! The case won't see through it.
1361
1362 For now, I'm treating treating a variable applied to types
1363 in a *lazy* context "lone". The motivating example was
1364 f = /\a. \x. BIG
1365 g = /\a. \y. h (f a)
1366 There's no advantage in inlining f here, and perhaps
1367 a significant disadvantage. Hence some_val_args in the Stop case
1368
1369 Note [Interaction of exprIsWorkFree and lone variables]
1370 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1371 The lone-variable test says "don't inline if a case expression
1372 scrutinises a lone variable whose unfolding is cheap". It's very
1373 important that, under these circumstances, exprIsConApp_maybe
1374 can spot a constructor application. So, for example, we don't
1375 consider
1376 let x = e in (x,x)
1377 to be cheap, and that's good because exprIsConApp_maybe doesn't
1378 think that expression is a constructor application.
1379
1380 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1381 rather than is_wf, which was utterly wrong, because the above
1382 expression responds True to exprIsHNF, which is what sets is_value.
1383
1384 This kind of thing can occur if you have
1385
1386 {-# INLINE foo #-}
1387 foo = let x = e in (x,x)
1388
1389 which Roman did.
1390 -}
1391
1392 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1393 -> Int
1394 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1395 -- We multiple the raw discounts (args_discount and result_discount)
1396 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1397 -- *size* whereas the discounts imply that there's some extra
1398 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1399 -- by inlining.
1400
1401 = 10 -- Discount of 10 because the result replaces the call
1402 -- so we count 10 for the function itself
1403
1404 + 10 * length actual_arg_discounts
1405 -- Discount of 10 for each arg supplied,
1406 -- because the result replaces the call
1407
1408 + round (ufKeenessFactor dflags *
1409 fromIntegral (total_arg_discount + res_discount'))
1410 where
1411 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1412 total_arg_discount = sum actual_arg_discounts
1413
1414 mk_arg_discount _ TrivArg = 0
1415 mk_arg_discount _ NonTrivArg = 10
1416 mk_arg_discount discount ValueArg = discount
1417
1418 res_discount'
1419 | LT <- arg_discounts `compareLength` arg_infos
1420 = res_discount -- Over-saturated
1421 | otherwise
1422 = case cont_info of
1423 BoringCtxt -> 0
1424 CaseCtxt -> res_discount -- Presumably a constructor
1425 ValAppCtxt -> res_discount -- Presumably a function
1426 _ -> 40 `min` res_discount
1427 -- ToDo: this 40 `min` res_discount doesn't seem right
1428 -- for DiscArgCtxt it shouldn't matter because the function will
1429 -- get the arg discount for any non-triv arg
1430 -- for RuleArgCtxt we do want to be keener to inline; but not only
1431 -- constructor results
1432 -- for RhsCtxt I suppose that exposing a data con is good in general
1433 -- And 40 seems very arbitrary
1434 --
1435 -- res_discount can be very large when a function returns
1436 -- constructors; but we only want to invoke that large discount
1437 -- when there's a case continuation.
1438 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1439 -- But we want to aovid inlining large functions that return
1440 -- constructors into contexts that are simply "interesting"