Typos in comments
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import PprCore () -- Instances
50 import OccurAnal ( occurAnalyseExpr )
51 import CoreOpt
52 import CoreArity ( manifestArity )
53 import CoreUtils
54 import Id
55 import Demand ( isBottomingSig )
56 import DataCon
57 import Literal
58 import PrimOp
59 import IdInfo
60 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
61 import Type
62 import PrelNames
63 import TysPrim ( realWorldStatePrimTy )
64 import Bag
65 import Util
66 import Outputable
67 import ForeignCall
68
69 import qualified Data.ByteString as BS
70
71 {-
72 ************************************************************************
73 * *
74 \subsection{Making unfoldings}
75 * *
76 ************************************************************************
77 -}
78
79 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
80 mkTopUnfolding dflags is_bottoming rhs
81 = mkUnfolding dflags InlineRhs True is_bottoming rhs
82
83 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
84 -- For implicit Ids, do a tiny bit of optimising first
85 mkImplicitUnfolding dflags expr
86 = mkTopUnfolding dflags False (simpleOptExpr expr)
87
88 -- Note [Top-level flag on inline rules]
89 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
90 -- Slight hack: note that mk_inline_rules conservatively sets the
91 -- top-level flag to True. It gets set more accurately by the simplifier
92 -- Simplify.simplUnfolding.
93
94 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
95 mkSimpleUnfolding dflags rhs
96 = mkUnfolding dflags InlineRhs False False rhs
97
98 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
99 mkDFunUnfolding bndrs con ops
100 = DFunUnfolding { df_bndrs = bndrs
101 , df_con = con
102 , df_args = map occurAnalyseExpr ops }
103 -- See Note [Occurrence analysis of unfoldings]
104
105 mkWwInlineRule :: CoreExpr -> Arity -> Unfolding
106 mkWwInlineRule expr arity
107 = mkCoreUnfolding InlineStable True
108 (simpleOptExpr expr)
109 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
110 , ug_boring_ok = boringCxtNotOk })
111
112 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
113 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
114 = mkCoreUnfolding InlineCompulsory True
115 (simpleOptExpr expr)
116 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
117 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
118
119 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
120 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
121 mkWorkerUnfolding dflags work_fn
122 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
123 , uf_is_top = top_lvl })
124 | isStableSource src
125 = mkCoreUnfolding src top_lvl new_tmpl guidance
126 where
127 new_tmpl = simpleOptExpr (work_fn tmpl)
128 guidance = calcUnfoldingGuidance dflags False new_tmpl
129
130 mkWorkerUnfolding _ _ _ = noUnfolding
131
132 -- | Make an unfolding that may be used unsaturated
133 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
134 -- manifest arity (the number of outer lambdas applications will
135 -- resolve before doing any work).
136 mkInlineUnfolding :: CoreExpr -> Unfolding
137 mkInlineUnfolding expr
138 = mkCoreUnfolding InlineStable
139 True -- Note [Top-level flag on inline rules]
140 expr' guide
141 where
142 expr' = simpleOptExpr expr
143 guide = UnfWhen { ug_arity = manifestArity expr'
144 , ug_unsat_ok = unSaturatedOk
145 , ug_boring_ok = boring_ok }
146 boring_ok = inlineBoringOk expr'
147
148 -- | Make an unfolding that will be used once the RHS has been saturated
149 -- to the given arity.
150 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
151 mkInlineUnfoldingWithArity arity expr
152 = mkCoreUnfolding InlineStable
153 True -- Note [Top-level flag on inline rules]
154 expr' guide
155 where
156 expr' = simpleOptExpr expr
157 guide = UnfWhen { ug_arity = arity
158 , ug_unsat_ok = needSaturated
159 , ug_boring_ok = boring_ok }
160 boring_ok = inlineBoringOk expr'
161
162 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
163 mkInlinableUnfolding dflags expr
164 = mkUnfolding dflags InlineStable False False expr'
165 where
166 expr' = simpleOptExpr expr
167
168 specUnfolding :: [Var] -> (CoreExpr -> CoreExpr) -> Arity -> Unfolding -> Unfolding
169 -- See Note [Specialising unfoldings]
170 -- specUnfolding spec_bndrs spec_app arity_decrease unf
171 -- = \spec_bndrs. spec_app( unf )
172 --
173 specUnfolding spec_bndrs spec_app arity_decrease
174 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
175 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
176 mkDFunUnfolding spec_bndrs con (map spec_arg args)
177 -- There is a hard-to-check assumption here that the spec_app has
178 -- enough applications to exactly saturate the old_bndrs
179 -- For DFunUnfoldings we transform
180 -- \old_bndrs. MkD <op1> ... <opn>
181 -- to
182 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
183 -- The ASSERT checks the value part of that
184 where
185 spec_arg arg = simpleOptExpr (spec_app (mkLams old_bndrs arg))
186 -- The beta-redexes created by spec_app will be
187 -- simplified away by simplOptExpr
188
189 specUnfolding spec_bndrs spec_app arity_decrease
190 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
191 , uf_is_top = top_lvl
192 , uf_guidance = old_guidance })
193 | isStableSource src -- See Note [Specialising unfoldings]
194 , UnfWhen { ug_arity = old_arity
195 , ug_unsat_ok = unsat_ok
196 , ug_boring_ok = boring_ok } <- old_guidance
197 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
198 , ug_unsat_ok = unsat_ok
199 , ug_boring_ok = boring_ok }
200 new_tmpl = simpleOptExpr (mkLams spec_bndrs (spec_app tmpl))
201 -- The beta-redexes created by spec_app will be
202 -- simplified away by simplOptExpr
203
204 in mkCoreUnfolding src top_lvl new_tmpl guidance
205
206 specUnfolding _ _ _ _ = noUnfolding
207
208 {- Note [Specialising unfoldings]
209 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
210 When we specialise a function for some given type-class arguments, we use
211 specUnfolding to specialise its unfolding. Some important points:
212
213 * If the original function has a DFunUnfolding, the specialised one
214 must do so too! Otherwise we lose the magic rules that make it
215 interact with ClassOps
216
217 * There is a bit of hack for INLINABLE functions:
218 f :: Ord a => ....
219 f = <big-rhs>
220 {- INLINABLE f #-}
221 Now if we specialise f, should the specialised version still have
222 an INLINABLE pragma? If it does, we'll capture a specialised copy
223 of <big-rhs> as its unfolding, and that probaby won't inline. But
224 if we don't, the specialised version of <big-rhs> might be small
225 enough to inline at a call site. This happens with Control.Monad.liftM3,
226 and can cause a lot more allocation as a result (nofib n-body shows this).
227
228 Moreover, keeping the INLINABLE thing isn't much help, because
229 the specialised function (probaby) isn't overloaded any more.
230
231 Conclusion: drop the INLINEALE pragma. In practice what this means is:
232 if a stable unfolding has UnfoldingGuidance of UnfWhen,
233 we keep it (so the specialised thing too will always inline)
234 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
235 (which arises from INLINABLE), we discard it
236 -}
237
238 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
239 -> UnfoldingGuidance -> Unfolding
240 -- Occurrence-analyses the expression before capturing it
241 mkCoreUnfolding src top_lvl expr guidance
242 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
243 -- See Note [Occurrence analysis of unfoldings]
244 uf_src = src,
245 uf_is_top = top_lvl,
246 uf_is_value = exprIsHNF expr,
247 uf_is_conlike = exprIsConLike expr,
248 uf_is_work_free = exprIsWorkFree expr,
249 uf_expandable = exprIsExpandable expr,
250 uf_guidance = guidance }
251
252 mkUnfolding :: DynFlags -> UnfoldingSource
253 -> Bool -- Is top-level
254 -> Bool -- Definitely a bottoming binding
255 -- (only relevant for top-level bindings)
256 -> CoreExpr
257 -> Unfolding
258 -- Calculates unfolding guidance
259 -- Occurrence-analyses the expression before capturing it
260 mkUnfolding dflags src is_top_lvl is_bottoming expr
261 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
262 -- See Note [Occurrence analysis of unfoldings]
263 uf_src = src,
264 uf_is_top = is_top_lvl,
265 uf_is_value = exprIsHNF expr,
266 uf_is_conlike = exprIsConLike expr,
267 uf_expandable = exprIsExpandable expr,
268 uf_is_work_free = exprIsWorkFree expr,
269 uf_guidance = guidance }
270 where
271 is_top_bottoming = is_top_lvl && is_bottoming
272 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
273 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
274 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
275
276 {-
277 Note [Occurrence analysis of unfoldings]
278 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
279 We do occurrence-analysis of unfoldings once and for all, when the
280 unfolding is built, rather than each time we inline them.
281
282 But given this decision it's vital that we do
283 *always* do it. Consider this unfolding
284 \x -> letrec { f = ...g...; g* = f } in body
285 where g* is (for some strange reason) the loop breaker. If we don't
286 occ-anal it when reading it in, we won't mark g as a loop breaker, and
287 we may inline g entirely in body, dropping its binding, and leaving
288 the occurrence in f out of scope. This happened in Trac #8892, where
289 the unfolding in question was a DFun unfolding.
290
291 But more generally, the simplifier is designed on the
292 basis that it is looking at occurrence-analysed expressions, so better
293 ensure that they acutally are.
294
295 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
296 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
297 Notice that we give the non-occur-analysed expression to
298 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
299 first; for example, sometimes during simplification, there's a large
300 let-bound thing which has been substituted, and so is now dead; so
301 'expr' contains two copies of the thing while the occurrence-analysed
302 expression doesn't.
303
304 Nevertheless, we *don't* and *must not* occ-analyse before computing
305 the size because
306
307 a) The size computation bales out after a while, whereas occurrence
308 analysis does not.
309
310 b) Residency increases sharply if you occ-anal first. I'm not
311 100% sure why, but it's a large effect. Compiling Cabal went
312 from residency of 534M to over 800M with this one change.
313
314 This can occasionally mean that the guidance is very pessimistic;
315 it gets fixed up next round. And it should be rare, because large
316 let-bound things that are dead are usually caught by preInlineUnconditionally
317
318
319 ************************************************************************
320 * *
321 \subsection{The UnfoldingGuidance type}
322 * *
323 ************************************************************************
324 -}
325
326 inlineBoringOk :: CoreExpr -> Bool
327 -- See Note [INLINE for small functions]
328 -- True => the result of inlining the expression is
329 -- no bigger than the expression itself
330 -- eg (\x y -> f y x)
331 -- This is a quick and dirty version. It doesn't attempt
332 -- to deal with (\x y z -> x (y z))
333 -- The really important one is (x `cast` c)
334 inlineBoringOk e
335 = go 0 e
336 where
337 go :: Int -> CoreExpr -> Bool
338 go credit (Lam x e) | isId x = go (credit+1) e
339 | otherwise = go credit e
340 go credit (App f (Type {})) = go credit f
341 go credit (App f a) | credit > 0
342 , exprIsTrivial a = go (credit-1) f
343 go credit (Tick _ e) = go credit e -- dubious
344 go credit (Cast e _) = go credit e
345 go _ (Var {}) = boringCxtOk
346 go _ _ = boringCxtNotOk
347
348 calcUnfoldingGuidance
349 :: DynFlags
350 -> Bool -- Definitely a top-level, bottoming binding
351 -> CoreExpr -- Expression to look at
352 -> UnfoldingGuidance
353 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
354 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
355 = calcUnfoldingGuidance dflags is_top_bottoming expr
356 calcUnfoldingGuidance dflags is_top_bottoming expr
357 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
358 TooBig -> UnfNever
359 SizeIs size cased_bndrs scrut_discount
360 | uncondInline expr n_val_bndrs size
361 -> UnfWhen { ug_unsat_ok = unSaturatedOk
362 , ug_boring_ok = boringCxtOk
363 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
364
365 | is_top_bottoming
366 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
367
368 | otherwise
369 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
370 , ug_size = size
371 , ug_res = scrut_discount }
372
373 where
374 (bndrs, body) = collectBinders expr
375 bOMB_OUT_SIZE = ufCreationThreshold dflags
376 -- Bomb out if size gets bigger than this
377 val_bndrs = filter isId bndrs
378 n_val_bndrs = length val_bndrs
379
380 mk_discount :: Bag (Id,Int) -> Id -> Int
381 mk_discount cbs bndr = foldlBag combine 0 cbs
382 where
383 combine acc (bndr', disc)
384 | bndr == bndr' = acc `plus_disc` disc
385 | otherwise = acc
386
387 plus_disc :: Int -> Int -> Int
388 plus_disc | isFunTy (idType bndr) = max
389 | otherwise = (+)
390 -- See Note [Function and non-function discounts]
391
392 {-
393 Note [Computing the size of an expression]
394 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
395 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
396 heuristics right has taken a long time. Here's the basic strategy:
397
398 * Variables, literals: 0
399 (Exception for string literals, see litSize.)
400
401 * Function applications (f e1 .. en): 1 + #value args
402
403 * Constructor applications: 1, regardless of #args
404
405 * Let(rec): 1 + size of components
406
407 * Note, cast: 0
408
409 Examples
410
411 Size Term
412 --------------
413 0 42#
414 0 x
415 0 True
416 2 f x
417 1 Just x
418 4 f (g x)
419
420 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
421 a function call to account for. Notice also that constructor applications
422 are very cheap, because exposing them to a caller is so valuable.
423
424 [25/5/11] All sizes are now multiplied by 10, except for primops
425 (which have sizes like 1 or 4. This makes primops look fantastically
426 cheap, and seems to be almost unversally beneficial. Done partly as a
427 result of #4978.
428
429 Note [Do not inline top-level bottoming functions]
430 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
431 The FloatOut pass has gone to some trouble to float out calls to 'error'
432 and similar friends. See Note [Bottoming floats] in SetLevels.
433 Do not re-inline them! But we *do* still inline if they are very small
434 (the uncondInline stuff).
435
436 Note [INLINE for small functions]
437 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
438 Consider {-# INLINE f #-}
439 f x = Just x
440 g y = f y
441 Then f's RHS is no larger than its LHS, so we should inline it into
442 even the most boring context. In general, f the function is
443 sufficiently small that its body is as small as the call itself, the
444 inline unconditionally, regardless of how boring the context is.
445
446 Things to note:
447
448 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
449 than the thing it's replacing. Notice that
450 (f x) --> (g 3) -- YES, unconditionally
451 (f x) --> x : [] -- YES, *even though* there are two
452 -- arguments to the cons
453 x --> g 3 -- NO
454 x --> Just v -- NO
455
456 It's very important not to unconditionally replace a variable by
457 a non-atomic term.
458
459 (2) We do this even if the thing isn't saturated, else we end up with the
460 silly situation that
461 f x y = x
462 ...map (f 3)...
463 doesn't inline. Even in a boring context, inlining without being
464 saturated will give a lambda instead of a PAP, and will be more
465 efficient at runtime.
466
467 (3) However, when the function's arity > 0, we do insist that it
468 has at least one value argument at the call site. (This check is
469 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
470 f = /\a \x:a. x
471 d = /\b. MkD (f b)
472 If we inline f here we get
473 d = /\b. MkD (\x:b. x)
474 and then prepareRhs floats out the argument, abstracting the type
475 variables, so we end up with the original again!
476
477 (4) We must be much more cautious about arity-zero things. Consider
478 let x = y +# z in ...
479 In *size* terms primops look very small, because the generate a
480 single instruction, but we do not want to unconditionally replace
481 every occurrence of x with (y +# z). So we only do the
482 unconditional-inline thing for *trivial* expressions.
483
484 NB: you might think that PostInlineUnconditionally would do this
485 but it doesn't fire for top-level things; see SimplUtils
486 Note [Top level and postInlineUnconditionally]
487 -}
488
489 uncondInline :: CoreExpr -> Arity -> Int -> Bool
490 -- Inline unconditionally if there no size increase
491 -- Size of call is arity (+1 for the function)
492 -- See Note [INLINE for small functions]
493 uncondInline rhs arity size
494 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
495 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
496
497 sizeExpr :: DynFlags
498 -> Int -- Bomb out if it gets bigger than this
499 -> [Id] -- Arguments; we're interested in which of these
500 -- get case'd
501 -> CoreExpr
502 -> ExprSize
503
504 -- Note [Computing the size of an expression]
505
506 sizeExpr dflags bOMB_OUT_SIZE top_args expr
507 = size_up expr
508 where
509 size_up (Cast e _) = size_up e
510 size_up (Tick _ e) = size_up e
511 size_up (Type _) = sizeZero -- Types cost nothing
512 size_up (Coercion _) = sizeZero
513 size_up (Lit lit) = sizeN (litSize lit)
514 size_up (Var f) | isRealWorldId f = sizeZero
515 -- Make sure we get constructor discounts even
516 -- on nullary constructors
517 | otherwise = size_up_call f [] 0
518
519 size_up (App fun arg)
520 | isTyCoArg arg = size_up fun
521 | otherwise = size_up arg `addSizeNSD`
522 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
523
524 size_up (Lam b e)
525 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
526 | otherwise = size_up e
527
528 size_up (Let (NonRec binder rhs) body)
529 = size_up_rhs (binder, rhs) `addSizeNSD`
530 size_up body `addSizeN`
531 size_up_alloc binder
532
533 size_up (Let (Rec pairs) body)
534 = foldr (addSizeNSD . size_up_rhs)
535 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
536 pairs
537
538 size_up (Case e _ _ alts)
539 | null alts
540 = size_up e -- case e of {} never returns, so take size of scrutinee
541
542 size_up (Case e _ _ alts)
543 -- Now alts is non-empty
544 | Just v <- is_top_arg e -- We are scrutinising an argument variable
545 = let
546 alt_sizes = map size_up_alt alts
547
548 -- alts_size tries to compute a good discount for
549 -- the case when we are scrutinising an argument variable
550 alts_size (SizeIs tot tot_disc tot_scrut)
551 -- Size of all alternatives
552 (SizeIs max _ _)
553 -- Size of biggest alternative
554 = SizeIs tot (unitBag (v, 20 + tot - max)
555 `unionBags` tot_disc) tot_scrut
556 -- If the variable is known, we produce a
557 -- discount that will take us back to 'max',
558 -- the size of the largest alternative The
559 -- 1+ is a little discount for reduced
560 -- allocation in the caller
561 --
562 -- Notice though, that we return tot_disc,
563 -- the total discount from all branches. I
564 -- think that's right.
565
566 alts_size tot_size _ = tot_size
567 in
568 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
569 (foldr1 maxSize alt_sizes)
570 -- Good to inline if an arg is scrutinised, because
571 -- that may eliminate allocation in the caller
572 -- And it eliminates the case itself
573 where
574 is_top_arg (Var v) | v `elem` top_args = Just v
575 is_top_arg (Cast e _) = is_top_arg e
576 is_top_arg _ = Nothing
577
578
579 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
580 foldr (addAltSize . size_up_alt) case_size alts
581 where
582 case_size
583 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
584 | otherwise = sizeZero
585 -- Normally we don't charge for the case itself, but
586 -- we charge one per alternative (see size_up_alt,
587 -- below) to account for the cost of the info table
588 -- and comparisons.
589 --
590 -- However, in certain cases (see is_inline_scrut
591 -- below), no code is generated for the case unless
592 -- there are multiple alts. In these cases we
593 -- subtract one, making the first alt free.
594 -- e.g. case x# +# y# of _ -> ... should cost 1
595 -- case touch# x# of _ -> ... should cost 0
596 -- (see #4978)
597 --
598 -- I would like to not have the "lengthAtMost alts 1"
599 -- condition above, but without that some programs got worse
600 -- (spectral/hartel/event and spectral/para). I don't fully
601 -- understand why. (SDM 24/5/11)
602
603 -- unboxed variables, inline primops and unsafe foreign calls
604 -- are all "inline" things:
605 is_inline_scrut (Var v) = isUnliftedType (idType v)
606 is_inline_scrut scrut
607 | (Var f, _) <- collectArgs scrut
608 = case idDetails f of
609 FCallId fc -> not (isSafeForeignCall fc)
610 PrimOpId op -> not (primOpOutOfLine op)
611 _other -> False
612 | otherwise
613 = False
614
615 size_up_rhs (bndr, rhs)
616 | Just join_arity <- isJoinId_maybe bndr
617 -- Skip arguments to join point
618 , (_bndrs, body) <- collectNBinders join_arity rhs
619 = size_up body
620 | otherwise
621 = size_up rhs
622
623 ------------
624 -- size_up_app is used when there's ONE OR MORE value args
625 size_up_app (App fun arg) args voids
626 | isTyCoArg arg = size_up_app fun args voids
627 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
628 | otherwise = size_up arg `addSizeNSD`
629 size_up_app fun (arg:args) voids
630 size_up_app (Var fun) args voids = size_up_call fun args voids
631 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
632 size_up_app (Cast expr _) args voids = size_up_app expr args voids
633 size_up_app other args voids = size_up other `addSizeN`
634 callSize (length args) voids
635 -- if the lhs is not an App or a Var, or an invisible thing like a
636 -- Tick or Cast, then we should charge for a complete call plus the
637 -- size of the lhs itself.
638
639 ------------
640 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
641 size_up_call fun val_args voids
642 = case idDetails fun of
643 FCallId _ -> sizeN (callSize (length val_args) voids)
644 DataConWorkId dc -> conSize dc (length val_args)
645 PrimOpId op -> primOpSize op (length val_args)
646 ClassOpId _ -> classOpSize dflags top_args val_args
647 _ -> funSize dflags top_args fun (length val_args) voids
648
649 ------------
650 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
651 -- Don't charge for args, so that wrappers look cheap
652 -- (See comments about wrappers with Case)
653 --
654 -- IMPORTANT: *do* charge 1 for the alternative, else we
655 -- find that giant case nests are treated as practically free
656 -- A good example is Foreign.C.Error.errnoToIOError
657
658 ------------
659 -- Cost to allocate binding with given binder
660 size_up_alloc bndr
661 | isTyVar bndr -- Doesn't exist at runtime
662 || isJoinId bndr -- Not allocated at all
663 || isUnliftedType (idType bndr) -- Doesn't live in heap
664 = 0
665 | otherwise
666 = 10
667
668 ------------
669 -- These addSize things have to be here because
670 -- I don't want to give them bOMB_OUT_SIZE as an argument
671 addSizeN TooBig _ = TooBig
672 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
673
674 -- addAltSize is used to add the sizes of case alternatives
675 addAltSize TooBig _ = TooBig
676 addAltSize _ TooBig = TooBig
677 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
678 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
679 (xs `unionBags` ys)
680 (d1 + d2) -- Note [addAltSize result discounts]
681
682 -- This variant ignores the result discount from its LEFT argument
683 -- It's used when the second argument isn't part of the result
684 addSizeNSD TooBig _ = TooBig
685 addSizeNSD _ TooBig = TooBig
686 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
687 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
688 (xs `unionBags` ys)
689 d2 -- Ignore d1
690
691 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
692
693 -- an expression of type State# RealWorld must be a variable
694 isRealWorldExpr (Var id) = isRealWorldId id
695 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
696 isRealWorldExpr _ = False
697
698 -- | Finds a nominal size of a string literal.
699 litSize :: Literal -> Int
700 -- Used by CoreUnfold.sizeExpr
701 litSize (LitInteger {}) = 100 -- Note [Size of literal integers]
702 litSize (MachStr str) = 10 + 10 * ((BS.length str + 3) `div` 4)
703 -- If size could be 0 then @f "x"@ might be too small
704 -- [Sept03: make literal strings a bit bigger to avoid fruitless
705 -- duplication of little strings]
706 litSize _other = 0 -- Must match size of nullary constructors
707 -- Key point: if x |-> 4, then x must inline unconditionally
708 -- (eg via case binding)
709
710 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
711 -- See Note [Conlike is interesting]
712 classOpSize _ _ []
713 = sizeZero
714 classOpSize dflags top_args (arg1 : other_args)
715 = SizeIs size arg_discount 0
716 where
717 size = 20 + (10 * length other_args)
718 -- If the class op is scrutinising a lambda bound dictionary then
719 -- give it a discount, to encourage the inlining of this function
720 -- The actual discount is rather arbitrarily chosen
721 arg_discount = case arg1 of
722 Var dict | dict `elem` top_args
723 -> unitBag (dict, ufDictDiscount dflags)
724 _other -> emptyBag
725
726 -- | The size of a function call
727 callSize
728 :: Int -- ^ number of value args
729 -> Int -- ^ number of value args that are void
730 -> Int
731 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
732 -- The 1+ is for the function itself
733 -- Add 1 for each non-trivial arg;
734 -- the allocation cost, as in let(rec)
735
736 -- | The size of a jump to a join point
737 jumpSize
738 :: Int -- ^ number of value args
739 -> Int -- ^ number of value args that are void
740 -> Int
741 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
742 -- A jump is 20% the size of a function call. Making jumps free reopens
743 -- bug #6048, but making them any more expensive loses a 21% improvement in
744 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
745 -- better solution?
746
747 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
748 -- Size for functions that are not constructors or primops
749 -- Note [Function applications]
750 funSize dflags top_args fun n_val_args voids
751 | fun `hasKey` buildIdKey = buildSize
752 | fun `hasKey` augmentIdKey = augmentSize
753 | otherwise = SizeIs size arg_discount res_discount
754 where
755 some_val_args = n_val_args > 0
756 is_join = isJoinId fun
757
758 size | is_join = jumpSize n_val_args voids
759 | not some_val_args = 0
760 | otherwise = callSize n_val_args voids
761
762 -- DISCOUNTS
763 -- See Note [Function and non-function discounts]
764 arg_discount | some_val_args && fun `elem` top_args
765 = unitBag (fun, ufFunAppDiscount dflags)
766 | otherwise = emptyBag
767 -- If the function is an argument and is applied
768 -- to some values, give it an arg-discount
769
770 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
771 | otherwise = 0
772 -- If the function is partially applied, show a result discount
773 -- XXX maybe behave like ConSize for eval'd variable
774
775 conSize :: DataCon -> Int -> ExprSize
776 conSize dc n_val_args
777 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
778
779 -- See Note [Unboxed tuple size and result discount]
780 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
781
782 -- See Note [Constructor size and result discount]
783 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
784
785 -- XXX still looks to large to me
786
787 {-
788 Note [Constructor size and result discount]
789 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
790 Treat a constructors application as size 10, regardless of how many
791 arguments it has; we are keen to expose them (and we charge separately
792 for their args). We can't treat them as size zero, else we find that
793 (Just x) has size 0, which is the same as a lone variable; and hence
794 'v' will always be replaced by (Just x), where v is bound to Just x.
795
796 The "result discount" is applied if the result of the call is
797 scrutinised (say by a case). For a constructor application that will
798 mean the constructor application will disappear, so we don't need to
799 charge it to the function. So the discount should at least match the
800 cost of the constructor application, namely 10. But to give a bit
801 of extra incentive we give a discount of 10*(1 + n_val_args).
802
803 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
804 and said it was an "unambiguous win", but its terribly dangerous
805 because a function with many many case branches, each finishing with
806 a constructor, can have an arbitrarily large discount. This led to
807 terrible code bloat: see Trac #6099.
808
809 Note [Unboxed tuple size and result discount]
810 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
811 However, unboxed tuples count as size zero. I found occasions where we had
812 f x y z = case op# x y z of { s -> (# s, () #) }
813 and f wasn't getting inlined.
814
815 I tried giving unboxed tuples a *result discount* of zero (see the
816 commented-out line). Why? When returned as a result they do not
817 allocate, so maybe we don't want to charge so much for them If you
818 have a non-zero discount here, we find that workers often get inlined
819 back into wrappers, because it look like
820 f x = case $wf x of (# a,b #) -> (a,b)
821 and we are keener because of the case. However while this change
822 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
823 more. All other changes were very small. So it's not a big deal but I
824 didn't adopt the idea.
825
826 Note [Function and non-function discounts]
827 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
828 We want a discount if the function is applied. A good example is
829 monadic combinators with continuation arguments, where inlining is
830 quite important.
831
832 But we don't want a big discount when a function is called many times
833 (see the detailed comments with Trac #6048) because if the function is
834 big it won't be inlined at its many call sites and no benefit results.
835 Indeed, we can get exponentially big inlinings this way; that is what
836 Trac #6048 is about.
837
838 On the other hand, for data-valued arguments, if there are lots of
839 case expressions in the body, each one will get smaller if we apply
840 the function to a constructor application, so we *want* a big discount
841 if the argument is scrutinised by many case expressions.
842
843 Conclusion:
844 - For functions, take the max of the discounts
845 - For data values, take the sum of the discounts
846
847
848 Note [Literal integer size]
849 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
850 Literal integers *can* be big (mkInteger [...coefficients...]), but
851 need not be (S# n). We just use an arbitrary big-ish constant here
852 so that, in particular, we don't inline top-level defns like
853 n = S# 5
854 There's no point in doing so -- any optimisations will see the S#
855 through n's unfolding. Nor will a big size inhibit unfoldings functions
856 that mention a literal Integer, because the float-out pass will float
857 all those constants to top level.
858 -}
859
860 primOpSize :: PrimOp -> Int -> ExprSize
861 primOpSize op n_val_args
862 = if primOpOutOfLine op
863 then sizeN (op_size + n_val_args)
864 else sizeN op_size
865 where
866 op_size = primOpCodeSize op
867
868
869 buildSize :: ExprSize
870 buildSize = SizeIs 0 emptyBag 40
871 -- We really want to inline applications of build
872 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
873 -- Indeed, we should add a result_discount because build is
874 -- very like a constructor. We don't bother to check that the
875 -- build is saturated (it usually is). The "-2" discounts for the \c n,
876 -- The "4" is rather arbitrary.
877
878 augmentSize :: ExprSize
879 augmentSize = SizeIs 0 emptyBag 40
880 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
881 -- e plus ys. The -2 accounts for the \cn
882
883 -- When we return a lambda, give a discount if it's used (applied)
884 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
885 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
886 lamScrutDiscount _ TooBig = TooBig
887
888 {-
889 Note [addAltSize result discounts]
890 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
891 When adding the size of alternatives, we *add* the result discounts
892 too, rather than take the *maximum*. For a multi-branch case, this
893 gives a discount for each branch that returns a constructor, making us
894 keener to inline. I did try using 'max' instead, but it makes nofib
895 'rewrite' and 'puzzle' allocate significantly more, and didn't make
896 binary sizes shrink significantly either.
897
898 Note [Discounts and thresholds]
899 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
900 Constants for discounts and thesholds are defined in main/DynFlags,
901 all of form ufXxxx. They are:
902
903 ufCreationThreshold
904 At a definition site, if the unfolding is bigger than this, we
905 may discard it altogether
906
907 ufUseThreshold
908 At a call site, if the unfolding, less discounts, is smaller than
909 this, then it's small enough inline
910
911 ufKeenessFactor
912 Factor by which the discounts are multiplied before
913 subtracting from size
914
915 ufDictDiscount
916 The discount for each occurrence of a dictionary argument
917 as an argument of a class method. Should be pretty small
918 else big functions may get inlined
919
920 ufFunAppDiscount
921 Discount for a function argument that is applied. Quite
922 large, because if we inline we avoid the higher-order call.
923
924 ufDearOp
925 The size of a foreign call or not-dupable PrimOp
926
927 ufVeryAggressive
928 If True, the compiler ignores all the thresholds and inlines very
929 aggressively. It still adheres to arity, simplifier phase control and
930 loop breakers.
931
932
933 Note [Function applications]
934 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
935 In a function application (f a b)
936
937 - If 'f' is an argument to the function being analysed,
938 and there's at least one value arg, record a FunAppDiscount for f
939
940 - If the application if a PAP (arity > 2 in this example)
941 record a *result* discount (because inlining
942 with "extra" args in the call may mean that we now
943 get a saturated application)
944
945 Code for manipulating sizes
946 -}
947
948 -- | The size of a candidate expression for unfolding
949 data ExprSize
950 = TooBig
951 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
952 , _es_args :: !(Bag (Id,Int))
953 -- ^ Arguments cased herein, and discount for each such
954 , _es_discount :: {-# UNPACK #-} !Int
955 -- ^ Size to subtract if result is scrutinised by a case
956 -- expression
957 }
958
959 instance Outputable ExprSize where
960 ppr TooBig = text "TooBig"
961 ppr (SizeIs a _ c) = brackets (int a <+> int c)
962
963 -- subtract the discount before deciding whether to bale out. eg. we
964 -- want to inline a large constructor application into a selector:
965 -- tup = (a_1, ..., a_99)
966 -- x = case tup of ...
967 --
968 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
969 mkSizeIs max n xs d | (n - d) > max = TooBig
970 | otherwise = SizeIs n xs d
971
972 maxSize :: ExprSize -> ExprSize -> ExprSize
973 maxSize TooBig _ = TooBig
974 maxSize _ TooBig = TooBig
975 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
976 | otherwise = s2
977
978 sizeZero :: ExprSize
979 sizeN :: Int -> ExprSize
980
981 sizeZero = SizeIs 0 emptyBag 0
982 sizeN n = SizeIs n emptyBag 0
983
984 {-
985 ************************************************************************
986 * *
987 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
988 * *
989 ************************************************************************
990
991 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
992 we ``couldn't possibly use'' on the other side. Can be overridden w/
993 flaggery. Just the same as smallEnoughToInline, except that it has no
994 actual arguments.
995 -}
996
997 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
998 couldBeSmallEnoughToInline dflags threshold rhs
999 = case sizeExpr dflags threshold [] body of
1000 TooBig -> False
1001 _ -> True
1002 where
1003 (_, body) = collectBinders rhs
1004
1005 ----------------
1006 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1007 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1008 = size <= ufUseThreshold dflags
1009 smallEnoughToInline _ _
1010 = False
1011
1012 ----------------
1013
1014 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1015 -- Sees if the unfolding is pretty certain to inline
1016 -- If so, return a *stable* unfolding for it, that will always inline
1017 certainlyWillInline dflags fn_info
1018 = case unfoldingInfo fn_info of
1019 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1020 | loop_breaker -> Nothing -- Won't inline, so try w/w
1021 | otherwise -> do_cunf e g -- Depends on size, so look at that
1022
1023 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1024 -- to do so, and even if it is currently a
1025 -- loop breaker, it may not be later
1026
1027 _other_unf -> Nothing
1028
1029 where
1030 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1031 fn_unf = unfoldingInfo fn_info
1032
1033 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1034 do_cunf _ UnfNever = Nothing
1035 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1036 -- INLINE functions have UnfWhen
1037
1038 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1039 -- binary sizes go up by 10%! (This is with SplitObjs.)
1040 -- I'm not totally sure why.
1041 -- INLINABLE functions come via this path
1042 -- See Note [certainlyWillInline: INLINABLE]
1043 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1044 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1045 , case inlinePragmaSpec (inlinePragInfo fn_info) of
1046 NoInline -> False -- NOINLINE; do not say certainlyWillInline!
1047 _ -> True -- INLINE, INLINABLE, or nothing
1048 , not (isBottomingSig (strictnessInfo fn_info))
1049 -- Do not unconditionally inline a bottoming functions even if
1050 -- it seems smallish. We've carefully lifted it out to top level,
1051 -- so we don't want to re-inline it.
1052 , let arity = length args
1053 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1054 = Just (fn_unf { uf_src = InlineStable
1055 , uf_guidance = UnfWhen { ug_arity = arity
1056 , ug_unsat_ok = unSaturatedOk
1057 , ug_boring_ok = inlineBoringOk expr } })
1058 -- Note the "unsaturatedOk". A function like f = \ab. a
1059 -- will certainly inline, even if partially applied (f e), so we'd
1060 -- better make sure that the transformed inlining has the same property
1061 | otherwise
1062 = Nothing
1063
1064 {- Note [certainlyWillInline: be careful of thunks]
1065 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1066 Don't claim that thunks will certainly inline, because that risks work
1067 duplication. Even if the work duplication is not great (eg is_cheap
1068 holds), it can make a big difference in an inner loop In Trac #5623 we
1069 found that the WorkWrap phase thought that
1070 y = case x of F# v -> F# (v +# v)
1071 was certainlyWillInline, so the addition got duplicated.
1072
1073 Note [certainlyWillInline: INLINABLE]
1074 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1075 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1076 even though we have a stable inlining, so that strictness w/w takes
1077 place. It makes a big difference to efficiency, and the w/w pass knows
1078 how to transfer the INLINABLE info to the worker; see WorkWrap
1079 Note [Worker-wrapper for INLINABLE functions]
1080
1081 ************************************************************************
1082 * *
1083 \subsection{callSiteInline}
1084 * *
1085 ************************************************************************
1086
1087 This is the key function. It decides whether to inline a variable at a call site
1088
1089 callSiteInline is used at call sites, so it is a bit more generous.
1090 It's a very important function that embodies lots of heuristics.
1091 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1092 and occurs exactly once or
1093 occurs once in each branch of a case and is small
1094
1095 If the thing is in WHNF, there's no danger of duplicating work,
1096 so we can inline if it occurs once, or is small
1097
1098 NOTE: we don't want to inline top-level functions that always diverge.
1099 It just makes the code bigger. Tt turns out that the convenient way to prevent
1100 them inlining is to give them a NOINLINE pragma, which we do in
1101 StrictAnal.addStrictnessInfoToTopId
1102 -}
1103
1104 callSiteInline :: DynFlags
1105 -> Id -- The Id
1106 -> Bool -- True <=> unfolding is active
1107 -> Bool -- True if there are no arguments at all (incl type args)
1108 -> [ArgSummary] -- One for each value arg; True if it is interesting
1109 -> CallCtxt -- True <=> continuation is interesting
1110 -> Maybe CoreExpr -- Unfolding, if any
1111
1112 data ArgSummary = TrivArg -- Nothing interesting
1113 | NonTrivArg -- Arg has structure
1114 | ValueArg -- Arg is a con-app or PAP
1115 -- ..or con-like. Note [Conlike is interesting]
1116
1117 instance Outputable ArgSummary where
1118 ppr TrivArg = text "TrivArg"
1119 ppr NonTrivArg = text "NonTrivArg"
1120 ppr ValueArg = text "ValueArg"
1121
1122 nonTriv :: ArgSummary -> Bool
1123 nonTriv TrivArg = False
1124 nonTriv _ = True
1125
1126 data CallCtxt
1127 = BoringCtxt
1128 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1129 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1130 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1131
1132 | ValAppCtxt -- We're applied to at least one value arg
1133 -- This arises when we have ((f x |> co) y)
1134 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1135
1136 | CaseCtxt -- We're the scrutinee of a case
1137 -- that decomposes its scrutinee
1138
1139 instance Outputable CallCtxt where
1140 ppr CaseCtxt = text "CaseCtxt"
1141 ppr ValAppCtxt = text "ValAppCtxt"
1142 ppr BoringCtxt = text "BoringCtxt"
1143 ppr RhsCtxt = text "RhsCtxt"
1144 ppr DiscArgCtxt = text "DiscArgCtxt"
1145 ppr RuleArgCtxt = text "RuleArgCtxt"
1146
1147 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1148 = case idUnfolding id of
1149 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1150 -- Things with an INLINE pragma may have an unfolding *and*
1151 -- be a loop breaker (maybe the knot is not yet untied)
1152 CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top
1153 , uf_is_work_free = is_wf
1154 , uf_guidance = guidance, uf_expandable = is_exp }
1155 | active_unfolding -> tryUnfolding dflags id lone_variable
1156 arg_infos cont_info unf_template is_top
1157 is_wf is_exp guidance
1158 | otherwise -> traceInline dflags "Inactive unfolding:" (ppr id) Nothing
1159 NoUnfolding -> Nothing
1160 BootUnfolding -> Nothing
1161 OtherCon {} -> Nothing
1162 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1163
1164 traceInline :: DynFlags -> String -> SDoc -> a -> a
1165 traceInline dflags str doc result
1166 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1167 = pprTrace str doc result
1168 | otherwise
1169 = result
1170
1171 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1172 -> CoreExpr -> Bool -> Bool -> Bool -> UnfoldingGuidance
1173 -> Maybe CoreExpr
1174 tryUnfolding dflags id lone_variable
1175 arg_infos cont_info unf_template is_top
1176 is_wf is_exp guidance
1177 = case guidance of
1178 UnfNever -> traceInline dflags str (text "UnfNever") Nothing
1179
1180 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1181 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1182 -- See Note [INLINE for small functions (3)]
1183 -> traceInline dflags str (mk_doc some_benefit empty True) (Just unf_template)
1184 | otherwise
1185 -> traceInline dflags str (mk_doc some_benefit empty False) Nothing
1186 where
1187 some_benefit = calc_some_benefit uf_arity
1188 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1189
1190 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1191 | ufVeryAggressive dflags
1192 -> traceInline dflags str (mk_doc some_benefit extra_doc True) (Just unf_template)
1193 | is_wf && some_benefit && small_enough
1194 -> traceInline dflags str (mk_doc some_benefit extra_doc True) (Just unf_template)
1195 | otherwise
1196 -> traceInline dflags str (mk_doc some_benefit extra_doc False) Nothing
1197 where
1198 some_benefit = calc_some_benefit (length arg_discounts)
1199 extra_doc = text "discounted size =" <+> int discounted_size
1200 discounted_size = size - discount
1201 small_enough = discounted_size <= ufUseThreshold dflags
1202 discount = computeDiscount dflags arg_discounts
1203 res_discount arg_infos cont_info
1204
1205 where
1206 mk_doc some_benefit extra_doc yes_or_no
1207 = vcat [ text "arg infos" <+> ppr arg_infos
1208 , text "interesting continuation" <+> ppr cont_info
1209 , text "some_benefit" <+> ppr some_benefit
1210 , text "is exp:" <+> ppr is_exp
1211 , text "is work-free:" <+> ppr is_wf
1212 , text "guidance" <+> ppr guidance
1213 , extra_doc
1214 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1215
1216 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1217 n_val_args = length arg_infos
1218
1219 -- some_benefit is used when the RHS is small enough
1220 -- and the call has enough (or too many) value
1221 -- arguments (ie n_val_args >= arity). But there must
1222 -- be *something* interesting about some argument, or the
1223 -- result context, to make it worth inlining
1224 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1225 -- expected by the unfolding
1226 calc_some_benefit uf_arity
1227 | not saturated = interesting_args -- Under-saturated
1228 -- Note [Unsaturated applications]
1229 | otherwise = interesting_args -- Saturated or over-saturated
1230 || interesting_call
1231 where
1232 saturated = n_val_args >= uf_arity
1233 over_saturated = n_val_args > uf_arity
1234 interesting_args = any nonTriv arg_infos
1235 -- NB: (any nonTriv arg_infos) looks at the
1236 -- over-saturated args too which is "wrong";
1237 -- but if over-saturated we inline anyway.
1238
1239 interesting_call
1240 | over_saturated
1241 = True
1242 | otherwise
1243 = case cont_info of
1244 CaseCtxt -> not (lone_variable && is_wf) -- Note [Lone variables]
1245 ValAppCtxt -> True -- Note [Cast then apply]
1246 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1247 DiscArgCtxt -> uf_arity > 0 --
1248 RhsCtxt -> uf_arity > 0 --
1249 _ -> not is_top && uf_arity > 0 -- Note [Nested functions]
1250 -- Note [Inlining in ArgCtxt]
1251
1252 {-
1253 Note [Unfold into lazy contexts], Note [RHS of lets]
1254 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1255 When the call is the argument of a function with a RULE, or the RHS of a let,
1256 we are a little bit keener to inline. For example
1257 f y = (y,y,y)
1258 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1259 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1260 only we can't see it. Also
1261 x = f v
1262 could be expensive whereas
1263 x = case v of (a,b) -> a
1264 is patently cheap and may allow more eta expansion.
1265 So we treat the RHS of a let as not-totally-boring.
1266
1267 Note [Unsaturated applications]
1268 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1269 When a call is not saturated, we *still* inline if one of the
1270 arguments has interesting structure. That's sometimes very important.
1271 A good example is the Ord instance for Bool in Base:
1272
1273 Rec {
1274 $fOrdBool =GHC.Classes.D:Ord
1275 @ Bool
1276 ...
1277 $cmin_ajX
1278
1279 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1280 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1281 }
1282
1283 But the defn of GHC.Classes.$dmmin is:
1284
1285 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1286 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1287 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1288 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1289 GHC.Types.False -> y GHC.Types.True -> x }) -}
1290
1291 We *really* want to inline $dmmin, even though it has arity 3, in
1292 order to unravel the recursion.
1293
1294
1295 Note [Things to watch]
1296 ~~~~~~~~~~~~~~~~~~~~~~
1297 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1298 Assume x is exported, so not inlined unconditionally.
1299 Then we want x to inline unconditionally; no reason for it
1300 not to, and doing so avoids an indirection.
1301
1302 * { x = I# 3; ....f x.... }
1303 Make sure that x does not inline unconditionally!
1304 Lest we get extra allocation.
1305
1306 Note [Inlining an InlineRule]
1307 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1308 An InlineRules is used for
1309 (a) programmer INLINE pragmas
1310 (b) inlinings from worker/wrapper
1311
1312 For (a) the RHS may be large, and our contract is that we *only* inline
1313 when the function is applied to all the arguments on the LHS of the
1314 source-code defn. (The uf_arity in the rule.)
1315
1316 However for worker/wrapper it may be worth inlining even if the
1317 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1318 require saturation.
1319
1320
1321 Note [Nested functions]
1322 ~~~~~~~~~~~~~~~~~~~~~~~
1323 If a function has a nested defn we also record some-benefit, on the
1324 grounds that we are often able to eliminate the binding, and hence the
1325 allocation, for the function altogether; this is good for join points.
1326 But this only makes sense for *functions*; inlining a constructor
1327 doesn't help allocation unless the result is scrutinised. UNLESS the
1328 constructor occurs just once, albeit possibly in multiple case
1329 branches. Then inlining it doesn't increase allocation, but it does
1330 increase the chance that the constructor won't be allocated at all in
1331 the branches that don't use it.
1332
1333 Note [Cast then apply]
1334 ~~~~~~~~~~~~~~~~~~~~~~
1335 Consider
1336 myIndex = __inline_me ( (/\a. <blah>) |> co )
1337 co :: (forall a. a -> a) ~ (forall a. T a)
1338 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1339
1340 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1341 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1342
1343 Note [Inlining in ArgCtxt]
1344 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1345 The condition (arity > 0) here is very important, because otherwise
1346 we end up inlining top-level stuff into useless places; eg
1347 x = I# 3#
1348 f = \y. g x
1349 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1350 and 20% to 'power'.
1351
1352 At one stage I replaced this condition by 'True' (leading to the above
1353 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1354 to work ok now.
1355
1356 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1357 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1358 for the RHS of a 'let', we only profit from the inlining if we get a
1359 CONLIKE thing (modulo lets).
1360
1361 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1362 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1363 The "lone-variable" case is important. I spent ages messing about
1364 with unsatisfactory variants, but this is nice. The idea is that if a
1365 variable appears all alone
1366
1367 as an arg of lazy fn, or rhs BoringCtxt
1368 as scrutinee of a case CaseCtxt
1369 as arg of a fn ArgCtxt
1370 AND
1371 it is bound to a cheap expression
1372
1373 then we should not inline it (unless there is some other reason,
1374 e.g. it is the sole occurrence). That is what is happening at
1375 the use of 'lone_variable' in 'interesting_call'.
1376
1377 Why? At least in the case-scrutinee situation, turning
1378 let x = (a,b) in case x of y -> ...
1379 into
1380 let x = (a,b) in case (a,b) of y -> ...
1381 and thence to
1382 let x = (a,b) in let y = (a,b) in ...
1383 is bad if the binding for x will remain.
1384
1385 Another example: I discovered that strings
1386 were getting inlined straight back into applications of 'error'
1387 because the latter is strict.
1388 s = "foo"
1389 f = \x -> ...(error s)...
1390
1391 Fundamentally such contexts should not encourage inlining because the
1392 context can ``see'' the unfolding of the variable (e.g. case or a
1393 RULE) so there's no gain. If the thing is bound to a value.
1394
1395 However, watch out:
1396
1397 * Consider this:
1398 foo = _inline_ (\n. [n])
1399 bar = _inline_ (foo 20)
1400 baz = \n. case bar of { (m:_) -> m + n }
1401 Here we really want to inline 'bar' so that we can inline 'foo'
1402 and the whole thing unravels as it should obviously do. This is
1403 important: in the NDP project, 'bar' generates a closure data
1404 structure rather than a list.
1405
1406 So the non-inlining of lone_variables should only apply if the
1407 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1408 looks through the unfolding. Hence the "&& is_wf" in the
1409 InlineRule branch.
1410
1411 * Even a type application or coercion isn't a lone variable.
1412 Consider
1413 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1414 We had better inline that sucker! The case won't see through it.
1415
1416 For now, I'm treating treating a variable applied to types
1417 in a *lazy* context "lone". The motivating example was
1418 f = /\a. \x. BIG
1419 g = /\a. \y. h (f a)
1420 There's no advantage in inlining f here, and perhaps
1421 a significant disadvantage. Hence some_val_args in the Stop case
1422
1423 Note [Interaction of exprIsWorkFree and lone variables]
1424 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1425 The lone-variable test says "don't inline if a case expression
1426 scrutinises a lone variable whose unfolding is cheap". It's very
1427 important that, under these circumstances, exprIsConApp_maybe
1428 can spot a constructor application. So, for example, we don't
1429 consider
1430 let x = e in (x,x)
1431 to be cheap, and that's good because exprIsConApp_maybe doesn't
1432 think that expression is a constructor application.
1433
1434 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1435 rather than is_wf, which was utterly wrong, because the above
1436 expression responds True to exprIsHNF, which is what sets is_value.
1437
1438 This kind of thing can occur if you have
1439
1440 {-# INLINE foo #-}
1441 foo = let x = e in (x,x)
1442
1443 which Roman did.
1444 -}
1445
1446 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1447 -> Int
1448 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1449 -- We multiple the raw discounts (args_discount and result_discount)
1450 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1451 -- *size* whereas the discounts imply that there's some extra
1452 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1453 -- by inlining.
1454
1455 = 10 -- Discount of 10 because the result replaces the call
1456 -- so we count 10 for the function itself
1457
1458 + 10 * length actual_arg_discounts
1459 -- Discount of 10 for each arg supplied,
1460 -- because the result replaces the call
1461
1462 + round (ufKeenessFactor dflags *
1463 fromIntegral (total_arg_discount + res_discount'))
1464 where
1465 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1466 total_arg_discount = sum actual_arg_discounts
1467
1468 mk_arg_discount _ TrivArg = 0
1469 mk_arg_discount _ NonTrivArg = 10
1470 mk_arg_discount discount ValueArg = discount
1471
1472 res_discount'
1473 | LT <- arg_discounts `compareLength` arg_infos
1474 = res_discount -- Over-saturated
1475 | otherwise
1476 = case cont_info of
1477 BoringCtxt -> 0
1478 CaseCtxt -> res_discount -- Presumably a constructor
1479 ValAppCtxt -> res_discount -- Presumably a function
1480 _ -> 40 `min` res_discount
1481 -- ToDo: this 40 `min` res_discount doesn't seem right
1482 -- for DiscArgCtxt it shouldn't matter because the function will
1483 -- get the arg discount for any non-triv arg
1484 -- for RuleArgCtxt we do want to be keener to inline; but not only
1485 -- constructor results
1486 -- for RhsCtxt I suppose that exposing a data con is good in general
1487 -- And 40 seems very arbitrary
1488 --
1489 -- res_discount can be very large when a function returns
1490 -- constructors; but we only want to invoke that large discount
1491 -- when there's a case continuation.
1492 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1493 -- But we want to aovid inlining large functions that return
1494 -- constructors into contexts that are simply "interesting"