Lint types in newFamInst
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import PprCore () -- Instances
50 import OccurAnal ( occurAnalyseExpr )
51 import CoreOpt
52 import CoreArity ( manifestArity )
53 import CoreUtils
54 import Id
55 import Demand ( isBottomingSig )
56 import DataCon
57 import Literal
58 import PrimOp
59 import IdInfo
60 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
61 import Type
62 import PrelNames
63 import TysPrim ( realWorldStatePrimTy )
64 import Bag
65 import Util
66 import Outputable
67 import ForeignCall
68 import Name
69
70 import qualified Data.ByteString as BS
71 import Data.List
72
73 {-
74 ************************************************************************
75 * *
76 \subsection{Making unfoldings}
77 * *
78 ************************************************************************
79 -}
80
81 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
82 mkTopUnfolding dflags is_bottoming rhs
83 = mkUnfolding dflags InlineRhs True is_bottoming rhs
84
85 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
86 -- For implicit Ids, do a tiny bit of optimising first
87 mkImplicitUnfolding dflags expr
88 = mkTopUnfolding dflags False (simpleOptExpr expr)
89
90 -- Note [Top-level flag on inline rules]
91 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 -- Slight hack: note that mk_inline_rules conservatively sets the
93 -- top-level flag to True. It gets set more accurately by the simplifier
94 -- Simplify.simplUnfolding.
95
96 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
97 mkSimpleUnfolding dflags rhs
98 = mkUnfolding dflags InlineRhs False False rhs
99
100 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
101 mkDFunUnfolding bndrs con ops
102 = DFunUnfolding { df_bndrs = bndrs
103 , df_con = con
104 , df_args = map occurAnalyseExpr ops }
105 -- See Note [Occurrence analysis of unfoldings]
106
107 mkWwInlineRule :: CoreExpr -> Arity -> Unfolding
108 mkWwInlineRule expr arity
109 = mkCoreUnfolding InlineStable True
110 (simpleOptExpr expr)
111 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
112 , ug_boring_ok = boringCxtNotOk })
113
114 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
115 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
116 = mkCoreUnfolding InlineCompulsory True
117 (simpleOptExpr expr)
118 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
119 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
120
121 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
122 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
123 mkWorkerUnfolding dflags work_fn
124 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
125 , uf_is_top = top_lvl })
126 | isStableSource src
127 = mkCoreUnfolding src top_lvl new_tmpl guidance
128 where
129 new_tmpl = simpleOptExpr (work_fn tmpl)
130 guidance = calcUnfoldingGuidance dflags False new_tmpl
131
132 mkWorkerUnfolding _ _ _ = noUnfolding
133
134 -- | Make an unfolding that may be used unsaturated
135 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
136 -- manifest arity (the number of outer lambdas applications will
137 -- resolve before doing any work).
138 mkInlineUnfolding :: CoreExpr -> Unfolding
139 mkInlineUnfolding expr
140 = mkCoreUnfolding InlineStable
141 True -- Note [Top-level flag on inline rules]
142 expr' guide
143 where
144 expr' = simpleOptExpr expr
145 guide = UnfWhen { ug_arity = manifestArity expr'
146 , ug_unsat_ok = unSaturatedOk
147 , ug_boring_ok = boring_ok }
148 boring_ok = inlineBoringOk expr'
149
150 -- | Make an unfolding that will be used once the RHS has been saturated
151 -- to the given arity.
152 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
153 mkInlineUnfoldingWithArity arity expr
154 = mkCoreUnfolding InlineStable
155 True -- Note [Top-level flag on inline rules]
156 expr' guide
157 where
158 expr' = simpleOptExpr expr
159 guide = UnfWhen { ug_arity = arity
160 , ug_unsat_ok = needSaturated
161 , ug_boring_ok = boring_ok }
162 boring_ok = inlineBoringOk expr'
163
164 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
165 mkInlinableUnfolding dflags expr
166 = mkUnfolding dflags InlineStable False False expr'
167 where
168 expr' = simpleOptExpr expr
169
170 specUnfolding :: [Var] -> (CoreExpr -> CoreExpr) -> Arity -> Unfolding -> Unfolding
171 -- See Note [Specialising unfoldings]
172 -- specUnfolding spec_bndrs spec_app arity_decrease unf
173 -- = \spec_bndrs. spec_app( unf )
174 --
175 specUnfolding spec_bndrs spec_app arity_decrease
176 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
177 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
178 mkDFunUnfolding spec_bndrs con (map spec_arg args)
179 -- There is a hard-to-check assumption here that the spec_app has
180 -- enough applications to exactly saturate the old_bndrs
181 -- For DFunUnfoldings we transform
182 -- \old_bndrs. MkD <op1> ... <opn>
183 -- to
184 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
185 -- The ASSERT checks the value part of that
186 where
187 spec_arg arg = simpleOptExpr (spec_app (mkLams old_bndrs arg))
188 -- The beta-redexes created by spec_app will be
189 -- simplified away by simplOptExpr
190
191 specUnfolding spec_bndrs spec_app arity_decrease
192 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
193 , uf_is_top = top_lvl
194 , uf_guidance = old_guidance })
195 | isStableSource src -- See Note [Specialising unfoldings]
196 , UnfWhen { ug_arity = old_arity
197 , ug_unsat_ok = unsat_ok
198 , ug_boring_ok = boring_ok } <- old_guidance
199 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
200 , ug_unsat_ok = unsat_ok
201 , ug_boring_ok = boring_ok }
202 new_tmpl = simpleOptExpr (mkLams spec_bndrs (spec_app tmpl))
203 -- The beta-redexes created by spec_app will be
204 -- simplified away by simplOptExpr
205
206 in mkCoreUnfolding src top_lvl new_tmpl guidance
207
208 specUnfolding _ _ _ _ = noUnfolding
209
210 {- Note [Specialising unfoldings]
211 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
212 When we specialise a function for some given type-class arguments, we use
213 specUnfolding to specialise its unfolding. Some important points:
214
215 * If the original function has a DFunUnfolding, the specialised one
216 must do so too! Otherwise we lose the magic rules that make it
217 interact with ClassOps
218
219 * There is a bit of hack for INLINABLE functions:
220 f :: Ord a => ....
221 f = <big-rhs>
222 {- INLINABLE f #-}
223 Now if we specialise f, should the specialised version still have
224 an INLINABLE pragma? If it does, we'll capture a specialised copy
225 of <big-rhs> as its unfolding, and that probaby won't inline. But
226 if we don't, the specialised version of <big-rhs> might be small
227 enough to inline at a call site. This happens with Control.Monad.liftM3,
228 and can cause a lot more allocation as a result (nofib n-body shows this).
229
230 Moreover, keeping the INLINABLE thing isn't much help, because
231 the specialised function (probaby) isn't overloaded any more.
232
233 Conclusion: drop the INLINEALE pragma. In practice what this means is:
234 if a stable unfolding has UnfoldingGuidance of UnfWhen,
235 we keep it (so the specialised thing too will always inline)
236 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
237 (which arises from INLINABLE), we discard it
238 -}
239
240 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
241 -> UnfoldingGuidance -> Unfolding
242 -- Occurrence-analyses the expression before capturing it
243 mkCoreUnfolding src top_lvl expr guidance
244 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
245 -- See Note [Occurrence analysis of unfoldings]
246 uf_src = src,
247 uf_is_top = top_lvl,
248 uf_is_value = exprIsHNF expr,
249 uf_is_conlike = exprIsConLike expr,
250 uf_is_work_free = exprIsWorkFree expr,
251 uf_expandable = exprIsExpandable expr,
252 uf_guidance = guidance }
253
254 mkUnfolding :: DynFlags -> UnfoldingSource
255 -> Bool -- Is top-level
256 -> Bool -- Definitely a bottoming binding
257 -- (only relevant for top-level bindings)
258 -> CoreExpr
259 -> Unfolding
260 -- Calculates unfolding guidance
261 -- Occurrence-analyses the expression before capturing it
262 mkUnfolding dflags src is_top_lvl is_bottoming expr
263 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
264 -- See Note [Occurrence analysis of unfoldings]
265 uf_src = src,
266 uf_is_top = is_top_lvl,
267 uf_is_value = exprIsHNF expr,
268 uf_is_conlike = exprIsConLike expr,
269 uf_expandable = exprIsExpandable expr,
270 uf_is_work_free = exprIsWorkFree expr,
271 uf_guidance = guidance }
272 where
273 is_top_bottoming = is_top_lvl && is_bottoming
274 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
275 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
276 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
277
278 {-
279 Note [Occurrence analysis of unfoldings]
280 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
281 We do occurrence-analysis of unfoldings once and for all, when the
282 unfolding is built, rather than each time we inline them.
283
284 But given this decision it's vital that we do
285 *always* do it. Consider this unfolding
286 \x -> letrec { f = ...g...; g* = f } in body
287 where g* is (for some strange reason) the loop breaker. If we don't
288 occ-anal it when reading it in, we won't mark g as a loop breaker, and
289 we may inline g entirely in body, dropping its binding, and leaving
290 the occurrence in f out of scope. This happened in Trac #8892, where
291 the unfolding in question was a DFun unfolding.
292
293 But more generally, the simplifier is designed on the
294 basis that it is looking at occurrence-analysed expressions, so better
295 ensure that they acutally are.
296
297 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
298 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
299 Notice that we give the non-occur-analysed expression to
300 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
301 first; for example, sometimes during simplification, there's a large
302 let-bound thing which has been substituted, and so is now dead; so
303 'expr' contains two copies of the thing while the occurrence-analysed
304 expression doesn't.
305
306 Nevertheless, we *don't* and *must not* occ-analyse before computing
307 the size because
308
309 a) The size computation bales out after a while, whereas occurrence
310 analysis does not.
311
312 b) Residency increases sharply if you occ-anal first. I'm not
313 100% sure why, but it's a large effect. Compiling Cabal went
314 from residency of 534M to over 800M with this one change.
315
316 This can occasionally mean that the guidance is very pessimistic;
317 it gets fixed up next round. And it should be rare, because large
318 let-bound things that are dead are usually caught by preInlineUnconditionally
319
320
321 ************************************************************************
322 * *
323 \subsection{The UnfoldingGuidance type}
324 * *
325 ************************************************************************
326 -}
327
328 inlineBoringOk :: CoreExpr -> Bool
329 -- See Note [INLINE for small functions]
330 -- True => the result of inlining the expression is
331 -- no bigger than the expression itself
332 -- eg (\x y -> f y x)
333 -- This is a quick and dirty version. It doesn't attempt
334 -- to deal with (\x y z -> x (y z))
335 -- The really important one is (x `cast` c)
336 inlineBoringOk e
337 = go 0 e
338 where
339 go :: Int -> CoreExpr -> Bool
340 go credit (Lam x e) | isId x = go (credit+1) e
341 | otherwise = go credit e
342 go credit (App f (Type {})) = go credit f
343 go credit (App f a) | credit > 0
344 , exprIsTrivial a = go (credit-1) f
345 go credit (Tick _ e) = go credit e -- dubious
346 go credit (Cast e _) = go credit e
347 go _ (Var {}) = boringCxtOk
348 go _ _ = boringCxtNotOk
349
350 calcUnfoldingGuidance
351 :: DynFlags
352 -> Bool -- Definitely a top-level, bottoming binding
353 -> CoreExpr -- Expression to look at
354 -> UnfoldingGuidance
355 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
356 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
357 = calcUnfoldingGuidance dflags is_top_bottoming expr
358 calcUnfoldingGuidance dflags is_top_bottoming expr
359 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
360 TooBig -> UnfNever
361 SizeIs size cased_bndrs scrut_discount
362 | uncondInline expr n_val_bndrs size
363 -> UnfWhen { ug_unsat_ok = unSaturatedOk
364 , ug_boring_ok = boringCxtOk
365 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
366
367 | is_top_bottoming
368 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
369
370 | otherwise
371 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
372 , ug_size = size
373 , ug_res = scrut_discount }
374
375 where
376 (bndrs, body) = collectBinders expr
377 bOMB_OUT_SIZE = ufCreationThreshold dflags
378 -- Bomb out if size gets bigger than this
379 val_bndrs = filter isId bndrs
380 n_val_bndrs = length val_bndrs
381
382 mk_discount :: Bag (Id,Int) -> Id -> Int
383 mk_discount cbs bndr = foldlBag combine 0 cbs
384 where
385 combine acc (bndr', disc)
386 | bndr == bndr' = acc `plus_disc` disc
387 | otherwise = acc
388
389 plus_disc :: Int -> Int -> Int
390 plus_disc | isFunTy (idType bndr) = max
391 | otherwise = (+)
392 -- See Note [Function and non-function discounts]
393
394 {-
395 Note [Computing the size of an expression]
396 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
397 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
398 heuristics right has taken a long time. Here's the basic strategy:
399
400 * Variables, literals: 0
401 (Exception for string literals, see litSize.)
402
403 * Function applications (f e1 .. en): 1 + #value args
404
405 * Constructor applications: 1, regardless of #args
406
407 * Let(rec): 1 + size of components
408
409 * Note, cast: 0
410
411 Examples
412
413 Size Term
414 --------------
415 0 42#
416 0 x
417 0 True
418 2 f x
419 1 Just x
420 4 f (g x)
421
422 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
423 a function call to account for. Notice also that constructor applications
424 are very cheap, because exposing them to a caller is so valuable.
425
426 [25/5/11] All sizes are now multiplied by 10, except for primops
427 (which have sizes like 1 or 4. This makes primops look fantastically
428 cheap, and seems to be almost unversally beneficial. Done partly as a
429 result of #4978.
430
431 Note [Do not inline top-level bottoming functions]
432 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
433 The FloatOut pass has gone to some trouble to float out calls to 'error'
434 and similar friends. See Note [Bottoming floats] in SetLevels.
435 Do not re-inline them! But we *do* still inline if they are very small
436 (the uncondInline stuff).
437
438 Note [INLINE for small functions]
439 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
440 Consider {-# INLINE f #-}
441 f x = Just x
442 g y = f y
443 Then f's RHS is no larger than its LHS, so we should inline it into
444 even the most boring context. In general, f the function is
445 sufficiently small that its body is as small as the call itself, the
446 inline unconditionally, regardless of how boring the context is.
447
448 Things to note:
449
450 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
451 than the thing it's replacing. Notice that
452 (f x) --> (g 3) -- YES, unconditionally
453 (f x) --> x : [] -- YES, *even though* there are two
454 -- arguments to the cons
455 x --> g 3 -- NO
456 x --> Just v -- NO
457
458 It's very important not to unconditionally replace a variable by
459 a non-atomic term.
460
461 (2) We do this even if the thing isn't saturated, else we end up with the
462 silly situation that
463 f x y = x
464 ...map (f 3)...
465 doesn't inline. Even in a boring context, inlining without being
466 saturated will give a lambda instead of a PAP, and will be more
467 efficient at runtime.
468
469 (3) However, when the function's arity > 0, we do insist that it
470 has at least one value argument at the call site. (This check is
471 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
472 f = /\a \x:a. x
473 d = /\b. MkD (f b)
474 If we inline f here we get
475 d = /\b. MkD (\x:b. x)
476 and then prepareRhs floats out the argument, abstracting the type
477 variables, so we end up with the original again!
478
479 (4) We must be much more cautious about arity-zero things. Consider
480 let x = y +# z in ...
481 In *size* terms primops look very small, because the generate a
482 single instruction, but we do not want to unconditionally replace
483 every occurrence of x with (y +# z). So we only do the
484 unconditional-inline thing for *trivial* expressions.
485
486 NB: you might think that PostInlineUnconditionally would do this
487 but it doesn't fire for top-level things; see SimplUtils
488 Note [Top level and postInlineUnconditionally]
489 -}
490
491 uncondInline :: CoreExpr -> Arity -> Int -> Bool
492 -- Inline unconditionally if there no size increase
493 -- Size of call is arity (+1 for the function)
494 -- See Note [INLINE for small functions]
495 uncondInline rhs arity size
496 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
497 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
498
499 sizeExpr :: DynFlags
500 -> Int -- Bomb out if it gets bigger than this
501 -> [Id] -- Arguments; we're interested in which of these
502 -- get case'd
503 -> CoreExpr
504 -> ExprSize
505
506 -- Note [Computing the size of an expression]
507
508 sizeExpr dflags bOMB_OUT_SIZE top_args expr
509 = size_up expr
510 where
511 size_up (Cast e _) = size_up e
512 size_up (Tick _ e) = size_up e
513 size_up (Type _) = sizeZero -- Types cost nothing
514 size_up (Coercion _) = sizeZero
515 size_up (Lit lit) = sizeN (litSize lit)
516 size_up (Var f) | isRealWorldId f = sizeZero
517 -- Make sure we get constructor discounts even
518 -- on nullary constructors
519 | otherwise = size_up_call f [] 0
520
521 size_up (App fun arg)
522 | isTyCoArg arg = size_up fun
523 | otherwise = size_up arg `addSizeNSD`
524 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
525
526 size_up (Lam b e)
527 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
528 | otherwise = size_up e
529
530 size_up (Let (NonRec binder rhs) body)
531 = size_up_rhs (binder, rhs) `addSizeNSD`
532 size_up body `addSizeN`
533 size_up_alloc binder
534
535 size_up (Let (Rec pairs) body)
536 = foldr (addSizeNSD . size_up_rhs)
537 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
538 pairs
539
540 size_up (Case e _ _ alts)
541 | null alts
542 = size_up e -- case e of {} never returns, so take size of scrutinee
543
544 size_up (Case e _ _ alts)
545 -- Now alts is non-empty
546 | Just v <- is_top_arg e -- We are scrutinising an argument variable
547 = let
548 alt_sizes = map size_up_alt alts
549
550 -- alts_size tries to compute a good discount for
551 -- the case when we are scrutinising an argument variable
552 alts_size (SizeIs tot tot_disc tot_scrut)
553 -- Size of all alternatives
554 (SizeIs max _ _)
555 -- Size of biggest alternative
556 = SizeIs tot (unitBag (v, 20 + tot - max)
557 `unionBags` tot_disc) tot_scrut
558 -- If the variable is known, we produce a
559 -- discount that will take us back to 'max',
560 -- the size of the largest alternative The
561 -- 1+ is a little discount for reduced
562 -- allocation in the caller
563 --
564 -- Notice though, that we return tot_disc,
565 -- the total discount from all branches. I
566 -- think that's right.
567
568 alts_size tot_size _ = tot_size
569 in
570 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
571 (foldr1 maxSize alt_sizes)
572 -- Good to inline if an arg is scrutinised, because
573 -- that may eliminate allocation in the caller
574 -- And it eliminates the case itself
575 where
576 is_top_arg (Var v) | v `elem` top_args = Just v
577 is_top_arg (Cast e _) = is_top_arg e
578 is_top_arg _ = Nothing
579
580
581 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
582 foldr (addAltSize . size_up_alt) case_size alts
583 where
584 case_size
585 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
586 | otherwise = sizeZero
587 -- Normally we don't charge for the case itself, but
588 -- we charge one per alternative (see size_up_alt,
589 -- below) to account for the cost of the info table
590 -- and comparisons.
591 --
592 -- However, in certain cases (see is_inline_scrut
593 -- below), no code is generated for the case unless
594 -- there are multiple alts. In these cases we
595 -- subtract one, making the first alt free.
596 -- e.g. case x# +# y# of _ -> ... should cost 1
597 -- case touch# x# of _ -> ... should cost 0
598 -- (see #4978)
599 --
600 -- I would like to not have the "lengthAtMost alts 1"
601 -- condition above, but without that some programs got worse
602 -- (spectral/hartel/event and spectral/para). I don't fully
603 -- understand why. (SDM 24/5/11)
604
605 -- unboxed variables, inline primops and unsafe foreign calls
606 -- are all "inline" things:
607 is_inline_scrut (Var v) = isUnliftedType (idType v)
608 is_inline_scrut scrut
609 | (Var f, _) <- collectArgs scrut
610 = case idDetails f of
611 FCallId fc -> not (isSafeForeignCall fc)
612 PrimOpId op -> not (primOpOutOfLine op)
613 _other -> False
614 | otherwise
615 = False
616
617 size_up_rhs (bndr, rhs)
618 | Just join_arity <- isJoinId_maybe bndr
619 -- Skip arguments to join point
620 , (_bndrs, body) <- collectNBinders join_arity rhs
621 = size_up body
622 | otherwise
623 = size_up rhs
624
625 ------------
626 -- size_up_app is used when there's ONE OR MORE value args
627 size_up_app (App fun arg) args voids
628 | isTyCoArg arg = size_up_app fun args voids
629 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
630 | otherwise = size_up arg `addSizeNSD`
631 size_up_app fun (arg:args) voids
632 size_up_app (Var fun) args voids = size_up_call fun args voids
633 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
634 size_up_app (Cast expr _) args voids = size_up_app expr args voids
635 size_up_app other args voids = size_up other `addSizeN`
636 callSize (length args) voids
637 -- if the lhs is not an App or a Var, or an invisible thing like a
638 -- Tick or Cast, then we should charge for a complete call plus the
639 -- size of the lhs itself.
640
641 ------------
642 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
643 size_up_call fun val_args voids
644 = case idDetails fun of
645 FCallId _ -> sizeN (callSize (length val_args) voids)
646 DataConWorkId dc -> conSize dc (length val_args)
647 PrimOpId op -> primOpSize op (length val_args)
648 ClassOpId _ -> classOpSize dflags top_args val_args
649 _ -> funSize dflags top_args fun (length val_args) voids
650
651 ------------
652 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
653 -- Don't charge for args, so that wrappers look cheap
654 -- (See comments about wrappers with Case)
655 --
656 -- IMPORTANT: *do* charge 1 for the alternative, else we
657 -- find that giant case nests are treated as practically free
658 -- A good example is Foreign.C.Error.errnoToIOError
659
660 ------------
661 -- Cost to allocate binding with given binder
662 size_up_alloc bndr
663 | isTyVar bndr -- Doesn't exist at runtime
664 || isJoinId bndr -- Not allocated at all
665 || isUnliftedType (idType bndr) -- Doesn't live in heap
666 = 0
667 | otherwise
668 = 10
669
670 ------------
671 -- These addSize things have to be here because
672 -- I don't want to give them bOMB_OUT_SIZE as an argument
673 addSizeN TooBig _ = TooBig
674 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
675
676 -- addAltSize is used to add the sizes of case alternatives
677 addAltSize TooBig _ = TooBig
678 addAltSize _ TooBig = TooBig
679 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
680 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
681 (xs `unionBags` ys)
682 (d1 + d2) -- Note [addAltSize result discounts]
683
684 -- This variant ignores the result discount from its LEFT argument
685 -- It's used when the second argument isn't part of the result
686 addSizeNSD TooBig _ = TooBig
687 addSizeNSD _ TooBig = TooBig
688 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
689 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
690 (xs `unionBags` ys)
691 d2 -- Ignore d1
692
693 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
694
695 -- an expression of type State# RealWorld must be a variable
696 isRealWorldExpr (Var id) = isRealWorldId id
697 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
698 isRealWorldExpr _ = False
699
700 -- | Finds a nominal size of a string literal.
701 litSize :: Literal -> Int
702 -- Used by CoreUnfold.sizeExpr
703 litSize (LitInteger {}) = 100 -- Note [Size of literal integers]
704 litSize (MachStr str) = 10 + 10 * ((BS.length str + 3) `div` 4)
705 -- If size could be 0 then @f "x"@ might be too small
706 -- [Sept03: make literal strings a bit bigger to avoid fruitless
707 -- duplication of little strings]
708 litSize _other = 0 -- Must match size of nullary constructors
709 -- Key point: if x |-> 4, then x must inline unconditionally
710 -- (eg via case binding)
711
712 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
713 -- See Note [Conlike is interesting]
714 classOpSize _ _ []
715 = sizeZero
716 classOpSize dflags top_args (arg1 : other_args)
717 = SizeIs size arg_discount 0
718 where
719 size = 20 + (10 * length other_args)
720 -- If the class op is scrutinising a lambda bound dictionary then
721 -- give it a discount, to encourage the inlining of this function
722 -- The actual discount is rather arbitrarily chosen
723 arg_discount = case arg1 of
724 Var dict | dict `elem` top_args
725 -> unitBag (dict, ufDictDiscount dflags)
726 _other -> emptyBag
727
728 -- | The size of a function call
729 callSize
730 :: Int -- ^ number of value args
731 -> Int -- ^ number of value args that are void
732 -> Int
733 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
734 -- The 1+ is for the function itself
735 -- Add 1 for each non-trivial arg;
736 -- the allocation cost, as in let(rec)
737
738 -- | The size of a jump to a join point
739 jumpSize
740 :: Int -- ^ number of value args
741 -> Int -- ^ number of value args that are void
742 -> Int
743 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
744 -- A jump is 20% the size of a function call. Making jumps free reopens
745 -- bug #6048, but making them any more expensive loses a 21% improvement in
746 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
747 -- better solution?
748
749 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
750 -- Size for functions that are not constructors or primops
751 -- Note [Function applications]
752 funSize dflags top_args fun n_val_args voids
753 | fun `hasKey` buildIdKey = buildSize
754 | fun `hasKey` augmentIdKey = augmentSize
755 | otherwise = SizeIs size arg_discount res_discount
756 where
757 some_val_args = n_val_args > 0
758 is_join = isJoinId fun
759
760 size | is_join = jumpSize n_val_args voids
761 | not some_val_args = 0
762 | otherwise = callSize n_val_args voids
763
764 -- DISCOUNTS
765 -- See Note [Function and non-function discounts]
766 arg_discount | some_val_args && fun `elem` top_args
767 = unitBag (fun, ufFunAppDiscount dflags)
768 | otherwise = emptyBag
769 -- If the function is an argument and is applied
770 -- to some values, give it an arg-discount
771
772 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
773 | otherwise = 0
774 -- If the function is partially applied, show a result discount
775 -- XXX maybe behave like ConSize for eval'd variable
776
777 conSize :: DataCon -> Int -> ExprSize
778 conSize dc n_val_args
779 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
780
781 -- See Note [Unboxed tuple size and result discount]
782 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
783
784 -- See Note [Constructor size and result discount]
785 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
786
787 -- XXX still looks to large to me
788
789 {-
790 Note [Constructor size and result discount]
791 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
792 Treat a constructors application as size 10, regardless of how many
793 arguments it has; we are keen to expose them (and we charge separately
794 for their args). We can't treat them as size zero, else we find that
795 (Just x) has size 0, which is the same as a lone variable; and hence
796 'v' will always be replaced by (Just x), where v is bound to Just x.
797
798 The "result discount" is applied if the result of the call is
799 scrutinised (say by a case). For a constructor application that will
800 mean the constructor application will disappear, so we don't need to
801 charge it to the function. So the discount should at least match the
802 cost of the constructor application, namely 10. But to give a bit
803 of extra incentive we give a discount of 10*(1 + n_val_args).
804
805 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
806 and said it was an "unambiguous win", but its terribly dangerous
807 because a function with many many case branches, each finishing with
808 a constructor, can have an arbitrarily large discount. This led to
809 terrible code bloat: see Trac #6099.
810
811 Note [Unboxed tuple size and result discount]
812 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
813 However, unboxed tuples count as size zero. I found occasions where we had
814 f x y z = case op# x y z of { s -> (# s, () #) }
815 and f wasn't getting inlined.
816
817 I tried giving unboxed tuples a *result discount* of zero (see the
818 commented-out line). Why? When returned as a result they do not
819 allocate, so maybe we don't want to charge so much for them If you
820 have a non-zero discount here, we find that workers often get inlined
821 back into wrappers, because it look like
822 f x = case $wf x of (# a,b #) -> (a,b)
823 and we are keener because of the case. However while this change
824 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
825 more. All other changes were very small. So it's not a big deal but I
826 didn't adopt the idea.
827
828 Note [Function and non-function discounts]
829 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
830 We want a discount if the function is applied. A good example is
831 monadic combinators with continuation arguments, where inlining is
832 quite important.
833
834 But we don't want a big discount when a function is called many times
835 (see the detailed comments with Trac #6048) because if the function is
836 big it won't be inlined at its many call sites and no benefit results.
837 Indeed, we can get exponentially big inlinings this way; that is what
838 Trac #6048 is about.
839
840 On the other hand, for data-valued arguments, if there are lots of
841 case expressions in the body, each one will get smaller if we apply
842 the function to a constructor application, so we *want* a big discount
843 if the argument is scrutinised by many case expressions.
844
845 Conclusion:
846 - For functions, take the max of the discounts
847 - For data values, take the sum of the discounts
848
849
850 Note [Literal integer size]
851 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
852 Literal integers *can* be big (mkInteger [...coefficients...]), but
853 need not be (S# n). We just use an arbitrary big-ish constant here
854 so that, in particular, we don't inline top-level defns like
855 n = S# 5
856 There's no point in doing so -- any optimisations will see the S#
857 through n's unfolding. Nor will a big size inhibit unfoldings functions
858 that mention a literal Integer, because the float-out pass will float
859 all those constants to top level.
860 -}
861
862 primOpSize :: PrimOp -> Int -> ExprSize
863 primOpSize op n_val_args
864 = if primOpOutOfLine op
865 then sizeN (op_size + n_val_args)
866 else sizeN op_size
867 where
868 op_size = primOpCodeSize op
869
870
871 buildSize :: ExprSize
872 buildSize = SizeIs 0 emptyBag 40
873 -- We really want to inline applications of build
874 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
875 -- Indeed, we should add a result_discount because build is
876 -- very like a constructor. We don't bother to check that the
877 -- build is saturated (it usually is). The "-2" discounts for the \c n,
878 -- The "4" is rather arbitrary.
879
880 augmentSize :: ExprSize
881 augmentSize = SizeIs 0 emptyBag 40
882 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
883 -- e plus ys. The -2 accounts for the \cn
884
885 -- When we return a lambda, give a discount if it's used (applied)
886 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
887 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
888 lamScrutDiscount _ TooBig = TooBig
889
890 {-
891 Note [addAltSize result discounts]
892 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
893 When adding the size of alternatives, we *add* the result discounts
894 too, rather than take the *maximum*. For a multi-branch case, this
895 gives a discount for each branch that returns a constructor, making us
896 keener to inline. I did try using 'max' instead, but it makes nofib
897 'rewrite' and 'puzzle' allocate significantly more, and didn't make
898 binary sizes shrink significantly either.
899
900 Note [Discounts and thresholds]
901 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
902 Constants for discounts and thesholds are defined in main/DynFlags,
903 all of form ufXxxx. They are:
904
905 ufCreationThreshold
906 At a definition site, if the unfolding is bigger than this, we
907 may discard it altogether
908
909 ufUseThreshold
910 At a call site, if the unfolding, less discounts, is smaller than
911 this, then it's small enough inline
912
913 ufKeenessFactor
914 Factor by which the discounts are multiplied before
915 subtracting from size
916
917 ufDictDiscount
918 The discount for each occurrence of a dictionary argument
919 as an argument of a class method. Should be pretty small
920 else big functions may get inlined
921
922 ufFunAppDiscount
923 Discount for a function argument that is applied. Quite
924 large, because if we inline we avoid the higher-order call.
925
926 ufDearOp
927 The size of a foreign call or not-dupable PrimOp
928
929 ufVeryAggressive
930 If True, the compiler ignores all the thresholds and inlines very
931 aggressively. It still adheres to arity, simplifier phase control and
932 loop breakers.
933
934
935 Note [Function applications]
936 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
937 In a function application (f a b)
938
939 - If 'f' is an argument to the function being analysed,
940 and there's at least one value arg, record a FunAppDiscount for f
941
942 - If the application if a PAP (arity > 2 in this example)
943 record a *result* discount (because inlining
944 with "extra" args in the call may mean that we now
945 get a saturated application)
946
947 Code for manipulating sizes
948 -}
949
950 -- | The size of a candidate expression for unfolding
951 data ExprSize
952 = TooBig
953 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
954 , _es_args :: !(Bag (Id,Int))
955 -- ^ Arguments cased herein, and discount for each such
956 , _es_discount :: {-# UNPACK #-} !Int
957 -- ^ Size to subtract if result is scrutinised by a case
958 -- expression
959 }
960
961 instance Outputable ExprSize where
962 ppr TooBig = text "TooBig"
963 ppr (SizeIs a _ c) = brackets (int a <+> int c)
964
965 -- subtract the discount before deciding whether to bale out. eg. we
966 -- want to inline a large constructor application into a selector:
967 -- tup = (a_1, ..., a_99)
968 -- x = case tup of ...
969 --
970 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
971 mkSizeIs max n xs d | (n - d) > max = TooBig
972 | otherwise = SizeIs n xs d
973
974 maxSize :: ExprSize -> ExprSize -> ExprSize
975 maxSize TooBig _ = TooBig
976 maxSize _ TooBig = TooBig
977 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
978 | otherwise = s2
979
980 sizeZero :: ExprSize
981 sizeN :: Int -> ExprSize
982
983 sizeZero = SizeIs 0 emptyBag 0
984 sizeN n = SizeIs n emptyBag 0
985
986 {-
987 ************************************************************************
988 * *
989 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
990 * *
991 ************************************************************************
992
993 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
994 we ``couldn't possibly use'' on the other side. Can be overridden w/
995 flaggery. Just the same as smallEnoughToInline, except that it has no
996 actual arguments.
997 -}
998
999 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1000 couldBeSmallEnoughToInline dflags threshold rhs
1001 = case sizeExpr dflags threshold [] body of
1002 TooBig -> False
1003 _ -> True
1004 where
1005 (_, body) = collectBinders rhs
1006
1007 ----------------
1008 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1009 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1010 = size <= ufUseThreshold dflags
1011 smallEnoughToInline _ _
1012 = False
1013
1014 ----------------
1015
1016 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1017 -- Sees if the unfolding is pretty certain to inline
1018 -- If so, return a *stable* unfolding for it, that will always inline
1019 certainlyWillInline dflags fn_info
1020 = case unfoldingInfo fn_info of
1021 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1022 | loop_breaker -> Nothing -- Won't inline, so try w/w
1023 | otherwise -> do_cunf e g -- Depends on size, so look at that
1024
1025 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1026 -- to do so, and even if it is currently a
1027 -- loop breaker, it may not be later
1028
1029 _other_unf -> Nothing
1030
1031 where
1032 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1033 fn_unf = unfoldingInfo fn_info
1034
1035 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1036 do_cunf _ UnfNever = Nothing
1037 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1038 -- INLINE functions have UnfWhen
1039
1040 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1041 -- binary sizes go up by 10%! (This is with SplitObjs.)
1042 -- I'm not totally sure why.
1043 -- INLINABLE functions come via this path
1044 -- See Note [certainlyWillInline: INLINABLE]
1045 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1046 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1047 , case inlinePragmaSpec (inlinePragInfo fn_info) of
1048 NoInline -> False -- NOINLINE; do not say certainlyWillInline!
1049 _ -> True -- INLINE, INLINABLE, or nothing
1050 , not (isBottomingSig (strictnessInfo fn_info))
1051 -- Do not unconditionally inline a bottoming functions even if
1052 -- it seems smallish. We've carefully lifted it out to top level,
1053 -- so we don't want to re-inline it.
1054 , let arity = length args
1055 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1056 = Just (fn_unf { uf_src = InlineStable
1057 , uf_guidance = UnfWhen { ug_arity = arity
1058 , ug_unsat_ok = unSaturatedOk
1059 , ug_boring_ok = inlineBoringOk expr } })
1060 -- Note the "unsaturatedOk". A function like f = \ab. a
1061 -- will certainly inline, even if partially applied (f e), so we'd
1062 -- better make sure that the transformed inlining has the same property
1063 | otherwise
1064 = Nothing
1065
1066 {- Note [certainlyWillInline: be careful of thunks]
1067 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1068 Don't claim that thunks will certainly inline, because that risks work
1069 duplication. Even if the work duplication is not great (eg is_cheap
1070 holds), it can make a big difference in an inner loop In Trac #5623 we
1071 found that the WorkWrap phase thought that
1072 y = case x of F# v -> F# (v +# v)
1073 was certainlyWillInline, so the addition got duplicated.
1074
1075 Note [certainlyWillInline: INLINABLE]
1076 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1077 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1078 even though we have a stable inlining, so that strictness w/w takes
1079 place. It makes a big difference to efficiency, and the w/w pass knows
1080 how to transfer the INLINABLE info to the worker; see WorkWrap
1081 Note [Worker-wrapper for INLINABLE functions]
1082
1083 ************************************************************************
1084 * *
1085 \subsection{callSiteInline}
1086 * *
1087 ************************************************************************
1088
1089 This is the key function. It decides whether to inline a variable at a call site
1090
1091 callSiteInline is used at call sites, so it is a bit more generous.
1092 It's a very important function that embodies lots of heuristics.
1093 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1094 and occurs exactly once or
1095 occurs once in each branch of a case and is small
1096
1097 If the thing is in WHNF, there's no danger of duplicating work,
1098 so we can inline if it occurs once, or is small
1099
1100 NOTE: we don't want to inline top-level functions that always diverge.
1101 It just makes the code bigger. Tt turns out that the convenient way to prevent
1102 them inlining is to give them a NOINLINE pragma, which we do in
1103 StrictAnal.addStrictnessInfoToTopId
1104 -}
1105
1106 callSiteInline :: DynFlags
1107 -> Id -- The Id
1108 -> Bool -- True <=> unfolding is active
1109 -> Bool -- True if there are no arguments at all (incl type args)
1110 -> [ArgSummary] -- One for each value arg; True if it is interesting
1111 -> CallCtxt -- True <=> continuation is interesting
1112 -> Maybe CoreExpr -- Unfolding, if any
1113
1114 data ArgSummary = TrivArg -- Nothing interesting
1115 | NonTrivArg -- Arg has structure
1116 | ValueArg -- Arg is a con-app or PAP
1117 -- ..or con-like. Note [Conlike is interesting]
1118
1119 instance Outputable ArgSummary where
1120 ppr TrivArg = text "TrivArg"
1121 ppr NonTrivArg = text "NonTrivArg"
1122 ppr ValueArg = text "ValueArg"
1123
1124 nonTriv :: ArgSummary -> Bool
1125 nonTriv TrivArg = False
1126 nonTriv _ = True
1127
1128 data CallCtxt
1129 = BoringCtxt
1130 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1131 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1132 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1133
1134 | ValAppCtxt -- We're applied to at least one value arg
1135 -- This arises when we have ((f x |> co) y)
1136 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1137
1138 | CaseCtxt -- We're the scrutinee of a case
1139 -- that decomposes its scrutinee
1140
1141 instance Outputable CallCtxt where
1142 ppr CaseCtxt = text "CaseCtxt"
1143 ppr ValAppCtxt = text "ValAppCtxt"
1144 ppr BoringCtxt = text "BoringCtxt"
1145 ppr RhsCtxt = text "RhsCtxt"
1146 ppr DiscArgCtxt = text "DiscArgCtxt"
1147 ppr RuleArgCtxt = text "RuleArgCtxt"
1148
1149 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1150 = case idUnfolding id of
1151 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1152 -- Things with an INLINE pragma may have an unfolding *and*
1153 -- be a loop breaker (maybe the knot is not yet untied)
1154 CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top
1155 , uf_is_work_free = is_wf
1156 , uf_guidance = guidance, uf_expandable = is_exp }
1157 | active_unfolding -> tryUnfolding dflags id lone_variable
1158 arg_infos cont_info unf_template is_top
1159 is_wf is_exp guidance
1160 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1161 NoUnfolding -> Nothing
1162 BootUnfolding -> Nothing
1163 OtherCon {} -> Nothing
1164 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1165
1166 traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1167 traceInline dflags inline_id str doc result
1168 | Just prefix <- inlineCheck dflags
1169 = if prefix `isPrefixOf` occNameString (getOccName inline_id)
1170 then pprTrace str doc result
1171 else result
1172 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1173 = pprTrace str doc result
1174 | otherwise
1175 = result
1176
1177 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1178 -> CoreExpr -> Bool -> Bool -> Bool -> UnfoldingGuidance
1179 -> Maybe CoreExpr
1180 tryUnfolding dflags id lone_variable
1181 arg_infos cont_info unf_template is_top
1182 is_wf is_exp guidance
1183 = case guidance of
1184 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1185
1186 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1187 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1188 -- See Note [INLINE for small functions (3)]
1189 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1190 | otherwise
1191 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1192 where
1193 some_benefit = calc_some_benefit uf_arity
1194 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1195
1196 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1197 | ufVeryAggressive dflags
1198 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1199 | is_wf && some_benefit && small_enough
1200 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1201 | otherwise
1202 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1203 where
1204 some_benefit = calc_some_benefit (length arg_discounts)
1205 extra_doc = text "discounted size =" <+> int discounted_size
1206 discounted_size = size - discount
1207 small_enough = discounted_size <= ufUseThreshold dflags
1208 discount = computeDiscount dflags arg_discounts
1209 res_discount arg_infos cont_info
1210
1211 where
1212 mk_doc some_benefit extra_doc yes_or_no
1213 = vcat [ text "arg infos" <+> ppr arg_infos
1214 , text "interesting continuation" <+> ppr cont_info
1215 , text "some_benefit" <+> ppr some_benefit
1216 , text "is exp:" <+> ppr is_exp
1217 , text "is work-free:" <+> ppr is_wf
1218 , text "guidance" <+> ppr guidance
1219 , extra_doc
1220 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1221
1222 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1223 n_val_args = length arg_infos
1224
1225 -- some_benefit is used when the RHS is small enough
1226 -- and the call has enough (or too many) value
1227 -- arguments (ie n_val_args >= arity). But there must
1228 -- be *something* interesting about some argument, or the
1229 -- result context, to make it worth inlining
1230 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1231 -- expected by the unfolding
1232 calc_some_benefit uf_arity
1233 | not saturated = interesting_args -- Under-saturated
1234 -- Note [Unsaturated applications]
1235 | otherwise = interesting_args -- Saturated or over-saturated
1236 || interesting_call
1237 where
1238 saturated = n_val_args >= uf_arity
1239 over_saturated = n_val_args > uf_arity
1240 interesting_args = any nonTriv arg_infos
1241 -- NB: (any nonTriv arg_infos) looks at the
1242 -- over-saturated args too which is "wrong";
1243 -- but if over-saturated we inline anyway.
1244
1245 interesting_call
1246 | over_saturated
1247 = True
1248 | otherwise
1249 = case cont_info of
1250 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1251 ValAppCtxt -> True -- Note [Cast then apply]
1252 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1253 DiscArgCtxt -> uf_arity > 0 --
1254 RhsCtxt -> uf_arity > 0 --
1255 _ -> not is_top && uf_arity > 0 -- Note [Nested functions]
1256 -- Note [Inlining in ArgCtxt]
1257
1258 {-
1259 Note [Unfold into lazy contexts], Note [RHS of lets]
1260 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1261 When the call is the argument of a function with a RULE, or the RHS of a let,
1262 we are a little bit keener to inline. For example
1263 f y = (y,y,y)
1264 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1265 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1266 only we can't see it. Also
1267 x = f v
1268 could be expensive whereas
1269 x = case v of (a,b) -> a
1270 is patently cheap and may allow more eta expansion.
1271 So we treat the RHS of a let as not-totally-boring.
1272
1273 Note [Unsaturated applications]
1274 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1275 When a call is not saturated, we *still* inline if one of the
1276 arguments has interesting structure. That's sometimes very important.
1277 A good example is the Ord instance for Bool in Base:
1278
1279 Rec {
1280 $fOrdBool =GHC.Classes.D:Ord
1281 @ Bool
1282 ...
1283 $cmin_ajX
1284
1285 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1286 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1287 }
1288
1289 But the defn of GHC.Classes.$dmmin is:
1290
1291 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1292 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1293 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1294 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1295 GHC.Types.False -> y GHC.Types.True -> x }) -}
1296
1297 We *really* want to inline $dmmin, even though it has arity 3, in
1298 order to unravel the recursion.
1299
1300
1301 Note [Things to watch]
1302 ~~~~~~~~~~~~~~~~~~~~~~
1303 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1304 Assume x is exported, so not inlined unconditionally.
1305 Then we want x to inline unconditionally; no reason for it
1306 not to, and doing so avoids an indirection.
1307
1308 * { x = I# 3; ....f x.... }
1309 Make sure that x does not inline unconditionally!
1310 Lest we get extra allocation.
1311
1312 Note [Inlining an InlineRule]
1313 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1314 An InlineRules is used for
1315 (a) programmer INLINE pragmas
1316 (b) inlinings from worker/wrapper
1317
1318 For (a) the RHS may be large, and our contract is that we *only* inline
1319 when the function is applied to all the arguments on the LHS of the
1320 source-code defn. (The uf_arity in the rule.)
1321
1322 However for worker/wrapper it may be worth inlining even if the
1323 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1324 require saturation.
1325
1326
1327 Note [Nested functions]
1328 ~~~~~~~~~~~~~~~~~~~~~~~
1329 If a function has a nested defn we also record some-benefit, on the
1330 grounds that we are often able to eliminate the binding, and hence the
1331 allocation, for the function altogether; this is good for join points.
1332 But this only makes sense for *functions*; inlining a constructor
1333 doesn't help allocation unless the result is scrutinised. UNLESS the
1334 constructor occurs just once, albeit possibly in multiple case
1335 branches. Then inlining it doesn't increase allocation, but it does
1336 increase the chance that the constructor won't be allocated at all in
1337 the branches that don't use it.
1338
1339 Note [Cast then apply]
1340 ~~~~~~~~~~~~~~~~~~~~~~
1341 Consider
1342 myIndex = __inline_me ( (/\a. <blah>) |> co )
1343 co :: (forall a. a -> a) ~ (forall a. T a)
1344 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1345
1346 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1347 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1348
1349 Note [Inlining in ArgCtxt]
1350 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1351 The condition (arity > 0) here is very important, because otherwise
1352 we end up inlining top-level stuff into useless places; eg
1353 x = I# 3#
1354 f = \y. g x
1355 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1356 and 20% to 'power'.
1357
1358 At one stage I replaced this condition by 'True' (leading to the above
1359 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1360 to work ok now.
1361
1362 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1363 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1364 for the RHS of a 'let', we only profit from the inlining if we get a
1365 CONLIKE thing (modulo lets).
1366
1367 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1368 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1369 The "lone-variable" case is important. I spent ages messing about
1370 with unsatisfactory variants, but this is nice. The idea is that if a
1371 variable appears all alone
1372
1373 as an arg of lazy fn, or rhs BoringCtxt
1374 as scrutinee of a case CaseCtxt
1375 as arg of a fn ArgCtxt
1376 AND
1377 it is bound to a cheap expression
1378
1379 then we should not inline it (unless there is some other reason,
1380 e.g. it is the sole occurrence). That is what is happening at
1381 the use of 'lone_variable' in 'interesting_call'.
1382
1383 Why? At least in the case-scrutinee situation, turning
1384 let x = (a,b) in case x of y -> ...
1385 into
1386 let x = (a,b) in case (a,b) of y -> ...
1387 and thence to
1388 let x = (a,b) in let y = (a,b) in ...
1389 is bad if the binding for x will remain.
1390
1391 Another example: I discovered that strings
1392 were getting inlined straight back into applications of 'error'
1393 because the latter is strict.
1394 s = "foo"
1395 f = \x -> ...(error s)...
1396
1397 Fundamentally such contexts should not encourage inlining because, provided
1398 the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1399 context can ``see'' the unfolding of the variable (e.g. case or a
1400 RULE) so there's no gain.
1401
1402 However, watch out:
1403
1404 * Consider this:
1405 foo = _inline_ (\n. [n])
1406 bar = _inline_ (foo 20)
1407 baz = \n. case bar of { (m:_) -> m + n }
1408 Here we really want to inline 'bar' so that we can inline 'foo'
1409 and the whole thing unravels as it should obviously do. This is
1410 important: in the NDP project, 'bar' generates a closure data
1411 structure rather than a list.
1412
1413 So the non-inlining of lone_variables should only apply if the
1414 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1415 looks through the unfolding. Hence the "&& is_wf" in the
1416 InlineRule branch.
1417
1418 * Even a type application or coercion isn't a lone variable.
1419 Consider
1420 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1421 We had better inline that sucker! The case won't see through it.
1422
1423 For now, I'm treating treating a variable applied to types
1424 in a *lazy* context "lone". The motivating example was
1425 f = /\a. \x. BIG
1426 g = /\a. \y. h (f a)
1427 There's no advantage in inlining f here, and perhaps
1428 a significant disadvantage. Hence some_val_args in the Stop case
1429
1430 Note [Interaction of exprIsWorkFree and lone variables]
1431 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1432 The lone-variable test says "don't inline if a case expression
1433 scrutinises a lone variable whose unfolding is cheap". It's very
1434 important that, under these circumstances, exprIsConApp_maybe
1435 can spot a constructor application. So, for example, we don't
1436 consider
1437 let x = e in (x,x)
1438 to be cheap, and that's good because exprIsConApp_maybe doesn't
1439 think that expression is a constructor application.
1440
1441 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1442 rather than is_wf, which was utterly wrong, because the above
1443 expression responds True to exprIsHNF, which is what sets is_value.
1444
1445 This kind of thing can occur if you have
1446
1447 {-# INLINE foo #-}
1448 foo = let x = e in (x,x)
1449
1450 which Roman did.
1451 -}
1452
1453 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1454 -> Int
1455 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1456 -- We multiple the raw discounts (args_discount and result_discount)
1457 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1458 -- *size* whereas the discounts imply that there's some extra
1459 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1460 -- by inlining.
1461
1462 = 10 -- Discount of 10 because the result replaces the call
1463 -- so we count 10 for the function itself
1464
1465 + 10 * length actual_arg_discounts
1466 -- Discount of 10 for each arg supplied,
1467 -- because the result replaces the call
1468
1469 + round (ufKeenessFactor dflags *
1470 fromIntegral (total_arg_discount + res_discount'))
1471 where
1472 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1473 total_arg_discount = sum actual_arg_discounts
1474
1475 mk_arg_discount _ TrivArg = 0
1476 mk_arg_discount _ NonTrivArg = 10
1477 mk_arg_discount discount ValueArg = discount
1478
1479 res_discount'
1480 | LT <- arg_discounts `compareLength` arg_infos
1481 = res_discount -- Over-saturated
1482 | otherwise
1483 = case cont_info of
1484 BoringCtxt -> 0
1485 CaseCtxt -> res_discount -- Presumably a constructor
1486 ValAppCtxt -> res_discount -- Presumably a function
1487 _ -> 40 `min` res_discount
1488 -- ToDo: this 40 `min` res_discount doesn't seem right
1489 -- for DiscArgCtxt it shouldn't matter because the function will
1490 -- get the arg discount for any non-triv arg
1491 -- for RuleArgCtxt we do want to be keener to inline; but not only
1492 -- constructor results
1493 -- for RhsCtxt I suppose that exposing a data con is good in general
1494 -- And 40 seems very arbitrary
1495 --
1496 -- res_discount can be very large when a function returns
1497 -- constructors; but we only want to invoke that large discount
1498 -- when there's a case continuation.
1499 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1500 -- But we want to aovid inlining large functions that return
1501 -- constructors into contexts that are simply "interesting"