Set strictness correctly for JoinIds
[ghc.git] / compiler / coreSyn / CoreUnfold.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The AQUA Project, Glasgow University, 1994-1998
4
5
6 Core-syntax unfoldings
7
8 Unfoldings (which can travel across module boundaries) are in Core
9 syntax (namely @CoreExpr@s).
10
11 The type @Unfolding@ sits ``above'' simply-Core-expressions
12 unfoldings, capturing ``higher-level'' things we know about a binding,
13 usually things that the simplifier found out (e.g., ``it's a
14 literal''). In the corner of a @CoreUnfolding@ unfolding, you will
15 find, unsurprisingly, a Core expression.
16 -}
17
18 {-# LANGUAGE CPP #-}
19
20 module CoreUnfold (
21 Unfolding, UnfoldingGuidance, -- Abstract types
22
23 noUnfolding, mkImplicitUnfolding,
24 mkUnfolding, mkCoreUnfolding,
25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26 mkInlineUnfolding, mkInlineUnfoldingWithArity,
27 mkInlinableUnfolding, mkWwInlineRule,
28 mkCompulsoryUnfolding, mkDFunUnfolding,
29 specUnfolding,
30
31 ArgSummary(..),
32
33 couldBeSmallEnoughToInline, inlineBoringOk,
34 certainlyWillInline, smallEnoughToInline,
35
36 callSiteInline, CallCtxt(..),
37
38 -- Reexport from CoreSubst (it only live there so it can be used
39 -- by the Very Simple Optimiser)
40 exprIsConApp_maybe, exprIsLiteral_maybe
41 ) where
42
43 #include "HsVersions.h"
44
45 import GhcPrelude
46
47 import DynFlags
48 import CoreSyn
49 import PprCore () -- Instances
50 import OccurAnal ( occurAnalyseExpr )
51 import CoreOpt
52 import CoreArity ( manifestArity )
53 import CoreUtils
54 import Id
55 import Demand ( isBottomingSig )
56 import DataCon
57 import Literal
58 import PrimOp
59 import IdInfo
60 import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec )
61 import Type
62 import PrelNames
63 import TysPrim ( realWorldStatePrimTy )
64 import Bag
65 import Util
66 import Outputable
67 import ForeignCall
68 import Name
69
70 import qualified Data.ByteString as BS
71 import Data.List
72
73 {-
74 ************************************************************************
75 * *
76 \subsection{Making unfoldings}
77 * *
78 ************************************************************************
79 -}
80
81 mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
82 mkTopUnfolding dflags is_bottoming rhs
83 = mkUnfolding dflags InlineRhs True is_bottoming rhs
84
85 mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
86 -- For implicit Ids, do a tiny bit of optimising first
87 mkImplicitUnfolding dflags expr
88 = mkTopUnfolding dflags False (simpleOptExpr dflags expr)
89
90 -- Note [Top-level flag on inline rules]
91 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
92 -- Slight hack: note that mk_inline_rules conservatively sets the
93 -- top-level flag to True. It gets set more accurately by the simplifier
94 -- Simplify.simplUnfolding.
95
96 mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
97 mkSimpleUnfolding dflags rhs
98 = mkUnfolding dflags InlineRhs False False rhs
99
100 mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
101 mkDFunUnfolding bndrs con ops
102 = DFunUnfolding { df_bndrs = bndrs
103 , df_con = con
104 , df_args = map occurAnalyseExpr ops }
105 -- See Note [Occurrence analysis of unfoldings]
106
107 mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
108 mkWwInlineRule dflags expr arity
109 = mkCoreUnfolding InlineStable True
110 (simpleOptExpr dflags expr)
111 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
112 , ug_boring_ok = boringCxtNotOk })
113
114 mkCompulsoryUnfolding :: CoreExpr -> Unfolding
115 mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded
116 = mkCoreUnfolding InlineCompulsory True
117 (simpleOptExpr unsafeGlobalDynFlags expr)
118 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter
119 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
120
121 mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
122 -- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
123 mkWorkerUnfolding dflags work_fn
124 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
125 , uf_is_top = top_lvl })
126 | isStableSource src
127 = mkCoreUnfolding src top_lvl new_tmpl guidance
128 where
129 new_tmpl = simpleOptExpr dflags (work_fn tmpl)
130 guidance = calcUnfoldingGuidance dflags False new_tmpl
131
132 mkWorkerUnfolding _ _ _ = noUnfolding
133
134 -- | Make an unfolding that may be used unsaturated
135 -- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
136 -- manifest arity (the number of outer lambdas applications will
137 -- resolve before doing any work).
138 mkInlineUnfolding :: CoreExpr -> Unfolding
139 mkInlineUnfolding expr
140 = mkCoreUnfolding InlineStable
141 True -- Note [Top-level flag on inline rules]
142 expr' guide
143 where
144 expr' = simpleOptExpr unsafeGlobalDynFlags expr
145 guide = UnfWhen { ug_arity = manifestArity expr'
146 , ug_unsat_ok = unSaturatedOk
147 , ug_boring_ok = boring_ok }
148 boring_ok = inlineBoringOk expr'
149
150 -- | Make an unfolding that will be used once the RHS has been saturated
151 -- to the given arity.
152 mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
153 mkInlineUnfoldingWithArity arity expr
154 = mkCoreUnfolding InlineStable
155 True -- Note [Top-level flag on inline rules]
156 expr' guide
157 where
158 expr' = simpleOptExpr unsafeGlobalDynFlags expr
159 guide = UnfWhen { ug_arity = arity
160 , ug_unsat_ok = needSaturated
161 , ug_boring_ok = boring_ok }
162 boring_ok = inlineBoringOk expr'
163
164 mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
165 mkInlinableUnfolding dflags expr
166 = mkUnfolding dflags InlineStable False False expr'
167 where
168 expr' = simpleOptExpr dflags expr
169
170 specUnfolding :: DynFlags -> [Var] -> (CoreExpr -> CoreExpr) -> Arity
171 -> Unfolding -> Unfolding
172 -- See Note [Specialising unfoldings]
173 -- specUnfolding spec_bndrs spec_app arity_decrease unf
174 -- = \spec_bndrs. spec_app( unf )
175 --
176 specUnfolding dflags spec_bndrs spec_app arity_decrease
177 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
178 = ASSERT2( arity_decrease == count isId old_bndrs - count isId spec_bndrs, ppr df )
179 mkDFunUnfolding spec_bndrs con (map spec_arg args)
180 -- There is a hard-to-check assumption here that the spec_app has
181 -- enough applications to exactly saturate the old_bndrs
182 -- For DFunUnfoldings we transform
183 -- \old_bndrs. MkD <op1> ... <opn>
184 -- to
185 -- \new_bndrs. MkD (spec_app(\old_bndrs. <op1>)) ... ditto <opn>
186 -- The ASSERT checks the value part of that
187 where
188 spec_arg arg = simpleOptExpr dflags (spec_app (mkLams old_bndrs arg))
189 -- The beta-redexes created by spec_app will be
190 -- simplified away by simplOptExpr
191
192 specUnfolding dflags spec_bndrs spec_app arity_decrease
193 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
194 , uf_is_top = top_lvl
195 , uf_guidance = old_guidance })
196 | isStableSource src -- See Note [Specialising unfoldings]
197 , UnfWhen { ug_arity = old_arity
198 , ug_unsat_ok = unsat_ok
199 , ug_boring_ok = boring_ok } <- old_guidance
200 = let guidance = UnfWhen { ug_arity = old_arity - arity_decrease
201 , ug_unsat_ok = unsat_ok
202 , ug_boring_ok = boring_ok }
203 new_tmpl = simpleOptExpr dflags (mkLams spec_bndrs (spec_app tmpl))
204 -- The beta-redexes created by spec_app will be
205 -- simplified away by simplOptExpr
206
207 in mkCoreUnfolding src top_lvl new_tmpl guidance
208
209 specUnfolding _ _ _ _ _ = noUnfolding
210
211 {- Note [Specialising unfoldings]
212 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
213 When we specialise a function for some given type-class arguments, we use
214 specUnfolding to specialise its unfolding. Some important points:
215
216 * If the original function has a DFunUnfolding, the specialised one
217 must do so too! Otherwise we lose the magic rules that make it
218 interact with ClassOps
219
220 * There is a bit of hack for INLINABLE functions:
221 f :: Ord a => ....
222 f = <big-rhs>
223 {- INLINABLE f #-}
224 Now if we specialise f, should the specialised version still have
225 an INLINABLE pragma? If it does, we'll capture a specialised copy
226 of <big-rhs> as its unfolding, and that probaby won't inline. But
227 if we don't, the specialised version of <big-rhs> might be small
228 enough to inline at a call site. This happens with Control.Monad.liftM3,
229 and can cause a lot more allocation as a result (nofib n-body shows this).
230
231 Moreover, keeping the INLINABLE thing isn't much help, because
232 the specialised function (probaby) isn't overloaded any more.
233
234 Conclusion: drop the INLINEALE pragma. In practice what this means is:
235 if a stable unfolding has UnfoldingGuidance of UnfWhen,
236 we keep it (so the specialised thing too will always inline)
237 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
238 (which arises from INLINABLE), we discard it
239 -}
240
241 mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
242 -> UnfoldingGuidance -> Unfolding
243 -- Occurrence-analyses the expression before capturing it
244 mkCoreUnfolding src top_lvl expr guidance
245 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
246 -- See Note [Occurrence analysis of unfoldings]
247 uf_src = src,
248 uf_is_top = top_lvl,
249 uf_is_value = exprIsHNF expr,
250 uf_is_conlike = exprIsConLike expr,
251 uf_is_work_free = exprIsWorkFree expr,
252 uf_expandable = exprIsExpandable expr,
253 uf_guidance = guidance }
254
255 mkUnfolding :: DynFlags -> UnfoldingSource
256 -> Bool -- Is top-level
257 -> Bool -- Definitely a bottoming binding
258 -- (only relevant for top-level bindings)
259 -> CoreExpr
260 -> Unfolding
261 -- Calculates unfolding guidance
262 -- Occurrence-analyses the expression before capturing it
263 mkUnfolding dflags src is_top_lvl is_bottoming expr
264 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
265 -- See Note [Occurrence analysis of unfoldings]
266 uf_src = src,
267 uf_is_top = is_top_lvl,
268 uf_is_value = exprIsHNF expr,
269 uf_is_conlike = exprIsConLike expr,
270 uf_expandable = exprIsExpandable expr,
271 uf_is_work_free = exprIsWorkFree expr,
272 uf_guidance = guidance }
273 where
274 is_top_bottoming = is_top_lvl && is_bottoming
275 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr
276 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
277 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
278
279 {-
280 Note [Occurrence analysis of unfoldings]
281 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
282 We do occurrence-analysis of unfoldings once and for all, when the
283 unfolding is built, rather than each time we inline them.
284
285 But given this decision it's vital that we do
286 *always* do it. Consider this unfolding
287 \x -> letrec { f = ...g...; g* = f } in body
288 where g* is (for some strange reason) the loop breaker. If we don't
289 occ-anal it when reading it in, we won't mark g as a loop breaker, and
290 we may inline g entirely in body, dropping its binding, and leaving
291 the occurrence in f out of scope. This happened in Trac #8892, where
292 the unfolding in question was a DFun unfolding.
293
294 But more generally, the simplifier is designed on the
295 basis that it is looking at occurrence-analysed expressions, so better
296 ensure that they acutally are.
297
298 Note [Calculate unfolding guidance on the non-occ-anal'd expression]
299 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
300 Notice that we give the non-occur-analysed expression to
301 calcUnfoldingGuidance. In some ways it'd be better to occur-analyse
302 first; for example, sometimes during simplification, there's a large
303 let-bound thing which has been substituted, and so is now dead; so
304 'expr' contains two copies of the thing while the occurrence-analysed
305 expression doesn't.
306
307 Nevertheless, we *don't* and *must not* occ-analyse before computing
308 the size because
309
310 a) The size computation bales out after a while, whereas occurrence
311 analysis does not.
312
313 b) Residency increases sharply if you occ-anal first. I'm not
314 100% sure why, but it's a large effect. Compiling Cabal went
315 from residency of 534M to over 800M with this one change.
316
317 This can occasionally mean that the guidance is very pessimistic;
318 it gets fixed up next round. And it should be rare, because large
319 let-bound things that are dead are usually caught by preInlineUnconditionally
320
321
322 ************************************************************************
323 * *
324 \subsection{The UnfoldingGuidance type}
325 * *
326 ************************************************************************
327 -}
328
329 inlineBoringOk :: CoreExpr -> Bool
330 -- See Note [INLINE for small functions]
331 -- True => the result of inlining the expression is
332 -- no bigger than the expression itself
333 -- eg (\x y -> f y x)
334 -- This is a quick and dirty version. It doesn't attempt
335 -- to deal with (\x y z -> x (y z))
336 -- The really important one is (x `cast` c)
337 inlineBoringOk e
338 = go 0 e
339 where
340 go :: Int -> CoreExpr -> Bool
341 go credit (Lam x e) | isId x = go (credit+1) e
342 | otherwise = go credit e
343 go credit (App f (Type {})) = go credit f
344 go credit (App f a) | credit > 0
345 , exprIsTrivial a = go (credit-1) f
346 go credit (Tick _ e) = go credit e -- dubious
347 go credit (Cast e _) = go credit e
348 go _ (Var {}) = boringCxtOk
349 go _ _ = boringCxtNotOk
350
351 calcUnfoldingGuidance
352 :: DynFlags
353 -> Bool -- Definitely a top-level, bottoming binding
354 -> CoreExpr -- Expression to look at
355 -> UnfoldingGuidance
356 calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
357 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding
358 = calcUnfoldingGuidance dflags is_top_bottoming expr
359 calcUnfoldingGuidance dflags is_top_bottoming expr
360 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
361 TooBig -> UnfNever
362 SizeIs size cased_bndrs scrut_discount
363 | uncondInline expr n_val_bndrs size
364 -> UnfWhen { ug_unsat_ok = unSaturatedOk
365 , ug_boring_ok = boringCxtOk
366 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions]
367
368 | is_top_bottoming
369 -> UnfNever -- See Note [Do not inline top-level bottoming functions]
370
371 | otherwise
372 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs
373 , ug_size = size
374 , ug_res = scrut_discount }
375
376 where
377 (bndrs, body) = collectBinders expr
378 bOMB_OUT_SIZE = ufCreationThreshold dflags
379 -- Bomb out if size gets bigger than this
380 val_bndrs = filter isId bndrs
381 n_val_bndrs = length val_bndrs
382
383 mk_discount :: Bag (Id,Int) -> Id -> Int
384 mk_discount cbs bndr = foldlBag combine 0 cbs
385 where
386 combine acc (bndr', disc)
387 | bndr == bndr' = acc `plus_disc` disc
388 | otherwise = acc
389
390 plus_disc :: Int -> Int -> Int
391 plus_disc | isFunTy (idType bndr) = max
392 | otherwise = (+)
393 -- See Note [Function and non-function discounts]
394
395 {-
396 Note [Computing the size of an expression]
397 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
398 The basic idea of sizeExpr is obvious enough: count nodes. But getting the
399 heuristics right has taken a long time. Here's the basic strategy:
400
401 * Variables, literals: 0
402 (Exception for string literals, see litSize.)
403
404 * Function applications (f e1 .. en): 1 + #value args
405
406 * Constructor applications: 1, regardless of #args
407
408 * Let(rec): 1 + size of components
409
410 * Note, cast: 0
411
412 Examples
413
414 Size Term
415 --------------
416 0 42#
417 0 x
418 0 True
419 2 f x
420 1 Just x
421 4 f (g x)
422
423 Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
424 a function call to account for. Notice also that constructor applications
425 are very cheap, because exposing them to a caller is so valuable.
426
427 [25/5/11] All sizes are now multiplied by 10, except for primops
428 (which have sizes like 1 or 4. This makes primops look fantastically
429 cheap, and seems to be almost unversally beneficial. Done partly as a
430 result of #4978.
431
432 Note [Do not inline top-level bottoming functions]
433 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
434 The FloatOut pass has gone to some trouble to float out calls to 'error'
435 and similar friends. See Note [Bottoming floats] in SetLevels.
436 Do not re-inline them! But we *do* still inline if they are very small
437 (the uncondInline stuff).
438
439 Note [INLINE for small functions]
440 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
441 Consider {-# INLINE f #-}
442 f x = Just x
443 g y = f y
444 Then f's RHS is no larger than its LHS, so we should inline it into
445 even the most boring context. In general, f the function is
446 sufficiently small that its body is as small as the call itself, the
447 inline unconditionally, regardless of how boring the context is.
448
449 Things to note:
450
451 (1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
452 than the thing it's replacing. Notice that
453 (f x) --> (g 3) -- YES, unconditionally
454 (f x) --> x : [] -- YES, *even though* there are two
455 -- arguments to the cons
456 x --> g 3 -- NO
457 x --> Just v -- NO
458
459 It's very important not to unconditionally replace a variable by
460 a non-atomic term.
461
462 (2) We do this even if the thing isn't saturated, else we end up with the
463 silly situation that
464 f x y = x
465 ...map (f 3)...
466 doesn't inline. Even in a boring context, inlining without being
467 saturated will give a lambda instead of a PAP, and will be more
468 efficient at runtime.
469
470 (3) However, when the function's arity > 0, we do insist that it
471 has at least one value argument at the call site. (This check is
472 made in the UnfWhen case of callSiteInline.) Otherwise we find this:
473 f = /\a \x:a. x
474 d = /\b. MkD (f b)
475 If we inline f here we get
476 d = /\b. MkD (\x:b. x)
477 and then prepareRhs floats out the argument, abstracting the type
478 variables, so we end up with the original again!
479
480 (4) We must be much more cautious about arity-zero things. Consider
481 let x = y +# z in ...
482 In *size* terms primops look very small, because the generate a
483 single instruction, but we do not want to unconditionally replace
484 every occurrence of x with (y +# z). So we only do the
485 unconditional-inline thing for *trivial* expressions.
486
487 NB: you might think that PostInlineUnconditionally would do this
488 but it doesn't fire for top-level things; see SimplUtils
489 Note [Top level and postInlineUnconditionally]
490 -}
491
492 uncondInline :: CoreExpr -> Arity -> Int -> Bool
493 -- Inline unconditionally if there no size increase
494 -- Size of call is arity (+1 for the function)
495 -- See Note [INLINE for small functions]
496 uncondInline rhs arity size
497 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
498 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4)
499
500 sizeExpr :: DynFlags
501 -> Int -- Bomb out if it gets bigger than this
502 -> [Id] -- Arguments; we're interested in which of these
503 -- get case'd
504 -> CoreExpr
505 -> ExprSize
506
507 -- Note [Computing the size of an expression]
508
509 sizeExpr dflags bOMB_OUT_SIZE top_args expr
510 = size_up expr
511 where
512 size_up (Cast e _) = size_up e
513 size_up (Tick _ e) = size_up e
514 size_up (Type _) = sizeZero -- Types cost nothing
515 size_up (Coercion _) = sizeZero
516 size_up (Lit lit) = sizeN (litSize lit)
517 size_up (Var f) | isRealWorldId f = sizeZero
518 -- Make sure we get constructor discounts even
519 -- on nullary constructors
520 | otherwise = size_up_call f [] 0
521
522 size_up (App fun arg)
523 | isTyCoArg arg = size_up fun
524 | otherwise = size_up arg `addSizeNSD`
525 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
526
527 size_up (Lam b e)
528 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
529 | otherwise = size_up e
530
531 size_up (Let (NonRec binder rhs) body)
532 = size_up_rhs (binder, rhs) `addSizeNSD`
533 size_up body `addSizeN`
534 size_up_alloc binder
535
536 size_up (Let (Rec pairs) body)
537 = foldr (addSizeNSD . size_up_rhs)
538 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
539 pairs
540
541 size_up (Case e _ _ alts)
542 | null alts
543 = size_up e -- case e of {} never returns, so take size of scrutinee
544
545 size_up (Case e _ _ alts)
546 -- Now alts is non-empty
547 | Just v <- is_top_arg e -- We are scrutinising an argument variable
548 = let
549 alt_sizes = map size_up_alt alts
550
551 -- alts_size tries to compute a good discount for
552 -- the case when we are scrutinising an argument variable
553 alts_size (SizeIs tot tot_disc tot_scrut)
554 -- Size of all alternatives
555 (SizeIs max _ _)
556 -- Size of biggest alternative
557 = SizeIs tot (unitBag (v, 20 + tot - max)
558 `unionBags` tot_disc) tot_scrut
559 -- If the variable is known, we produce a
560 -- discount that will take us back to 'max',
561 -- the size of the largest alternative The
562 -- 1+ is a little discount for reduced
563 -- allocation in the caller
564 --
565 -- Notice though, that we return tot_disc,
566 -- the total discount from all branches. I
567 -- think that's right.
568
569 alts_size tot_size _ = tot_size
570 in
571 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty
572 (foldr1 maxSize alt_sizes)
573 -- Good to inline if an arg is scrutinised, because
574 -- that may eliminate allocation in the caller
575 -- And it eliminates the case itself
576 where
577 is_top_arg (Var v) | v `elem` top_args = Just v
578 is_top_arg (Cast e _) = is_top_arg e
579 is_top_arg _ = Nothing
580
581
582 size_up (Case e _ _ alts) = size_up e `addSizeNSD`
583 foldr (addAltSize . size_up_alt) case_size alts
584 where
585 case_size
586 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
587 | otherwise = sizeZero
588 -- Normally we don't charge for the case itself, but
589 -- we charge one per alternative (see size_up_alt,
590 -- below) to account for the cost of the info table
591 -- and comparisons.
592 --
593 -- However, in certain cases (see is_inline_scrut
594 -- below), no code is generated for the case unless
595 -- there are multiple alts. In these cases we
596 -- subtract one, making the first alt free.
597 -- e.g. case x# +# y# of _ -> ... should cost 1
598 -- case touch# x# of _ -> ... should cost 0
599 -- (see #4978)
600 --
601 -- I would like to not have the "lengthAtMost alts 1"
602 -- condition above, but without that some programs got worse
603 -- (spectral/hartel/event and spectral/para). I don't fully
604 -- understand why. (SDM 24/5/11)
605
606 -- unboxed variables, inline primops and unsafe foreign calls
607 -- are all "inline" things:
608 is_inline_scrut (Var v) = isUnliftedType (idType v)
609 is_inline_scrut scrut
610 | (Var f, _) <- collectArgs scrut
611 = case idDetails f of
612 FCallId fc -> not (isSafeForeignCall fc)
613 PrimOpId op -> not (primOpOutOfLine op)
614 _other -> False
615 | otherwise
616 = False
617
618 size_up_rhs (bndr, rhs)
619 | Just join_arity <- isJoinId_maybe bndr
620 -- Skip arguments to join point
621 , (_bndrs, body) <- collectNBinders join_arity rhs
622 = size_up body
623 | otherwise
624 = size_up rhs
625
626 ------------
627 -- size_up_app is used when there's ONE OR MORE value args
628 size_up_app (App fun arg) args voids
629 | isTyCoArg arg = size_up_app fun args voids
630 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1)
631 | otherwise = size_up arg `addSizeNSD`
632 size_up_app fun (arg:args) voids
633 size_up_app (Var fun) args voids = size_up_call fun args voids
634 size_up_app (Tick _ expr) args voids = size_up_app expr args voids
635 size_up_app (Cast expr _) args voids = size_up_app expr args voids
636 size_up_app other args voids = size_up other `addSizeN`
637 callSize (length args) voids
638 -- if the lhs is not an App or a Var, or an invisible thing like a
639 -- Tick or Cast, then we should charge for a complete call plus the
640 -- size of the lhs itself.
641
642 ------------
643 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
644 size_up_call fun val_args voids
645 = case idDetails fun of
646 FCallId _ -> sizeN (callSize (length val_args) voids)
647 DataConWorkId dc -> conSize dc (length val_args)
648 PrimOpId op -> primOpSize op (length val_args)
649 ClassOpId _ -> classOpSize dflags top_args val_args
650 _ -> funSize dflags top_args fun (length val_args) voids
651
652 ------------
653 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
654 -- Don't charge for args, so that wrappers look cheap
655 -- (See comments about wrappers with Case)
656 --
657 -- IMPORTANT: *do* charge 1 for the alternative, else we
658 -- find that giant case nests are treated as practically free
659 -- A good example is Foreign.C.Error.errnoToIOError
660
661 ------------
662 -- Cost to allocate binding with given binder
663 size_up_alloc bndr
664 | isTyVar bndr -- Doesn't exist at runtime
665 || isJoinId bndr -- Not allocated at all
666 || isUnliftedType (idType bndr) -- Doesn't live in heap
667 = 0
668 | otherwise
669 = 10
670
671 ------------
672 -- These addSize things have to be here because
673 -- I don't want to give them bOMB_OUT_SIZE as an argument
674 addSizeN TooBig _ = TooBig
675 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
676
677 -- addAltSize is used to add the sizes of case alternatives
678 addAltSize TooBig _ = TooBig
679 addAltSize _ TooBig = TooBig
680 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
681 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
682 (xs `unionBags` ys)
683 (d1 + d2) -- Note [addAltSize result discounts]
684
685 -- This variant ignores the result discount from its LEFT argument
686 -- It's used when the second argument isn't part of the result
687 addSizeNSD TooBig _ = TooBig
688 addSizeNSD _ TooBig = TooBig
689 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
690 = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
691 (xs `unionBags` ys)
692 d2 -- Ignore d1
693
694 isRealWorldId id = idType id `eqType` realWorldStatePrimTy
695
696 -- an expression of type State# RealWorld must be a variable
697 isRealWorldExpr (Var id) = isRealWorldId id
698 isRealWorldExpr (Tick _ e) = isRealWorldExpr e
699 isRealWorldExpr _ = False
700
701 -- | Finds a nominal size of a string literal.
702 litSize :: Literal -> Int
703 -- Used by CoreUnfold.sizeExpr
704 litSize (LitNumber LitNumInteger _ _) = 100 -- Note [Size of literal integers]
705 litSize (LitNumber LitNumNatural _ _) = 100
706 litSize (MachStr str) = 10 + 10 * ((BS.length str + 3) `div` 4)
707 -- If size could be 0 then @f "x"@ might be too small
708 -- [Sept03: make literal strings a bit bigger to avoid fruitless
709 -- duplication of little strings]
710 litSize _other = 0 -- Must match size of nullary constructors
711 -- Key point: if x |-> 4, then x must inline unconditionally
712 -- (eg via case binding)
713
714 classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
715 -- See Note [Conlike is interesting]
716 classOpSize _ _ []
717 = sizeZero
718 classOpSize dflags top_args (arg1 : other_args)
719 = SizeIs size arg_discount 0
720 where
721 size = 20 + (10 * length other_args)
722 -- If the class op is scrutinising a lambda bound dictionary then
723 -- give it a discount, to encourage the inlining of this function
724 -- The actual discount is rather arbitrarily chosen
725 arg_discount = case arg1 of
726 Var dict | dict `elem` top_args
727 -> unitBag (dict, ufDictDiscount dflags)
728 _other -> emptyBag
729
730 -- | The size of a function call
731 callSize
732 :: Int -- ^ number of value args
733 -> Int -- ^ number of value args that are void
734 -> Int
735 callSize n_val_args voids = 10 * (1 + n_val_args - voids)
736 -- The 1+ is for the function itself
737 -- Add 1 for each non-trivial arg;
738 -- the allocation cost, as in let(rec)
739
740 -- | The size of a jump to a join point
741 jumpSize
742 :: Int -- ^ number of value args
743 -> Int -- ^ number of value args that are void
744 -> Int
745 jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
746 -- A jump is 20% the size of a function call. Making jumps free reopens
747 -- bug #6048, but making them any more expensive loses a 21% improvement in
748 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
749 -- better solution?
750
751 funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
752 -- Size for functions that are not constructors or primops
753 -- Note [Function applications]
754 funSize dflags top_args fun n_val_args voids
755 | fun `hasKey` buildIdKey = buildSize
756 | fun `hasKey` augmentIdKey = augmentSize
757 | otherwise = SizeIs size arg_discount res_discount
758 where
759 some_val_args = n_val_args > 0
760 is_join = isJoinId fun
761
762 size | is_join = jumpSize n_val_args voids
763 | not some_val_args = 0
764 | otherwise = callSize n_val_args voids
765
766 -- DISCOUNTS
767 -- See Note [Function and non-function discounts]
768 arg_discount | some_val_args && fun `elem` top_args
769 = unitBag (fun, ufFunAppDiscount dflags)
770 | otherwise = emptyBag
771 -- If the function is an argument and is applied
772 -- to some values, give it an arg-discount
773
774 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
775 | otherwise = 0
776 -- If the function is partially applied, show a result discount
777 -- XXX maybe behave like ConSize for eval'd variable
778
779 conSize :: DataCon -> Int -> ExprSize
780 conSize dc n_val_args
781 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables
782
783 -- See Note [Unboxed tuple size and result discount]
784 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
785
786 -- See Note [Constructor size and result discount]
787 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
788
789 -- XXX still looks to large to me
790
791 {-
792 Note [Constructor size and result discount]
793 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
794 Treat a constructors application as size 10, regardless of how many
795 arguments it has; we are keen to expose them (and we charge separately
796 for their args). We can't treat them as size zero, else we find that
797 (Just x) has size 0, which is the same as a lone variable; and hence
798 'v' will always be replaced by (Just x), where v is bound to Just x.
799
800 The "result discount" is applied if the result of the call is
801 scrutinised (say by a case). For a constructor application that will
802 mean the constructor application will disappear, so we don't need to
803 charge it to the function. So the discount should at least match the
804 cost of the constructor application, namely 10. But to give a bit
805 of extra incentive we give a discount of 10*(1 + n_val_args).
806
807 Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
808 and said it was an "unambiguous win", but its terribly dangerous
809 because a function with many many case branches, each finishing with
810 a constructor, can have an arbitrarily large discount. This led to
811 terrible code bloat: see Trac #6099.
812
813 Note [Unboxed tuple size and result discount]
814 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
815 However, unboxed tuples count as size zero. I found occasions where we had
816 f x y z = case op# x y z of { s -> (# s, () #) }
817 and f wasn't getting inlined.
818
819 I tried giving unboxed tuples a *result discount* of zero (see the
820 commented-out line). Why? When returned as a result they do not
821 allocate, so maybe we don't want to charge so much for them If you
822 have a non-zero discount here, we find that workers often get inlined
823 back into wrappers, because it look like
824 f x = case $wf x of (# a,b #) -> (a,b)
825 and we are keener because of the case. However while this change
826 shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
827 more. All other changes were very small. So it's not a big deal but I
828 didn't adopt the idea.
829
830 Note [Function and non-function discounts]
831 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
832 We want a discount if the function is applied. A good example is
833 monadic combinators with continuation arguments, where inlining is
834 quite important.
835
836 But we don't want a big discount when a function is called many times
837 (see the detailed comments with Trac #6048) because if the function is
838 big it won't be inlined at its many call sites and no benefit results.
839 Indeed, we can get exponentially big inlinings this way; that is what
840 Trac #6048 is about.
841
842 On the other hand, for data-valued arguments, if there are lots of
843 case expressions in the body, each one will get smaller if we apply
844 the function to a constructor application, so we *want* a big discount
845 if the argument is scrutinised by many case expressions.
846
847 Conclusion:
848 - For functions, take the max of the discounts
849 - For data values, take the sum of the discounts
850
851
852 Note [Literal integer size]
853 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
854 Literal integers *can* be big (mkInteger [...coefficients...]), but
855 need not be (S# n). We just use an arbitrary big-ish constant here
856 so that, in particular, we don't inline top-level defns like
857 n = S# 5
858 There's no point in doing so -- any optimisations will see the S#
859 through n's unfolding. Nor will a big size inhibit unfoldings functions
860 that mention a literal Integer, because the float-out pass will float
861 all those constants to top level.
862 -}
863
864 primOpSize :: PrimOp -> Int -> ExprSize
865 primOpSize op n_val_args
866 = if primOpOutOfLine op
867 then sizeN (op_size + n_val_args)
868 else sizeN op_size
869 where
870 op_size = primOpCodeSize op
871
872
873 buildSize :: ExprSize
874 buildSize = SizeIs 0 emptyBag 40
875 -- We really want to inline applications of build
876 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
877 -- Indeed, we should add a result_discount because build is
878 -- very like a constructor. We don't bother to check that the
879 -- build is saturated (it usually is). The "-2" discounts for the \c n,
880 -- The "4" is rather arbitrary.
881
882 augmentSize :: ExprSize
883 augmentSize = SizeIs 0 emptyBag 40
884 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
885 -- e plus ys. The -2 accounts for the \cn
886
887 -- When we return a lambda, give a discount if it's used (applied)
888 lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
889 lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
890 lamScrutDiscount _ TooBig = TooBig
891
892 {-
893 Note [addAltSize result discounts]
894 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
895 When adding the size of alternatives, we *add* the result discounts
896 too, rather than take the *maximum*. For a multi-branch case, this
897 gives a discount for each branch that returns a constructor, making us
898 keener to inline. I did try using 'max' instead, but it makes nofib
899 'rewrite' and 'puzzle' allocate significantly more, and didn't make
900 binary sizes shrink significantly either.
901
902 Note [Discounts and thresholds]
903 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
904 Constants for discounts and thesholds are defined in main/DynFlags,
905 all of form ufXxxx. They are:
906
907 ufCreationThreshold
908 At a definition site, if the unfolding is bigger than this, we
909 may discard it altogether
910
911 ufUseThreshold
912 At a call site, if the unfolding, less discounts, is smaller than
913 this, then it's small enough inline
914
915 ufKeenessFactor
916 Factor by which the discounts are multiplied before
917 subtracting from size
918
919 ufDictDiscount
920 The discount for each occurrence of a dictionary argument
921 as an argument of a class method. Should be pretty small
922 else big functions may get inlined
923
924 ufFunAppDiscount
925 Discount for a function argument that is applied. Quite
926 large, because if we inline we avoid the higher-order call.
927
928 ufDearOp
929 The size of a foreign call or not-dupable PrimOp
930
931 ufVeryAggressive
932 If True, the compiler ignores all the thresholds and inlines very
933 aggressively. It still adheres to arity, simplifier phase control and
934 loop breakers.
935
936
937 Note [Function applications]
938 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
939 In a function application (f a b)
940
941 - If 'f' is an argument to the function being analysed,
942 and there's at least one value arg, record a FunAppDiscount for f
943
944 - If the application if a PAP (arity > 2 in this example)
945 record a *result* discount (because inlining
946 with "extra" args in the call may mean that we now
947 get a saturated application)
948
949 Code for manipulating sizes
950 -}
951
952 -- | The size of a candidate expression for unfolding
953 data ExprSize
954 = TooBig
955 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found
956 , _es_args :: !(Bag (Id,Int))
957 -- ^ Arguments cased herein, and discount for each such
958 , _es_discount :: {-# UNPACK #-} !Int
959 -- ^ Size to subtract if result is scrutinised by a case
960 -- expression
961 }
962
963 instance Outputable ExprSize where
964 ppr TooBig = text "TooBig"
965 ppr (SizeIs a _ c) = brackets (int a <+> int c)
966
967 -- subtract the discount before deciding whether to bale out. eg. we
968 -- want to inline a large constructor application into a selector:
969 -- tup = (a_1, ..., a_99)
970 -- x = case tup of ...
971 --
972 mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
973 mkSizeIs max n xs d | (n - d) > max = TooBig
974 | otherwise = SizeIs n xs d
975
976 maxSize :: ExprSize -> ExprSize -> ExprSize
977 maxSize TooBig _ = TooBig
978 maxSize _ TooBig = TooBig
979 maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1
980 | otherwise = s2
981
982 sizeZero :: ExprSize
983 sizeN :: Int -> ExprSize
984
985 sizeZero = SizeIs 0 emptyBag 0
986 sizeN n = SizeIs n emptyBag 0
987
988 {-
989 ************************************************************************
990 * *
991 \subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
992 * *
993 ************************************************************************
994
995 We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
996 we ``couldn't possibly use'' on the other side. Can be overridden w/
997 flaggery. Just the same as smallEnoughToInline, except that it has no
998 actual arguments.
999 -}
1000
1001 couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1002 couldBeSmallEnoughToInline dflags threshold rhs
1003 = case sizeExpr dflags threshold [] body of
1004 TooBig -> False
1005 _ -> True
1006 where
1007 (_, body) = collectBinders rhs
1008
1009 ----------------
1010 smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1011 smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1012 = size <= ufUseThreshold dflags
1013 smallEnoughToInline _ _
1014 = False
1015
1016 ----------------
1017
1018 certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1019 -- Sees if the unfolding is pretty certain to inline
1020 -- If so, return a *stable* unfolding for it, that will always inline
1021 certainlyWillInline dflags fn_info
1022 = case unfoldingInfo fn_info of
1023 CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1024 | loop_breaker -> Nothing -- Won't inline, so try w/w
1025 | otherwise -> do_cunf e g -- Depends on size, so look at that
1026
1027 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense
1028 -- to do so, and even if it is currently a
1029 -- loop breaker, it may not be later
1030
1031 _other_unf -> Nothing
1032
1033 where
1034 loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1035 fn_unf = unfoldingInfo fn_info
1036
1037 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1038 do_cunf _ UnfNever = Nothing
1039 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1040 -- INLINE functions have UnfWhen
1041
1042 -- The UnfIfGoodArgs case seems important. If we w/w small functions
1043 -- binary sizes go up by 10%! (This is with SplitObjs.)
1044 -- I'm not totally sure why.
1045 -- INLINABLE functions come via this path
1046 -- See Note [certainlyWillInline: INLINABLE]
1047 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1048 | not (null args) -- See Note [certainlyWillInline: be careful of thunks]
1049 , case inlinePragmaSpec (inlinePragInfo fn_info) of
1050 NoInline -> False -- NOINLINE; do not say certainlyWillInline!
1051 _ -> True -- INLINE, INLINABLE, or nothing
1052 , not (isBottomingSig (strictnessInfo fn_info))
1053 -- Do not unconditionally inline a bottoming functions even if
1054 -- it seems smallish. We've carefully lifted it out to top level,
1055 -- so we don't want to re-inline it.
1056 , let arity = length args
1057 , size - (10 * (arity + 1)) <= ufUseThreshold dflags
1058 = Just (fn_unf { uf_src = InlineStable
1059 , uf_guidance = UnfWhen { ug_arity = arity
1060 , ug_unsat_ok = unSaturatedOk
1061 , ug_boring_ok = inlineBoringOk expr } })
1062 -- Note the "unsaturatedOk". A function like f = \ab. a
1063 -- will certainly inline, even if partially applied (f e), so we'd
1064 -- better make sure that the transformed inlining has the same property
1065 | otherwise
1066 = Nothing
1067
1068 {- Note [certainlyWillInline: be careful of thunks]
1069 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1070 Don't claim that thunks will certainly inline, because that risks work
1071 duplication. Even if the work duplication is not great (eg is_cheap
1072 holds), it can make a big difference in an inner loop In Trac #5623 we
1073 found that the WorkWrap phase thought that
1074 y = case x of F# v -> F# (v +# v)
1075 was certainlyWillInline, so the addition got duplicated.
1076
1077 Note [certainlyWillInline: INLINABLE]
1078 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1079 certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1080 even though we have a stable inlining, so that strictness w/w takes
1081 place. It makes a big difference to efficiency, and the w/w pass knows
1082 how to transfer the INLINABLE info to the worker; see WorkWrap
1083 Note [Worker-wrapper for INLINABLE functions]
1084
1085 ************************************************************************
1086 * *
1087 \subsection{callSiteInline}
1088 * *
1089 ************************************************************************
1090
1091 This is the key function. It decides whether to inline a variable at a call site
1092
1093 callSiteInline is used at call sites, so it is a bit more generous.
1094 It's a very important function that embodies lots of heuristics.
1095 A non-WHNF can be inlined if it doesn't occur inside a lambda,
1096 and occurs exactly once or
1097 occurs once in each branch of a case and is small
1098
1099 If the thing is in WHNF, there's no danger of duplicating work,
1100 so we can inline if it occurs once, or is small
1101
1102 NOTE: we don't want to inline top-level functions that always diverge.
1103 It just makes the code bigger. Tt turns out that the convenient way to prevent
1104 them inlining is to give them a NOINLINE pragma, which we do in
1105 StrictAnal.addStrictnessInfoToTopId
1106 -}
1107
1108 callSiteInline :: DynFlags
1109 -> Id -- The Id
1110 -> Bool -- True <=> unfolding is active
1111 -> Bool -- True if there are no arguments at all (incl type args)
1112 -> [ArgSummary] -- One for each value arg; True if it is interesting
1113 -> CallCtxt -- True <=> continuation is interesting
1114 -> Maybe CoreExpr -- Unfolding, if any
1115
1116 data ArgSummary = TrivArg -- Nothing interesting
1117 | NonTrivArg -- Arg has structure
1118 | ValueArg -- Arg is a con-app or PAP
1119 -- ..or con-like. Note [Conlike is interesting]
1120
1121 instance Outputable ArgSummary where
1122 ppr TrivArg = text "TrivArg"
1123 ppr NonTrivArg = text "NonTrivArg"
1124 ppr ValueArg = text "ValueArg"
1125
1126 nonTriv :: ArgSummary -> Bool
1127 nonTriv TrivArg = False
1128 nonTriv _ = True
1129
1130 data CallCtxt
1131 = BoringCtxt
1132 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets]
1133 | DiscArgCtxt -- Argument of a function with non-zero arg discount
1134 | RuleArgCtxt -- We are somewhere in the argument of a function with rules
1135
1136 | ValAppCtxt -- We're applied to at least one value arg
1137 -- This arises when we have ((f x |> co) y)
1138 -- Then the (f x) has argument 'x' but in a ValAppCtxt
1139
1140 | CaseCtxt -- We're the scrutinee of a case
1141 -- that decomposes its scrutinee
1142
1143 instance Outputable CallCtxt where
1144 ppr CaseCtxt = text "CaseCtxt"
1145 ppr ValAppCtxt = text "ValAppCtxt"
1146 ppr BoringCtxt = text "BoringCtxt"
1147 ppr RhsCtxt = text "RhsCtxt"
1148 ppr DiscArgCtxt = text "DiscArgCtxt"
1149 ppr RuleArgCtxt = text "RuleArgCtxt"
1150
1151 callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1152 = case idUnfolding id of
1153 -- idUnfolding checks for loop-breakers, returning NoUnfolding
1154 -- Things with an INLINE pragma may have an unfolding *and*
1155 -- be a loop breaker (maybe the knot is not yet untied)
1156 CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top
1157 , uf_is_work_free = is_wf
1158 , uf_guidance = guidance, uf_expandable = is_exp }
1159 | active_unfolding -> tryUnfolding dflags id lone_variable
1160 arg_infos cont_info unf_template is_top
1161 is_wf is_exp guidance
1162 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1163 NoUnfolding -> Nothing
1164 BootUnfolding -> Nothing
1165 OtherCon {} -> Nothing
1166 DFunUnfolding {} -> Nothing -- Never unfold a DFun
1167
1168 traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1169 traceInline dflags inline_id str doc result
1170 | Just prefix <- inlineCheck dflags
1171 = if prefix `isPrefixOf` occNameString (getOccName inline_id)
1172 then pprTrace str doc result
1173 else result
1174 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1175 = pprTrace str doc result
1176 | otherwise
1177 = result
1178
1179 tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1180 -> CoreExpr -> Bool -> Bool -> Bool -> UnfoldingGuidance
1181 -> Maybe CoreExpr
1182 tryUnfolding dflags id lone_variable
1183 arg_infos cont_info unf_template is_top
1184 is_wf is_exp guidance
1185 = case guidance of
1186 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1187
1188 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1189 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1190 -- See Note [INLINE for small functions (3)]
1191 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1192 | otherwise
1193 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1194 where
1195 some_benefit = calc_some_benefit uf_arity
1196 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1197
1198 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1199 | ufVeryAggressive dflags
1200 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1201 | is_wf && some_benefit && small_enough
1202 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1203 | otherwise
1204 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1205 where
1206 some_benefit = calc_some_benefit (length arg_discounts)
1207 extra_doc = text "discounted size =" <+> int discounted_size
1208 discounted_size = size - discount
1209 small_enough = discounted_size <= ufUseThreshold dflags
1210 discount = computeDiscount dflags arg_discounts
1211 res_discount arg_infos cont_info
1212
1213 where
1214 mk_doc some_benefit extra_doc yes_or_no
1215 = vcat [ text "arg infos" <+> ppr arg_infos
1216 , text "interesting continuation" <+> ppr cont_info
1217 , text "some_benefit" <+> ppr some_benefit
1218 , text "is exp:" <+> ppr is_exp
1219 , text "is work-free:" <+> ppr is_wf
1220 , text "guidance" <+> ppr guidance
1221 , extra_doc
1222 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1223
1224 str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1225 n_val_args = length arg_infos
1226
1227 -- some_benefit is used when the RHS is small enough
1228 -- and the call has enough (or too many) value
1229 -- arguments (ie n_val_args >= arity). But there must
1230 -- be *something* interesting about some argument, or the
1231 -- result context, to make it worth inlining
1232 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args
1233 -- expected by the unfolding
1234 calc_some_benefit uf_arity
1235 | not saturated = interesting_args -- Under-saturated
1236 -- Note [Unsaturated applications]
1237 | otherwise = interesting_args -- Saturated or over-saturated
1238 || interesting_call
1239 where
1240 saturated = n_val_args >= uf_arity
1241 over_saturated = n_val_args > uf_arity
1242 interesting_args = any nonTriv arg_infos
1243 -- NB: (any nonTriv arg_infos) looks at the
1244 -- over-saturated args too which is "wrong";
1245 -- but if over-saturated we inline anyway.
1246
1247 interesting_call
1248 | over_saturated
1249 = True
1250 | otherwise
1251 = case cont_info of
1252 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables]
1253 ValAppCtxt -> True -- Note [Cast then apply]
1254 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts]
1255 DiscArgCtxt -> uf_arity > 0 --
1256 RhsCtxt -> uf_arity > 0 --
1257 _ -> not is_top && uf_arity > 0 -- Note [Nested functions]
1258 -- Note [Inlining in ArgCtxt]
1259
1260 {-
1261 Note [Unfold into lazy contexts], Note [RHS of lets]
1262 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1263 When the call is the argument of a function with a RULE, or the RHS of a let,
1264 we are a little bit keener to inline. For example
1265 f y = (y,y,y)
1266 g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1267 We'd inline 'f' if the call was in a case context, and it kind-of-is,
1268 only we can't see it. Also
1269 x = f v
1270 could be expensive whereas
1271 x = case v of (a,b) -> a
1272 is patently cheap and may allow more eta expansion.
1273 So we treat the RHS of a let as not-totally-boring.
1274
1275 Note [Unsaturated applications]
1276 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1277 When a call is not saturated, we *still* inline if one of the
1278 arguments has interesting structure. That's sometimes very important.
1279 A good example is the Ord instance for Bool in Base:
1280
1281 Rec {
1282 $fOrdBool =GHC.Classes.D:Ord
1283 @ Bool
1284 ...
1285 $cmin_ajX
1286
1287 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1288 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1289 }
1290
1291 But the defn of GHC.Classes.$dmmin is:
1292
1293 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1294 {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1295 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1296 case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1297 GHC.Types.False -> y GHC.Types.True -> x }) -}
1298
1299 We *really* want to inline $dmmin, even though it has arity 3, in
1300 order to unravel the recursion.
1301
1302
1303 Note [Things to watch]
1304 ~~~~~~~~~~~~~~~~~~~~~~
1305 * { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1306 Assume x is exported, so not inlined unconditionally.
1307 Then we want x to inline unconditionally; no reason for it
1308 not to, and doing so avoids an indirection.
1309
1310 * { x = I# 3; ....f x.... }
1311 Make sure that x does not inline unconditionally!
1312 Lest we get extra allocation.
1313
1314 Note [Inlining an InlineRule]
1315 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1316 An InlineRules is used for
1317 (a) programmer INLINE pragmas
1318 (b) inlinings from worker/wrapper
1319
1320 For (a) the RHS may be large, and our contract is that we *only* inline
1321 when the function is applied to all the arguments on the LHS of the
1322 source-code defn. (The uf_arity in the rule.)
1323
1324 However for worker/wrapper it may be worth inlining even if the
1325 arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1326 require saturation.
1327
1328
1329 Note [Nested functions]
1330 ~~~~~~~~~~~~~~~~~~~~~~~
1331 If a function has a nested defn we also record some-benefit, on the
1332 grounds that we are often able to eliminate the binding, and hence the
1333 allocation, for the function altogether; this is good for join points.
1334 But this only makes sense for *functions*; inlining a constructor
1335 doesn't help allocation unless the result is scrutinised. UNLESS the
1336 constructor occurs just once, albeit possibly in multiple case
1337 branches. Then inlining it doesn't increase allocation, but it does
1338 increase the chance that the constructor won't be allocated at all in
1339 the branches that don't use it.
1340
1341 Note [Cast then apply]
1342 ~~~~~~~~~~~~~~~~~~~~~~
1343 Consider
1344 myIndex = __inline_me ( (/\a. <blah>) |> co )
1345 co :: (forall a. a -> a) ~ (forall a. T a)
1346 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1347
1348 We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1349 no value arguments. The ValAppCtxt gives it enough incentive to inline.
1350
1351 Note [Inlining in ArgCtxt]
1352 ~~~~~~~~~~~~~~~~~~~~~~~~~~
1353 The condition (arity > 0) here is very important, because otherwise
1354 we end up inlining top-level stuff into useless places; eg
1355 x = I# 3#
1356 f = \y. g x
1357 This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1358 and 20% to 'power'.
1359
1360 At one stage I replaced this condition by 'True' (leading to the above
1361 slow-down). The motivation was test eyeball/inline1.hs; but that seems
1362 to work ok now.
1363
1364 NOTE: arguably, we should inline in ArgCtxt only if the result of the
1365 call is at least CONLIKE. At least for the cases where we use ArgCtxt
1366 for the RHS of a 'let', we only profit from the inlining if we get a
1367 CONLIKE thing (modulo lets).
1368
1369 Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables]
1370 ~~~~~~~~~~~~~~~~~~~~~ which appears below
1371 The "lone-variable" case is important. I spent ages messing about
1372 with unsatisfactory variants, but this is nice. The idea is that if a
1373 variable appears all alone
1374
1375 as an arg of lazy fn, or rhs BoringCtxt
1376 as scrutinee of a case CaseCtxt
1377 as arg of a fn ArgCtxt
1378 AND
1379 it is bound to a cheap expression
1380
1381 then we should not inline it (unless there is some other reason,
1382 e.g. it is the sole occurrence). That is what is happening at
1383 the use of 'lone_variable' in 'interesting_call'.
1384
1385 Why? At least in the case-scrutinee situation, turning
1386 let x = (a,b) in case x of y -> ...
1387 into
1388 let x = (a,b) in case (a,b) of y -> ...
1389 and thence to
1390 let x = (a,b) in let y = (a,b) in ...
1391 is bad if the binding for x will remain.
1392
1393 Another example: I discovered that strings
1394 were getting inlined straight back into applications of 'error'
1395 because the latter is strict.
1396 s = "foo"
1397 f = \x -> ...(error s)...
1398
1399 Fundamentally such contexts should not encourage inlining because, provided
1400 the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1401 context can ``see'' the unfolding of the variable (e.g. case or a
1402 RULE) so there's no gain.
1403
1404 However, watch out:
1405
1406 * Consider this:
1407 foo = _inline_ (\n. [n])
1408 bar = _inline_ (foo 20)
1409 baz = \n. case bar of { (m:_) -> m + n }
1410 Here we really want to inline 'bar' so that we can inline 'foo'
1411 and the whole thing unravels as it should obviously do. This is
1412 important: in the NDP project, 'bar' generates a closure data
1413 structure rather than a list.
1414
1415 So the non-inlining of lone_variables should only apply if the
1416 unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1417 looks through the unfolding. Hence the "&& is_wf" in the
1418 InlineRule branch.
1419
1420 * Even a type application or coercion isn't a lone variable.
1421 Consider
1422 case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1423 We had better inline that sucker! The case won't see through it.
1424
1425 For now, I'm treating treating a variable applied to types
1426 in a *lazy* context "lone". The motivating example was
1427 f = /\a. \x. BIG
1428 g = /\a. \y. h (f a)
1429 There's no advantage in inlining f here, and perhaps
1430 a significant disadvantage. Hence some_val_args in the Stop case
1431
1432 Note [Interaction of exprIsWorkFree and lone variables]
1433 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1434 The lone-variable test says "don't inline if a case expression
1435 scrutinises a lone variable whose unfolding is cheap". It's very
1436 important that, under these circumstances, exprIsConApp_maybe
1437 can spot a constructor application. So, for example, we don't
1438 consider
1439 let x = e in (x,x)
1440 to be cheap, and that's good because exprIsConApp_maybe doesn't
1441 think that expression is a constructor application.
1442
1443 In the 'not (lone_variable && is_wf)' test, I used to test is_value
1444 rather than is_wf, which was utterly wrong, because the above
1445 expression responds True to exprIsHNF, which is what sets is_value.
1446
1447 This kind of thing can occur if you have
1448
1449 {-# INLINE foo #-}
1450 foo = let x = e in (x,x)
1451
1452 which Roman did.
1453 -}
1454
1455 computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1456 -> Int
1457 computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1458 -- We multiple the raw discounts (args_discount and result_discount)
1459 -- ty opt_UnfoldingKeenessFactor because the former have to do with
1460 -- *size* whereas the discounts imply that there's some extra
1461 -- *efficiency* to be gained (e.g. beta reductions, case reductions)
1462 -- by inlining.
1463
1464 = 10 -- Discount of 10 because the result replaces the call
1465 -- so we count 10 for the function itself
1466
1467 + 10 * length actual_arg_discounts
1468 -- Discount of 10 for each arg supplied,
1469 -- because the result replaces the call
1470
1471 + round (ufKeenessFactor dflags *
1472 fromIntegral (total_arg_discount + res_discount'))
1473 where
1474 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1475 total_arg_discount = sum actual_arg_discounts
1476
1477 mk_arg_discount _ TrivArg = 0
1478 mk_arg_discount _ NonTrivArg = 10
1479 mk_arg_discount discount ValueArg = discount
1480
1481 res_discount'
1482 | LT <- arg_discounts `compareLength` arg_infos
1483 = res_discount -- Over-saturated
1484 | otherwise
1485 = case cont_info of
1486 BoringCtxt -> 0
1487 CaseCtxt -> res_discount -- Presumably a constructor
1488 ValAppCtxt -> res_discount -- Presumably a function
1489 _ -> 40 `min` res_discount
1490 -- ToDo: this 40 `min` res_discount doesn't seem right
1491 -- for DiscArgCtxt it shouldn't matter because the function will
1492 -- get the arg discount for any non-triv arg
1493 -- for RuleArgCtxt we do want to be keener to inline; but not only
1494 -- constructor results
1495 -- for RhsCtxt I suppose that exposing a data con is good in general
1496 -- And 40 seems very arbitrary
1497 --
1498 -- res_discount can be very large when a function returns
1499 -- constructors; but we only want to invoke that large discount
1500 -- when there's a case continuation.
1501 -- Otherwise we, rather arbitrarily, threshold it. Yuk.
1502 -- But we want to aovid inlining large functions that return
1503 -- constructors into contexts that are simply "interesting"