ghc-prim: Don't allocate a thunk for each unpacked UTF-8 character
[ghc.git] / libraries / ghc-prim / GHC / CString.hs
1 {-# LANGUAGE MagicHash, NoImplicitPrelude, BangPatterns #-}
2 -----------------------------------------------------------------------------
3 -- |
4 -- Module : GHC.CString
5 -- Copyright : (c) The University of Glasgow 2011
6 -- License : see libraries/ghc-prim/LICENSE
7 --
8 -- Maintainer : cvs-ghc@haskell.org
9 -- Stability : internal
10 -- Portability : non-portable (GHC Extensions)
11 --
12 -- GHC C strings definitions (previously in GHC.Base).
13 -- Use GHC.Exts from the base package instead of importing this
14 -- module directly.
15 --
16 -----------------------------------------------------------------------------
17
18 module GHC.CString (
19 unpackCString#, unpackAppendCString#, unpackFoldrCString#,
20 unpackCStringUtf8#, unpackNBytes#
21 ) where
22
23 import GHC.Types
24 import GHC.Prim
25
26 -----------------------------------------------------------------------------
27 -- Unpacking C strings
28 -----------------------------------------------------------------------------
29
30 -- This code is needed for virtually all programs, since it's used for
31 -- unpacking the strings of error messages.
32
33 -- Used to be in GHC.Base, but was moved to ghc-prim because the new generics
34 -- stuff uses Strings in the representation, so to give representations for
35 -- ghc-prim types we need unpackCString#
36
37 {- Note [Inlining unpackCString#]
38 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
39 There's really no point in ever inlining things like unpackCString# as the loop
40 doesn't specialise in an interesting way and we can't deforest the list
41 constructors (we'd want to use unpackFoldrCString# for this). Moreover, it's
42 pretty small, so there's a danger that it'll be inlined at every literal, which
43 is a waste.
44
45 Moreover, inlining early may interfere with a variety of rules that are supposed
46 to match unpackCString#,
47
48 * BuiltInRules in PrelRules.hs; e.g.
49 eqString (unpackCString# (Lit s1)) (unpackCString# (Lit s2)
50 = s1 == s2
51
52 * unpacking rules; e.g. in GHC.Base,
53 unpackCString# a
54 = build (unpackFoldrCString# a)
55
56 * stream fusion rules; e.g. in the `text` library,
57 unstream (S.map safe (S.streamList (GHC.unpackCString# a)))
58 = unpackCString# a
59
60 Moreover, we want to make it CONLIKE, so that:
61
62 * the rules in PrelRules will fire when the string is let-bound.
63 E.g. the eqString rule in PrelRules
64 eqString (unpackCString# (Lit s1)) (unpackCString# (Lit s2) = s1==s2
65
66 * exprIsConApp_maybe will see the string when we have
67 let x = unpackCString# "foo"#
68 ...(case x of algs)...
69
70 All of this goes for unpackCStringUtf8# too.
71 -}
72
73 unpackCString# :: Addr# -> [Char]
74 {-# NOINLINE CONLIKE unpackCString# #-}
75 unpackCString# addr
76 = unpack 0#
77 where
78 unpack nh
79 | isTrue# (ch `eqChar#` '\0'#) = []
80 | True = C# ch : unpack (nh +# 1#)
81 where
82 !ch = indexCharOffAddr# addr nh
83
84 unpackAppendCString# :: Addr# -> [Char] -> [Char]
85 {-# NOINLINE unpackAppendCString# #-}
86 -- See the NOINLINE note on unpackCString#
87 unpackAppendCString# addr rest
88 = unpack 0#
89 where
90 unpack nh
91 | isTrue# (ch `eqChar#` '\0'#) = rest
92 | True = C# ch : unpack (nh +# 1#)
93 where
94 !ch = indexCharOffAddr# addr nh
95
96 unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a
97
98 -- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#
99
100 -- It also has a BuiltInRule in PrelRules.hs:
101 -- unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n)
102 -- = unpackFoldrCString# "foobaz" c n
103
104 {-# NOINLINE unpackFoldrCString# #-}
105 -- At one stage I had NOINLINE [0] on the grounds that, unlike
106 -- unpackCString#, there *is* some point in inlining
107 -- unpackFoldrCString#, because we get better code for the
108 -- higher-order function call. BUT there may be a lot of
109 -- literal strings, and making a separate 'unpack' loop for
110 -- each is highly gratuitous. See nofib/real/anna/PrettyPrint.
111
112 unpackFoldrCString# addr f z
113 = unpack 0#
114 where
115 unpack nh
116 | isTrue# (ch `eqChar#` '\0'#) = z
117 | True = C# ch `f` unpack (nh +# 1#)
118 where
119 !ch = indexCharOffAddr# addr nh
120
121 -- There's really no point in inlining this for the same reasons as
122 -- unpackCString. See Note [Inlining unpackCString#] above for details.
123 unpackCStringUtf8# :: Addr# -> [Char]
124 {-# NOINLINE CONLIKE unpackCStringUtf8# #-}
125 unpackCStringUtf8# addr
126 = unpack 0#
127 where
128 -- We take care to strictly evaluate the character decoding as
129 -- indexCharOffAddr# is marked with the can_fail flag and
130 -- consequently GHC won't evaluate the expression unless it is absolutely
131 -- needed.
132 unpack nh
133 | isTrue# (ch `eqChar#` '\0'# ) = []
134 | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpack (nh +# 1#)
135 | isTrue# (ch `leChar#` '\xDF'#) =
136 let !c = C# (chr# (((ord# ch -# 0xC0#) `uncheckedIShiftL#` 6#) +#
137 (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#)))
138 in c : unpack (nh +# 2#)
139 | isTrue# (ch `leChar#` '\xEF'#) =
140 let !c = C# (chr# (((ord# ch -# 0xE0#) `uncheckedIShiftL#` 12#) +#
141 ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 6#) +#
142 (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#)))
143 in c : unpack (nh +# 3#)
144 | True =
145 let !c = C# (chr# (((ord# ch -# 0xF0#) `uncheckedIShiftL#` 18#) +#
146 ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +#
147 ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#` 6#) +#
148 (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#)))
149 in c : unpack (nh +# 4#)
150 where
151 !ch = indexCharOffAddr# addr nh
152
153 -- There's really no point in inlining this for the same reasons as
154 -- unpackCString. See Note [Inlining unpackCString#] above for details.
155 unpackNBytes# :: Addr# -> Int# -> [Char]
156 {-# NOINLINE unpackNBytes# #-}
157 unpackNBytes# _addr 0# = []
158 unpackNBytes# addr len# = unpack [] (len# -# 1#)
159 where
160 unpack acc i#
161 | isTrue# (i# <# 0#) = acc
162 | True =
163 case indexCharOffAddr# addr i# of
164 ch -> unpack (C# ch : acc) (i# -# 1#)
165