Merge branch 'wip/8-8-ghci' of gitlab.haskell.org:ghc/ghc into wip/ghc-8.8-merges
[ghc.git] / compiler / utils / StringBuffer.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-}
10 {-# OPTIONS_GHC -O2 #-}
11 -- We always optimise this, otherwise performance of a non-optimised
12 -- compiler is severely affected
13
14 module StringBuffer
15 (
16 StringBuffer(..),
17 -- non-abstract for vs\/HaskellService
18
19 -- * Creation\/destruction
20 hGetStringBuffer,
21 hGetStringBufferBlock,
22 hPutStringBuffer,
23 appendStringBuffers,
24 stringToStringBuffer,
25
26 -- * Inspection
27 nextChar,
28 currentChar,
29 prevChar,
30 atEnd,
31
32 -- * Moving and comparison
33 stepOn,
34 offsetBytes,
35 byteDiff,
36 atLine,
37
38 -- * Conversion
39 lexemeToString,
40 lexemeToFastString,
41 decodePrevNChars,
42
43 -- * Parsing integers
44 parseUnsignedInteger,
45 ) where
46
47 #include "HsVersions.h"
48
49 import GhcPrelude
50
51 import Encoding
52 import FastString
53 import FastFunctions
54 import PlainPanic
55 import Util
56
57 import Data.Maybe
58 import Control.Exception
59 import System.IO
60 import System.IO.Unsafe ( unsafePerformIO )
61 import GHC.IO.Encoding.UTF8 ( mkUTF8 )
62 import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) )
63
64 import GHC.Exts
65
66 import Foreign
67
68 -- -----------------------------------------------------------------------------
69 -- The StringBuffer type
70
71 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
72 -- The bytes are intended to be *immutable*. There are pure
73 -- operations to read the contents of a StringBuffer.
74 --
75 -- A StringBuffer may have a finalizer, depending on how it was
76 -- obtained.
77 --
78 data StringBuffer
79 = StringBuffer {
80 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
81 len :: {-# UNPACK #-} !Int, -- length
82 cur :: {-# UNPACK #-} !Int -- current pos
83 }
84 -- The buffer is assumed to be UTF-8 encoded, and furthermore
85 -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
86 -- decoder doesn't have to check for overflow at every single byte
87 -- of a multibyte sequence.
88
89 instance Show StringBuffer where
90 showsPrec _ s = showString "<stringbuffer("
91 . shows (len s) . showString "," . shows (cur s)
92 . showString ")>"
93
94 -- -----------------------------------------------------------------------------
95 -- Creation / Destruction
96
97 -- | Read a file into a 'StringBuffer'. The resulting buffer is automatically
98 -- managed by the garbage collector.
99 hGetStringBuffer :: FilePath -> IO StringBuffer
100 hGetStringBuffer fname = do
101 h <- openBinaryFile fname ReadMode
102 size_i <- hFileSize h
103 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
104 let size = fromIntegral $ size_i - offset_i
105 buf <- mallocForeignPtrArray (size+3)
106 withForeignPtr buf $ \ptr -> do
107 r <- if size == 0 then return 0 else hGetBuf h ptr size
108 hClose h
109 if (r /= size)
110 then ioError (userError "short read of file")
111 else newUTF8StringBuffer buf ptr size
112
113 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
114 hGetStringBufferBlock handle wanted
115 = do size_i <- hFileSize handle
116 offset_i <- hTell handle >>= skipBOM handle size_i
117 let size = min wanted (fromIntegral $ size_i-offset_i)
118 buf <- mallocForeignPtrArray (size+3)
119 withForeignPtr buf $ \ptr ->
120 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
121 if r /= size
122 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
123 else newUTF8StringBuffer buf ptr size
124
125 hPutStringBuffer :: Handle -> StringBuffer -> IO ()
126 hPutStringBuffer hdl (StringBuffer buf len cur)
127 = do withForeignPtr (plusForeignPtr buf cur) $ \ptr ->
128 hPutBuf hdl ptr len
129
130 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
131 -- and return the new position of the handle in bytes.
132 --
133 -- This is better than treating #FEFF as whitespace,
134 -- because that would mess up layout. We don't have a concept
135 -- of zero-width whitespace in Haskell: all whitespace codepoints
136 -- have a width of one column.
137 skipBOM :: Handle -> Integer -> Integer -> IO Integer
138 skipBOM h size offset =
139 -- Only skip BOM at the beginning of a file.
140 if size > 0 && offset == 0
141 then do
142 -- Validate assumption that handle is in binary mode.
143 ASSERTM( hGetEncoding h >>= return . isNothing )
144 -- Temporarily select utf8 encoding with error ignoring,
145 -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
146 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
147 c <- hLookAhead h
148 if c == '\xfeff'
149 then hGetChar h >> hTell h
150 else return offset
151 else return offset
152 where
153 safeEncoding = mkUTF8 IgnoreCodingFailure
154
155 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
156 newUTF8StringBuffer buf ptr size = do
157 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
158 -- sentinels for UTF-8 decoding
159 return $ StringBuffer buf size 0
160
161 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
162 appendStringBuffers sb1 sb2
163 = do newBuf <- mallocForeignPtrArray (size+3)
164 withForeignPtr newBuf $ \ptr ->
165 withForeignPtr (buf sb1) $ \sb1Ptr ->
166 withForeignPtr (buf sb2) $ \sb2Ptr ->
167 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
168 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
169 pokeArray (ptr `advancePtr` size) [0,0,0]
170 return (StringBuffer newBuf size 0)
171 where sb1_len = calcLen sb1
172 sb2_len = calcLen sb2
173 calcLen sb = len sb - cur sb
174 size = sb1_len + sb2_len
175
176 -- | Encode a 'String' into a 'StringBuffer' as UTF-8. The resulting buffer
177 -- is automatically managed by the garbage collector.
178 stringToStringBuffer :: String -> StringBuffer
179 stringToStringBuffer str =
180 unsafePerformIO $ do
181 let size = utf8EncodedLength str
182 buf <- mallocForeignPtrArray (size+3)
183 withForeignPtr buf $ \ptr -> do
184 utf8EncodeString ptr str
185 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
186 -- sentinels for UTF-8 decoding
187 return (StringBuffer buf size 0)
188
189 -- -----------------------------------------------------------------------------
190 -- Grab a character
191
192 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
193 -- the remaining portion (analogous to 'Data.List.uncons'). __Warning:__ The
194 -- behavior is undefined if the 'StringBuffer' is empty. The result shares
195 -- the same buffer as the original. Similar to 'utf8DecodeChar', if the
196 -- character cannot be decoded as UTF-8, @\'\\0\'@ is returned.
197 {-# INLINE nextChar #-}
198 nextChar :: StringBuffer -> (Char,StringBuffer)
199 nextChar (StringBuffer buf len (I# cur#)) =
200 -- Getting our fingers dirty a little here, but this is performance-critical
201 inlinePerformIO $ do
202 withForeignPtr buf $ \(Ptr a#) -> do
203 case utf8DecodeChar# (a# `plusAddr#` cur#) of
204 (# c#, nBytes# #) ->
205 let cur' = I# (cur# +# nBytes#) in
206 return (C# c#, StringBuffer buf len cur')
207
208 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
209 -- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
210 -- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character
211 -- cannot be decoded as UTF-8, @\'\\0\'@ is returned.
212 currentChar :: StringBuffer -> Char
213 currentChar = fst . nextChar
214
215 prevChar :: StringBuffer -> Char -> Char
216 prevChar (StringBuffer _ _ 0) deflt = deflt
217 prevChar (StringBuffer buf _ cur) _ =
218 inlinePerformIO $ do
219 withForeignPtr buf $ \p -> do
220 p' <- utf8PrevChar (p `plusPtr` cur)
221 return (fst (utf8DecodeChar p'))
222
223 -- -----------------------------------------------------------------------------
224 -- Moving
225
226 -- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
227 -- to 'Data.List.tail'). __Warning:__ The behavior is undefined if the
228 -- 'StringBuffer' is empty. The result shares the same buffer as the
229 -- original.
230 stepOn :: StringBuffer -> StringBuffer
231 stepOn s = snd (nextChar s)
232
233 -- | Return a 'StringBuffer' with the first @n@ bytes removed. __Warning:__
234 -- If there aren't enough characters, the returned 'StringBuffer' will be
235 -- invalid and any use of it may lead to undefined behavior. The result
236 -- shares the same buffer as the original.
237 offsetBytes :: Int -- ^ @n@, the number of bytes
238 -> StringBuffer
239 -> StringBuffer
240 offsetBytes i s = s { cur = cur s + i }
241
242 -- | Compute the difference in offset between two 'StringBuffer's that share
243 -- the same buffer. __Warning:__ The behavior is undefined if the
244 -- 'StringBuffer's use separate buffers.
245 byteDiff :: StringBuffer -> StringBuffer -> Int
246 byteDiff s1 s2 = cur s2 - cur s1
247
248 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
249 atEnd :: StringBuffer -> Bool
250 atEnd (StringBuffer _ l c) = l == c
251
252 -- | Computes a 'StringBuffer' which points to the first character of the
253 -- wanted line. Lines begin at 1.
254 atLine :: Int -> StringBuffer -> Maybe StringBuffer
255 atLine line sb@(StringBuffer buf len _) =
256 inlinePerformIO $
257 withForeignPtr buf $ \p -> do
258 p' <- skipToLine line len p
259 if p' == nullPtr
260 then return Nothing
261 else
262 let
263 delta = p' `minusPtr` p
264 in return $ Just (sb { cur = delta
265 , len = len - delta
266 })
267
268 skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
269 skipToLine !line !len !op0 = go 1 op0
270 where
271 !opend = op0 `plusPtr` len
272
273 go !i_line !op
274 | op >= opend = pure nullPtr
275 | i_line == line = pure op
276 | otherwise = do
277 w <- peek op :: IO Word8
278 case w of
279 10 -> go (i_line + 1) (plusPtr op 1)
280 13 -> do
281 -- this is safe because a 'StringBuffer' is
282 -- guaranteed to have 3 bytes sentinel values.
283 w' <- peek (plusPtr op 1) :: IO Word8
284 case w' of
285 10 -> go (i_line + 1) (plusPtr op 2)
286 _ -> go (i_line + 1) (plusPtr op 1)
287 _ -> go i_line (plusPtr op 1)
288
289 -- -----------------------------------------------------------------------------
290 -- Conversion
291
292 -- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
293 -- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
294 -- they will be replaced with @\'\\0\'@.
295 lexemeToString :: StringBuffer
296 -> Int -- ^ @n@, the number of bytes
297 -> String
298 lexemeToString _ 0 = ""
299 lexemeToString (StringBuffer buf _ cur) bytes =
300 utf8DecodeStringLazy buf cur bytes
301
302 lexemeToFastString :: StringBuffer
303 -> Int -- ^ @n@, the number of bytes
304 -> FastString
305 lexemeToFastString _ 0 = nilFS
306 lexemeToFastString (StringBuffer buf _ cur) len =
307 inlinePerformIO $
308 withForeignPtr buf $ \ptr ->
309 return $! mkFastStringBytes (ptr `plusPtr` cur) len
310
311 -- | Return the previous @n@ characters (or fewer if we are less than @n@
312 -- characters into the buffer.
313 decodePrevNChars :: Int -> StringBuffer -> String
314 decodePrevNChars n (StringBuffer buf _ cur) =
315 inlinePerformIO $ withForeignPtr buf $ \p0 ->
316 go p0 n "" (p0 `plusPtr` (cur - 1))
317 where
318 go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
319 go buf0 n acc p | n == 0 || buf0 >= p = return acc
320 go buf0 n acc p = do
321 p' <- utf8PrevChar p
322 let (c,_) = utf8DecodeChar p'
323 go buf0 (n - 1) (c:acc) p'
324
325 -- -----------------------------------------------------------------------------
326 -- Parsing integer strings in various bases
327 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
328 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
329 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
330 go i x | i == len = x
331 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
332 '_' -> go (i + 1) x -- skip "_" (#14473)
333 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
334 in go 0 0