Caret diag.: Avoid decoding whole module if only specific line is needed
[ghc.git] / compiler / utils / StringBuffer.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-}
10 {-# OPTIONS_GHC -O #-}
11 -- We always optimise this, otherwise performance of a non-optimised
12 -- compiler is severely affected
13
14 module StringBuffer
15 (
16 StringBuffer(..),
17 -- non-abstract for vs\/HaskellService
18
19 -- * Creation\/destruction
20 hGetStringBuffer,
21 hGetStringBufferBlock,
22 appendStringBuffers,
23 stringToStringBuffer,
24
25 -- * Inspection
26 nextChar,
27 currentChar,
28 prevChar,
29 atEnd,
30
31 -- * Moving and comparison
32 stepOn,
33 offsetBytes,
34 byteDiff,
35 atLine,
36
37 -- * Conversion
38 lexemeToString,
39 lexemeToFastString,
40 decodePrevNChars,
41
42 -- * Parsing integers
43 parseUnsignedInteger,
44 ) where
45
46 #include "HsVersions.h"
47
48 import Encoding
49 import FastString
50 import FastFunctions
51 import Outputable
52 import Util
53
54 import Data.Maybe
55 import Control.Exception
56 import System.IO
57 import System.IO.Unsafe ( unsafePerformIO )
58 import GHC.IO.Encoding.UTF8 ( mkUTF8 )
59 import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) )
60
61 import GHC.Exts
62
63 import Foreign
64
65 -- -----------------------------------------------------------------------------
66 -- The StringBuffer type
67
68 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
69 -- The bytes are intended to be *immutable*. There are pure
70 -- operations to read the contents of a StringBuffer.
71 --
72 -- A StringBuffer may have a finalizer, depending on how it was
73 -- obtained.
74 --
75 data StringBuffer
76 = StringBuffer {
77 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
78 len :: {-# UNPACK #-} !Int, -- length
79 cur :: {-# UNPACK #-} !Int -- current pos
80 }
81 -- The buffer is assumed to be UTF-8 encoded, and furthermore
82 -- we add three '\0' bytes to the end as sentinels so that the
83 -- decoder doesn't have to check for overflow at every single byte
84 -- of a multibyte sequence.
85
86 instance Show StringBuffer where
87 showsPrec _ s = showString "<stringbuffer("
88 . shows (len s) . showString "," . shows (cur s)
89 . showString ")>"
90
91 -- -----------------------------------------------------------------------------
92 -- Creation / Destruction
93
94 -- | Read a file into a 'StringBuffer'. The resulting buffer is automatically
95 -- managed by the garbage collector.
96 hGetStringBuffer :: FilePath -> IO StringBuffer
97 hGetStringBuffer fname = do
98 h <- openBinaryFile fname ReadMode
99 size_i <- hFileSize h
100 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
101 let size = fromIntegral $ size_i - offset_i
102 buf <- mallocForeignPtrArray (size+3)
103 withForeignPtr buf $ \ptr -> do
104 r <- if size == 0 then return 0 else hGetBuf h ptr size
105 hClose h
106 if (r /= size)
107 then ioError (userError "short read of file")
108 else newUTF8StringBuffer buf ptr size
109
110 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
111 hGetStringBufferBlock handle wanted
112 = do size_i <- hFileSize handle
113 offset_i <- hTell handle >>= skipBOM handle size_i
114 let size = min wanted (fromIntegral $ size_i-offset_i)
115 buf <- mallocForeignPtrArray (size+3)
116 withForeignPtr buf $ \ptr ->
117 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
118 if r /= size
119 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
120 else newUTF8StringBuffer buf ptr size
121
122 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
123 -- and return the new position of the handle in bytes.
124 --
125 -- This is better than treating #FEFF as whitespace,
126 -- because that would mess up layout. We don't have a concept
127 -- of zero-width whitespace in Haskell: all whitespace codepoints
128 -- have a width of one column.
129 skipBOM :: Handle -> Integer -> Integer -> IO Integer
130 skipBOM h size offset =
131 -- Only skip BOM at the beginning of a file.
132 if size > 0 && offset == 0
133 then do
134 -- Validate assumption that handle is in binary mode.
135 ASSERTM( hGetEncoding h >>= return . isNothing )
136 -- Temporarily select utf8 encoding with error ignoring,
137 -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
138 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
139 c <- hLookAhead h
140 if c == '\xfeff'
141 then hGetChar h >> hTell h
142 else return offset
143 else return offset
144 where
145 safeEncoding = mkUTF8 IgnoreCodingFailure
146
147 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
148 newUTF8StringBuffer buf ptr size = do
149 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
150 -- sentinels for UTF-8 decoding
151 return $ StringBuffer buf size 0
152
153 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
154 appendStringBuffers sb1 sb2
155 = do newBuf <- mallocForeignPtrArray (size+3)
156 withForeignPtr newBuf $ \ptr ->
157 withForeignPtr (buf sb1) $ \sb1Ptr ->
158 withForeignPtr (buf sb2) $ \sb2Ptr ->
159 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
160 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
161 pokeArray (ptr `advancePtr` size) [0,0,0]
162 return (StringBuffer newBuf size 0)
163 where sb1_len = calcLen sb1
164 sb2_len = calcLen sb2
165 calcLen sb = len sb - cur sb
166 size = sb1_len + sb2_len
167
168 -- | Encode a 'String' into a 'StringBuffer' as UTF-8. The resulting buffer
169 -- is automatically managed by the garbage collector.
170 stringToStringBuffer :: String -> StringBuffer
171 stringToStringBuffer str =
172 unsafePerformIO $ do
173 let size = utf8EncodedLength str
174 buf <- mallocForeignPtrArray (size+3)
175 withForeignPtr buf $ \ptr -> do
176 utf8EncodeString ptr str
177 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
178 -- sentinels for UTF-8 decoding
179 return (StringBuffer buf size 0)
180
181 -- -----------------------------------------------------------------------------
182 -- Grab a character
183
184 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
185 -- the remaining portion (analogous to 'Data.List.uncons'). __Warning:__ The
186 -- behavior is undefined if the 'StringBuffer' is empty. The result shares
187 -- the same buffer as the original. Similar to 'utf8DecodeChar', if the
188 -- character cannot be decoded as UTF-8, '\0' is returned.
189 {-# INLINE nextChar #-}
190 nextChar :: StringBuffer -> (Char,StringBuffer)
191 nextChar (StringBuffer buf len (I# cur#)) =
192 -- Getting our fingers dirty a little here, but this is performance-critical
193 inlinePerformIO $ do
194 withForeignPtr buf $ \(Ptr a#) -> do
195 case utf8DecodeChar# (a# `plusAddr#` cur#) of
196 (# c#, nBytes# #) ->
197 let cur' = I# (cur# +# nBytes#) in
198 return (C# c#, StringBuffer buf len cur')
199
200 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
201 -- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
202 -- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character
203 -- cannot be decoded as UTF-8, '\0' is returned.
204 currentChar :: StringBuffer -> Char
205 currentChar = fst . nextChar
206
207 prevChar :: StringBuffer -> Char -> Char
208 prevChar (StringBuffer _ _ 0) deflt = deflt
209 prevChar (StringBuffer buf _ cur) _ =
210 inlinePerformIO $ do
211 withForeignPtr buf $ \p -> do
212 p' <- utf8PrevChar (p `plusPtr` cur)
213 return (fst (utf8DecodeChar p'))
214
215 -- -----------------------------------------------------------------------------
216 -- Moving
217
218 -- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
219 -- to 'Data.List.tail'). __Warning:__ The behavior is undefined if the
220 -- 'StringBuffer' is empty. The result shares the same buffer as the
221 -- original.
222 stepOn :: StringBuffer -> StringBuffer
223 stepOn s = snd (nextChar s)
224
225 -- | Return a 'StringBuffer' with the first @n@ bytes removed. __Warning:__
226 -- If there aren't enough characters, the returned 'StringBuffer' will be
227 -- invalid and any use of it may lead to undefined behavior. The result
228 -- shares the same buffer as the original.
229 offsetBytes :: Int -- ^ @n@, the number of bytes
230 -> StringBuffer
231 -> StringBuffer
232 offsetBytes i s = s { cur = cur s + i }
233
234 -- | Compute the difference in offset between two 'StringBuffer's that share
235 -- the same buffer. __Warning:__ The behavior is undefined if the
236 -- 'StringBuffer's use separate buffers.
237 byteDiff :: StringBuffer -> StringBuffer -> Int
238 byteDiff s1 s2 = cur s2 - cur s1
239
240 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
241 atEnd :: StringBuffer -> Bool
242 atEnd (StringBuffer _ l c) = l == c
243
244 -- | Computes a 'StringBuffer' which points to the first character of the
245 -- wanted line. Lines begin at 1.
246 atLine :: Int -> StringBuffer -> Maybe StringBuffer
247 atLine line sb@(StringBuffer buf len _) =
248 inlinePerformIO $
249 withForeignPtr buf $ \p -> do
250 p' <- skipToLine line len p
251 if p' == nullPtr
252 then return Nothing
253 else
254 let
255 delta = p' `minusPtr` p
256 in return $ Just (sb { cur = delta
257 , len = len - delta
258 })
259
260 skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
261 skipToLine !line !len !op0 = go 1 op0
262 where
263 !opend = op0 `plusPtr` len
264
265 go !i_line !op
266 | op >= opend = pure nullPtr
267 | i_line == line = pure op
268 | otherwise = do
269 w <- peek op :: IO Word8
270 case w of
271 10 -> go (i_line + 1) (plusPtr op 1)
272 13 -> do
273 -- this is safe because a 'StringBuffer' is
274 -- guaranteed to have 3 bytes sentinel values.
275 w' <- peek (plusPtr op 1) :: IO Word8
276 case w' of
277 10 -> go (i_line + 1) (plusPtr op 2)
278 _ -> go (i_line + 1) (plusPtr op 1)
279 _ -> go i_line (plusPtr op 1)
280
281 -- -----------------------------------------------------------------------------
282 -- Conversion
283
284 -- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
285 -- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
286 -- they will be replaced with '\0'.
287 lexemeToString :: StringBuffer
288 -> Int -- ^ @n@, the number of bytes
289 -> String
290 lexemeToString _ 0 = ""
291 lexemeToString (StringBuffer buf _ cur) bytes =
292 utf8DecodeStringLazy buf cur bytes
293
294 lexemeToFastString :: StringBuffer
295 -> Int -- ^ @n@, the number of bytes
296 -> FastString
297 lexemeToFastString _ 0 = nilFS
298 lexemeToFastString (StringBuffer buf _ cur) len =
299 inlinePerformIO $
300 withForeignPtr buf $ \ptr ->
301 return $! mkFastStringBytes (ptr `plusPtr` cur) len
302
303 -- | Return the previous @n@ characters (or fewer if we are less than @n@
304 -- characters into the buffer.
305 decodePrevNChars :: Int -> StringBuffer -> String
306 decodePrevNChars n (StringBuffer buf _ cur) =
307 inlinePerformIO $ withForeignPtr buf $ \p0 ->
308 go p0 n "" (p0 `plusPtr` (cur - 1))
309 where
310 go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
311 go buf0 n acc p | n == 0 || buf0 >= p = return acc
312 go buf0 n acc p = do
313 p' <- utf8PrevChar p
314 let (c,_) = utf8DecodeChar p'
315 go buf0 (n - 1) (c:acc) p'
316
317 -- -----------------------------------------------------------------------------
318 -- Parsing integer strings in various bases
319 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
320 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
321 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
322 go i x | i == len = x
323 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
324 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
325 in go 0 0