Properly escape character literals in Haddocks
[ghc.git] / compiler / utils / StringBuffer.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-}
10 {-# OPTIONS_GHC -O2 #-}
11 -- We always optimise this, otherwise performance of a non-optimised
12 -- compiler is severely affected
13
14 module StringBuffer
15 (
16 StringBuffer(..),
17 -- non-abstract for vs\/HaskellService
18
19 -- * Creation\/destruction
20 hGetStringBuffer,
21 hGetStringBufferBlock,
22 appendStringBuffers,
23 stringToStringBuffer,
24
25 -- * Inspection
26 nextChar,
27 currentChar,
28 prevChar,
29 atEnd,
30
31 -- * Moving and comparison
32 stepOn,
33 offsetBytes,
34 byteDiff,
35 atLine,
36
37 -- * Conversion
38 lexemeToString,
39 lexemeToFastString,
40 decodePrevNChars,
41
42 -- * Parsing integers
43 parseUnsignedInteger,
44 ) where
45
46 #include "HsVersions.h"
47
48 import GhcPrelude
49
50 import Encoding
51 import FastString
52 import FastFunctions
53 import Outputable
54 import Util
55
56 import Data.Maybe
57 import Control.Exception
58 import System.IO
59 import System.IO.Unsafe ( unsafePerformIO )
60 import GHC.IO.Encoding.UTF8 ( mkUTF8 )
61 import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) )
62
63 import GHC.Exts
64
65 import Foreign
66
67 -- -----------------------------------------------------------------------------
68 -- The StringBuffer type
69
70 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
71 -- The bytes are intended to be *immutable*. There are pure
72 -- operations to read the contents of a StringBuffer.
73 --
74 -- A StringBuffer may have a finalizer, depending on how it was
75 -- obtained.
76 --
77 data StringBuffer
78 = StringBuffer {
79 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
80 len :: {-# UNPACK #-} !Int, -- length
81 cur :: {-# UNPACK #-} !Int -- current pos
82 }
83 -- The buffer is assumed to be UTF-8 encoded, and furthermore
84 -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
85 -- decoder doesn't have to check for overflow at every single byte
86 -- of a multibyte sequence.
87
88 instance Show StringBuffer where
89 showsPrec _ s = showString "<stringbuffer("
90 . shows (len s) . showString "," . shows (cur s)
91 . showString ")>"
92
93 -- -----------------------------------------------------------------------------
94 -- Creation / Destruction
95
96 -- | Read a file into a 'StringBuffer'. The resulting buffer is automatically
97 -- managed by the garbage collector.
98 hGetStringBuffer :: FilePath -> IO StringBuffer
99 hGetStringBuffer fname = do
100 h <- openBinaryFile fname ReadMode
101 size_i <- hFileSize h
102 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
103 let size = fromIntegral $ size_i - offset_i
104 buf <- mallocForeignPtrArray (size+3)
105 withForeignPtr buf $ \ptr -> do
106 r <- if size == 0 then return 0 else hGetBuf h ptr size
107 hClose h
108 if (r /= size)
109 then ioError (userError "short read of file")
110 else newUTF8StringBuffer buf ptr size
111
112 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
113 hGetStringBufferBlock handle wanted
114 = do size_i <- hFileSize handle
115 offset_i <- hTell handle >>= skipBOM handle size_i
116 let size = min wanted (fromIntegral $ size_i-offset_i)
117 buf <- mallocForeignPtrArray (size+3)
118 withForeignPtr buf $ \ptr ->
119 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
120 if r /= size
121 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
122 else newUTF8StringBuffer buf ptr size
123
124 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
125 -- and return the new position of the handle in bytes.
126 --
127 -- This is better than treating #FEFF as whitespace,
128 -- because that would mess up layout. We don't have a concept
129 -- of zero-width whitespace in Haskell: all whitespace codepoints
130 -- have a width of one column.
131 skipBOM :: Handle -> Integer -> Integer -> IO Integer
132 skipBOM h size offset =
133 -- Only skip BOM at the beginning of a file.
134 if size > 0 && offset == 0
135 then do
136 -- Validate assumption that handle is in binary mode.
137 ASSERTM( hGetEncoding h >>= return . isNothing )
138 -- Temporarily select utf8 encoding with error ignoring,
139 -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
140 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
141 c <- hLookAhead h
142 if c == '\xfeff'
143 then hGetChar h >> hTell h
144 else return offset
145 else return offset
146 where
147 safeEncoding = mkUTF8 IgnoreCodingFailure
148
149 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
150 newUTF8StringBuffer buf ptr size = do
151 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
152 -- sentinels for UTF-8 decoding
153 return $ StringBuffer buf size 0
154
155 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
156 appendStringBuffers sb1 sb2
157 = do newBuf <- mallocForeignPtrArray (size+3)
158 withForeignPtr newBuf $ \ptr ->
159 withForeignPtr (buf sb1) $ \sb1Ptr ->
160 withForeignPtr (buf sb2) $ \sb2Ptr ->
161 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
162 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
163 pokeArray (ptr `advancePtr` size) [0,0,0]
164 return (StringBuffer newBuf size 0)
165 where sb1_len = calcLen sb1
166 sb2_len = calcLen sb2
167 calcLen sb = len sb - cur sb
168 size = sb1_len + sb2_len
169
170 -- | Encode a 'String' into a 'StringBuffer' as UTF-8. The resulting buffer
171 -- is automatically managed by the garbage collector.
172 stringToStringBuffer :: String -> StringBuffer
173 stringToStringBuffer str =
174 unsafePerformIO $ do
175 let size = utf8EncodedLength str
176 buf <- mallocForeignPtrArray (size+3)
177 withForeignPtr buf $ \ptr -> do
178 utf8EncodeString ptr str
179 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
180 -- sentinels for UTF-8 decoding
181 return (StringBuffer buf size 0)
182
183 -- -----------------------------------------------------------------------------
184 -- Grab a character
185
186 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
187 -- the remaining portion (analogous to 'Data.List.uncons'). __Warning:__ The
188 -- behavior is undefined if the 'StringBuffer' is empty. The result shares
189 -- the same buffer as the original. Similar to 'utf8DecodeChar', if the
190 -- character cannot be decoded as UTF-8, @\'\\0\'@ is returned.
191 {-# INLINE nextChar #-}
192 nextChar :: StringBuffer -> (Char,StringBuffer)
193 nextChar (StringBuffer buf len (I# cur#)) =
194 -- Getting our fingers dirty a little here, but this is performance-critical
195 inlinePerformIO $ do
196 withForeignPtr buf $ \(Ptr a#) -> do
197 case utf8DecodeChar# (a# `plusAddr#` cur#) of
198 (# c#, nBytes# #) ->
199 let cur' = I# (cur# +# nBytes#) in
200 return (C# c#, StringBuffer buf len cur')
201
202 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
203 -- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
204 -- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character
205 -- cannot be decoded as UTF-8, @\'\\0\'@ is returned.
206 currentChar :: StringBuffer -> Char
207 currentChar = fst . nextChar
208
209 prevChar :: StringBuffer -> Char -> Char
210 prevChar (StringBuffer _ _ 0) deflt = deflt
211 prevChar (StringBuffer buf _ cur) _ =
212 inlinePerformIO $ do
213 withForeignPtr buf $ \p -> do
214 p' <- utf8PrevChar (p `plusPtr` cur)
215 return (fst (utf8DecodeChar p'))
216
217 -- -----------------------------------------------------------------------------
218 -- Moving
219
220 -- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
221 -- to 'Data.List.tail'). __Warning:__ The behavior is undefined if the
222 -- 'StringBuffer' is empty. The result shares the same buffer as the
223 -- original.
224 stepOn :: StringBuffer -> StringBuffer
225 stepOn s = snd (nextChar s)
226
227 -- | Return a 'StringBuffer' with the first @n@ bytes removed. __Warning:__
228 -- If there aren't enough characters, the returned 'StringBuffer' will be
229 -- invalid and any use of it may lead to undefined behavior. The result
230 -- shares the same buffer as the original.
231 offsetBytes :: Int -- ^ @n@, the number of bytes
232 -> StringBuffer
233 -> StringBuffer
234 offsetBytes i s = s { cur = cur s + i }
235
236 -- | Compute the difference in offset between two 'StringBuffer's that share
237 -- the same buffer. __Warning:__ The behavior is undefined if the
238 -- 'StringBuffer's use separate buffers.
239 byteDiff :: StringBuffer -> StringBuffer -> Int
240 byteDiff s1 s2 = cur s2 - cur s1
241
242 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
243 atEnd :: StringBuffer -> Bool
244 atEnd (StringBuffer _ l c) = l == c
245
246 -- | Computes a 'StringBuffer' which points to the first character of the
247 -- wanted line. Lines begin at 1.
248 atLine :: Int -> StringBuffer -> Maybe StringBuffer
249 atLine line sb@(StringBuffer buf len _) =
250 inlinePerformIO $
251 withForeignPtr buf $ \p -> do
252 p' <- skipToLine line len p
253 if p' == nullPtr
254 then return Nothing
255 else
256 let
257 delta = p' `minusPtr` p
258 in return $ Just (sb { cur = delta
259 , len = len - delta
260 })
261
262 skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
263 skipToLine !line !len !op0 = go 1 op0
264 where
265 !opend = op0 `plusPtr` len
266
267 go !i_line !op
268 | op >= opend = pure nullPtr
269 | i_line == line = pure op
270 | otherwise = do
271 w <- peek op :: IO Word8
272 case w of
273 10 -> go (i_line + 1) (plusPtr op 1)
274 13 -> do
275 -- this is safe because a 'StringBuffer' is
276 -- guaranteed to have 3 bytes sentinel values.
277 w' <- peek (plusPtr op 1) :: IO Word8
278 case w' of
279 10 -> go (i_line + 1) (plusPtr op 2)
280 _ -> go (i_line + 1) (plusPtr op 1)
281 _ -> go i_line (plusPtr op 1)
282
283 -- -----------------------------------------------------------------------------
284 -- Conversion
285
286 -- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
287 -- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
288 -- they will be replaced with @\'\\0\'@.
289 lexemeToString :: StringBuffer
290 -> Int -- ^ @n@, the number of bytes
291 -> String
292 lexemeToString _ 0 = ""
293 lexemeToString (StringBuffer buf _ cur) bytes =
294 utf8DecodeStringLazy buf cur bytes
295
296 lexemeToFastString :: StringBuffer
297 -> Int -- ^ @n@, the number of bytes
298 -> FastString
299 lexemeToFastString _ 0 = nilFS
300 lexemeToFastString (StringBuffer buf _ cur) len =
301 inlinePerformIO $
302 withForeignPtr buf $ \ptr ->
303 return $! mkFastStringBytes (ptr `plusPtr` cur) len
304
305 -- | Return the previous @n@ characters (or fewer if we are less than @n@
306 -- characters into the buffer.
307 decodePrevNChars :: Int -> StringBuffer -> String
308 decodePrevNChars n (StringBuffer buf _ cur) =
309 inlinePerformIO $ withForeignPtr buf $ \p0 ->
310 go p0 n "" (p0 `plusPtr` (cur - 1))
311 where
312 go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
313 go buf0 n acc p | n == 0 || buf0 >= p = return acc
314 go buf0 n acc p = do
315 p' <- utf8PrevChar p
316 let (c,_) = utf8DecodeChar p'
317 go buf0 (n - 1) (c:acc) p'
318
319 -- -----------------------------------------------------------------------------
320 -- Parsing integer strings in various bases
321 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
322 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
323 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
324 go i x | i == len = x
325 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
326 '_' -> go (i + 1) x -- skip "_" (#14473)
327 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
328 in go 0 0