fcc344554b278a54f40bccb813554a7c0fa57600
[ghc.git] / compiler / utils / StringBuffer.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
10 {-# OPTIONS_GHC -O #-}
11 -- We always optimise this, otherwise performance of a non-optimised
12 -- compiler is severely affected
13
14 module StringBuffer
15 (
16 StringBuffer(..),
17 -- non-abstract for vs\/HaskellService
18
19 -- * Creation\/destruction
20 hGetStringBuffer,
21 hGetStringBufferBlock,
22 appendStringBuffers,
23 stringToStringBuffer,
24
25 -- * Inspection
26 nextChar,
27 currentChar,
28 prevChar,
29 atEnd,
30
31 -- * Moving and comparison
32 stepOn,
33 offsetBytes,
34 byteDiff,
35
36 -- * Conversion
37 lexemeToString,
38 lexemeToFastString,
39 decodePrevNChars,
40
41 -- * Parsing integers
42 parseUnsignedInteger,
43 ) where
44
45 #include "HsVersions.h"
46
47 import Encoding
48 import FastString
49 import FastFunctions
50 import Outputable
51 import Util
52
53 import Data.Maybe
54 import Control.Exception
55 import System.IO
56 import System.IO.Unsafe ( unsafePerformIO )
57 import GHC.IO.Encoding.UTF8 ( mkUTF8 )
58 import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) )
59
60 import GHC.Exts
61
62 import Foreign
63
64 -- -----------------------------------------------------------------------------
65 -- The StringBuffer type
66
67 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
68 -- The bytes are intended to be *immutable*. There are pure
69 -- operations to read the contents of a StringBuffer.
70 --
71 -- A StringBuffer may have a finalizer, depending on how it was
72 -- obtained.
73 --
74 data StringBuffer
75 = StringBuffer {
76 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
77 len :: {-# UNPACK #-} !Int, -- length
78 cur :: {-# UNPACK #-} !Int -- current pos
79 }
80 -- The buffer is assumed to be UTF-8 encoded, and furthermore
81 -- we add three '\0' bytes to the end as sentinels so that the
82 -- decoder doesn't have to check for overflow at every single byte
83 -- of a multibyte sequence.
84
85 instance Show StringBuffer where
86 showsPrec _ s = showString "<stringbuffer("
87 . shows (len s) . showString "," . shows (cur s)
88 . showString ")>"
89
90 -- -----------------------------------------------------------------------------
91 -- Creation / Destruction
92
93 -- | Read a file into a 'StringBuffer'. The resulting buffer is automatically
94 -- managed by the garbage collector.
95 hGetStringBuffer :: FilePath -> IO StringBuffer
96 hGetStringBuffer fname = do
97 h <- openBinaryFile fname ReadMode
98 size_i <- hFileSize h
99 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
100 let size = fromIntegral $ size_i - offset_i
101 buf <- mallocForeignPtrArray (size+3)
102 withForeignPtr buf $ \ptr -> do
103 r <- if size == 0 then return 0 else hGetBuf h ptr size
104 hClose h
105 if (r /= size)
106 then ioError (userError "short read of file")
107 else newUTF8StringBuffer buf ptr size
108
109 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
110 hGetStringBufferBlock handle wanted
111 = do size_i <- hFileSize handle
112 offset_i <- hTell handle >>= skipBOM handle size_i
113 let size = min wanted (fromIntegral $ size_i-offset_i)
114 buf <- mallocForeignPtrArray (size+3)
115 withForeignPtr buf $ \ptr ->
116 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
117 if r /= size
118 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
119 else newUTF8StringBuffer buf ptr size
120
121 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
122 -- and return the new position of the handle in bytes.
123 --
124 -- This is better than treating #FEFF as whitespace,
125 -- because that would mess up layout. We don't have a concept
126 -- of zero-width whitespace in Haskell: all whitespace codepoints
127 -- have a width of one column.
128 skipBOM :: Handle -> Integer -> Integer -> IO Integer
129 skipBOM h size offset =
130 -- Only skip BOM at the beginning of a file.
131 if size > 0 && offset == 0
132 then do
133 -- Validate assumption that handle is in binary mode.
134 ASSERTM( hGetEncoding h >>= return . isNothing )
135 -- Temporarily select utf8 encoding with error ignoring,
136 -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
137 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
138 c <- hLookAhead h
139 if c == '\xfeff'
140 then hGetChar h >> hTell h
141 else return offset
142 else return offset
143 where
144 safeEncoding = mkUTF8 IgnoreCodingFailure
145
146 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
147 newUTF8StringBuffer buf ptr size = do
148 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
149 -- sentinels for UTF-8 decoding
150 return $ StringBuffer buf size 0
151
152 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
153 appendStringBuffers sb1 sb2
154 = do newBuf <- mallocForeignPtrArray (size+3)
155 withForeignPtr newBuf $ \ptr ->
156 withForeignPtr (buf sb1) $ \sb1Ptr ->
157 withForeignPtr (buf sb2) $ \sb2Ptr ->
158 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
159 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
160 pokeArray (ptr `advancePtr` size) [0,0,0]
161 return (StringBuffer newBuf size 0)
162 where sb1_len = calcLen sb1
163 sb2_len = calcLen sb2
164 calcLen sb = len sb - cur sb
165 size = sb1_len + sb2_len
166
167 -- | Encode a 'String' into a 'StringBuffer' as UTF-8. The resulting buffer
168 -- is automatically managed by the garbage collector.
169 stringToStringBuffer :: String -> StringBuffer
170 stringToStringBuffer str =
171 unsafePerformIO $ do
172 let size = utf8EncodedLength str
173 buf <- mallocForeignPtrArray (size+3)
174 withForeignPtr buf $ \ptr -> do
175 utf8EncodeString ptr str
176 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
177 -- sentinels for UTF-8 decoding
178 return (StringBuffer buf size 0)
179
180 -- -----------------------------------------------------------------------------
181 -- Grab a character
182
183 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
184 -- the remaining portion (analogous to 'Data.List.uncons'). __Warning:__ The
185 -- behavior is undefined if the 'StringBuffer' is empty. The result shares
186 -- the same buffer as the original. Similar to 'utf8DecodeChar', if the
187 -- character cannot be decoded as UTF-8, '\0' is returned.
188 {-# INLINE nextChar #-}
189 nextChar :: StringBuffer -> (Char,StringBuffer)
190 nextChar (StringBuffer buf len (I# cur#)) =
191 -- Getting our fingers dirty a little here, but this is performance-critical
192 inlinePerformIO $ do
193 withForeignPtr buf $ \(Ptr a#) -> do
194 case utf8DecodeChar# (a# `plusAddr#` cur#) of
195 (# c#, nBytes# #) ->
196 let cur' = I# (cur# +# nBytes#) in
197 return (C# c#, StringBuffer buf len cur')
198
199 -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
200 -- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
201 -- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character
202 -- cannot be decoded as UTF-8, '\0' is returned.
203 currentChar :: StringBuffer -> Char
204 currentChar = fst . nextChar
205
206 prevChar :: StringBuffer -> Char -> Char
207 prevChar (StringBuffer _ _ 0) deflt = deflt
208 prevChar (StringBuffer buf _ cur) _ =
209 inlinePerformIO $ do
210 withForeignPtr buf $ \p -> do
211 p' <- utf8PrevChar (p `plusPtr` cur)
212 return (fst (utf8DecodeChar p'))
213
214 -- -----------------------------------------------------------------------------
215 -- Moving
216
217 -- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
218 -- to 'Data.List.tail'). __Warning:__ The behavior is undefined if the
219 -- 'StringBuffer' is empty. The result shares the same buffer as the
220 -- original.
221 stepOn :: StringBuffer -> StringBuffer
222 stepOn s = snd (nextChar s)
223
224 -- | Return a 'StringBuffer' with the first @n@ bytes removed. __Warning:__
225 -- If there aren't enough characters, the returned 'StringBuffer' will be
226 -- invalid and any use of it may lead to undefined behavior. The result
227 -- shares the same buffer as the original.
228 offsetBytes :: Int -- ^ @n@, the number of bytes
229 -> StringBuffer
230 -> StringBuffer
231 offsetBytes i s = s { cur = cur s + i }
232
233 -- | Compute the difference in offset between two 'StringBuffer's that share
234 -- the same buffer. __Warning:__ The behavior is undefined if the
235 -- 'StringBuffer's use separate buffers.
236 byteDiff :: StringBuffer -> StringBuffer -> Int
237 byteDiff s1 s2 = cur s2 - cur s1
238
239 -- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
240 atEnd :: StringBuffer -> Bool
241 atEnd (StringBuffer _ l c) = l == c
242
243 -- -----------------------------------------------------------------------------
244 -- Conversion
245
246 -- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
247 -- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
248 -- they will be replaced with '\0'.
249 lexemeToString :: StringBuffer
250 -> Int -- ^ @n@, the number of bytes
251 -> String
252 lexemeToString _ 0 = ""
253 lexemeToString (StringBuffer buf _ cur) bytes =
254 utf8DecodeStringLazy buf cur bytes
255
256 lexemeToFastString :: StringBuffer
257 -> Int -- ^ @n@, the number of bytes
258 -> FastString
259 lexemeToFastString _ 0 = nilFS
260 lexemeToFastString (StringBuffer buf _ cur) len =
261 inlinePerformIO $
262 withForeignPtr buf $ \ptr ->
263 return $! mkFastStringBytes (ptr `plusPtr` cur) len
264
265 -- | Return the previous @n@ characters (or fewer if we are less than @n@
266 -- characters into the buffer.
267 decodePrevNChars :: Int -> StringBuffer -> String
268 decodePrevNChars n (StringBuffer buf _ cur) =
269 inlinePerformIO $ withForeignPtr buf $ \p0 ->
270 go p0 n "" (p0 `plusPtr` (cur - 1))
271 where
272 go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
273 go buf0 n acc p | n == 0 || buf0 >= p = return acc
274 go buf0 n acc p = do
275 p' <- utf8PrevChar p
276 let (c,_) = utf8DecodeChar p'
277 go buf0 (n - 1) (c:acc) p'
278
279 -- -----------------------------------------------------------------------------
280 -- Parsing integer strings in various bases
281 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
282 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
283 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
284 go i x | i == len = x
285 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
286 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
287 in go 0 0