Package environments
[ghc.git] / compiler / utils / StringBuffer.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-}
10 {-# OPTIONS_GHC -O -funbox-strict-fields #-}
11 -- We always optimise this, otherwise performance of a non-optimised
12 -- compiler is severely affected
13
14 module StringBuffer
15 (
16 StringBuffer(..),
17 -- non-abstract for vs\/HaskellService
18
19 -- * Creation\/destruction
20 hGetStringBuffer,
21 hGetStringBufferBlock,
22 appendStringBuffers,
23 stringToStringBuffer,
24
25 -- * Inspection
26 nextChar,
27 currentChar,
28 prevChar,
29 atEnd,
30
31 -- * Moving and comparison
32 stepOn,
33 offsetBytes,
34 byteDiff,
35
36 -- * Conversion
37 lexemeToString,
38 lexemeToFastString,
39
40 -- * Parsing integers
41 parseUnsignedInteger,
42 ) where
43
44 #include "HsVersions.h"
45
46 import Encoding
47 import FastString
48 import FastTypes
49 import FastFunctions
50 import Outputable
51 import Util
52
53 import Data.Maybe
54 import Control.Exception
55 import System.IO
56 import System.IO.Unsafe ( unsafePerformIO )
57
58 import GHC.Exts
59
60 #if __GLASGOW_HASKELL__ >= 709
61 import Foreign
62 #else
63 import Foreign.Safe
64 #endif
65
66 -- -----------------------------------------------------------------------------
67 -- The StringBuffer type
68
69 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
70 -- The bytes are intended to be *immutable*. There are pure
71 -- operations to read the contents of a StringBuffer.
72 --
73 -- A StringBuffer may have a finalizer, depending on how it was
74 -- obtained.
75 --
76 data StringBuffer
77 = StringBuffer {
78 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
79 len :: {-# UNPACK #-} !Int, -- length
80 cur :: {-# UNPACK #-} !Int -- current pos
81 }
82 -- The buffer is assumed to be UTF-8 encoded, and furthermore
83 -- we add three '\0' bytes to the end as sentinels so that the
84 -- decoder doesn't have to check for overflow at every single byte
85 -- of a multibyte sequence.
86
87 instance Show StringBuffer where
88 showsPrec _ s = showString "<stringbuffer("
89 . shows (len s) . showString "," . shows (cur s)
90 . showString ")>"
91
92 -- -----------------------------------------------------------------------------
93 -- Creation / Destruction
94
95 hGetStringBuffer :: FilePath -> IO StringBuffer
96 hGetStringBuffer fname = do
97 h <- openBinaryFile fname ReadMode
98 size_i <- hFileSize h
99 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
100 let size = fromIntegral $ size_i - offset_i
101 buf <- mallocForeignPtrArray (size+3)
102 withForeignPtr buf $ \ptr -> do
103 r <- if size == 0 then return 0 else hGetBuf h ptr size
104 hClose h
105 if (r /= size)
106 then ioError (userError "short read of file")
107 else newUTF8StringBuffer buf ptr size
108
109 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
110 hGetStringBufferBlock handle wanted
111 = do size_i <- hFileSize handle
112 offset_i <- hTell handle >>= skipBOM handle size_i
113 let size = min wanted (fromIntegral $ size_i-offset_i)
114 buf <- mallocForeignPtrArray (size+3)
115 withForeignPtr buf $ \ptr ->
116 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
117 if r /= size
118 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
119 else newUTF8StringBuffer buf ptr size
120
121 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
122 -- and return the new position of the handle in bytes.
123 --
124 -- This is better than treating #FEFF as whitespace,
125 -- because that would mess up layout. We don't have a concept
126 -- of zero-width whitespace in Haskell: all whitespace codepoints
127 -- have a width of one column.
128 skipBOM :: Handle -> Integer -> Integer -> IO Integer
129 skipBOM h size offset =
130 -- Only skip BOM at the beginning of a file.
131 if size > 0 && offset == 0
132 then do
133 -- Validate assumption that handle is in binary mode.
134 ASSERTM( hGetEncoding h >>= return . isNothing )
135 -- Temporarily select text mode to make `hLookAhead` and
136 -- `hGetChar` return full Unicode characters.
137 bracket_ (hSetBinaryMode h False) (hSetBinaryMode h True) $ do
138 c <- hLookAhead h
139 if c == '\xfeff'
140 then hGetChar h >> hTell h
141 else return offset
142 else return offset
143
144 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
145 newUTF8StringBuffer buf ptr size = do
146 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
147 -- sentinels for UTF-8 decoding
148 return $ StringBuffer buf size 0
149
150 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
151 appendStringBuffers sb1 sb2
152 = do newBuf <- mallocForeignPtrArray (size+3)
153 withForeignPtr newBuf $ \ptr ->
154 withForeignPtr (buf sb1) $ \sb1Ptr ->
155 withForeignPtr (buf sb2) $ \sb2Ptr ->
156 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
157 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
158 pokeArray (ptr `advancePtr` size) [0,0,0]
159 return (StringBuffer newBuf size 0)
160 where sb1_len = calcLen sb1
161 sb2_len = calcLen sb2
162 calcLen sb = len sb - cur sb
163 size = sb1_len + sb2_len
164
165 stringToStringBuffer :: String -> StringBuffer
166 stringToStringBuffer str =
167 unsafePerformIO $ do
168 let size = utf8EncodedLength str
169 buf <- mallocForeignPtrArray (size+3)
170 withForeignPtr buf $ \ptr -> do
171 utf8EncodeString ptr str
172 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
173 -- sentinels for UTF-8 decoding
174 return (StringBuffer buf size 0)
175
176 -- -----------------------------------------------------------------------------
177 -- Grab a character
178
179 -- Getting our fingers dirty a little here, but this is performance-critical
180 {-# INLINE nextChar #-}
181 nextChar :: StringBuffer -> (Char,StringBuffer)
182 nextChar (StringBuffer buf len (I# cur#)) =
183 inlinePerformIO $ do
184 withForeignPtr buf $ \(Ptr a#) -> do
185 case utf8DecodeChar# (a# `plusAddr#` cur#) of
186 (# c#, nBytes# #) ->
187 let cur' = I# (cur# +# nBytes#) in
188 return (C# c#, StringBuffer buf len cur')
189
190 currentChar :: StringBuffer -> Char
191 currentChar = fst . nextChar
192
193 prevChar :: StringBuffer -> Char -> Char
194 prevChar (StringBuffer _ _ 0) deflt = deflt
195 prevChar (StringBuffer buf _ cur) _ =
196 inlinePerformIO $ do
197 withForeignPtr buf $ \p -> do
198 p' <- utf8PrevChar (p `plusPtr` cur)
199 return (fst (utf8DecodeChar p'))
200
201 -- -----------------------------------------------------------------------------
202 -- Moving
203
204 stepOn :: StringBuffer -> StringBuffer
205 stepOn s = snd (nextChar s)
206
207 offsetBytes :: Int -> StringBuffer -> StringBuffer
208 offsetBytes i s = s { cur = cur s + i }
209
210 byteDiff :: StringBuffer -> StringBuffer -> Int
211 byteDiff s1 s2 = cur s2 - cur s1
212
213 atEnd :: StringBuffer -> Bool
214 atEnd (StringBuffer _ l c) = l == c
215
216 -- -----------------------------------------------------------------------------
217 -- Conversion
218
219 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
220 lexemeToString _ 0 = ""
221 lexemeToString (StringBuffer buf _ cur) bytes =
222 inlinePerformIO $
223 withForeignPtr buf $ \ptr ->
224 utf8DecodeString (ptr `plusPtr` cur) bytes
225
226 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
227 lexemeToFastString _ 0 = nilFS
228 lexemeToFastString (StringBuffer buf _ cur) len =
229 inlinePerformIO $
230 withForeignPtr buf $ \ptr ->
231 return $! mkFastStringBytes (ptr `plusPtr` cur) len
232
233 -- -----------------------------------------------------------------------------
234 -- Parsing integer strings in various bases
235 {-
236 byteOff :: StringBuffer -> Int -> Char
237 byteOff (StringBuffer buf _ cur) i =
238 inlinePerformIO $ withForeignPtr buf $ \ptr -> do
239 -- return $! cBox (indexWord8OffFastPtrAsFastChar
240 -- (pUnbox ptr) (iUnbox (cur+i)))
241 --or
242 -- w <- peek (ptr `plusPtr` (cur+i))
243 -- return (unsafeChr (fromIntegral (w::Word8)))
244 -}
245 -- | XXX assumes ASCII digits only (by using byteOff)
246 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
247 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
248 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
249 --LOL, in implementations where the indexing needs slow unsafePerformIO,
250 --this is less (not more) efficient than using the IO monad explicitly
251 --here.
252 !ptr' = pUnbox ptr
253 byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
254 go i x | i == len = x
255 | otherwise = case byteOff i of
256 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
257 in go 0 0