Skip a possible BOM in utf8 encoding
[ghc.git] / compiler / utils / StringBuffer.hs
1 {-
2 (c) The University of Glasgow 2006
3 (c) The University of Glasgow, 1997-2006
4
5
6 Buffers for scanning string input stored in external arrays.
7 -}
8
9 {-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
10 {-# OPTIONS_GHC -O #-}
11 -- We always optimise this, otherwise performance of a non-optimised
12 -- compiler is severely affected
13
14 module StringBuffer
15 (
16 StringBuffer(..),
17 -- non-abstract for vs\/HaskellService
18
19 -- * Creation\/destruction
20 hGetStringBuffer,
21 hGetStringBufferBlock,
22 appendStringBuffers,
23 stringToStringBuffer,
24
25 -- * Inspection
26 nextChar,
27 currentChar,
28 prevChar,
29 atEnd,
30
31 -- * Moving and comparison
32 stepOn,
33 offsetBytes,
34 byteDiff,
35
36 -- * Conversion
37 lexemeToString,
38 lexemeToFastString,
39
40 -- * Parsing integers
41 parseUnsignedInteger,
42 ) where
43
44 #include "HsVersions.h"
45
46 import Encoding
47 import FastString
48 import FastFunctions
49 import Outputable
50 import Util
51
52 import Data.Maybe
53 import Control.Exception
54 import System.IO
55 import System.IO.Unsafe ( unsafePerformIO )
56 import GHC.IO.Encoding.UTF8 ( mkUTF8 )
57 import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) )
58
59 import GHC.Exts
60
61 #if __GLASGOW_HASKELL__ >= 709
62 import Foreign
63 #else
64 import Foreign.Safe
65 #endif
66
67 -- -----------------------------------------------------------------------------
68 -- The StringBuffer type
69
70 -- |A StringBuffer is an internal pointer to a sized chunk of bytes.
71 -- The bytes are intended to be *immutable*. There are pure
72 -- operations to read the contents of a StringBuffer.
73 --
74 -- A StringBuffer may have a finalizer, depending on how it was
75 -- obtained.
76 --
77 data StringBuffer
78 = StringBuffer {
79 buf :: {-# UNPACK #-} !(ForeignPtr Word8),
80 len :: {-# UNPACK #-} !Int, -- length
81 cur :: {-# UNPACK #-} !Int -- current pos
82 }
83 -- The buffer is assumed to be UTF-8 encoded, and furthermore
84 -- we add three '\0' bytes to the end as sentinels so that the
85 -- decoder doesn't have to check for overflow at every single byte
86 -- of a multibyte sequence.
87
88 instance Show StringBuffer where
89 showsPrec _ s = showString "<stringbuffer("
90 . shows (len s) . showString "," . shows (cur s)
91 . showString ")>"
92
93 -- -----------------------------------------------------------------------------
94 -- Creation / Destruction
95
96 hGetStringBuffer :: FilePath -> IO StringBuffer
97 hGetStringBuffer fname = do
98 h <- openBinaryFile fname ReadMode
99 size_i <- hFileSize h
100 offset_i <- skipBOM h size_i 0 -- offset is 0 initially
101 let size = fromIntegral $ size_i - offset_i
102 buf <- mallocForeignPtrArray (size+3)
103 withForeignPtr buf $ \ptr -> do
104 r <- if size == 0 then return 0 else hGetBuf h ptr size
105 hClose h
106 if (r /= size)
107 then ioError (userError "short read of file")
108 else newUTF8StringBuffer buf ptr size
109
110 hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
111 hGetStringBufferBlock handle wanted
112 = do size_i <- hFileSize handle
113 offset_i <- hTell handle >>= skipBOM handle size_i
114 let size = min wanted (fromIntegral $ size_i-offset_i)
115 buf <- mallocForeignPtrArray (size+3)
116 withForeignPtr buf $ \ptr ->
117 do r <- if size == 0 then return 0 else hGetBuf handle ptr size
118 if r /= size
119 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
120 else newUTF8StringBuffer buf ptr size
121
122 -- | Skip the byte-order mark if there is one (see #1744 and #6016),
123 -- and return the new position of the handle in bytes.
124 --
125 -- This is better than treating #FEFF as whitespace,
126 -- because that would mess up layout. We don't have a concept
127 -- of zero-width whitespace in Haskell: all whitespace codepoints
128 -- have a width of one column.
129 skipBOM :: Handle -> Integer -> Integer -> IO Integer
130 skipBOM h size offset =
131 -- Only skip BOM at the beginning of a file.
132 if size > 0 && offset == 0
133 then do
134 -- Validate assumption that handle is in binary mode.
135 ASSERTM( hGetEncoding h >>= return . isNothing )
136 -- Temporarily select utf8 encoding with error ignoring,
137 -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
138 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
139 c <- hLookAhead h
140 if c == '\xfeff'
141 then hGetChar h >> hTell h
142 else return offset
143 else return offset
144 where
145 safeEncoding = mkUTF8 IgnoreCodingFailure
146
147 newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
148 newUTF8StringBuffer buf ptr size = do
149 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
150 -- sentinels for UTF-8 decoding
151 return $ StringBuffer buf size 0
152
153 appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
154 appendStringBuffers sb1 sb2
155 = do newBuf <- mallocForeignPtrArray (size+3)
156 withForeignPtr newBuf $ \ptr ->
157 withForeignPtr (buf sb1) $ \sb1Ptr ->
158 withForeignPtr (buf sb2) $ \sb2Ptr ->
159 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
160 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
161 pokeArray (ptr `advancePtr` size) [0,0,0]
162 return (StringBuffer newBuf size 0)
163 where sb1_len = calcLen sb1
164 sb2_len = calcLen sb2
165 calcLen sb = len sb - cur sb
166 size = sb1_len + sb2_len
167
168 stringToStringBuffer :: String -> StringBuffer
169 stringToStringBuffer str =
170 unsafePerformIO $ do
171 let size = utf8EncodedLength str
172 buf <- mallocForeignPtrArray (size+3)
173 withForeignPtr buf $ \ptr -> do
174 utf8EncodeString ptr str
175 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
176 -- sentinels for UTF-8 decoding
177 return (StringBuffer buf size 0)
178
179 -- -----------------------------------------------------------------------------
180 -- Grab a character
181
182 -- Getting our fingers dirty a little here, but this is performance-critical
183 {-# INLINE nextChar #-}
184 nextChar :: StringBuffer -> (Char,StringBuffer)
185 nextChar (StringBuffer buf len (I# cur#)) =
186 inlinePerformIO $ do
187 withForeignPtr buf $ \(Ptr a#) -> do
188 case utf8DecodeChar# (a# `plusAddr#` cur#) of
189 (# c#, nBytes# #) ->
190 let cur' = I# (cur# +# nBytes#) in
191 return (C# c#, StringBuffer buf len cur')
192
193 currentChar :: StringBuffer -> Char
194 currentChar = fst . nextChar
195
196 prevChar :: StringBuffer -> Char -> Char
197 prevChar (StringBuffer _ _ 0) deflt = deflt
198 prevChar (StringBuffer buf _ cur) _ =
199 inlinePerformIO $ do
200 withForeignPtr buf $ \p -> do
201 p' <- utf8PrevChar (p `plusPtr` cur)
202 return (fst (utf8DecodeChar p'))
203
204 -- -----------------------------------------------------------------------------
205 -- Moving
206
207 stepOn :: StringBuffer -> StringBuffer
208 stepOn s = snd (nextChar s)
209
210 offsetBytes :: Int -> StringBuffer -> StringBuffer
211 offsetBytes i s = s { cur = cur s + i }
212
213 byteDiff :: StringBuffer -> StringBuffer -> Int
214 byteDiff s1 s2 = cur s2 - cur s1
215
216 atEnd :: StringBuffer -> Bool
217 atEnd (StringBuffer _ l c) = l == c
218
219 -- -----------------------------------------------------------------------------
220 -- Conversion
221
222 lexemeToString :: StringBuffer -> Int {-bytes-} -> String
223 lexemeToString _ 0 = ""
224 lexemeToString (StringBuffer buf _ cur) bytes =
225 inlinePerformIO $
226 withForeignPtr buf $ \ptr ->
227 utf8DecodeString (ptr `plusPtr` cur) bytes
228
229 lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
230 lexemeToFastString _ 0 = nilFS
231 lexemeToFastString (StringBuffer buf _ cur) len =
232 inlinePerformIO $
233 withForeignPtr buf $ \ptr ->
234 return $! mkFastStringBytes (ptr `plusPtr` cur) len
235
236 -- -----------------------------------------------------------------------------
237 -- Parsing integer strings in various bases
238 parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
239 parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
240 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
241 go i x | i == len = x
242 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
243 char -> go (i + 1) (x * radix + toInteger (char_to_int char))
244 in go 0 0