Add `Data.Text.Lazy.Encoding.decodeLatin1` ISO-8859-1 decoding function
[packages/text.git] / Data / Text / Lazy / Encoding.hs
1 {-# LANGUAGE BangPatterns #-}
2 -- |
3 -- Module : Data.Text.Lazy.Encoding
4 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan
5 --
6 -- License : BSD-style
7 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
8 -- duncan@haskell.org
9 -- Stability : experimental
10 -- Portability : portable
11 --
12 -- Functions for converting lazy 'Text' values to and from lazy
13 -- 'ByteString', using several standard encodings.
14 --
15 -- To gain access to a much larger variety of encodings, use the
16 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
17
18 module Data.Text.Lazy.Encoding
19 (
20 -- * Decoding ByteStrings to Text
21 -- $strict
22 decodeASCII
23 , decodeLatin1
24 , decodeUtf8
25 , decodeUtf16LE
26 , decodeUtf16BE
27 , decodeUtf32LE
28 , decodeUtf32BE
29
30 -- ** Catchable failure
31 , decodeUtf8'
32
33 -- ** Controllable error handling
34 , decodeUtf8With
35 , decodeUtf16LEWith
36 , decodeUtf16BEWith
37 , decodeUtf32LEWith
38 , decodeUtf32BEWith
39
40 -- * Encoding Text to ByteStrings
41 , encodeUtf8
42 , encodeUtf16LE
43 , encodeUtf16BE
44 , encodeUtf32LE
45 , encodeUtf32BE
46 ) where
47
48 import Control.Exception (evaluate, try)
49 import Data.Bits ((.&.))
50 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
51 import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
52 import System.IO.Unsafe (unsafePerformIO)
53 import qualified Data.ByteString as S
54 import qualified Data.ByteString.Lazy as B
55 import qualified Data.ByteString.Lazy.Internal as B
56 import qualified Data.ByteString.Unsafe as S
57 import qualified Data.Text as T
58 import qualified Data.Text.Encoding as TE
59 import qualified Data.Text.Lazy.Encoding.Fusion as E
60 import qualified Data.Text.Lazy.Fusion as F
61
62 -- $strict
63 --
64 -- All of the single-parameter functions for decoding bytestrings
65 -- encoded in one of the Unicode Transformation Formats (UTF) operate
66 -- in a /strict/ mode: each will throw an exception if given invalid
67 -- input.
68 --
69 -- Each function has a variant, whose name is suffixed with -'With',
70 -- that gives greater control over the handling of decoding errors.
71 -- For instance, 'decodeUtf8' will throw an exception, but
72 -- 'decodeUtf8With' allows the programmer to determine what to do on a
73 -- decoding error.
74
75 -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
76 -- encoded text.
77 --
78 -- This function is deprecated. Use 'decodeLatin1' instead.
79 decodeASCII :: B.ByteString -> Text
80 decodeASCII = decodeUtf8
81 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
82
83 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
84 decodeLatin1 :: B.ByteString -> Text
85 decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
86
87 -- | Decode a 'ByteString' containing UTF-8 encoded text.
88 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
89 decodeUtf8With onErr bs0 = fast bs0
90 where
91 decode = TE.decodeUtf8With onErr
92 fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
93 | otherwise = chunk (decode h) (slow t ps)
94 where (h,t) = S.splitAt pivot p
95 pivot | at 1 = len-1
96 | at 2 = len-2
97 | otherwise = len-3
98 len = S.length p
99 at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
100 fast B.Empty = empty
101 slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
102 case B.uncons bs of
103 Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
104 | otherwise -> slow i' bs'
105 where i' = S.snoc i w
106 Nothing -> case S.uncons i of
107 Just (j,i') ->
108 case onErr desc (Just j) of
109 Nothing -> slow i' bs
110 Just c -> Chunk (T.singleton c) (slow i' bs)
111 Nothing ->
112 case onErr desc Nothing of
113 Nothing -> empty
114 Just c -> Chunk (T.singleton c) empty
115 isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
116 ix 1 .&. 0x80 == 0 ||
117 (len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
118 (len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
119 (len >= 4 && ix 4 .&. 0xf8 == 0xf0)
120 where len = S.length bs
121 ix n = S.unsafeIndex bs (len-n)
122 desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
123 {-# INLINE[0] decodeUtf8With #-}
124
125 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
126 -- to be valid.
127 --
128 -- If the input contains any invalid UTF-8 data, an exception will be
129 -- thrown that cannot be caught in pure code. For more control over
130 -- the handling of invalid data, use 'decodeUtf8'' or
131 -- 'decodeUtf8With'.
132 decodeUtf8 :: B.ByteString -> Text
133 decodeUtf8 = decodeUtf8With strictDecode
134 {-# INLINE[0] decodeUtf8 #-}
135
136 -- This rule seems to cause performance loss.
137 {- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
138 forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
139
140 -- | Decode a 'ByteString' containing UTF-8 encoded text..
141 --
142 -- If the input contains any invalid UTF-8 data, the relevant
143 -- exception will be returned, otherwise the decoded text.
144 --
145 -- /Note/: this function is /not/ lazy, as it must decode its entire
146 -- input before it can return a result. If you need lazy (streaming)
147 -- decoding, use 'decodeUtf8With' in lenient mode.
148 decodeUtf8' :: B.ByteString -> Either UnicodeException Text
149 decodeUtf8' bs = unsafePerformIO $ do
150 let t = decodeUtf8 bs
151 try (evaluate (rnf t `seq` t))
152 where
153 rnf Empty = ()
154 rnf (Chunk _ ts) = rnf ts
155 {-# INLINE decodeUtf8' #-}
156
157 encodeUtf8 :: Text -> B.ByteString
158 encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)
159 encodeUtf8 Empty = B.Empty
160
161 -- | Decode text from little endian UTF-16 encoding.
162 decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
163 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
164 {-# INLINE decodeUtf16LEWith #-}
165
166 -- | Decode text from little endian UTF-16 encoding.
167 --
168 -- If the input contains any invalid little endian UTF-16 data, an
169 -- exception will be thrown. For more control over the handling of
170 -- invalid data, use 'decodeUtf16LEWith'.
171 decodeUtf16LE :: B.ByteString -> Text
172 decodeUtf16LE = decodeUtf16LEWith strictDecode
173 {-# INLINE decodeUtf16LE #-}
174
175 -- | Decode text from big endian UTF-16 encoding.
176 decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
177 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
178 {-# INLINE decodeUtf16BEWith #-}
179
180 -- | Decode text from big endian UTF-16 encoding.
181 --
182 -- If the input contains any invalid big endian UTF-16 data, an
183 -- exception will be thrown. For more control over the handling of
184 -- invalid data, use 'decodeUtf16BEWith'.
185 decodeUtf16BE :: B.ByteString -> Text
186 decodeUtf16BE = decodeUtf16BEWith strictDecode
187 {-# INLINE decodeUtf16BE #-}
188
189 -- | Encode text using little endian UTF-16 encoding.
190 encodeUtf16LE :: Text -> B.ByteString
191 encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
192 {-# INLINE encodeUtf16LE #-}
193
194 -- | Encode text using big endian UTF-16 encoding.
195 encodeUtf16BE :: Text -> B.ByteString
196 encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
197 {-# INLINE encodeUtf16BE #-}
198
199 -- | Decode text from little endian UTF-32 encoding.
200 decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
201 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
202 {-# INLINE decodeUtf32LEWith #-}
203
204 -- | Decode text from little endian UTF-32 encoding.
205 --
206 -- If the input contains any invalid little endian UTF-32 data, an
207 -- exception will be thrown. For more control over the handling of
208 -- invalid data, use 'decodeUtf32LEWith'.
209 decodeUtf32LE :: B.ByteString -> Text
210 decodeUtf32LE = decodeUtf32LEWith strictDecode
211 {-# INLINE decodeUtf32LE #-}
212
213 -- | Decode text from big endian UTF-32 encoding.
214 decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
215 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
216 {-# INLINE decodeUtf32BEWith #-}
217
218 -- | Decode text from big endian UTF-32 encoding.
219 --
220 -- If the input contains any invalid big endian UTF-32 data, an
221 -- exception will be thrown. For more control over the handling of
222 -- invalid data, use 'decodeUtf32BEWith'.
223 decodeUtf32BE :: B.ByteString -> Text
224 decodeUtf32BE = decodeUtf32BEWith strictDecode
225 {-# INLINE decodeUtf32BE #-}
226
227 -- | Encode text using little endian UTF-32 encoding.
228 encodeUtf32LE :: Text -> B.ByteString
229 encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
230 {-# INLINE encodeUtf32LE #-}
231
232 -- | Encode text using big endian UTF-32 encoding.
233 encodeUtf32BE :: Text -> B.ByteString
234 encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
235 {-# INLINE encodeUtf32BE #-}