Many small documentation improvements.
[packages/text.git] / Data / Text / Lazy / Encoding.hs
1 {-# LANGUAGE BangPatterns #-}
2 -- |
3 -- Module : Data.Text.Lazy.Encoding
4 -- Copyright : (c) 2009, 2010 Bryan O'Sullivan
5 --
6 -- License : BSD-style
7 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
8 -- duncan@haskell.org
9 -- Stability : experimental
10 -- Portability : portable
11 --
12 -- Functions for converting lazy 'Text' values to and from lazy
13 -- 'ByteString', using several standard encodings.
14 --
15 -- To gain access to a much larger variety of encodings, use the
16 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
17
18 module Data.Text.Lazy.Encoding
19 (
20 -- * Decoding ByteStrings to Text
21 -- $strict
22 decodeASCII
23 , decodeUtf8
24 , decodeUtf16LE
25 , decodeUtf16BE
26 , decodeUtf32LE
27 , decodeUtf32BE
28 -- ** Controllable error handling
29 , decodeUtf8With
30 , decodeUtf16LEWith
31 , decodeUtf16BEWith
32 , decodeUtf32LEWith
33 , decodeUtf32BEWith
34
35 -- * Encoding Text to ByteStrings
36 , encodeUtf8
37 , encodeUtf16LE
38 , encodeUtf16BE
39 , encodeUtf32LE
40 , encodeUtf32BE
41 ) where
42
43 import Data.Bits ((.&.))
44 import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
45 import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
46 import qualified Data.ByteString as S
47 import qualified Data.ByteString.Lazy as B
48 import qualified Data.ByteString.Lazy.Internal as B
49 import qualified Data.ByteString.Unsafe as S
50 import qualified Data.Text as T
51 import qualified Data.Text.Encoding as TE
52 import qualified Data.Text.Lazy.Encoding.Fusion as E
53 import qualified Data.Text.Lazy.Fusion as F
54
55 -- $strict
56 --
57 -- All of the single-parameter functions for decoding bytestrings
58 -- encoded in one of the Unicode Transformation Formats (UTF) operate
59 -- in a /strict/ mode: each will throw an exception if given invalid
60 -- input.
61 --
62 -- Each function has a variant, whose name is suffixed with -'With',
63 -- that gives greater control over the handling of decoding errors.
64 -- For instance, 'decodeUtf8' will throw an exception, but
65 -- 'decodeUtf8With' allows the programmer to determine what to do on a
66 -- decoding error.
67
68 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
69 decodeASCII :: B.ByteString -> Text
70 decodeASCII bs = foldr (chunk . TE.decodeASCII) empty (B.toChunks bs)
71 {-# INLINE decodeASCII #-}
72
73 -- | Decode a 'ByteString' containing UTF-8 encoded text.
74 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
75 decodeUtf8With onErr bs0 = fast bs0
76 where
77 decode = TE.decodeUtf8With onErr
78 fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
79 | otherwise = chunk (decode h) (slow t ps)
80 where (h,t) = S.splitAt pivot p
81 pivot | at 1 = len-1
82 | at 2 = len-2
83 | otherwise = len-3
84 len = S.length p
85 at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
86 fast B.Empty = empty
87 slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
88 case B.uncons bs of
89 Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
90 | otherwise -> slow i' bs'
91 where i' = S.snoc i w
92 Nothing -> case S.uncons i of
93 Just (j,i') ->
94 case onErr desc (Just j) of
95 Nothing -> slow i' bs
96 Just c -> Chunk (T.singleton c) (slow i' bs)
97 Nothing ->
98 case onErr desc Nothing of
99 Nothing -> empty
100 Just c -> Chunk (T.singleton c) empty
101 isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
102 ix 1 .&. 0x80 == 0 ||
103 (len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
104 (len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
105 (len >= 4 && ix 4 .&. 0xf8 == 0xf0)
106 where len = S.length bs
107 ix n = S.unsafeIndex bs (len-n)
108 desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
109 {-# INLINE[0] decodeUtf8With #-}
110
111 -- | Decode a 'ByteString' containing UTF-8 encoded text.
112 --
113 -- If the input contains any invalid UTF-8 data, an exception will be
114 -- thrown. For more control over the handling of invalid data, use
115 -- 'decodeUtf8With'.
116 decodeUtf8 :: B.ByteString -> Text
117 decodeUtf8 = decodeUtf8With strictDecode
118 {-# INLINE[0] decodeUtf8 #-}
119
120 -- This rule seems to cause performance loss.
121 {- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
122 forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
123
124 encodeUtf8 :: Text -> B.ByteString
125 encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)
126 encodeUtf8 Empty = B.Empty
127
128 -- | Decode text from little endian UTF-16 encoding.
129 decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
130 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
131 {-# INLINE decodeUtf16LEWith #-}
132
133 -- | Decode text from little endian UTF-16 encoding.
134 --
135 -- If the input contains any invalid little endian UTF-16 data, an
136 -- exception will be thrown. For more control over the handling of
137 -- invalid data, use 'decodeUtf16LEWith'.
138 decodeUtf16LE :: B.ByteString -> Text
139 decodeUtf16LE = decodeUtf16LEWith strictDecode
140 {-# INLINE decodeUtf16LE #-}
141
142 -- | Decode text from big endian UTF-16 encoding.
143 decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
144 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
145 {-# INLINE decodeUtf16BEWith #-}
146
147 -- | Decode text from big endian UTF-16 encoding.
148 --
149 -- If the input contains any invalid big endian UTF-16 data, an
150 -- exception will be thrown. For more control over the handling of
151 -- invalid data, use 'decodeUtf16BEWith'.
152 decodeUtf16BE :: B.ByteString -> Text
153 decodeUtf16BE = decodeUtf16BEWith strictDecode
154 {-# INLINE decodeUtf16BE #-}
155
156 -- | Encode text using little endian UTF-16 encoding.
157 encodeUtf16LE :: Text -> B.ByteString
158 encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
159 {-# INLINE encodeUtf16LE #-}
160
161 -- | Encode text using big endian UTF-16 encoding.
162 encodeUtf16BE :: Text -> B.ByteString
163 encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
164 {-# INLINE encodeUtf16BE #-}
165
166 -- | Decode text from little endian UTF-32 encoding.
167 decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
168 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
169 {-# INLINE decodeUtf32LEWith #-}
170
171 -- | Decode text from little endian UTF-32 encoding.
172 --
173 -- If the input contains any invalid little endian UTF-32 data, an
174 -- exception will be thrown. For more control over the handling of
175 -- invalid data, use 'decodeUtf32LEWith'.
176 decodeUtf32LE :: B.ByteString -> Text
177 decodeUtf32LE = decodeUtf32LEWith strictDecode
178 {-# INLINE decodeUtf32LE #-}
179
180 -- | Decode text from big endian UTF-32 encoding.
181 decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
182 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
183 {-# INLINE decodeUtf32BEWith #-}
184
185 -- | Decode text from big endian UTF-32 encoding.
186 --
187 -- If the input contains any invalid big endian UTF-32 data, an
188 -- exception will be thrown. For more control over the handling of
189 -- invalid data, use 'decodeUtf32BEWith'.
190 decodeUtf32BE :: B.ByteString -> Text
191 decodeUtf32BE = decodeUtf32BEWith strictDecode
192 {-# INLINE decodeUtf32BE #-}
193
194 -- | Encode text using little endian UTF-32 encoding.
195 encodeUtf32LE :: Text -> B.ByteString
196 encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
197 {-# INLINE encodeUtf32LE #-}
198
199 -- | Encode text using big endian UTF-32 encoding.
200 encodeUtf32BE :: Text -> B.ByteString
201 encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
202 {-# INLINE encodeUtf32BE #-}