implemented 'Text -> Builder' UTF-8 encoders
authorSimon Meier <iridcode@gmail.com>
Tue, 31 Jan 2012 07:31:48 +0000 (08:31 +0100)
committerSimon Meier <iridcode@gmail.com>
Fri, 3 Feb 2012 10:40:32 +0000 (11:40 +0100)
It uses a coupled end-of-input-and-output boundary and exploits the UTF-16
representation of the 'Text' value. According to preliminary benchmarks, it is
25% faster than the existing 'encodeUtf8 :: Text -> ByteString' function.

We also support an 'encodeUtf8AsciiEscaped' encoder that allows to special
case encoding of ASCII characters. This is a very useful function for
implementing escaping encoders, e.g., for JSON or HTML output.

Data/Text/Encoding.hs

index 476f89d..6339130 100644 (file)
@@ -49,6 +49,8 @@ module Data.Text.Encoding
     -- * Generic encoding of Text
     , encodeStreamWithB
     , encodeTextWithB
+    , encodeUtf8Builder
+    , encodeUtf8Escaped
     ) where
 
 import Control.Exception (evaluate, try)
@@ -61,7 +63,7 @@ import Data.Bits ((.&.))
 import Data.ByteString as B
 import Data.ByteString.Internal as B
 import Data.ByteString.Lazy.Builder.Internal as B
-import Data.ByteString.Lazy.Builder.BasicEncoding.Internal as B
+import qualified Data.ByteString.Lazy.Builder.BasicEncoding.Internal as B
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal (Text(..), textP)
 import Data.Text.UnsafeChar (ord, unsafeWrite)
@@ -310,3 +312,98 @@ encodeStreamWithB be =
 {-# INLINE encodeTextWithB #-}
 encodeTextWithB :: B.BoundedEncoding Char -> Text -> B.Builder
 encodeTextWithB be = encodeStreamWithB be . F.stream
+
+-- | Encode text using UTF-8 encoding.
+encodeUtf8Builder :: Text -> B.Builder
+encodeUtf8Builder (Text arr off len) =
+    B.builder step
+  where
+    bound   = 4
+    iend    = off + len
+    step !k =
+        outerLoop off
+      where
+        outerLoop !i0 !br@(BufferRange op0 ope)
+          | i0 >= iend                = k br
+          | op0 `plusPtr` bound < ope =
+              goPartial (i0 + min outRemaining inpRemaining)
+          | otherwise  = return $ bufferFull bound op0 (outerLoop i0)
+          where
+            outRemaining = (ope `minusPtr` op0) `div` bound
+            inpRemaining = iend - i0
+
+            goPartial !iendTmp = go i0 op0
+              where
+                go !i !op
+                  | i < iendTmp = case A.unsafeIndex arr i of
+                      w | w <= 0x7F -> do
+                            poke8 0 w
+                            go (i + 1) (op `plusPtr` 1)
+                        | w <= 0x7FF -> do
+                            poke8 0 $ (w `shiftR` 6) + 0xC0
+                            poke8 1 $ (w .&. 0x3f) + 0x80
+                            go (i + 1) (op `plusPtr` 2)
+                        | 0xD800 <= w && w <= 0xDBFF -> do
+                            let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1))
+                            poke8 0 $ (c `shiftR` 18) + 0xF0
+                            poke8 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80
+                            poke8 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80
+                            poke8 3 $ (c .&. 0x3F) + 0x80
+                            go (i + 2) (op `plusPtr` 4)
+                        | otherwise -> do
+                            poke8 0 $ (w `shiftR` 12) + 0xE0
+                            poke8 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80
+                            poke8 2 $ (w .&. 0x3F) + 0x80
+                            go (i + 1) (op `plusPtr` 3)
+                  | otherwise =
+                      outerLoop i (BufferRange op ope)
+                  where
+                    poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
+
+-- | Encode text using UTF-8 encoding and escape the ASCII characters using
+-- a 'BoundedEncoding'.
+encodeUtf8Escaped :: B.BoundedEncoding Word8 -> Text -> B.Builder
+encodeUtf8Escaped be (Text arr off len) =
+    B.builder step
+  where
+    bound   = max 4 $ B.sizeBound be
+    iend    = off + len
+    step !k =
+        outerLoop off
+      where
+        outerLoop !i0 !br@(BufferRange op0 ope)
+          | i0 >= iend                = k br
+          | op0 `plusPtr` bound < ope =
+              goPartial (i0 + min outRemaining inpRemaining)
+          | otherwise  = return $ bufferFull bound op0 (outerLoop i0)
+          where
+            outRemaining = (ope `minusPtr` op0) `div` bound
+            inpRemaining = iend - i0
+
+            goPartial !iendTmp = go i0 op0
+              where
+                go !i !op
+                  | i < iendTmp = case A.unsafeIndex arr i of
+                      w | w <= 0x7F -> do
+                            B.runB be (fromIntegral w) op >>= go (i + 1)
+                        | w <= 0x7FF -> do
+                            poke8 0 $ (w `shiftR` 6) + 0xC0
+                            poke8 1 $ (w .&. 0x3f) + 0x80
+                            go (i + 1) (op `plusPtr` 2)
+                        | 0xD800 <= w && w <= 0xDBFF -> do
+                            let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1))
+                            poke8 0 $ (c `shiftR` 18) + 0xF0
+                            poke8 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80
+                            poke8 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80
+                            poke8 3 $ (c .&. 0x3F) + 0x80
+                            go (i + 2) (op `plusPtr` 4)
+                        | otherwise -> do
+                            poke8 0 $ (w `shiftR` 12) + 0xE0
+                            poke8 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80
+                            poke8 2 $ (w .&. 0x3F) + 0x80
+                            go (i + 1) (op `plusPtr` 3)
+                  | otherwise =
+                      outerLoop i (BufferRange op ope)
+                  where
+                    poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
+