153 lines
4.6 KiB
Haskell
153 lines
4.6 KiB
Haskell
{-# LANGUAGE DeriveDataTypeable #-}
|
|
{- | This module implements UTF-16 encoding and decoding as in RFC 2781.
|
|
See <http://en.wikipedia.org/wiki/UTF-16> for more information.
|
|
-}
|
|
module Data.Encoding.UTF16
|
|
(UTF16(..)
|
|
) where
|
|
|
|
import Data.Encoding.Base
|
|
import Data.Char(ord,chr)
|
|
import Data.Bits
|
|
import Data.Int
|
|
import Data.Word
|
|
import Data.ByteString as BS
|
|
import qualified Data.ByteString.Lazy as LBS
|
|
import Prelude hiding (length)
|
|
import Control.Exception
|
|
import Data.Dynamic (toDyn)
|
|
import Data.Typeable
|
|
|
|
data UTF16
|
|
= UTF16 -- ^ Decodes big and little endian, encodes big endian.
|
|
| UTF16BE -- ^ Big endian decoding and encoding, fails if the string isn\'t actually big endian.
|
|
| UTF16LE -- ^ Little endian decoding and encoding.
|
|
deriving (Eq,Show,Typeable)
|
|
|
|
utf16enc :: Bool -> (EncodeState,String) -> Maybe (Word8,(EncodeState,String))
|
|
utf16enc _ (Done,[]) = Nothing
|
|
utf16enc True (Done,x:xs)
|
|
| n<=0x0000FFFF = Just
|
|
(fromIntegral $ n `shiftR` 8
|
|
,(Put1 (fromIntegral $ n),xs))
|
|
| n<=0x0010FFFF = Just
|
|
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)
|
|
,(Put3 (fromIntegral $ (n' `shiftR` 10))
|
|
(fromIntegral $
|
|
0xDC .|. ((n' `shiftR` 8) .&. 0x03))
|
|
(fromIntegral n'),xs))
|
|
| otherwise = throwDyn $ HasNoRepresentation x
|
|
where
|
|
n = ord x
|
|
n' = n - 0x10000
|
|
utf16enc False (Done,x:xs)
|
|
| n<=0x0000FFFF = Just
|
|
(fromIntegral $ n
|
|
,(Put1 (fromIntegral $ n `shiftR` 8),xs))
|
|
| n<=0x0010FFFF = Just
|
|
(fromIntegral n'
|
|
,(Put3 (fromIntegral $
|
|
0xDC .|. ((n' `shiftR` 8) .&. 0x03))
|
|
(fromIntegral $ (n' `shiftR` 10))
|
|
(fromIntegral $ 0xD8 .|. (n' `shiftR` 18)),xs))
|
|
| otherwise = throwDyn $ HasNoRepresentation x
|
|
where
|
|
n = ord x
|
|
n' = n - 0x10000
|
|
utf16enc _ (Put3 w1 w2 w3,xs) = Just (w1,(Put2 w2 w3,xs))
|
|
utf16enc _ (Put2 w1 w2,xs) = Just (w1,(Put1 w2,xs))
|
|
utf16enc _ (Put1 w1,xs) = Just (w1,(Done,xs))
|
|
|
|
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int) #-}
|
|
{-# SPECIALIZE utf16dec :: Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,Int64) #-}
|
|
utf16dec :: Num a => Bool -> Word8 -> Word8 -> Word8 -> Word8 -> (Char,a)
|
|
utf16dec be s1 s2 s3 s4
|
|
| w1< 0xD8 || w1> 0xDF
|
|
= (chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2),2)
|
|
| w1> 0xDB = throwDyn $ IllegalCharacter w1
|
|
| w3< 0xDC || w3>0xDF = throwDyn $ IllegalCharacter w3
|
|
| otherwise = (chr $ (((fromIntegral w1 .&. 0x03) `shiftL` 18)
|
|
.|. ((fromIntegral w2) `shiftL` 10)
|
|
.|. ((fromIntegral w3 .&. 0x03) `shiftL` 8)
|
|
.|. (fromIntegral w4)) + 0x10000,4)
|
|
where
|
|
(w1,w2,w3,w4) = if be then (s1,s2,s3,s4) else (s2,s1,s4,s3)
|
|
|
|
instance Encoding UTF16 where
|
|
encode enc str = unfoldr (utf16enc (enc/=UTF16LE)) (case enc of
|
|
UTF16 -> Put2 0xFE 0xFF
|
|
_ -> Done,str)
|
|
encodeLazy enc str = LBS.unfoldr (utf16enc (enc/=UTF16LE)) (case enc of
|
|
UTF16 -> Put2 0xFE 0xFF
|
|
_ -> Done,str)
|
|
encodable _ c = ord c <= 0x0010FFFF
|
|
decode bo str = case findByteOrder str of
|
|
Nothing -> decode' (bo/=UTF16LE) 0
|
|
Just big -> decode' big 2
|
|
where
|
|
l = BS.length str
|
|
decode' be i = if i>=l
|
|
then []
|
|
else c:decode' be (i+took)
|
|
where
|
|
(c,took) = mapException (\ex -> case ex of
|
|
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
|
|
_ -> ex) (utf16dec be s1 s2 s3 s4)
|
|
s1 = index str i
|
|
s2 = index str (i+1)
|
|
s3 = index str (i+2)
|
|
s4 = index str (i+3)
|
|
decodeLazy bo str = case findByteOrderLazy str of
|
|
Nothing -> decode' (bo/=UTF16LE) 0
|
|
Just big -> decode' big 2
|
|
where
|
|
l = LBS.length str
|
|
decode' be i = if i>=l
|
|
then []
|
|
else c:decode' be (i+took)
|
|
where
|
|
(c,took) = mapException (\ex -> case ex of
|
|
ErrorCall _ -> DynException (toDyn UnexpectedEnd)
|
|
_ -> ex) (utf16dec be s1 s2 s3 s4)
|
|
s1 = LBS.index str i
|
|
s2 = LBS.index str (i+1)
|
|
s3 = LBS.index str (i+2)
|
|
s4 = LBS.index str (i+3)
|
|
decodable bo str = case findByteOrder str of
|
|
Nothing -> check' (bo/=UTF16LE) (length str) 0
|
|
Just big -> check' big (length str) 2
|
|
where
|
|
check' be m i
|
|
| m == i = True
|
|
| m == i+1 = False
|
|
| w1< 0xD8 || w1> 0xDF = check' be m (i+2)
|
|
| w1> 0xDB = False
|
|
| m <= i+3 = False
|
|
| w3< 0xDC || w3>0xDF = False
|
|
| otherwise = check' be m (i+4)
|
|
where
|
|
(w1,w3) = if be then (s1,s3) else (s2,s4)
|
|
s1 = index str i
|
|
s2 = index str (i+1)
|
|
s3 = index str (i+2)
|
|
s4 = index str (i+3)
|
|
|
|
findByteOrder :: ByteString -> Maybe Bool
|
|
findByteOrder str
|
|
| length str < 2 = Nothing
|
|
| w1 == 0xFE && w2 == 0xFF = Just True
|
|
| w1 == 0xFF && w2 == 0xFE = Just False
|
|
| otherwise = Nothing
|
|
where
|
|
w1 = index str 0
|
|
w2 = index str 1
|
|
|
|
|
|
findByteOrderLazy :: LBS.ByteString -> Maybe Bool
|
|
findByteOrderLazy str = case LBS.unpack (LBS.take 2 str) of
|
|
[w1,w2]
|
|
| w1 == 0xFE && w2 == 0xFF -> Just True
|
|
| w1 == 0xFF && w2 == 0xFE -> Just False
|
|
| otherwise -> Nothing
|
|
_ -> Nothing
|