diff --git a/Data/Encoding/UTF32.hs b/Data/Encoding/UTF32.hs index 2611b7e..b8598dc 100644 --- a/Data/Encoding/UTF32.hs +++ b/Data/Encoding/UTF32.hs @@ -12,18 +12,45 @@ import Data.Encoding.Base import Data.Word import Control.Exception (throwDyn) import Data.Typeable +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as LBS -data UTF32 = UTF32 deriving (Eq,Show,Typeable) +data UTF32 + = UTF32 -- ^ Detects big or little endian through the use of the BOM (Byte Order Mask) character. Defaults to big endian if not present. + | UTF32BE -- ^ Encodes and decodes using the big endian encoding. + | UTF32LE -- ^ Encodes and decodes using the little endian encoding. + deriving (Eq,Show,Typeable) + +bom :: Char +bom = '\xFEFF' instance Encoding UTF32 where - encode _ = encodeMultibyte encodeUTF32 - encodeLazy _ = encodeMultibyteLazy encodeUTF32 + encode UTF32 str = encodeMultibyte encodeUTF32be (bom:str) + encode UTF32LE str = encodeMultibyte encodeUTF32le str + encode UTF32BE str = encodeMultibyte encodeUTF32be str + encodeLazy UTF32 str = encodeMultibyteLazy encodeUTF32be (bom:str) + encodeLazy UTF32LE str = encodeMultibyteLazy encodeUTF32le str + encodeLazy UTF32BE str = encodeMultibyteLazy encodeUTF32be str encodable _ c = ord c < 0x0010FFFF - decode _ = decodeMultibyte decodeUTF32 - decodeLazy _ = decodeMultibyteLazy decodeUTF32 + decode UTF32 str = let + (start,rest) = BS.splitAt 4 str + in case BS.unpack start of + [0x00,0x00,0xFE,0xFF] -> decode UTF32BE rest + [0xFE,0xFF,0x00,0x00] -> decode UTF32LE rest + _ -> decode UTF32BE str + decode UTF32LE str = decodeMultibyte decodeUTF32le str + decode UTF32BE str = decodeMultibyte decodeUTF32be str + decodeLazy UTF32 str = let + (start,rest) = LBS.splitAt 4 str + in case LBS.unpack start of + [0x00,0x00,0xFE,0xFF] -> decodeLazy UTF32BE rest + [0xFE,0xFF,0x00,0x00] -> decodeLazy UTF32LE rest + _ -> decodeLazy UTF32BE str + decodeLazy UTF32LE str = decodeMultibyteLazy decodeUTF32le str + decodeLazy UTF32BE str = decodeMultibyteLazy decodeUTF32be str -encodeUTF32 :: Char -> (Word8,EncodeState) -encodeUTF32 ch = let +encodeUTF32be :: Char -> (Word8,EncodeState) +encodeUTF32be ch = let w = ord ch w1 = fromIntegral $ w `shiftR` 24 w2 = fromIntegral $ w `shiftR` 16 @@ -31,11 +58,33 @@ encodeUTF32 ch = let w4 = fromIntegral $ w in (w1,Put3 w2 w3 w4) -decodeUTF32 :: [Word8] -> (Char,[Word8]) -decodeUTF32 (w1:w2:w3:w4:rest) = (chr $ - (fromIntegral w1 `shiftL` 24) .|. - (fromIntegral w2 `shiftL` 16) .|. - (fromIntegral w3 `shiftL` 8) .|. - (fromIntegral w4),rest) -decodeUTF32 _ = throwDyn UnexpectedEnd +encodeUTF32le :: Char -> (Word8,EncodeState) +encodeUTF32le ch = let + w = ord ch + w1 = fromIntegral $ w `shiftR` 24 + w2 = fromIntegral $ w `shiftR` 16 + w3 = fromIntegral $ w `shiftR` 8 + w4 = fromIntegral $ w + in (w4,Put3 w3 w2 w1) + +decodeUTF32be :: [Word8] -> (Char,[Word8]) +decodeUTF32be (w1:w2:w3:w4:rest) = let + v = (fromIntegral w1 `shiftL` 24) .|. + (fromIntegral w2 `shiftL` 16) .|. + (fromIntegral w3 `shiftL` 8) .|. + (fromIntegral w4) + in if v < 0x0010FFFF + then (chr v,rest) + else throwDyn (IllegalRepresentation [w1,w2,w3,w4]) +decodeUTF32be _ = throwDyn UnexpectedEnd +decodeUTF32le :: [Word8] -> (Char,[Word8]) +decodeUTF32le (w1:w2:w3:w4:rest) = let + v = (fromIntegral w4 `shiftL` 24) .|. + (fromIntegral w3 `shiftL` 16) .|. + (fromIntegral w2 `shiftL` 8) .|. + (fromIntegral w1) + in if v < 0x0010FFFF + then (chr v,rest) + else throwDyn (IllegalRepresentation [w1,w2,w3,w4]) +decodeUTF32le _ = throwDyn UnexpectedEnd