encoding/Data/Encoding/UTF8.hs
Henning Guenther eeee054f1e Rewrite to support more sources and changing the encoding dynamically
Now it's possible to change the character encoding while de-/encoding.
Also, it's possible to use any data structure as a source or target of the de-/encoding process.

darcs-hash:20090221203100-a4fee-6da31f2e37c30a3f5cd5f10af71984209488bb0b
2009-02-21 12:31:00 -08:00

126 lines
5.5 KiB
Haskell

{-# LANGUAGE DeriveDataTypeable #-}
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
See <http://en.wikipedia.org/wiki/UTF-8> for more information.
-}
module Data.Encoding.UTF8 where
import Control.Throws
import Data.Char
import Data.Bits
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.Typeable
data UTF8 = UTF8 -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
| UTF8Strict -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
deriving (Eq,Show,Typeable)
instance Encoding UTF8 where
encodeChar _ c
| n <= 0x0000007F = p8 n
| n <= 0x000007FF = do
p8 $ 0xC0 .|. (n `shiftR` 6)
p8 $ 0x80 .|. (n .&. 0x3F)
| n <= 0x0000FFFF = do
p8 $ 0xE0 .|. (n `shiftR` 12)
p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
p8 $ 0x80 .|. (n .&. 0x3F)
| n <= 0x0010FFFF = do
p8 $ 0xF0 .|. (n `shiftR` 18)
p8 $ 0x80 .|. ((n `shiftR` 12) .&. 0x3F)
p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
p8 $ 0x80 .|. (n .&. 0x3F)
| otherwise = throwException (HasNoRepresentation c)
where
n = ord c
p8 = pushWord8.fromIntegral
decodeChar UTF8 = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
| w1 <= 0xDF -> do
w2 <- fetchWord8
return $ chr $
((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| w1 <= 0xEF -> do
w2 <- fetchWord8
w3 <- fetchWord8
let v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| w1 <= 0xF7 -> do
w2 <- fetchWord8
w3 <- fetchWord8
w4 <- fetchWord8
let v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
v4 = w4 .&. 0x3F
return $ chr $
((fromIntegral v1) `shiftL` 18)
.|. ((fromIntegral v2) `shiftL` 12)
.|. ((fromIntegral v3) `shiftL` 6)
.|. (fromIntegral v4)
| otherwise -> throwException (IllegalCharacter w1)
decodeChar UTF8Strict = do
w1 <- fetchWord8
case () of
_
| w1 <= 0x7F -> return $ chr $ fromIntegral w1
| w1 <= 0xBF -> throwException (IllegalCharacter w1)
| w1 <= 0xDF -> do
w2 <- fetchExtend8
let v1 = w1 .&. 0x1F
if v1 <= 1
then throwException (IllegalRepresentation [w1,w2])
else return $ chr $
((fromIntegral v1) `shiftL` 6)
.|. (fromIntegral $ w2 .&. 0x3F)
| w1 <= 0xEF -> do
w2 <- fetchExtend8
w3 <- fetchExtend8
let v1 = w1 .&. 0x0F
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
if v1 == 0 && v2 < 0x20
then throwException (IllegalRepresentation [w1,w2,w3])
else return $ chr $
((fromIntegral v1) `shiftL` 12)
.|. ((fromIntegral v2) `shiftL` 6)
.|. (fromIntegral v3)
| w1 <= 0xF7 -> do
w2 <- fetchExtend8
w3 <- fetchExtend8
w4 <- fetchExtend8
let v1 = w1 .&. 0x07
v2 = w2 .&. 0x3F
v3 = w3 .&. 0x3F
v4 = w4 .&. 0x3F
if v1 == 0 && v2 < 0x10
then throwException (IllegalRepresentation [w1,w2,w3,w4])
else return $ chr $
((fromIntegral v1) `shiftL` 18)
.|. ((fromIntegral v2) `shiftL` 12)
.|. ((fromIntegral v3) `shiftL` 6)
.|. (fromIntegral v4)
| otherwise -> throwException (IllegalCharacter w1)
where
invalidExtend wrd = wrd .&. 0xC0 /= 0x80
fetchExtend8 = do
w <- fetchWord8
if invalidExtend w
then throwException (IllegalCharacter w)
else return w