encoding/Data/Encoding/Base.hs
Henning Guenther 496761301f Initial commit
darcs-hash:20070802003505-a4fee-97432a586ed2453061ef246d1e7a81fec35d1bac
2007-08-01 17:35:05 -07:00

155 lines
5.0 KiB
Haskell

{-# OPTIONS -fglasgow-exts #-}
module Data.Encoding.Base
(Encoding(..)
,EncodeState(..)
,encodeMultibyte
,encodeMultibyteLazy
,decodeMultibyte
,decodeMultibyteLazy
,encodeSinglebyte
,encodeSinglebyteLazy
,decodeSinglebyte
,EncodingException(..)
,DecodingException(..)
,decodingArray
,encodingMap)
where
import Data.Array(array)
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
import qualified Data.ByteString.Lazy as LBS
import Data.ByteString.Base(unsafeIndex)
import Data.Map (Map,fromList,lookup)
import Data.Char(chr)
import Data.Maybe(mapMaybe)
import Data.Typeable
import Data.Word
import Prelude hiding (lookup,length)
import qualified Prelude
import Control.Exception
import Data.Dynamic(toDyn)
import Language.Haskell.TH
{- | Represents an encoding, supporting various methods of de- and encoding.
Minimal complete definition: encode, decode
-}
class Encoding enc where
-- | Encode a 'String' into a strict 'ByteString'. Throws the
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable
-- character.
encode :: enc -> String -> ByteString
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'.
encodeLazy :: enc -> String -> LBS.ByteString
encodeLazy e str = LBS.fromChunks [encode e str]
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
encodable :: enc -> Char -> Bool
encodable _ _ = True
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
-- decodable, a 'DecodingException' is thrown.
decode :: enc -> ByteString -> String
decodeLazy :: enc -> LBS.ByteString -> String
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
decodable :: enc -> ByteString -> Bool
decodable _ _ = True
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString
encodeMultibyte f str = unfoldr (\st -> case st of
(Done,[]) -> Nothing
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs))
(Put1 w1,xs) -> Just (w1,(Done,xs))
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs))
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str)
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of
Done -> case rest of
[] -> Nothing
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
Put1 w1 -> Just (w1,(Done,rest))
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String
decodeMultibyte f str = decode (unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
decodeMultibyteLazy f str = decode (LBS.unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
decodeSinglebyte f str = map f (unpack str)
data EncodeState
= Done
| Put1 !Word8
| Put2 !Word8 !Word8
| Put3 !Word8 !Word8 !Word8
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Show,Typeable)
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
deriving (Show,Typeable)
decodingArray :: FilePath -> Q Exp
-- Haddock hates template haskell...
#ifndef __HADDOCK__
decodingArray file = do
trans <- runIO (readTranslation file)
return $ AppE
(AppE
(VarE 'array)
(TupE [LitE $ IntegerL 0,LitE $ IntegerL 255]))
(ListE [ TupE [LitE $ IntegerL from,LitE $ CharL to]
| (from,to) <- trans ])
#endif
encodingMap :: FilePath -> Q Exp
#ifndef __HADDOCK__
encodingMap file = do
trans <- runIO (readTranslation file)
return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
#endif
readTranslation :: FilePath -> IO [(Integer,Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,chr $ read trg)
_ -> Nothing
) (lines cont)