encoding/Data/Encoding/Base.hs
Henning Guenther 971927c91f Made extensions explicit
darcs-hash:20071229003551-a4fee-f01193be732b8f3f5085232f922490cd5b48706a
2007-12-28 16:35:51 -08:00

156 lines
5.0 KiB
Haskell

{-# LANGUAGE CPP,TemplateHaskell,DeriveDataTypeable #-}
module Data.Encoding.Base
(Encoding(..)
,EncodeState(..)
,encodeMultibyte
,encodeMultibyteLazy
,decodeMultibyte
,decodeMultibyteLazy
,encodeSinglebyte
,encodeSinglebyteLazy
,decodeSinglebyte
,EncodingException(..)
,DecodingException(..)
,decodingArray
,encodingMap)
where
import Data.Array(array)
import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack)
import qualified Data.ByteString.Lazy as LBS
import Data.Encoding.Helper.Template
#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe(unsafeIndex)
#else
import Data.ByteString.Base(unsafeIndex)
#endif
import Data.Map (Map,fromList,lookup)
import Data.Char(chr)
import Data.Maybe(mapMaybe)
import Data.Typeable
import Data.Word
import Prelude hiding (lookup,length)
import qualified Prelude
import Control.Exception
import Data.Dynamic(toDyn)
import Language.Haskell.TH
{- | Represents an encoding, supporting various methods of de- and encoding.
Minimal complete definition: encode, decode
-}
class Encoding enc where
-- | Encode a 'String' into a strict 'ByteString'. Throws the
-- 'HasNoRepresentation'-Exception if it encounters an unrepresentable
-- character.
encode :: enc -> String -> ByteString
-- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'.
encodeLazy :: enc -> String -> LBS.ByteString
encodeLazy e str = LBS.fromChunks [encode e str]
-- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'.
encodable :: enc -> Char -> Bool
encodable _ _ = True
-- | Decode a strict 'ByteString' into a 'String'. If the string is not
-- decodable, a 'DecodingException' is thrown.
decode :: enc -> ByteString -> String
decodeLazy :: enc -> LBS.ByteString -> String
decodeLazy e str = concatMap (decode e) (LBS.toChunks str)
-- | Whether or no a given 'ByteString' is decodable. Default: 'True'.
decodable :: enc -> ByteString -> Bool
decodable _ _ = True
encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString
encodeMultibyte f str = unfoldr (\st -> case st of
(Done,[]) -> Nothing
(Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs))
(Put1 w1,xs) -> Just (w1,(Done,xs))
(Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs))
(Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str)
encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString
encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of
Done -> case rest of
[] -> Nothing
x:xs -> let ~(w,st) = f x in Just (w,(st,xs))
Put1 w1 -> Just (w1,(Done,rest))
Put2 w1 w2 -> Just (w1,(Put1 w2,rest))
Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str)
decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String
decodeMultibyte f str = decode (unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String
decodeMultibyteLazy f str = decode (LBS.unpack str)
where
decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst)
encodeSinglebyte :: (Char -> Word8) -> String -> ByteString
encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString
encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of
[] -> Nothing
(x:xs) -> Just (f x,xs)) str
decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String
decodeSinglebyte f str = map f (unpack str)
data EncodeState
= Done
| Put1 !Word8
| Put2 !Word8 !Word8
| Put3 !Word8 !Word8 !Word8
deriving Show
-- | This exception type is thrown whenever something went wrong during the
-- encoding-process.
data EncodingException
= HasNoRepresentation Char -- ^ Thrown if a specific character
-- is not representable in an encoding.
deriving (Show,Typeable)
-- | This exception type is thrown whenever something went wrong during the
-- decoding-process.
data DecodingException
= IllegalCharacter Word8 -- ^ The sequence contained an illegal
-- byte that couldn't be decoded.
| UnexpectedEnd -- ^ more bytes were needed to allow a
-- successfull decoding.
| OutOfRange -- ^ the decoded value was out of the unicode range
deriving (Show,Typeable)
decodingArray :: FilePath -> Q Exp
decodingArray file = do
trans <- runIO (readTranslation file)
createCharArray trans 0 255
encodingMap :: FilePath -> Q Exp
#ifndef __HADDOCK__
encodingMap file = do
trans <- runIO (readTranslation file)
return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
#endif
readTranslation :: FilePath -> IO [(Integer,Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this
(src:trg:_) -> Just (read src,chr $ read trg)
_ -> Nothing
) (lines cont)