JIS X 0208 encoding

darcs-hash:20090223182459-a4fee-98ced8f8b7bac594dc6510eeecb6bea8c51a6090
This commit is contained in:
Henning Guenther 2009-02-23 10:24:59 -08:00
parent 0398f66695
commit b95bfe9be4
6 changed files with 126 additions and 41 deletions

View File

@ -73,6 +73,7 @@ import Data.Encoding.KOI8U
import Data.Encoding.GB18030
import Data.Encoding.MacOSRoman
import Data.Encoding.JISX0201
import Data.Encoding.JISX0208
import Data.Char
import Text.Regex
@ -312,6 +313,8 @@ encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
"macintosh" -> Just $ DynEncoding MacOSRoman
-- JIS X 0201
"jis_x_0201" -> Just $ DynEncoding JISX0201
-- JIS x 0208
"jis_x_0208" -> Just $ DynEncoding JISX0208
-- defaults to nothing
_ -> Nothing
where

View File

@ -37,9 +37,24 @@ encodeWithMap mp c = case Map.lookup c mp of
Nothing -> throwException $ HasNoRepresentation c
Just v -> pushWord8 v
encodeWithMap2 :: ByteSink m => Map Char (Word8,Word8) -> Char -> m ()
encodeWithMap2 mp c = case Map.lookup c mp of
Nothing -> throwException $ HasNoRepresentation c
Just (w1,w2) -> do
pushWord8 w1
pushWord8 w2
decodeWithArray :: ByteSource m => Array Word8 (Maybe Char) -> m Char
decodeWithArray arr = do
w <- fetchWord8
case arr!w of
Nothing -> throwException $ IllegalCharacter w
Just c -> return c
decodeWithArray2 :: ByteSource m => Array (Word8,Word8) (Maybe Char) -> m Char
decodeWithArray2 arr = do
w1 <- fetchWord8
w2 <- fetchWord8
case arr!(w1,w2) of
Nothing -> throwException $ IllegalCharacter w1
Just c -> return c

View File

@ -2,6 +2,7 @@
module Data.Encoding.Helper.Template where
import Data.Encoding.Base
import Data.Bits
import Data.Char
import Data.Maybe (mapMaybe)
import Data.Map as Map (fromList,lookup)
@ -10,66 +11,112 @@ import Language.Haskell.TH
makeISOInstance :: String -> FilePath -> Q [Dec]
makeISOInstance name file = do
let rname = mkName name
trans <- runIO (readTranslation file)
mp <- encodingMap (validTranslations trans)
arr <- decodingArray (fillTranslations trans)
return [ DataD [] rname [] [NormalC rname []] [''Show]
, InstanceD [] (AppT (ConT ''Encoding) (ConT rname))
[FunD 'encodeChar
[Clause [WildP] (NormalB $ AppE (VarE 'encodeWithMap) (VarE $ mkName "mp"))
[ValD (VarP $ mkName "mp") (NormalB mp) []]
]
,FunD 'decodeChar
[Clause [WildP] (NormalB $ AppE (VarE 'decodeWithArray) (VarE $ mkName "arr"))
[ValD (VarP $ mkName "arr") (NormalB arr) []]
]
]
]
arr <- decodingArray (fillTranslations 0 255 trans)
return $ encodingInstance 'encodeWithMap 'decodeWithArray name mp arr
makeJISInstance :: String -> FilePath -> Q [Dec]
makeJISInstance name file = do
trans <- runIO (readJISTranslation file)
mp <- encodingMap2 (validTranslations trans)
arr <- decodingArray2 (fillTranslations (0x21,0x21) (0x7E,0x7E) trans)
return $ encodingInstance 'encodeWithMap2 'decodeWithArray2 name mp arr
encodingInstance :: Name -> Name -> String -> Exp -> Exp -> [Dec]
encodingInstance enc dec name mp arr
= [ DataD [] rname [] [NormalC rname []] [''Show]
, InstanceD [] (AppT (ConT ''Encoding) (ConT rname))
[FunD 'encodeChar
[Clause [WildP] (NormalB $ AppE (VarE enc) (VarE rmp))
[ValD (VarP rmp) (NormalB mp) []]
]
,FunD 'decodeChar
[Clause [WildP] (NormalB $ AppE (VarE dec) (VarE rarr))
[ValD (VarP rarr) (NormalB arr) []]
]
]
]
where
rname = mkName name
rarr = mkName "arr"
rmp = mkName "mp"
createCharArray :: [(Integer,Maybe Char)] -> Integer -> Integer -> Q Exp
createCharArray lst = createArray (map (\(x,y) -> (x,case y of
Nothing -> ConE 'Nothing
Just c -> AppE (ConE 'Just) (LitE $ CharL c))
) lst)
createCharArray lst f t = createArray (map (\(x,y) ->
(LitE $ IntegerL x,mbCharToExp y)
) lst) (LitE $ IntegerL f) (LitE $ IntegerL t)
createCharArray2 :: [((Integer,Integer),Maybe Char)] -> (Integer,Integer) -> (Integer,Integer) -> Q Exp
createCharArray2 lst (f1,f2) (t1,t2)
= createArray (map (\((x1,x2),y) ->
(TupE [integerExp x1,integerExp x2],mbCharToExp y)
) lst)
(TupE [integerExp f1,integerExp f2])
(TupE [integerExp t1,integerExp t2])
createArray :: [(Integer,Exp)] -> Integer -> Integer -> Q Exp
createArray lst from to = return $ AppE
(AppE
(VarE 'array)
(TupE [LitE $ IntegerL from,LitE $ IntegerL to]))
(ListE [ TupE [LitE $ IntegerL x,y]
| (x,y) <- lst ])
integerExp :: Integer -> Exp
integerExp i = LitE $ IntegerL i
mbCharToExp :: Maybe Char -> Exp
mbCharToExp Nothing = ConE 'Nothing
mbCharToExp (Just c) = AppE (ConE 'Just) (LitE $ CharL c)
createArray :: [(Exp,Exp)] -> Exp -> Exp -> Q Exp
createArray lst from to
= return $ AppE
(AppE
(VarE 'array)
(TupE [from,to]))
(ListE [TupE [x,y] | (x,y) <- lst])
decodingArray :: [(Integer,Maybe Char)] -> Q Exp
decodingArray trans = createCharArray trans 0 255
decodingArray2 :: [((Integer,Integer),Maybe Char)] -> Q Exp
decodingArray2 trans = createCharArray2 trans (0x21,0x21) (0x7E,0x7E)
encodingMap :: [(Integer,Char)] -> Q Exp
encodingMap trans = return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from]
| (from,to) <- trans])
encodingMap2 :: [((Integer,Integer),Char)] -> Q Exp
encodingMap2 trans = return $ AppE
(VarE 'fromList)
(ListE [ TupE [LitE $ CharL to,TupE [integerExp f1,integerExp f2]]
| ((f1,f2),to) <- trans])
readTranslation :: FilePath -> IO [(Integer,Maybe Char)]
readTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[] -> Nothing
('#':xs) -> Nothing
_ -> case words ln of
(src:"#UNDEFINED":_) -> Just (read src,Nothing)
(src:trg:_) -> Just (read src,Just $ chr $ read trg)
_ -> Nothing
) (lines cont)
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[src] -> Just (src,Nothing)
[src,trg] -> Just (src,Just $ chr $ fromIntegral trg)
_ -> Nothing) (parseTranslationTable cont)
fillTranslations :: [(Integer,Maybe Char)] -> [(Integer,Maybe Char)]
fillTranslations = fillTranslations' (-1)
readJISTranslation :: FilePath -> IO [((Integer,Integer),Maybe Char)]
readJISTranslation file = do
cont <- readFile file
return $ mapMaybe (\ln -> case ln of
[_,src] -> Just ((src `shiftR` 8,src .&. 0xFF),Nothing)
[_,src,trg] -> Just ((src `shiftR` 8,src .&. 0xFF),Just $ chr $ fromIntegral trg)
_ -> Nothing) (parseTranslationTable cont)
parseTranslationTable :: String -> [[Integer]]
parseTranslationTable cont = filter (not.null) (map (\ln -> map read (takeWhile ((/='#').head) (words ln))) (lines cont))
fillTranslations :: Ix a => a -> a -> [(a,Maybe Char)] -> [(a,Maybe Char)]
fillTranslations f t = merge (range (f,t))
where
fillTranslations' n ((n',c):cs) = (map (\i -> (i,Nothing)) [n+1..n'-1])++((n',c):fillTranslations' n' cs)
fillTranslations' n [] = map (\i -> (i,Nothing)) [n+1..255]
merge xs [] = map (\x -> (x,Nothing)) xs
merge [] _ = error "Data.Encoding.Helper.Template.fillTranslations: Character translations out of range"
merge (x:xs) (y:ys) = if x < fst y
then (x,Nothing):(merge xs (y:ys))
else y:(merge xs ys)
validTranslations :: [(Integer,Maybe Char)] -> [(Integer,Char)]
validTranslations :: [(a,Maybe Char)] -> [(a,Char)]
validTranslations = mapMaybe (\(n,mc) -> case mc of
Nothing -> Nothing
Just c -> Just (n,c))

View File

@ -0,0 +1,6 @@
{-# LANGUAGE TemplateHaskell #-}
module Data.Encoding.JISX0208 where
import Data.Encoding.Helper.Template (makeJISInstance)
$( makeJISInstance "JISX0208" "jis0208.txt" )

View File

@ -17,6 +17,7 @@ import Data.Encoding.ISO885910
import Data.Encoding.ISO885911
import Data.Encoding.ISO885913
import Data.Encoding.ISO885914
import Data.Encoding.JISX0208
import Data.Encoding.BootString
import Test.HUnit
import Test.QuickCheck hiding (test)
@ -227,4 +228,16 @@ punycodeTests = TestList $ map test $
\baz\xEF\xB8\x80\xEF\xB8\x88\xEF\xB8\x8F\xEF\xBB\xBF"
"foobarbaz"-}
]
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)
where punyTest str outp = EncodingTest punycode str (map (fromIntegral.ord) outp)
isoTests :: Test
isoTests = TestList $ map test $
[EncodingTest ISO88592 "\x104\x2D8\x141\xA4\x13D\x15A\xA7\xA8\x160\x15E\x164\x179\xAD\x17D\x17B\xB0\x105\x2DB\x142\xB4\x13E\x15B\x2C7\xB8\x161\x15F"
[0xA1..0xBA]
]
jisTests :: Test
jisTests = TestList $ map test $
[EncodingTest JISX0208 "\x4E9C"
[0x30,0x21]
]

View File

@ -61,4 +61,5 @@ Library
Data.Encoding.BootString
Data.Encoding.MacOSRoman
Data.Encoding.JISX0201
Data.Encoding.JISX0208
System.IO.Encoding