278 lines
11 KiB
Haskell
278 lines
11 KiB
Haskell
{-# LANGUAGE ExistentialQuantification,CPP #-}
|
|
module Data.Encoding
|
|
(Encoding(..)
|
|
,EncodingException(..)
|
|
,DecodingException(..)
|
|
,recode
|
|
,recodeLazy
|
|
,DynEncoding()
|
|
#ifndef USE_HPC
|
|
,encodingFromString
|
|
,encodingFromStringMaybe
|
|
#endif
|
|
)
|
|
where
|
|
|
|
import Data.ByteString (ByteString)
|
|
import qualified Data.ByteString.Lazy as Lazy (ByteString)
|
|
import Data.Typeable
|
|
import Data.Encoding.Base
|
|
|
|
#ifndef USE_HPC
|
|
import Data.Encoding.ASCII
|
|
import Data.Encoding.UTF8
|
|
import Data.Encoding.UTF16
|
|
import Data.Encoding.UTF32
|
|
import Data.Encoding.ISO88591
|
|
import Data.Encoding.ISO88592
|
|
import Data.Encoding.ISO88593
|
|
import Data.Encoding.ISO88594
|
|
import Data.Encoding.ISO88595
|
|
import Data.Encoding.ISO88596
|
|
import Data.Encoding.ISO88597
|
|
import Data.Encoding.ISO88598
|
|
import Data.Encoding.ISO88599
|
|
import Data.Encoding.ISO885910
|
|
import Data.Encoding.ISO885911
|
|
import Data.Encoding.ISO885913
|
|
import Data.Encoding.ISO885914
|
|
import Data.Encoding.ISO885915
|
|
import Data.Encoding.ISO885916
|
|
import Data.Encoding.CP1250
|
|
import Data.Encoding.CP1251
|
|
import Data.Encoding.CP1252
|
|
import Data.Encoding.CP1253
|
|
import Data.Encoding.CP1254
|
|
import Data.Encoding.CP1255
|
|
import Data.Encoding.CP1256
|
|
import Data.Encoding.CP1257
|
|
import Data.Encoding.CP1258
|
|
import Data.Encoding.KOI8R
|
|
import Data.Encoding.KOI8U
|
|
import Data.Encoding.GB18030
|
|
import Data.Char
|
|
import Text.Regex
|
|
#endif
|
|
|
|
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'.
|
|
data DynEncoding = forall t. (Encoding t,Show t,Typeable t,Eq t)
|
|
=> DynEncoding t
|
|
|
|
instance Encoding DynEncoding where
|
|
encode (DynEncoding enc) = encode enc
|
|
encodeLazy (DynEncoding enc) = encodeLazy enc
|
|
encodable (DynEncoding enc) = encodable enc
|
|
decode (DynEncoding enc) = decode enc
|
|
decodeLazy (DynEncoding enc) = decodeLazy enc
|
|
decodable (DynEncoding enc) = decodable enc
|
|
|
|
instance Show DynEncoding where
|
|
show (DynEncoding enc) = "DynEncoding "++show enc
|
|
|
|
instance Eq DynEncoding where
|
|
(DynEncoding enc1) == (DynEncoding enc2) = case cast enc2 of
|
|
Nothing -> False
|
|
Just renc2 -> enc1 == renc2
|
|
|
|
-- | This decodes a string from one encoding and encodes it into another.
|
|
recode :: (Encoding from,Encoding to) => from -> to -> ByteString -> ByteString
|
|
recode enc_f enc_t bs = encode enc_t (decode enc_f bs)
|
|
|
|
recodeLazy :: (Encoding from,Encoding to) => from -> to -> Lazy.ByteString -> Lazy.ByteString
|
|
recodeLazy enc_f enc_t bs = encodeLazy enc_t (decodeLazy enc_f bs)
|
|
|
|
#ifndef USE_HPC
|
|
-- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error
|
|
encodingFromStringMaybe :: String -> Maybe DynEncoding
|
|
encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
|
|
-- ASCII
|
|
"ascii" -> Just $ DynEncoding ASCII
|
|
"646" -> Just $ DynEncoding ASCII
|
|
"ansi_x3_4_1968" -> Just $ DynEncoding ASCII
|
|
"ansi_x3.4_1986" -> Just $ DynEncoding ASCII
|
|
"cp367" -> Just $ DynEncoding ASCII
|
|
"csascii" -> Just $ DynEncoding ASCII
|
|
"ibm367" -> Just $ DynEncoding ASCII
|
|
"iso646_us" -> Just $ DynEncoding ASCII
|
|
"iso_646.irv_1991" -> Just $ DynEncoding ASCII
|
|
"iso_ir_6" -> Just $ DynEncoding ASCII
|
|
"us" -> Just $ DynEncoding ASCII
|
|
"us_ascii" -> Just $ DynEncoding ASCII
|
|
-- UTF-8
|
|
"utf_8" -> Just $ DynEncoding UTF8
|
|
"u8" -> Just $ DynEncoding UTF8
|
|
"utf" -> Just $ DynEncoding UTF8
|
|
"utf8" -> Just $ DynEncoding UTF8
|
|
"utf8_ucs2" -> Just $ DynEncoding UTF8
|
|
"utf8_ucs4" -> Just $ DynEncoding UTF8
|
|
-- UTF-16
|
|
"utf_16" -> Just $ DynEncoding UTF16
|
|
"u16" -> Just $ DynEncoding UTF16
|
|
"utf16" -> Just $ DynEncoding UTF16
|
|
-- UTF-32
|
|
"utf_32" -> Just $ DynEncoding UTF32
|
|
-- KOI8-R
|
|
"koi8_r" -> Just $ DynEncoding KOI8R
|
|
"cskoi8r" -> Just $ DynEncoding KOI8R
|
|
-- KOI8-I
|
|
"koi8_u" -> Just $ DynEncoding KOI8U
|
|
-- ISO-8859-1
|
|
"iso_8859_1" -> Just $ DynEncoding ISO88591
|
|
"iso8859_1" -> Just $ DynEncoding ISO88591
|
|
"8859" -> Just $ DynEncoding ISO88591
|
|
"cp819" -> Just $ DynEncoding ISO88591
|
|
"csisolatin1" -> Just $ DynEncoding ISO88591
|
|
"ibm819" -> Just $ DynEncoding ISO88591
|
|
"iso8859" -> Just $ DynEncoding ISO88591
|
|
"iso_8859_1_1987" -> Just $ DynEncoding ISO88591
|
|
"iso_ir_100" -> Just $ DynEncoding ISO88591
|
|
"l1" -> Just $ DynEncoding ISO88591
|
|
"latin" -> Just $ DynEncoding ISO88591
|
|
"latin1" -> Just $ DynEncoding ISO88591
|
|
-- ISO-8859-2
|
|
"iso_8859_2" -> Just $ DynEncoding ISO88592
|
|
"iso8859_2" -> Just $ DynEncoding ISO88592
|
|
"csisolatin2" -> Just $ DynEncoding ISO88592
|
|
"iso_8859_2_1987" -> Just $ DynEncoding ISO88592
|
|
"iso_ir_101" -> Just $ DynEncoding ISO88592
|
|
"l2" -> Just $ DynEncoding ISO88592
|
|
"latin2" -> Just $ DynEncoding ISO88592
|
|
-- ISO-8859-3
|
|
"iso_8859_3" -> Just $ DynEncoding ISO88593
|
|
"iso8859_3" -> Just $ DynEncoding ISO88593
|
|
"csisolatin3" -> Just $ DynEncoding ISO88593
|
|
"iso_8859_3_1988" -> Just $ DynEncoding ISO88593
|
|
"iso_ir_109" -> Just $ DynEncoding ISO88593
|
|
"l3" -> Just $ DynEncoding ISO88593
|
|
"latin3" -> Just $ DynEncoding ISO88593
|
|
--ISO-8859-4
|
|
"iso_8859_4" -> Just $ DynEncoding ISO88594
|
|
"iso8859_4" -> Just $ DynEncoding ISO88594
|
|
"csisolatin4" -> Just $ DynEncoding ISO88594
|
|
"iso_8859_4_1988" -> Just $ DynEncoding ISO88594
|
|
"iso_ir_110" -> Just $ DynEncoding ISO88594
|
|
"l4" -> Just $ DynEncoding ISO88594
|
|
"latin4" -> Just $ DynEncoding ISO88594
|
|
--ISO-8859-5
|
|
"iso_8859_5" -> Just $ DynEncoding ISO88595
|
|
"iso8859_5" -> Just $ DynEncoding ISO88595
|
|
"csisolatincyrillic" -> Just $ DynEncoding ISO88595
|
|
"cyrillic" -> Just $ DynEncoding ISO88595
|
|
"iso_8859_5_1988" -> Just $ DynEncoding ISO88595
|
|
"iso_ir_144" -> Just $ DynEncoding ISO88595
|
|
-- ISO-8859-6
|
|
"iso_8859_6" -> Just $ DynEncoding ISO88596
|
|
"iso8859_6" -> Just $ DynEncoding ISO88596
|
|
"arabic" -> Just $ DynEncoding ISO88596
|
|
"asmo_708" -> Just $ DynEncoding ISO88596
|
|
"csisolatinarabic" -> Just $ DynEncoding ISO88596
|
|
"ecma_114" -> Just $ DynEncoding ISO88596
|
|
"iso_8859_6_1987" -> Just $ DynEncoding ISO88596
|
|
"iso_ir_127" -> Just $ DynEncoding ISO88596
|
|
-- ISO-8859-7
|
|
"iso_8859_7" -> Just $ DynEncoding ISO88597
|
|
"iso8859_7" -> Just $ DynEncoding ISO88597
|
|
"csisolatingreek" -> Just $ DynEncoding ISO88597
|
|
"ecma_118" -> Just $ DynEncoding ISO88597
|
|
"elot_928" -> Just $ DynEncoding ISO88597
|
|
"greek" -> Just $ DynEncoding ISO88597
|
|
"greek8" -> Just $ DynEncoding ISO88597
|
|
"iso_8859_7_1987" -> Just $ DynEncoding ISO88597
|
|
"iso_ir_126" -> Just $ DynEncoding ISO88597
|
|
-- ISO-8859-8
|
|
"iso_8859_8" -> Just $ DynEncoding ISO88598
|
|
"iso8859_8" -> Just $ DynEncoding ISO88598
|
|
"csisolatinhebrew" -> Just $ DynEncoding ISO88598
|
|
"hebrew" -> Just $ DynEncoding ISO88598
|
|
"iso_8859_8_1988" -> Just $ DynEncoding ISO88598
|
|
"iso_ir_138" -> Just $ DynEncoding ISO88598
|
|
-- ISO-8859-9
|
|
"iso_8859_9" -> Just $ DynEncoding ISO88599
|
|
"iso8859_9" -> Just $ DynEncoding ISO88599
|
|
"csisolatin5" -> Just $ DynEncoding ISO88599
|
|
"iso_8859_9_1989" -> Just $ DynEncoding ISO88599
|
|
"iso_ir_148" -> Just $ DynEncoding ISO88599
|
|
"l5" -> Just $ DynEncoding ISO88599
|
|
"latin5" -> Just $ DynEncoding ISO88599
|
|
-- ISO-8859-10
|
|
"iso_8859_10" -> Just $ DynEncoding ISO885910
|
|
"iso8859_10" -> Just $ DynEncoding ISO885910
|
|
"csisolatin6" -> Just $ DynEncoding ISO885910
|
|
"iso_8859_10_1992" -> Just $ DynEncoding ISO885910
|
|
"iso_ir_157" -> Just $ DynEncoding ISO885910
|
|
"l6" -> Just $ DynEncoding ISO885910
|
|
"latin6" -> Just $ DynEncoding ISO885910
|
|
-- ISO-8859-11
|
|
"iso_8859_11" -> Just $ DynEncoding ISO885911
|
|
"iso8859_11" -> Just $ DynEncoding ISO885911
|
|
"thai" -> Just $ DynEncoding ISO885911
|
|
"iso_8859_11_2001" -> Just $ DynEncoding ISO885911
|
|
-- ISO-8859-13
|
|
"iso_8859_13" -> Just $ DynEncoding ISO885913
|
|
"iso8859_13" -> Just $ DynEncoding ISO885913
|
|
-- ISO-8859-14
|
|
"iso_8859_14" -> Just $ DynEncoding ISO885914
|
|
"iso8859_14" -> Just $ DynEncoding ISO885914
|
|
"iso_8859_14_1998" -> Just $ DynEncoding ISO885914
|
|
"iso_celtic" -> Just $ DynEncoding ISO885914
|
|
"iso_ir_199" -> Just $ DynEncoding ISO885914
|
|
"l8" -> Just $ DynEncoding ISO885914
|
|
"latin8" -> Just $ DynEncoding ISO885914
|
|
-- ISO-8859-15
|
|
"iso_8859_15" -> Just $ DynEncoding ISO885915
|
|
"iso8859_15" -> Just $ DynEncoding ISO885915
|
|
"latin9" -> Just $ DynEncoding ISO885915
|
|
"l9" -> Just $ DynEncoding ISO885915
|
|
-- ISO-8859-16
|
|
"iso_8859_16" -> Just $ DynEncoding ISO885916
|
|
"iso8859_16" -> Just $ DynEncoding ISO885916
|
|
"iso_8859_16_2001" -> Just $ DynEncoding ISO885916
|
|
"iso_ir_226" -> Just $ DynEncoding ISO885916
|
|
"l10" -> Just $ DynEncoding ISO885916
|
|
"latin10" -> Just $ DynEncoding ISO885916
|
|
-- CP1250
|
|
"cp1250" -> Just $ DynEncoding CP1250
|
|
"windows_1250" -> Just $ DynEncoding CP1250
|
|
-- CP1251
|
|
"cp1251" -> Just $ DynEncoding CP1251
|
|
"windows_1251" -> Just $ DynEncoding CP1251
|
|
-- CP1252
|
|
"cp1252" -> Just $ DynEncoding CP1252
|
|
"windows_1252" -> Just $ DynEncoding CP1252
|
|
-- CP1253
|
|
"cp1253" -> Just $ DynEncoding CP1253
|
|
"windows_1253" -> Just $ DynEncoding CP1253
|
|
-- CP1254
|
|
"cp1254" -> Just $ DynEncoding CP1254
|
|
"windows_1254" -> Just $ DynEncoding CP1254
|
|
-- CP1255
|
|
"cp1255" -> Just $ DynEncoding CP1255
|
|
"windows_1255" -> Just $ DynEncoding CP1255
|
|
-- CP1256
|
|
"cp1256" -> Just $ DynEncoding CP1256
|
|
"windows_1256" -> Just $ DynEncoding CP1256
|
|
-- CP1257
|
|
"cp1257" -> Just $ DynEncoding CP1257
|
|
"windows_1257" -> Just $ DynEncoding CP1257
|
|
-- CP1258
|
|
"cp1258" -> Just $ DynEncoding CP1258
|
|
"windows_1258" -> Just $ DynEncoding CP1258
|
|
-- GB18030
|
|
"gb18030" -> Just $ DynEncoding GB18030
|
|
"gb18030_2000" -> Just $ DynEncoding GB18030
|
|
-- defaults to nothing
|
|
_ -> Nothing
|
|
where
|
|
normalizeEncoding s = map toLower $ subRegex sep s "_"
|
|
sep = mkRegex "[^0-9A-Za-z]+"
|
|
|
|
|
|
|
|
-- | Takes the name of an encoding and creates a dynamic encoding from it.
|
|
encodingFromString :: String -> DynEncoding
|
|
encodingFromString str = maybe
|
|
(error $ "Data.Encoding.encodingFromString: Unknown encoding: "++show str)
|
|
id
|
|
(encodingFromStringMaybe str)
|
|
#endif
|