Add the support for a large number of encoding name aliases. The aliases are the ones used in python, and are normalized as in python: they are case and separator insensitive.

darcs-hash:20080514130514-2fc9d-1b53b11141878a8651f3bde7e427c877172e6722
This commit is contained in:
gaetan.lehmann 2008-05-14 06:05:14 -07:00
parent 6405d95956
commit e3a3d5baed
2 changed files with 187 additions and 34 deletions

View File

@ -50,6 +50,8 @@ import Data.Encoding.CP1258
import Data.Encoding.KOI8R import Data.Encoding.KOI8R
import Data.Encoding.KOI8U import Data.Encoding.KOI8U
import Data.Encoding.GB18030 import Data.Encoding.GB18030
import Data.Char
import Text.Regex
#endif #endif
-- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'. -- | An untyped encoding. Used in 'System.IO.Encoding.getSystemEncoding'.
@ -82,38 +84,189 @@ recodeLazy enc_f enc_t bs = encodeLazy enc_t (decodeLazy enc_f bs)
#ifndef USE_HPC #ifndef USE_HPC
-- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error -- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error
encodingFromStringMaybe :: String -> Maybe DynEncoding encodingFromStringMaybe :: String -> Maybe DynEncoding
encodingFromStringMaybe "ASCII" = Just $ DynEncoding ASCII encodingFromStringMaybe codeName = case (normalizeEncoding codeName) of
encodingFromStringMaybe "UTF-8" = Just $ DynEncoding UTF8 -- ASCII
encodingFromStringMaybe "UTF-16" = Just $ DynEncoding UTF16 "ascii" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "UTF-32" = Just $ DynEncoding UTF32 "646" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "KOI8-R" = Just $ DynEncoding KOI8R "ansi_x3_4_1968" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "KOI8-U" = Just $ DynEncoding KOI8U "ansi_x3.4_1986" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-1" = Just $ DynEncoding ISO88591 "cp367" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-2" = Just $ DynEncoding ISO88592 "csascii" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-3" = Just $ DynEncoding ISO88593 "ibm367" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-4" = Just $ DynEncoding ISO88594 "iso646_us" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-5" = Just $ DynEncoding ISO88595 "iso_646.irv_1991" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-6" = Just $ DynEncoding ISO88596 "iso_ir_6" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-7" = Just $ DynEncoding ISO88597 "us" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-8" = Just $ DynEncoding ISO88598 "us_ascii" -> Just $ DynEncoding ASCII
encodingFromStringMaybe "ISO-8859-9" = Just $ DynEncoding ISO88599 -- UTF-8
encodingFromStringMaybe "ISO-8859-10" = Just $ DynEncoding ISO885910 "utf_8" -> Just $ DynEncoding UTF8
encodingFromStringMaybe "ISO-8859-11" = Just $ DynEncoding ISO885911 "u8" -> Just $ DynEncoding UTF8
encodingFromStringMaybe "ISO-8859-13" = Just $ DynEncoding ISO885913 "utf" -> Just $ DynEncoding UTF8
encodingFromStringMaybe "ISO-8859-14" = Just $ DynEncoding ISO885914 "utf8" -> Just $ DynEncoding UTF8
encodingFromStringMaybe "ISO-8859-15" = Just $ DynEncoding ISO885915 "utf8_ucs2" -> Just $ DynEncoding UTF8
encodingFromStringMaybe "ISO-8859-16" = Just $ DynEncoding ISO885916 "utf8_ucs4" -> Just $ DynEncoding UTF8
encodingFromStringMaybe "CP1250" = Just $ DynEncoding CP1250 -- UTF-16
encodingFromStringMaybe "CP1251" = Just $ DynEncoding CP1251 "utf_16" -> Just $ DynEncoding UTF16
encodingFromStringMaybe "CP1252" = Just $ DynEncoding CP1252 "u16" -> Just $ DynEncoding UTF16
encodingFromStringMaybe "CP1253" = Just $ DynEncoding CP1253 "utf16" -> Just $ DynEncoding UTF16
encodingFromStringMaybe "CP1254" = Just $ DynEncoding CP1254 -- UTF-32
encodingFromStringMaybe "CP1255" = Just $ DynEncoding CP1255 "utf_32" -> Just $ DynEncoding UTF32
encodingFromStringMaybe "CP1256" = Just $ DynEncoding CP1256 -- KOI8-R
encodingFromStringMaybe "CP1257" = Just $ DynEncoding CP1257 "koi8_r" -> Just $ DynEncoding KOI8R
encodingFromStringMaybe "CP1258" = Just $ DynEncoding CP1258 "cskoi8r" -> Just $ DynEncoding KOI8R
encodingFromStringMaybe "GB18030" = Just $ DynEncoding GB18030 -- KOI8-I
encodingFromStringMaybe _ = Nothing "koi8_u" -> Just $ DynEncoding KOI8U
-- ISO-8859-1
"iso_8859_1" -> Just $ DynEncoding ISO88591
"iso8859_1" -> Just $ DynEncoding ISO88591
"8859" -> Just $ DynEncoding ISO88591
"cp819" -> Just $ DynEncoding ISO88591
"csisolatin1" -> Just $ DynEncoding ISO88591
"ibm819" -> Just $ DynEncoding ISO88591
"iso8859" -> Just $ DynEncoding ISO88591
"iso_8859_1_1987" -> Just $ DynEncoding ISO88591
"iso_ir_100" -> Just $ DynEncoding ISO88591
"l1" -> Just $ DynEncoding ISO88591
"latin" -> Just $ DynEncoding ISO88591
"latin1" -> Just $ DynEncoding ISO88591
-- ISO-8859-2
"iso_8859_2" -> Just $ DynEncoding ISO88592
"iso8859_2" -> Just $ DynEncoding ISO88592
"csisolatin2" -> Just $ DynEncoding ISO88592
"iso_8859_2_1987" -> Just $ DynEncoding ISO88592
"iso_ir_101" -> Just $ DynEncoding ISO88592
"l2" -> Just $ DynEncoding ISO88592
"latin2" -> Just $ DynEncoding ISO88592
-- ISO-8859-3
"iso_8859_3" -> Just $ DynEncoding ISO88593
"iso8859_3" -> Just $ DynEncoding ISO88593
"csisolatin3" -> Just $ DynEncoding ISO88593
"iso_8859_3_1988" -> Just $ DynEncoding ISO88593
"iso_ir_109" -> Just $ DynEncoding ISO88593
"l3" -> Just $ DynEncoding ISO88593
"latin3" -> Just $ DynEncoding ISO88593
--ISO-8859-4
"iso_8859_4" -> Just $ DynEncoding ISO88594
"iso8859_4" -> Just $ DynEncoding ISO88594
"csisolatin4" -> Just $ DynEncoding ISO88594
"iso_8859_4_1988" -> Just $ DynEncoding ISO88594
"iso_ir_110" -> Just $ DynEncoding ISO88594
"l4" -> Just $ DynEncoding ISO88594
"latin4" -> Just $ DynEncoding ISO88594
--ISO-8859-5
"iso_8859_5" -> Just $ DynEncoding ISO88595
"iso8859_5" -> Just $ DynEncoding ISO88595
"csisolatincyrillic" -> Just $ DynEncoding ISO88595
"cyrillic" -> Just $ DynEncoding ISO88595
"iso_8859_5_1988" -> Just $ DynEncoding ISO88595
"iso_ir_144" -> Just $ DynEncoding ISO88595
-- ISO-8859-6
"iso_8859_6" -> Just $ DynEncoding ISO88596
"iso8859_6" -> Just $ DynEncoding ISO88596
"arabic" -> Just $ DynEncoding ISO88596
"asmo_708" -> Just $ DynEncoding ISO88596
"csisolatinarabic" -> Just $ DynEncoding ISO88596
"ecma_114" -> Just $ DynEncoding ISO88596
"iso_8859_6_1987" -> Just $ DynEncoding ISO88596
"iso_ir_127" -> Just $ DynEncoding ISO88596
-- ISO-8859-7
"iso_8859_7" -> Just $ DynEncoding ISO88597
"iso8859_7" -> Just $ DynEncoding ISO88597
"csisolatingreek" -> Just $ DynEncoding ISO88597
"ecma_118" -> Just $ DynEncoding ISO88597
"elot_928" -> Just $ DynEncoding ISO88597
"greek" -> Just $ DynEncoding ISO88597
"greek8" -> Just $ DynEncoding ISO88597
"iso_8859_7_1987" -> Just $ DynEncoding ISO88597
"iso_ir_126" -> Just $ DynEncoding ISO88597
-- ISO-8859-8
"iso_8859_8" -> Just $ DynEncoding ISO88598
"iso8859_8" -> Just $ DynEncoding ISO88598
"csisolatinhebrew" -> Just $ DynEncoding ISO88598
"hebrew" -> Just $ DynEncoding ISO88598
"iso_8859_8_1988" -> Just $ DynEncoding ISO88598
"iso_ir_138" -> Just $ DynEncoding ISO88598
-- ISO-8859-9
"iso_8859_9" -> Just $ DynEncoding ISO88599
"iso8859_9" -> Just $ DynEncoding ISO88599
"csisolatin5" -> Just $ DynEncoding ISO88599
"iso_8859_9_1989" -> Just $ DynEncoding ISO88599
"iso_ir_148" -> Just $ DynEncoding ISO88599
"l5" -> Just $ DynEncoding ISO88599
"latin5" -> Just $ DynEncoding ISO88599
-- ISO-8859-10
"iso_8859_10" -> Just $ DynEncoding ISO885910
"iso8859_10" -> Just $ DynEncoding ISO885910
"csisolatin6" -> Just $ DynEncoding ISO885910
"iso_8859_10_1992" -> Just $ DynEncoding ISO885910
"iso_ir_157" -> Just $ DynEncoding ISO885910
"l6" -> Just $ DynEncoding ISO885910
"latin6" -> Just $ DynEncoding ISO885910
-- ISO-8859-11
"iso_8859_11" -> Just $ DynEncoding ISO885911
"iso8859_11" -> Just $ DynEncoding ISO885911
"thai" -> Just $ DynEncoding ISO885911
"iso_8859_11_2001" -> Just $ DynEncoding ISO885911
-- ISO-8859-13
"iso_8859_13" -> Just $ DynEncoding ISO885913
"iso8859_13" -> Just $ DynEncoding ISO885913
-- ISO-8859-14
"iso_8859_14" -> Just $ DynEncoding ISO885914
"iso8859_14" -> Just $ DynEncoding ISO885914
"iso_8859_14_1998" -> Just $ DynEncoding ISO885914
"iso_celtic" -> Just $ DynEncoding ISO885914
"iso_ir_199" -> Just $ DynEncoding ISO885914
"l8" -> Just $ DynEncoding ISO885914
"latin8" -> Just $ DynEncoding ISO885914
-- ISO-8859-15
"iso_8859_15" -> Just $ DynEncoding ISO885915
"iso8859_15" -> Just $ DynEncoding ISO885915
"latin9" -> Just $ DynEncoding ISO885915
"l9" -> Just $ DynEncoding ISO885915
-- ISO-8859-16
"iso_8859_16" -> Just $ DynEncoding ISO885916
"iso8859_16" -> Just $ DynEncoding ISO885916
"iso_8859_16_2001" -> Just $ DynEncoding ISO885916
"iso_ir_226" -> Just $ DynEncoding ISO885916
"l10" -> Just $ DynEncoding ISO885916
"latin10" -> Just $ DynEncoding ISO885916
-- CP1250
"cp1250" -> Just $ DynEncoding CP1250
"windows_1250" -> Just $ DynEncoding CP1250
-- CP1251
"cp1251" -> Just $ DynEncoding CP1251
"windows_1251" -> Just $ DynEncoding CP1251
-- CP1252
"cp1252" -> Just $ DynEncoding CP1252
"windows_1252" -> Just $ DynEncoding CP1252
-- CP1253
"cp1253" -> Just $ DynEncoding CP1253
"windows_1253" -> Just $ DynEncoding CP1253
-- CP1254
"cp1254" -> Just $ DynEncoding CP1254
"windows_1254" -> Just $ DynEncoding CP1254
-- CP1255
"cp1255" -> Just $ DynEncoding CP1255
"windows_1255" -> Just $ DynEncoding CP1255
-- CP1256
"cp1256" -> Just $ DynEncoding CP1256
"windows_1256" -> Just $ DynEncoding CP1256
-- CP1257
"cp1257" -> Just $ DynEncoding CP1257
"windows_1257" -> Just $ DynEncoding CP1257
-- CP1258
"cp1258" -> Just $ DynEncoding CP1258
"windows_1258" -> Just $ DynEncoding CP1258
-- GB18030
"gb18030" -> Just $ DynEncoding GB18030
"gb18030_2000" -> Just $ DynEncoding GB18030
-- defaults to nothing
_ -> Nothing
where
normalizeEncoding s = map toLower $ subRegex sep s "_"
sep = mkRegex "[^0-9A-Za-z]+"
-- | Takes the name of an encoding and creates a dynamic encoding from it. -- | Takes the name of an encoding and creates a dynamic encoding from it.
encodingFromString :: String -> DynEncoding encodingFromString :: String -> DynEncoding

View File

@ -45,9 +45,9 @@ Flag splitBase
Library Library
if flag(splitBase) if flag(splitBase)
Build-Depends: bytestring, base >= 3, template-haskell, containers, array Build-Depends: bytestring, base >= 3, template-haskell, containers, array, regex-compat
else else
Build-Depends: base < 3, template-haskell Build-Depends: base < 3, template-haskell, regex-compat
Extensions: TemplateHaskell,CPP,ExistentialQuantification,ForeignFunctionInterface Extensions: TemplateHaskell,CPP,ExistentialQuantification,ForeignFunctionInterface
C-Sources: system_encoding.c C-Sources: system_encoding.c
Include-Dirs: . Include-Dirs: .