Allow for documentation on mapping files

Ignore-this: 5bcfdd21c16464e29fb7498dd6ec8915
Use the first commented lines in each mapping file as the docstring for the whole module.

darcs-hash:20090829125423-a4fee-c99efd2cccdaa24d3b283aca0d43cd7ff6970c7f
This commit is contained in:
Henning Guenther 2009-08-29 05:54:23 -07:00
parent 56867f5768
commit b12649a071
30 changed files with 27 additions and 741 deletions

View File

@ -1,21 +1,5 @@
#
# Name: cp1250 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1250 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1250 order
#
#Implements the Windows-1250 encoding.
#For details, refer to <http://en.wikipedia.org/wiki/Windows-1250>.
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1251 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1251 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1251 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1252 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1252 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1252 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1253 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1253 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1253 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1254 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1254 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1254 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1255 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 1/7/2000
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1255 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1255 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1256 to Unicode table
# Unicode version: 2.1
# Table version: 2.01
# Table format: Format A
# Date: 01/5/99
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1256 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1256 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1257 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1257 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1257 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp1258 to Unicode table
# Unicode version: 2.0
# Table version: 2.01
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp1258 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp1258 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp437_DOSLatinUS to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp437_DOSLatinUS code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp437_DOSLatinUS order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp737_DOSGreek to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp737_DOSGreek code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp737_DOSGreek order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp775_DOSBaltRim to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp775_DOSBaltRim code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp775_DOSBaltRim order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp850_DOSLatin1 to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp850_DOSLatin1 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp850_DOSLatin1 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp852_DOSLatin2 to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp852_DOSLatin2 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp852_DOSLatin2 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp855_DOSCyrillic to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp855_DOSCyrillic code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp855_DOSCyrillic order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp857_DOSTurkish to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp857_DOSTurkish code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp857_DOSTurkish order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp860_DOSPortuguese to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp860_DOSPortuguese code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp860_DOSPortuguese order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp861_DOSIcelandic to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp861_DOSIcelandic code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp861_DOSIcelandic order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp862_DOSHebrew to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp862_DOSHebrew code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp862_DOSHebrew order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp863_DOSCanadaF to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp863_DOSCanadaF code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp863_DOSCanadaF order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp864_DOSArabic to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp864_DOSArabic code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp864_DOSArabic order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp865_DOSNordic to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp865_DOSNordic code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp865_DOSNordic order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp866_DOSCyrillicRussian to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp866_DOSCyrillicRussian code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp866_DOSCyrillicRussian order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,20 +1,3 @@
#
# Name: cp869_DOSGreek2 to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/24/96
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp869_DOSGreek2 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp869_DOSGreek2 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,21 +1,3 @@
#
# Name: cp874 to Unicode table
# Unicode version: 2.0
# Table version: 2.00
# Table format: Format A
# Date: 04/15/98
#
# Contact: Shawn.Steele@microsoft.com
#
# General notes: none
#
# Format: Three tab-separated columns
# Column #1 is the cp874 code (in hex)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 is the Unicode name (follows a comment sign, '#')
#
# The entries are in cp874 order
#
0x00 0x0000 #NULL
0x01 0x0001 #START OF HEADING
0x02 0x0002 #START OF TEXT

View File

@ -1,53 +1,3 @@
#
# Name: JIS X 0201 (1976) to Unicode 1.1 Table
# Unicode version: 1.1
# Table version: 0.9
# Table format: Format A
# Date: 8 March 1994
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
#
# General notes:
#
#
# This table contains one set of mappings from JIS X 0201 into Unicode.
# Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of JIS X 0201 and Unicode, consult the
# VENDORS mapping data. Normative information on the mapping between
# JIS X 0201 and Unicode may be found in the Unihan.txt file in the
# latest Unicode Character Database.
#
# If you have carefully considered the fact that the mappings in
# this table are only one possible set of mappings between JIS X 0201 and
# Unicode and have no normative status, but still feel that you
# have located an error in the table that requires fixing, you may
# report any such error to errata@unicode.org.
#
#
# Format: Three tab-separated columns
# Column #1 is the shift JIS code (in hex as 0xXX)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 the Unicode (ISO 10646) name (follows a comment sign)
#
# The entries are in JIS order
#
#
0x20 0x0020 # SPACE
0x21 0x0021 # EXCLAMATION MARK
0x22 0x0022 # QUOTATION MARK

View File

@ -1,75 +1,3 @@
#
# Name: JIS X 0208 (1990) to Unicode
# Unicode version: 1.1
# Table version: 0.9
# Table format: Format A
# Date: 8 March 1994
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
#
# General notes:
#
#
# This table contains one set of mappings from JIS X 0208 (1990) into Unicode.
# Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of JIS X 0208 (1990) and Unicode, consult the
# VENDORS mapping data. Normative information on the mapping between
# JIS X 0208 (1990) and Unicode may be found in the Unihan.txt file in the
# latest Unicode Character Database.
#
# If you have carefully considered the fact that the mappings in
# this table are only one possible set of mappings between JIS X 0208 (1990)
# and Unicode and have no normative status, but still feel that you
# have located an error in the table that requires fixing, you may
# report any such error to errata@unicode.org.
#
#
# Format: Four tab-separated columns
# Column #1 is the JIS X 0208 code (in hex as 0xXXXX)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 the Unicode name (follows a comment sign, '#')
# The official names for Unicode characters U+4E00
# to U+9FA5, inclusive, is "CJK UNIFIED IDEOGRAPH-XXXX",
# where XXXX is the code point. Including all these
# names in this file increases its size substantially
# and needlessly. The token "<CJK>" is used for the
# name of these characters. If necessary, it can be
# expanded algorithmically by a parser or editor.
#
# The entries are in JIS X 0208 order
#
# The following algorithms can be used to change the hex form
# of JIS 0208 to other standard forms:
#
# To change hex to EUC form, add 0x8080
# To change hex to kuten form, first subtract 0x2020. Then
# the high and low bytes correspond to the ku and ten of
# the kuten form. For example, 0x2121 -> 0x0101 -> 0101;
# 0x7426 -> 0x5406 -> 8406
#
# The kanji mappings are a normative part of ISO/IEC 10646. The
# non-kanji mappings are provisional, pending definition of
# official mappings by Japanese standards bodies
#
# Any comments or problems, contact <John_Jenkins@taligent.com>
#
#
0x2121 0x3000 # IDEOGRAPHIC SPACE
0x2122 0x3001 # IDEOGRAPHIC COMMA
0x2123 0x3002 # IDEOGRAPHIC FULL STOP

View File

@ -1,89 +1,3 @@
#
# Name: JIS X 0212 (1990) to Unicode
# Unicode version: 1.1
# Table version: 0.9
# Table format: Format A
# Date: 8 March 1994
#
# Copyright (c) 1991-1994 Unicode, Inc. All Rights reserved.
#
# This file is provided as-is by Unicode, Inc. (The Unicode Consortium).
# No claims are made as to fitness for any particular purpose. No
# warranties of any kind are expressed or implied. The recipient
# agrees to determine applicability of information provided. If this
# file has been provided on magnetic media by Unicode, Inc., the sole
# remedy for any claim will be exchange of defective media within 90
# days of receipt.
#
# Recipient is granted the right to make copies in any form for
# internal distribution and to freely use the information supplied
# in the creation of products supporting Unicode. Unicode, Inc.
# specifically excludes the right to re-distribute this file directly
# to third parties or other organizations whether for profit or not.
#
# General notes:
#
#
# This table contains one set of mappings from JIS X 0212 into Unicode.
# Note that these data are *possible* mappings only and may not be the
# same as those used by actual products, nor may they be the best suited
# for all uses. For more information on the mappings between various code
# pages incorporating the repertoire of JIS X 0212 and Unicode, consult the
# VENDORS mapping data. Normative information on the mapping between
# JIS X 0212 and Unicode may be found in the Unihan.txt file in the
# latest Unicode Character Database.
#
# If you have carefully considered the fact that the mappings in
# this table are only one possible set of mappings between JIS X 0212 and
# Unicode and have no normative status, but still feel that you
# have located an error in the table that requires fixing, you may
# report any such error to errata@unicode.org.
#
#
# Format: Three tab-separated columns
# Column #1 is the JIS X 0212 code (in hex as 0xXXXX)
# Column #2 is the Unicode (in hex as 0xXXXX)
# Column #3 the Unicode name (follows a comment sign, '#')
# The official names for Unicode characters U+4E00
# to U+9FA5, inclusive, is "CJK UNIFIED IDEOGRAPH-XXXX",
# where XXXX is the code point. Including all these
# names in this file increases its size substantially
# and needlessly. The token "<CJK>" is used for the
# name of these characters. If necessary, it can be
# expanded algorithmically by a parser or editor.
#
# The entries are in JIS X 0212 order
#
# The following algorithms can be used to change the hex form
# of JIS 0212 to other standard forms:
#
# To change hex to EUC form, add 0x8080
# To change hex to kuten form, first subtract 0x2020. Then
# the high and low bytes correspond to the ku and ten of
# the kuten form. For example, 0x2121 -> 0x0101 -> 0101;
# 0x6D63 -> 0x4D43 -> 7767
#
# The kanji mappings are a normative part of ISO/IEC 10646. The
# non-kanji mappings are provisional, pending definition of
# official mappings by Japanese standards bodies
#
# Any comments or problems, contact <John_Jenkins@taligent.com>
#
# Notes:
#
# 1. JIS X 0212 apparently unified the following two symbols
# into a single character at 0x2922:
#
# LATIN CAPITAL LETTER D WITH STROKE
# LATIN CAPITAL LETTER ETH
#
# However, JIS X 0212 maintains the distinction between
# the lowercase forms of these two elements at 0x2942 and 0x2943.
# Given the structre of these JIS encodings, it is clear that
# 0x2922 and 0x2942 are intended to be a capital/small pair.
# Consequently, in the Unicode mapping, 0x2922 is treated as
# LATIN CAPITAL LETTER D WITH STROKE.
#
0x222F 0x02D8 # BREVE
0x2230 0x02C7 # CARON (Mandarin Chinese third tone)
0x2231 0x00B8 # CEDILLA

View File

@ -1,83 +1,4 @@
#=======================================================================
# File name: ROMAN.TXT
#
# Contents: Map (external version) from Mac OS Roman
# character set to Unicode 2.1 and later.
#
# Copyright: (c) 1994-2002, 2005 by Apple Computer, Inc., all rights
# reserved.
#
# Contact: charsets@apple.com
#
# Changes:
#
# c02 2005-Apr-05 Update header comments. Matches internal xml
# <c1.1> and Text Encoding Converter 2.0.
# b4,c1 2002-Dec-19 Update URLs, notes. Matches internal
# utom<b5>.
# b03 1999-Sep-22 Update contact e-mail address. Matches
# internal utom<b4>, ufrm<b3>, and Text
# Encoding Converter version 1.5.
# b02 1998-Aug-18 Encoding changed for Mac OS 8.5; change
# mapping of 0xDB from CURRENCY SIGN to
# EURO SIGN. Matches internal utom<b3>,
# ufrm<b3>.
# n08 1998-Feb-05 Minor update to header comments
# n06 1997-Dec-14 Add warning about future changes to 0xDB
# from CURRENCY SIGN to EURO SIGN. Clarify
# some header information
# n04 1997-Dec-01 Update to match internal utom<n3>, ufrm<n22>:
# Change standard mapping for 0xBD from U+2126
# to its canonical decomposition, U+03A9.
# n03 1995-Apr-15 First version (after fixing some typos).
# Matches internal ufrm<n9>.
#
# Standard header:
# ----------------
#
# Apple, the Apple logo, and Macintosh are trademarks of Apple
# Computer, Inc., registered in the United States and other countries.
# Unicode is a trademark of Unicode Inc. For the sake of brevity,
# throughout this document, "Macintosh" can be used to refer to
# Macintosh computers and "Unicode" can be used to refer to the
# Unicode standard.
#
# Apple Computer, Inc. ("Apple") makes no warranty or representation,
# either express or implied, with respect to this document and the
# included data, its quality, accuracy, or fitness for a particular
# purpose. In no event will Apple be liable for direct, indirect,
# special, incidental, or consequential damages resulting from any
# defect or inaccuracy in this document or the included data.
#
# These mapping tables and character lists are subject to change.
# The latest tables should be available from the following:
#
# <http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/>
#
# For general information about Mac OS encodings and these mapping
# tables, see the file "README.TXT".
#
# Format:
# -------
#
# Three tab-separated columns;
# '#' begins a comment which continues to the end of the line.
# Column #1 is the Mac OS Roman code (in hex as 0xNN)
# Column #2 is the corresponding Unicode (in hex as 0xNNNN)
# Column #3 is a comment containing the Unicode name
#
# The entries are in Mac OS Roman code order.
#
# One of these mappings requires the use of a corporate character.
# See the file "CORPCHAR.TXT" and notes below.
#
# Control character mappings are not shown in this table, following
# the conventions of the standard UTC mapping tables. However, the
# Mac OS Roman character set uses the standard control characters at
# 0x00-0x1F and 0x7F.
#
# Notes on Mac OS Roman:
# ----------------------
# /Notes on Mac OS Roman:/
#
# This is a legacy Mac OS encoding; in the Mac OS X Carbon and Cocoa
# environments, it is only supported directly in programming
@ -117,8 +38,7 @@
# interpreted as associated with these glyphs; they are usually
# interpreted (if at all) as the control codes DC1-DC4.
#
# Unicode mapping issues and notes:
# ---------------------------------
# /Unicode mapping issues and notes:/
#
# The following corporate zone Unicode character is used in this
# mapping:
@ -129,20 +49,17 @@
# is not authorized for use without permission of Apple, and
# unauthorized use might constitute trademark infringement.
#
# Details of mapping changes in each version:
# -------------------------------------------
# /Details of mapping changes in each version:/
#
# Changes from version n08 to version b02:
#
# - Encoding changed for Mac OS 8.5; change mapping of 0xDB from
# * Encoding changed for Mac OS 8.5; change mapping of 0xDB from
# CURRENCY SIGN (U+00A4) to EURO SIGN (U+20AC).
#
# Changes from version n03 to version n04:
#
# - Change mapping of 0xBD from U+2126 to its canonical
# * Change mapping of 0xBD from U+2126 to its canonical
# decomposition, U+03A9.
#
##################
0x20 0x0020 # SPACE
0x21 0x0021 # EXCLAMATION MARK

View File

@ -18,16 +18,22 @@ data MappingType
| JISMapping
deriving (Eq,Ord,Show,Read)
readTranslation :: Int -> FilePath -> IO [(Integer,Maybe Char)]
readTranslation :: Int -> FilePath -> IO ([(Integer,Maybe Char)],[String])
readTranslation offset file = do
cont <- readFile file
return $ mapMaybe (\ln -> case drop offset ln of
[src] -> Just (src,Nothing)
[src,trg] -> Just (src,Just $ chr $ fromIntegral trg)
_ -> Nothing) (parseTranslationTable cont)
cont <- fmap parseTranslationTable $ readFile file
let docstr = mapMaybe snd (takeWhile (null.fst) cont)
let trans = mapMaybe (\(ln,comm) -> case drop offset ln of
[src] -> Just (src,Nothing)
[src,trg] -> Just (src,Just $ chr $ fromIntegral trg)
_ -> Nothing) cont
return (trans,docstr)
parseTranslationTable :: String -> [[Integer]]
parseTranslationTable cont = filter (not.null) (map (\ln -> map read (takeWhile ((/='#').head) (words ln))) (lines cont))
parseTranslationTable :: String -> [([Integer],Maybe String)]
parseTranslationTable cont = map (\ln -> let (trans,comm) = break (=='#') ln
in (map read (words trans),case comm of
"" -> Nothing
_ -> Just (tail comm))
) (lines cont)
{-fillTranslations :: (Ix a,Show a) => a -> a -> [(a,Maybe Char)] -> [(a,Maybe Char)]
fillTranslations f t = merge (range (f,t))
@ -70,7 +76,7 @@ mappingPreprocessor = PreProcessor
preprocessMapping :: MappingType -> FilePath -> FilePath -> [String] -> String -> IO ()
preprocessMapping tp src trg mods name = do
trans <- readTranslation 0 src
(trans,doc) <- readTranslation 0 src
let mod = concat $ intersperse "." (mods++[name])
let wsize = case tp of
ISOMapping -> 1
@ -106,7 +112,11 @@ preprocessMapping tp src trg mods name = do
writeFile trg $ unlines $
["{- This file has been auto-generated. Do not edit it. -}"
,"{-# LANGUAGE MagicHash,DeriveDataTypeable #-}"
,"module "++mod++"("++name++"(..)) where"
]++(case doc of
[] -> []
_ -> ("{- | "++head doc):(map (\ln -> " "++ln) (tail doc)) ++ [" -}"])
++
["module "++mod++"("++name++"(..)) where"
,""
,"import Data.Encoding.Base"
,"import Data.Encoding.ByteSource"