| #!/usr/bin/python3 |
| # -*- coding: utf-8 -*- |
| # Copyright (C) 2014-2025 Free Software Foundation, Inc. |
| # Copyright The GNU Toolchain Authors. |
| # This file is part of the GNU C Library. |
| # |
| # The GNU C Library is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # The GNU C Library is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with the GNU C Library; if not, see |
| # <https://www.gnu.org/licenses/>. |
| |
| '''glibc/localedata/charmaps/UTF-8 file generator script |
| |
| This script generates a glibc/localedata/charmaps/UTF-8 file |
| from Unicode data. |
| |
| Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt |
| |
| It will output UTF-8 file |
| ''' |
| |
| import argparse |
| import re |
| import unicode_utils |
| |
| # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, |
| # sections 3.11 and 4.4. |
| |
| JAMO_INITIAL_SHORT_NAME = ( |
| 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', |
| 'C', 'K', 'T', 'P', 'H' |
| ) |
| |
| JAMO_MEDIAL_SHORT_NAME = ( |
| 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', |
| 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I' |
| ) |
| |
| JAMO_FINAL_SHORT_NAME = ( |
| '', 'G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', |
| 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', |
| 'P', 'H' |
| ) |
| |
| def process_range(start, end, outfile, name): |
| '''Writes a range of code points into the CHARMAP section of the |
| output file |
| |
| ''' |
| if 'Hangul Syllable' in name: |
| # from glibc/localedata/ChangeLog: |
| # |
| # 2000-09-24 Bruno Haible <haible@clisp.cons.org> |
| # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges, |
| # so they become printable and carry a width. Comment out surrogate |
| # ranges. Add a WIDTH table |
| # |
| # So we expand the Hangul Syllables here: |
| for i in range(int(start, 16), int(end, 16)+1 ): |
| index2, index3 = divmod(i - 0xaC00, 28) |
| index1, index2 = divmod(index2, 21) |
| hangul_syllable_name = 'HANGUL SYLLABLE ' \ |
| + JAMO_INITIAL_SHORT_NAME[index1] \ |
| + JAMO_MEDIAL_SHORT_NAME[index2] \ |
| + JAMO_FINAL_SHORT_NAME[index3] |
| outfile.write('{:<11s} {:<12s} {:s}\n'.format( |
| unicode_utils.ucs_symbol(i), convert_to_hex(i), |
| hangul_syllable_name)) |
| return |
| # UnicodeData.txt file has contains code point ranges like this: |
| # |
| # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
| # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
| # |
| # The glibc UTF-8 file splits ranges like these into shorter |
| # ranges of 64 code points each: |
| # |
| # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> |
| # … |
| # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A> |
| for i in range(int(start, 16), int(end, 16), 64 ): |
| if i > (int(end, 16)-64): |
| outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( |
| unicode_utils.ucs_symbol(i), |
| unicode_utils.ucs_symbol(int(end,16)), |
| convert_to_hex(i), |
| name)) |
| break |
| outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( |
| unicode_utils.ucs_symbol(i), |
| unicode_utils.ucs_symbol(i+63), |
| convert_to_hex(i), |
| name)) |
| |
| def process_charmap(flines, outfile): |
| '''This function takes an array which contains *all* lines of |
| of UnicodeData.txt and write lines to outfile as used in the |
| |
| CHARMAP |
| … |
| END CHARMAP |
| |
| section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. |
| |
| Samples for input lines: |
| |
| 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; |
| 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
| 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
| D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;; |
| DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;; |
| 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;; |
| 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;; |
| |
| Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): |
| |
| <U0010> /x10 DATA LINK ESCAPE |
| <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> |
| %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First> |
| %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last> |
| <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> |
| |
| ''' |
| fields_start = [] |
| for line in flines: |
| fields = line.split(";") |
| # Some characters have “<control>” as their name. We try to |
| # use the “Unicode 1.0 Name” (10th field in |
| # UnicodeData.txt) for them. |
| # |
| # The Characters U+0080, U+0081, U+0084 and U+0099 have |
| # “<control>” as their name but do not even have aa |
| # ”Unicode 1.0 Name”. We could write code to take their |
| # alternate names from NameAliases.txt. |
| if fields[1] == "<control>" and fields[10]: |
| fields[1] = fields[10] |
| # Handling code point ranges like: |
| # |
| # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
| # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
| if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: |
| fields_start = fields |
| continue |
| if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: |
| process_range(fields_start[0], fields[0], |
| outfile, fields[1][:-7]+'>') |
| fields_start = [] |
| continue |
| fields_start = [] |
| if 'Surrogate,' in fields[1]: |
| # Comment out the surrogates in the UTF-8 file. |
| # One could of course skip them completely but |
| # the original UTF-8 file in glibc had them as |
| # comments, so we keep these comment lines. |
| outfile.write('%') |
| outfile.write('{:<11s} {:<12s} {:s}\n'.format( |
| unicode_utils.ucs_symbol(int(fields[0], 16)), |
| convert_to_hex(int(fields[0], 16)), |
| fields[1])) |
| |
| def convert_to_hex(code_point): |
| '''Converts a code point to a hexadecimal UTF-8 representation |
| like /x**/x**/x**.''' |
| # Getting UTF8 of Unicode characters. |
| # In Python3, .encode('UTF-8') does not work for |
| # surrogates. Therefore, we use this conversion table |
| surrogates = { |
| 0xD800: '/xed/xa0/x80', |
| 0xDB7F: '/xed/xad/xbf', |
| 0xDB80: '/xed/xae/x80', |
| 0xDBFF: '/xed/xaf/xbf', |
| 0xDC00: '/xed/xb0/x80', |
| 0xDFFF: '/xed/xbf/xbf', |
| } |
| if code_point in surrogates: |
| return surrogates[code_point] |
| return ''.join([ |
| '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8') |
| ]) |
| |
| def write_header_charmap(outfile): |
| '''Write the header on top of the CHARMAP section to the output file''' |
| outfile.write("<code_set_name> UTF-8\n") |
| outfile.write("<comment_char> %\n") |
| outfile.write("<escape_char> /\n") |
| outfile.write("<mb_cur_min> 1\n") |
| outfile.write("<mb_cur_max> 6\n\n") |
| outfile.write("% CHARMAP generated using utf8_gen.py\n") |
| outfile.write("% alias ISO-10646/UTF-8\n") |
| outfile.write("CHARMAP\n") |
| |
| def write_header_width(outfile, unicode_version): |
| '''Writes the header on top of the WIDTH section to the output file''' |
| outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version)) |
| outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n') |
| outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n') |
| outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n') |
| outfile.write('% This character stands in for an intentionally omitted leading consonant\n') |
| outfile.write('% in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n') |
| outfile.write('% of visible display to ensure that the complete block has the correct width.\n') |
| outfile.write('% (See below for more information on Hangul syllables.)\n') |
| outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n') |
| outfile.write('% "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n') |
| outfile.write('% One composed Hangul "syllable block" like 퓛 is made up of\n') |
| outfile.write('% two to three individual component characters called "jamo".\n') |
| outfile.write('% The complete block must have total width 2;\n') |
| outfile.write('% to achieve this, we assign a width of 2 to leading "choseong" jamo,\n') |
| outfile.write('% and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n') |
| outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n') |
| outfile.write('% "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n') |
| outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n') |
| outfile.write('% "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n') |
| outfile.write('% - Double-width characters have width 2; generated from\n') |
| outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n') |
| outfile.write('% - Default width for all other characters is 1.\n') |
| outfile.write("WIDTH\n") |
| |
| def process_width(outfile, ulines, dlines, elines, klines): |
| '''ulines are lines from UnicodeData.txt. |
| elines are lines from EastAsianWidth.txt containing characters with width |
| “W” or “F”. |
| dlines are lines from DerivedCoreProperties.txt which contain |
| characters with the property “Default_Ignorable_Code_Point”. |
| klines are lines from HangulSyllableType.txt which contain characters |
| with syllable type “V” or “T”. |
| ''' |
| # Wide and fullwidth characters have width 1 |
| width_dict = {} |
| for line in elines: |
| fields = line.split(";") |
| if not '..' in fields[0]: |
| code_points = (fields[0], fields[0]) |
| else: |
| code_points = fields[0].split("..") |
| for key in range(int(code_points[0], 16), |
| int(code_points[1], 16)+1): |
| width_dict[key] = 2 |
| |
| # Nonspacing and enclosing marks have width 0 |
| for line in ulines: |
| fields = line.split(";") |
| if fields[4] == "NSM" or fields[2] in ("Me", "Mn"): |
| width_dict[int(fields[0], 16)] = 0 |
| |
| # Conjoining vowel and trailing jamo have width 0 |
| for line in klines: |
| fields = line.split(";") |
| if not '..' in fields[0]: |
| code_points = (fields[0], fields[0]) |
| else: |
| code_points = fields[0].split("..") |
| for key in range(int(code_points[0], 16), |
| int(code_points[1], 16)+1): |
| width_dict[key] = 0 |
| |
| # “Default_Ignorable_Code_Point”s have width 0 |
| for line in dlines: |
| fields = line.split(";") |
| if not '..' in fields[0]: |
| code_points = (fields[0], fields[0]) |
| else: |
| code_points = fields[0].split("..") |
| for key in range(int(code_points[0], 16), |
| int(code_points[1], 16)+1): |
| width_dict[key] = 0 # default width is 1 |
| |
| |
| # Special case: U+00AD SOFT HYPHEN |
| del width_dict[0x00AD] |
| |
| # Special case: U+115F HANGUL CHOSEONG FILLER |
| width_dict[0x115F] = 2 |
| |
| for key in list(range(0x3248, 0x3250)): |
| # These are “A” which means we can decide whether to treat them |
| # as “W” or “N” based on context: |
| # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html |
| # For us, “W” seems better. |
| width_dict[key] = 2 |
| for key in list(range(0x4DC0, 0x4E00)): |
| width_dict[key] = 2 |
| |
| same_width_lists = [] |
| current_width_list = [] |
| for key in sorted(width_dict): |
| if not current_width_list: |
| current_width_list = [key] |
| elif (key == current_width_list[-1] + 1 |
| and width_dict[key] == width_dict[current_width_list[0]]): |
| current_width_list.append(key) |
| else: |
| same_width_lists.append(current_width_list) |
| current_width_list = [key] |
| if current_width_list: |
| same_width_lists.append(current_width_list) |
| |
| for same_width_list in same_width_lists: |
| if len(same_width_list) == 1: |
| outfile.write('{:s}\t{:d}\n'.format( |
| unicode_utils.ucs_symbol(same_width_list[0]), |
| width_dict[same_width_list[0]])) |
| else: |
| outfile.write('{:s}...{:s}\t{:d}\n'.format( |
| unicode_utils.ucs_symbol(same_width_list[0]), |
| unicode_utils.ucs_symbol(same_width_list[-1]), |
| width_dict[same_width_list[0]])) |
| |
| if __name__ == "__main__": |
| PARSER = argparse.ArgumentParser( |
| description=''' |
| Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt |
| ''') |
| PARSER.add_argument( |
| '-u', '--unicode_data_file', |
| nargs='?', |
| type=str, |
| default='UnicodeData.txt', |
| help=('The UnicodeData.txt file to read, ' |
| + 'default: %(default)s')) |
| PARSER.add_argument( |
| '-d', '--derived_core_properties_file', |
| nargs='?', |
| type=str, |
| default='DerivedCoreProperties.txt', |
| help=('The DerivedCoreProperties.txt file to read, ' |
| + 'default: %(default)s')) |
| PARSER.add_argument( |
| '-e', '--east_asian_with_file', |
| nargs='?', |
| type=str, |
| default='EastAsianWidth.txt', |
| help=('The EastAsianWidth.txt file to read, ' |
| + 'default: %(default)s')) |
| PARSER.add_argument( |
| '-k', '--hangul_syllable_type_file', |
| nargs='?', |
| type=str, |
| default='HangulSyllableType.txt', |
| help=('The HangulSyllableType.txt file to read, ' |
| + 'default: %(default)s')) |
| PARSER.add_argument( |
| '--unicode_version', |
| nargs='?', |
| required=True, |
| type=str, |
| help='The Unicode version of the input files used.') |
| ARGS = PARSER.parse_args() |
| |
| unicode_utils.fill_attributes(ARGS.unicode_data_file) |
| with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: |
| UNICODE_DATA_LINES = UNIDATA_FILE.readlines() |
| with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE: |
| DERIVED_CORE_PROPERTIES_LINES = [] |
| for LINE in DERIVED_CORE_PROPERTIES_FILE: |
| # If characters which are from reserved ranges |
| # (i.e. not yet assigned code points) |
| # are added to the WIDTH section of the UTF-8 file, then |
| # “make check” produces “Unknown Character” errors for |
| # these code points because such unassigned code points |
| # are not in the CHARMAP section of the UTF-8 file. |
| # |
| # Therefore, we skip all reserved code points. |
| if re.match(r'.*<reserved-.+>', LINE): |
| continue |
| if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE): |
| DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip()) |
| with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: |
| EAST_ASIAN_WIDTH_LINES = [] |
| for LINE in EAST_ASIAN_WIDTH_FILE: |
| if re.match(r'.*<reserved-.+>', LINE): |
| continue |
| if re.match(r'^[^;]*;\s*[WF]', LINE): |
| EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) |
| with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE: |
| HANGUL_SYLLABLE_TYPE_LINES = [] |
| for LINE in HANGUL_SYLLABLE_TYPE_FILE: |
| if re.match(r'.*<reserved-.+>', LINE): |
| continue |
| if re.match(r'^[^;]*;\s*[VT]', LINE): |
| HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip()) |
| with open('UTF-8', mode='w') as OUTFILE: |
| # Processing UnicodeData.txt and write CHARMAP to UTF-8 file |
| write_header_charmap(OUTFILE) |
| process_charmap(UNICODE_DATA_LINES, OUTFILE) |
| OUTFILE.write("END CHARMAP\n\n") |
| # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file |
| write_header_width(OUTFILE, ARGS.unicode_version) |
| process_width(OUTFILE, |
| UNICODE_DATA_LINES, |
| DERIVED_CORE_PROPERTIES_LINES, |
| EAST_ASIAN_WIDTH_LINES, |
| HANGUL_SYLLABLE_TYPE_LINES) |
| OUTFILE.write("END WIDTH\n") |