| #!/usr/bin/env python3 |
| # |
| # Script to generate tables for libstdc++ std::format width estimation. |
| # |
| # This file is part of GCC. |
| # |
| # GCC is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 3, or (at your option) any later |
| # version. |
| # |
| # GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| # for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with GCC; see the file COPYING3. If not see |
| # <http://www.gnu.org/licenses/>. |
| |
| # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest: |
| # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt |
| # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt |
| # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt |
| # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt |
| # Then run this script and save the output to |
| # ../../libstdc++-v3/include/bits/unicode-data.h |
| |
| import sys |
| import re |
| import math |
| import os |
| |
| self = os.path.basename(__file__) |
| print("// Generated by contrib/unicode/{}, do not edit.".format(self)) |
| print(""" |
| // Copyright The GNU Toolchain Authors. |
| // |
| // This file is part of the GNU ISO C++ Library. This library is free |
| // software; you can redistribute it and/or modify it under the |
| // terms of the GNU General Public License as published by the |
| // Free Software Foundation; either version 3, or (at your option) |
| // any later version. |
| |
| // This library is distributed in the hope that it will be useful, |
| // but WITHOUT ANY WARRANTY; without even the implied warranty of |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| // GNU General Public License for more details. |
| |
| // Under Section 7 of GPL version 3, you are granted additional |
| // permissions described in the GCC Runtime Library Exception, version |
| // 3.1, as published by the Free Software Foundation. |
| |
| // You should have received a copy of the GNU General Public License and |
| // a copy of the GCC Runtime Library Exception along with this program; |
| // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| // <http://www.gnu.org/licenses/>. |
| |
| /** @file bits/unicode-data.h |
| * This is an internal header file, included by other library headers. |
| * Do not attempt to use it directly. @headername{format} |
| */ |
| """) |
| print("#ifndef _GLIBCXX_GET_UNICODE_DATA") |
| print('# error "This is not a public header, do not include it directly"') |
| print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000") |
| print('# error "Version mismatch for Unicode static data"') |
| print("#endif\n") |
| |
| # Process a list and return a list of tuples (index, val) which are the elements |
| # in the list that have a different val from the previous element. |
| # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)] |
| # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)] |
| def find_edges(vals, init = None): |
| edges = [] |
| prev_val = init |
| for i, v in enumerate(vals): |
| if v != prev_val: |
| edges.append((i,v)) |
| prev_val = v |
| return edges |
| |
| all_code_points = [] |
| |
| # Process a code point value or range of code point values with given property. |
| def process_code_points(code_points, val): |
| # Example arguments: |
| # 1100..115F, x |
| # 232A, y |
| |
| r = code_points.split("..") |
| if len(r) == 1: |
| c = int(r[0], base=16) |
| all_code_points[c] = val |
| elif len(r) == 2: |
| begin = int(r[0], base=16) |
| end = int(r[1], base=16) + 1 |
| all_code_points[begin:end] = [val] * (end - begin) |
| else: |
| raise ValueError |
| |
| # By default every code point has width 1. This is what the C++ standard says, |
| # even though the Unicode standard says some code points have width 0. |
| all_code_points = [1] * (1 + 0x10FFFF) |
| |
| # Extract all code points with East_Asian_Width=W or East_Asian_Width=F |
| for line in open("EastAsianWidth.txt", "r"): |
| # Example lines: |
| # 3000 ; F |
| # 3001..3003 ; W |
| line = line.split("#")[0] |
| if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line): |
| process_code_points(line.split(";")[0], 2) |
| |
| # The C++ standard also gives width 2 to the following ranges: |
| # U+4DC0 – U+4DFF (Yijing Hexagram Symbols) |
| process_code_points("4DC0..4DFF", 2) |
| # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs) |
| process_code_points("1F300..1F5FF", 2) |
| # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs) |
| process_code_points("1F900..1F9FF", 2) |
| |
| # Create a list that only contains the code points that have a different width |
| # to the previous code point. |
| edges = find_edges(all_code_points, 1) |
| |
| # Table for std::__unicode::__format_width(char32_t) |
| |
| print(" // Table generated by contrib/unicode/gen_std_format_width.py,") |
| print(" // from EastAsianWidth.txt from the Unicode standard."); |
| print(" inline constexpr char32_t __width_edges[] = {", end="") |
| for i, e in enumerate(edges): |
| if i % 8: |
| print(" ", end="") |
| else: |
| print("\n ", end="") |
| c,_ = e |
| print("{:#x},".format(c), end="") |
| print("\n };\n") |
| |
| # By default every code point has Grapheme_Cluster_Break=Other. |
| all_code_points = ["Other"] * (1 + 0x10FFFF) |
| |
| # Extract Grapheme_Cluster_Break property for all code points. |
| for line in open("GraphemeBreakProperty.txt", "r"): |
| # Example lines: |
| # "0600..0605", "Prepend" |
| # "00AD", "Control" |
| line = line.split("#")[0] |
| if re.match(r'^[\dA-Fa-f][^;]+;', line): |
| code_points, grapheme_property = line.split(";") |
| process_code_points(code_points, grapheme_property.strip()) |
| |
| edges = find_edges(all_code_points) |
| gcb_props = {"Other":0} |
| for c, p in edges: |
| if p not in gcb_props: |
| gcb_props[p] = len(gcb_props) |
| shift_bits = int(math.ceil(math.log2(len(gcb_props)))) |
| |
| # Enum definition for std::__unicode::_Gcb_property |
| |
| print(" enum class _Gcb_property {") |
| for p in gcb_props.items(): |
| print(" _Gcb_{} = {},".format(p[0],p[1])) |
| print(" };\n") |
| |
| # Tables for std::__unicode::_Grapheme_cluster_state |
| |
| print(" // Values generated by contrib/unicode/gen_std_format_width.py,") |
| print(" // from GraphemeBreakProperty.txt from the Unicode standard."); |
| print(" // Entries are (code_point << shift_bits) + property.") |
| print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits)) |
| print(" inline constexpr uint32_t __gcb_edges[] = {", end="") |
| for i, e in enumerate(edges): |
| if i % 6: |
| print(" ", end="") |
| else: |
| print("\n ", end="") |
| c, p = e |
| x = (c << shift_bits) + gcb_props[p] |
| print("{0:#x},".format(x), end="") |
| print("\n };\n") |
| |
| # By default every code point has Indic_Conjunct_Break=None. |
| all_code_points = [None] * (1 + 0x10FFFF) |
| |
| # Extract Indic_Conjunct_Break property for all code points. |
| for line in open("DerivedCoreProperties.txt", "r"): |
| # Example lines: |
| # 094D ; InCB; Linker |
| # 0B71 ; InCB; Consonant |
| # 0300..034E ; InCB; Extend |
| line = line.split("#")[0] |
| if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line): |
| code_points, _, incb_property = line.split(";") |
| process_code_points(code_points, incb_property.strip()) |
| |
| # Table for std::__unicode::__is_incb_linker |
| # This table is tiny, so just contains the list of code points. |
| print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="") |
| for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]: |
| print(" 0x{:04x},".format(i), end="") |
| all_code_points[i] = None |
| print("\n };\n") |
| |
| edges = find_edges(all_code_points) |
| |
| incb_props = {None:0, "Consonant":1, "Extend":2} |
| print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n") |
| # Table for std::__unicode::__incb_property |
| print(" // Values generated by contrib/unicode/gen_std_format_width.py,") |
| print(" // from DerivedCoreProperties.txt from the Unicode standard."); |
| print(" // Entries are (code_point << 2) + property.") |
| print(" inline constexpr uint32_t __incb_edges[] = {", end="") |
| for i, e in enumerate(edges): |
| if i % 6: |
| print(" ", end="") |
| else: |
| print("\n ", end="") |
| c, p = e |
| x = (c << 2) + incb_props[p] |
| print("{0:#x},".format(x), end="") |
| print("\n };\n") |
| |
| # By default every code point has Emoji=No. |
| all_code_points = [False] * (1 + 0x10FFFF) |
| |
| # Extract Emoji=Extended_Pictographic for all code points. |
| for line in open("emoji-data.txt", "r"): |
| # Example lines: |
| # 1100..115F ; Extended_Pictographic |
| # 232A ; Extended_Pictographic |
| line = line.split("#")[0] |
| if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line): |
| process_code_points(line.split(";")[0], True) |
| |
| edges = find_edges(all_code_points, False) |
| |
| # Table for std::__unicode::__is_extended_pictographic |
| print(" // Table generated by contrib/unicode/gen_std_format_width.py,") |
| print(" // from emoji-data.txt from the Unicode standard."); |
| print(" inline constexpr char32_t __xpicto_edges[] = {", end="") |
| for i, e in enumerate(edges): |
| if i % 8: |
| print(" ", end="") |
| else: |
| print("\n ", end="") |
| c,_ = e |
| print("{:#x},".format(c), end="") |
| print("\n };\n") |
| |
| # <bits/unicode.h> gives an error if this macro is left defined. |
| # Do this last, so that the generated output is not usable unless we reach here. |
| print("#undef _GLIBCXX_GET_UNICODE_DATA") |