contrib/unicode/gen_libstdcxx_unicode_data.py - rust-lang/gcc - Git at Google

 #!/usr/bin/env python3
 #
 # Script to generate tables for libstdc++ std::format width estimation.
 #
 # This file is part of GCC.
 #
 # GCC is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
 # Software Foundation; either version 3, or (at your option) any later
 # version.
 #
 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.

 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
 # Then run this script and save the output to
 # ../../libstdc++-v3/include/bits/unicode-data.h

 import sys
 import re
 import math
 import os

 self = os.path.basename(__file__)
 print("// Generated by contrib/unicode/{}, do not edit.".format(self))
 print("""
 // Copyright The GNU Toolchain Authors.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
 // terms of the GNU General Public License as published by the
 // Free Software Foundation; either version 3, or (at your option)
 // any later version.

 // This library is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.

 // Under Section 7 of GPL version 3, you are granted additional
 // permissions described in the GCC Runtime Library Exception, version
 // 3.1, as published by the Free Software Foundation.

 // You should have received a copy of the GNU General Public License and
 // a copy of the GCC Runtime Library Exception along with this program;
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // <http://www.gnu.org/licenses/>.

 /** @file bits/unicode-data.h
  *  This is an internal header file, included by other library headers.
  *  Do not attempt to use it directly. @headername{format}
  */
 """)
 print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
 print('# error "This is not a public header, do not include it directly"')
 print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000")
 print('# error "Version mismatch for Unicode static data"')
 print("#endif\n")

 # Process a list and return a list of tuples (index, val) which are the elements
 # in the list that have a different val from the previous element.
 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
 def find_edges(vals, init = None):
     edges = []
     prev_val = init
     for i, v in enumerate(vals):
         if v != prev_val:
             edges.append((i,v))
             prev_val = v
     return edges

 all_code_points = []

 # Process a code point value or range of code point values with given property.
 def process_code_points(code_points, val):
     # Example arguments:
     # 1100..115F, x
     # 232A, y

     r = code_points.split("..")
     if len(r) == 1:
         c = int(r[0], base=16)
         all_code_points[c] = val
     elif len(r) == 2:
         begin = int(r[0], base=16)
         end = int(r[1], base=16) + 1
         all_code_points[begin:end] = [val] * (end - begin)
     else:
         raise ValueError

 # By default every code point has width 1. This is what the C++ standard says,
 # even though the Unicode standard says some code points have width 0.
 all_code_points = [1] * (1 + 0x10FFFF)

 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F
 for line in open("EastAsianWidth.txt", "r"):
     # Example lines:
     # 3000           ; F
     # 3001..3003     ; W
     line = line.split("#")[0]
     if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
         process_code_points(line.split(";")[0], 2)

 # The C++ standard also gives width 2 to the following ranges:
 # U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
 process_code_points("4DC0..4DFF", 2)
 # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
 process_code_points("1F300..1F5FF", 2)
 # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
 process_code_points("1F900..1F9FF", 2)

 # Create a list that only contains the code points that have a different width
 # to the previous code point.
 edges = find_edges(all_code_points, 1)

 # Table for std::__unicode::__format_width(char32_t)

 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
 print("  // from EastAsianWidth.txt from the Unicode standard.");
 print("  inline constexpr char32_t __width_edges[] = {", end="")
 for i, e in enumerate(edges):
     if i % 8:
         print(" ", end="")
     else:
         print("\n    ", end="")
     c,_ = e
     print("{:#x},".format(c), end="")
 print("\n  };\n")

 # By default every code point has Grapheme_Cluster_Break=Other.
 all_code_points = ["Other"] * (1 + 0x10FFFF)

 # Extract Grapheme_Cluster_Break property for all code points.
 for line in open("GraphemeBreakProperty.txt", "r"):
     # Example lines:
     # "0600..0605", "Prepend"
     # "00AD", "Control"
     line = line.split("#")[0]
     if re.match(r'^[\dA-Fa-f][^;]+;', line):
         code_points, grapheme_property = line.split(";")
         process_code_points(code_points, grapheme_property.strip())

 edges = find_edges(all_code_points)
 gcb_props = {"Other":0}
 for c, p in edges:
     if p not in gcb_props:
         gcb_props[p] = len(gcb_props)
 shift_bits = int(math.ceil(math.log2(len(gcb_props))))

 # Enum definition for std::__unicode::_Gcb_property

 print("  enum class _Gcb_property {")
 for p in gcb_props.items():
     print("    _Gcb_{} = {},".format(p[0],p[1]))
 print("  };\n")

 # Tables for std::__unicode::_Grapheme_cluster_state

 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
 print("  // from GraphemeBreakProperty.txt from the Unicode standard.");
 print("  // Entries are (code_point << shift_bits) + property.")
 print("  inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
 print("  inline constexpr uint32_t __gcb_edges[] = {", end="")
 for i, e in enumerate(edges):
     if i % 6:
         print(" ", end="")
     else:
         print("\n    ", end="")
     c, p = e
     x = (c << shift_bits) + gcb_props[p]
     print("{0:#x},".format(x), end="")
 print("\n  };\n")

 # By default every code point has Indic_Conjunct_Break=None.
 all_code_points = [None] * (1 + 0x10FFFF)

 # Extract Indic_Conjunct_Break property for all code points.
 for line in open("DerivedCoreProperties.txt", "r"):
     # Example lines:
     # 094D       ; InCB; Linker
     # 0B71       ; InCB; Consonant
     # 0300..034E ; InCB; Extend
     line = line.split("#")[0]
     if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
         code_points, _, incb_property = line.split(";")
         process_code_points(code_points, incb_property.strip())

 # Table for std::__unicode::__is_incb_linker
 # This table is tiny, so just contains the list of code points.
 print("  inline constexpr char32_t __incb_linkers[] = {\n   ", end="")
 for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
     print(" 0x{:04x},".format(i), end="")
     all_code_points[i] = None
 print("\n  };\n")

 edges = find_edges(all_code_points)

 incb_props = {None:0, "Consonant":1, "Extend":2}
 print("  enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
 # Table for std::__unicode::__incb_property
 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
 print("  // from DerivedCoreProperties.txt from the Unicode standard.");
 print("  // Entries are (code_point << 2) + property.")
 print("  inline constexpr uint32_t __incb_edges[] = {", end="")
 for i, e in enumerate(edges):
     if i % 6:
         print(" ", end="")
     else:
         print("\n    ", end="")
     c, p = e
     x = (c << 2) + incb_props[p]
     print("{0:#x},".format(x), end="")
 print("\n  };\n")

 # By default every code point has Emoji=No.
 all_code_points = [False] * (1 + 0x10FFFF)

 # Extract Emoji=Extended_Pictographic for all code points.
 for line in open("emoji-data.txt", "r"):
     # Example lines:
     # 1100..115F ; Extended_Pictographic
     # 232A       ; Extended_Pictographic
     line = line.split("#")[0]
     if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
         process_code_points(line.split(";")[0], True)

 edges = find_edges(all_code_points, False)

 # Table for std::__unicode::__is_extended_pictographic
 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
 print("  // from emoji-data.txt from the Unicode standard.");
 print("  inline constexpr char32_t __xpicto_edges[] = {", end="")
 for i, e in enumerate(edges):
     if i % 8:
         print(" ", end="")
     else:
         print("\n    ", end="")
     c,_ = e
     print("{:#x},".format(c), end="")
 print("\n  };\n")

 # <bits/unicode.h> gives an error if this macro is left defined.
 # Do this last, so that the generated output is not usable unless we reach here.
 print("#undef _GLIBCXX_GET_UNICODE_DATA")
	#!/usr/bin/env python3
	#
	# Script to generate tables for libstdc++ std::format width estimation.
	#
	# This file is part of GCC.
	#
	# GCC is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 3, or (at your option) any later
	# version.
	#
	# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	# WARRANTY; without even the implied warranty of MERCHANTABILITY or
	# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	# for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with GCC; see the file COPYING3. If not see
	# <http://www.gnu.org/licenses/>.

	# To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
	# ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
	# ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
	# ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
	# ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
	# Then run this script and save the output to
	# ../../libstdc++-v3/include/bits/unicode-data.h

	import sys
	import re
	import math
	import os

	self = os.path.basename(__file__)
	print("// Generated by contrib/unicode/{}, do not edit.".format(self))
	print("""
	// Copyright The GNU Toolchain Authors.
	//
	// This file is part of the GNU ISO C++ Library. This library is free
	// software; you can redistribute it and/or modify it under the
	// terms of the GNU General Public License as published by the
	// Free Software Foundation; either version 3, or (at your option)
	// any later version.

	// This library is distributed in the hope that it will be useful,
	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	// GNU General Public License for more details.

	// Under Section 7 of GPL version 3, you are granted additional
	// permissions described in the GCC Runtime Library Exception, version
	// 3.1, as published by the Free Software Foundation.

	// You should have received a copy of the GNU General Public License and
	// a copy of the GCC Runtime Library Exception along with this program;
	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	// <http://www.gnu.org/licenses/>.

	/** @file bits/unicode-data.h
	* This is an internal header file, included by other library headers.
	* Do not attempt to use it directly. @headername{format}
	*/
	""")
	print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
	print('# error "This is not a public header, do not include it directly"')
	print("#elif _GLIBCXX_GET_UNICODE_DATA != 160000")
	print('# error "Version mismatch for Unicode static data"')
	print("#endif\n")

	# Process a list and return a list of tuples (index, val) which are the elements
	# in the list that have a different val from the previous element.
	# e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
	# and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
	def find_edges(vals, init = None):
	edges = []
	prev_val = init
	for i, v in enumerate(vals):
	if v != prev_val:
	edges.append((i,v))
	prev_val = v
	return edges

	all_code_points = []

	# Process a code point value or range of code point values with given property.
	def process_code_points(code_points, val):
	# Example arguments:
	# 1100..115F, x
	# 232A, y

	r = code_points.split("..")
	if len(r) == 1:
	c = int(r[0], base=16)
	all_code_points[c] = val
	elif len(r) == 2:
	begin = int(r[0], base=16)
	end = int(r[1], base=16) + 1
	all_code_points[begin:end] = [val] * (end - begin)
	else:
	raise ValueError

	# By default every code point has width 1. This is what the C++ standard says,
	# even though the Unicode standard says some code points have width 0.
	all_code_points = [1] * (1 + 0x10FFFF)

	# Extract all code points with East_Asian_Width=W or East_Asian_Width=F
	for line in open("EastAsianWidth.txt", "r"):
	# Example lines:
	# 3000 ; F
	# 3001..3003 ; W
	line = line.split("#")[0]
	if re.match(r'^[\dA-Fa-f][^;]+;\s[WF]\s$', line):
	process_code_points(line.split(";")[0], 2)

	# The C++ standard also gives width 2 to the following ranges:
	# U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
	process_code_points("4DC0..4DFF", 2)
	# U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
	process_code_points("1F300..1F5FF", 2)
	# U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
	process_code_points("1F900..1F9FF", 2)

	# Create a list that only contains the code points that have a different width
	# to the previous code point.
	edges = find_edges(all_code_points, 1)

	# Table for std::__unicode::__format_width(char32_t)

	print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
	print(" // from EastAsianWidth.txt from the Unicode standard.");
	print(" inline constexpr char32_t __width_edges[] = {", end="")
	for i, e in enumerate(edges):
	if i % 8:
	print(" ", end="")
	else:
	print("\n ", end="")
	c,_ = e
	print("{:#x},".format(c), end="")
	print("\n };\n")

	# By default every code point has Grapheme_Cluster_Break=Other.
	all_code_points = ["Other"] * (1 + 0x10FFFF)

	# Extract Grapheme_Cluster_Break property for all code points.
	for line in open("GraphemeBreakProperty.txt", "r"):
	# Example lines:
	# "0600..0605", "Prepend"
	# "00AD", "Control"
	line = line.split("#")[0]
	if re.match(r'^[\dA-Fa-f][^;]+;', line):
	code_points, grapheme_property = line.split(";")
	process_code_points(code_points, grapheme_property.strip())

	edges = find_edges(all_code_points)
	gcb_props = {"Other":0}
	for c, p in edges:
	if p not in gcb_props:
	gcb_props[p] = len(gcb_props)
	shift_bits = int(math.ceil(math.log2(len(gcb_props))))

	# Enum definition for std::__unicode::_Gcb_property

	print(" enum class _Gcb_property {")
	for p in gcb_props.items():
	print(" _Gcb_{} = {},".format(p[0],p[1]))
	print(" };\n")

	# Tables for std::__unicode::_Grapheme_cluster_state

	print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
	print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
	print(" // Entries are (code_point << shift_bits) + property.")
	print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
	print(" inline constexpr uint32_t __gcb_edges[] = {", end="")
	for i, e in enumerate(edges):
	if i % 6:
	print(" ", end="")
	else:
	print("\n ", end="")
	c, p = e
	x = (c << shift_bits) + gcb_props[p]
	print("{0:#x},".format(x), end="")
	print("\n };\n")

	# By default every code point has Indic_Conjunct_Break=None.
	all_code_points = [None] * (1 + 0x10FFFF)

	# Extract Indic_Conjunct_Break property for all code points.
	for line in open("DerivedCoreProperties.txt", "r"):
	# Example lines:
	# 094D ; InCB; Linker
	# 0B71 ; InCB; Consonant
	# 0300..034E ; InCB; Extend
	line = line.split("#")[0]
	if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
	code_points, _, incb_property = line.split(";")
	process_code_points(code_points, incb_property.strip())

	# Table for std::__unicode::__is_incb_linker
	# This table is tiny, so just contains the list of code points.
	print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="")
	for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
	print(" 0x{:04x},".format(i), end="")
	all_code_points[i] = None
	print("\n };\n")

	edges = find_edges(all_code_points)

	incb_props = {None:0, "Consonant":1, "Extend":2}
	print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
	# Table for std::__unicode::__incb_property
	print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
	print(" // from DerivedCoreProperties.txt from the Unicode standard.");
	print(" // Entries are (code_point << 2) + property.")
	print(" inline constexpr uint32_t __incb_edges[] = {", end="")
	for i, e in enumerate(edges):
	if i % 6:
	print(" ", end="")
	else:
	print("\n ", end="")
	c, p = e
	x = (c << 2) + incb_props[p]
	print("{0:#x},".format(x), end="")
	print("\n };\n")

	# By default every code point has Emoji=No.
	all_code_points = [False] * (1 + 0x10FFFF)

	# Extract Emoji=Extended_Pictographic for all code points.
	for line in open("emoji-data.txt", "r"):
	# Example lines:
	# 1100..115F ; Extended_Pictographic
	# 232A ; Extended_Pictographic
	line = line.split("#")[0]
	if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
	process_code_points(line.split(";")[0], True)

	edges = find_edges(all_code_points, False)

	# Table for std::__unicode::__is_extended_pictographic
	print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
	print(" // from emoji-data.txt from the Unicode standard.");
	print(" inline constexpr char32_t __xpicto_edges[] = {", end="")
	for i, e in enumerate(edges):
	if i % 8:
	print(" ", end="")
	else:
	print("\n ", end="")
	c,_ = e
	print("{:#x},".format(c), end="")
	print("\n };\n")

	# <bits/unicode.h> gives an error if this macro is left defined.
	# Do this last, so that the generated output is not usable unless we reach here.
	print("#undef _GLIBCXX_GET_UNICODE_DATA")