libstdc++-v3/scripts/gen_text_encoding_data.py - rust-lang/gcc - Git at Google

 #!/usr/bin/env python3
 #
 # Script to generate tables for libstdc++ std::text_encoding.
 #
 # This file is part of GCC.
 #
 # GCC is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
 # Software Foundation; either version 3, or (at your option) any later
 # version.
 #
 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.

 # To update the Libstdc++ static data in <bits/text_encoding-data.h> download
 # the latest:
 # https://www.iana.org/assignments/character-sets/character-sets-1.csv
 # Then run this script and save the output to
 # include/bits/text_encoding-data.h

 import sys
 import csv
 import os

 if len(sys.argv) != 2:
     print("Usage: %s <character sets csv>" % sys.argv[0], file=sys.stderr)
     sys.exit(1)

 self = os.path.basename(__file__)
 print("// Generated by scripts/{}, do not edit.".format(self))
 print("""

 // Copyright The GNU Toolchain Authors.
 //
 // This file is part of the GNU ISO C++ Library.  This library is free
 // software; you can redistribute it and/or modify it under the
 // terms of the GNU General Public License as published by the
 // Free Software Foundation; either version 3, or (at your option)
 // any later version.

 // This library is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.

 // Under Section 7 of GPL version 3, you are granted additional
 // permissions described in the GCC Runtime Library Exception, version
 // 3.1, as published by the Free Software Foundation.

 // You should have received a copy of the GNU General Public License and
 // a copy of the GCC Runtime Library Exception along with this program;
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // <http://www.gnu.org/licenses/>.

 /** @file bits/text_encoding-data.h
  *  This is an internal header file, included by other library headers.
  *  Do not attempt to use it directly. @headername{text_encoding}
  */
 """)
 print("#ifndef _GLIBCXX_GET_ENCODING_DATA")
 print('# error "This is not a public header, do not include it directly"')
 print("#endif\n")

 # We need to generate a list of initializers of the form { mib, alias }, e.g.,
 # { 3, "US-ASCII" },
 # { 3, "ISO646-US" },
 # { 3, "csASCII" },
 # { 4, "ISO_8859-1:1987" },
 # { 4, "latin1" },
 # The initializers must be sorted by the mib value. The first entry for
 # a given mib must be the primary name for the encoding. Any aliases for
 # the encoding come after the primary name.
 # We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the
 # offset into the list of the mib=106, alias="UTF-8" entry. This is used
 # to optimize the common case, so we don't need to search for "UTF-8".

 charsets = {}
 with open(sys.argv[1], newline='') as f:
     reader = csv.reader(f)
     next(reader) # skip header row
     for row in reader:
         mib = int(row[2])
         if mib in charsets:
             raise ValueError("Multiple rows for mibEnum={}".format(mib))
         name = row[1]
         aliases = row[5].split()
         # Ensure primary name comes first
         if name in aliases:
             aliases.remove(name)
         charsets[mib] = [name] + aliases

 # Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard.
 charsets.pop(33, None)
 charsets.pop(34, None)

 # This is not an official IANA alias, but we include it in the
 # implementation-defined superset of aliases for US-ASCII.
 # See also LWG 4043.
 extra_aliases = {3: ["ASCII"]}

 count = 0
 for mib in sorted(charsets.keys()):
     names = charsets[mib]
     if names[0] == "UTF-8":
         print("#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET {}".format(count))
     for name in names:
         print('  {{ {:4}, "{}" }},'.format(mib, name))
     count += len(names)
     if mib in extra_aliases:
         names = extra_aliases[mib]
         for name in names:
             print('  {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name))
         count += len(names)

 # <text_encoding> gives an error if this macro is left defined.
 # Do this last, so that the generated output is not usable unless we reach here.
 print("\n#undef _GLIBCXX_GET_ENCODING_DATA")
	#!/usr/bin/env python3
	#
	# Script to generate tables for libstdc++ std::text_encoding.
	#
	# This file is part of GCC.
	#
	# GCC is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 3, or (at your option) any later
	# version.
	#
	# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	# WARRANTY; without even the implied warranty of MERCHANTABILITY or
	# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	# for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with GCC; see the file COPYING3. If not see
	# <http://www.gnu.org/licenses/>.

	# To update the Libstdc++ static data in <bits/text_encoding-data.h> download
	# the latest:
	# https://www.iana.org/assignments/character-sets/character-sets-1.csv
	# Then run this script and save the output to
	# include/bits/text_encoding-data.h

	import sys
	import csv
	import os

	if len(sys.argv) != 2:
	print("Usage: %s <character sets csv>" % sys.argv[0], file=sys.stderr)
	sys.exit(1)

	self = os.path.basename(__file__)
	print("// Generated by scripts/{}, do not edit.".format(self))
	print("""

	// Copyright The GNU Toolchain Authors.
	//
	// This file is part of the GNU ISO C++ Library. This library is free
	// software; you can redistribute it and/or modify it under the
	// terms of the GNU General Public License as published by the
	// Free Software Foundation; either version 3, or (at your option)
	// any later version.

	// This library is distributed in the hope that it will be useful,
	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	// GNU General Public License for more details.

	// Under Section 7 of GPL version 3, you are granted additional
	// permissions described in the GCC Runtime Library Exception, version
	// 3.1, as published by the Free Software Foundation.

	// You should have received a copy of the GNU General Public License and
	// a copy of the GCC Runtime Library Exception along with this program;
	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	// <http://www.gnu.org/licenses/>.

	/** @file bits/text_encoding-data.h
	* This is an internal header file, included by other library headers.
	* Do not attempt to use it directly. @headername{text_encoding}
	*/
	""")
	print("#ifndef _GLIBCXX_GET_ENCODING_DATA")
	print('# error "This is not a public header, do not include it directly"')
	print("#endif\n")

	# We need to generate a list of initializers of the form { mib, alias }, e.g.,
	# { 3, "US-ASCII" },
	# { 3, "ISO646-US" },
	# { 3, "csASCII" },
	# { 4, "ISO_8859-1:1987" },
	# { 4, "latin1" },
	# The initializers must be sorted by the mib value. The first entry for
	# a given mib must be the primary name for the encoding. Any aliases for
	# the encoding come after the primary name.
	# We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the
	# offset into the list of the mib=106, alias="UTF-8" entry. This is used
	# to optimize the common case, so we don't need to search for "UTF-8".

	charsets = {}
	with open(sys.argv[1], newline='') as f:
	reader = csv.reader(f)
	next(reader) # skip header row
	for row in reader:
	mib = int(row[2])
	if mib in charsets:
	raise ValueError("Multiple rows for mibEnum={}".format(mib))
	name = row[1]
	aliases = row[5].split()
	# Ensure primary name comes first
	if name in aliases:
	aliases.remove(name)
	charsets[mib] = [name] + aliases

	# Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard.
	charsets.pop(33, None)
	charsets.pop(34, None)

	# This is not an official IANA alias, but we include it in the
	# implementation-defined superset of aliases for US-ASCII.
	# See also LWG 4043.
	extra_aliases = {3: ["ASCII"]}

	count = 0
	for mib in sorted(charsets.keys()):
	names = charsets[mib]
	if names[0] == "UTF-8":
	print("#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET {}".format(count))
	for name in names:
	print(' {{ {:4}, "{}" }},'.format(mib, name))
	count += len(names)
	if mib in extra_aliases:
	names = extra_aliases[mib]
	for name in names:
	print(' {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name))
	count += len(names)

	# <text_encoding> gives an error if this macro is left defined.
	# Do this last, so that the generated output is not usable unless we reach here.
	print("\n#undef _GLIBCXX_GET_ENCODING_DATA")