contrib/check-internal-format-escaping.py - rust-lang/gcc - Git at Google

 #!/usr/bin/env python3
 #
 # Check gcc.pot file for stylistic issues as described in
 # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
 # especially in gcc-internal-format messages.
 #
 # This file is part of GCC.
 #
 # GCC is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
 # Software Foundation; either version 3, or (at your option) any later
 # version.
 #
 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.

 import argparse
 import re
 from collections import Counter
 from typing import Dict, Match

 import polib

 seen_warnings = Counter()


 def location(msg: polib.POEntry):
     if msg.occurrences:
         occ = msg.occurrences[0]
         return f'{occ[0]}:{occ[1]}'
     return '<unknown location>'


 def warn(msg: polib.POEntry,
          diagnostic_id: str, diagnostic: str, include_msgid=True):
     """
     To suppress a warning for a particular message,
     add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
     """

     if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
         return

     seen_warnings[diagnostic] += 1

     if include_msgid:
         print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
     else:
         print(f'{location(msg)}: {diagnostic}')


 def lint_gcc_internal_format(msg: polib.POEntry):
     """
     Checks a single message that has the gcc-internal-format. These
     messages use a variety of placeholders like %qs, %<quotes%> and
     %q#E.
     """

     msgid: str = msg.msgid

     def outside_quotes(m: Match[str]):
         before = msgid[:m.start(0)]
         return before.count("%<") == before.count("%>")

     def lint_matching_placeholders():
         """
         Warns when literal values in placeholders are not exactly equal
         in the translation. This can happen when doing copy-and-paste
         translations of similar messages.

         To avoid these mismatches in the first place,
         structurally equal messages are found by
         lint_diagnostics_differing_only_in_placeholders.

         This check only applies when checking a finished translation
         such as de.po, not gcc.pot.
         """

         if not msg.translated():
             return

         in_msgid = re.findall('%<[^%]+%>', msgid)
         in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)

         if set(in_msgid) != set(in_msgstr):
             warn(msg,
                  'placeholder-mismatch',
                  f'placeholder mismatch: msgid has {in_msgid}, '
                  f'msgstr has {in_msgstr}',
                  include_msgid=False)

     def lint_option_outside_quotes():
         for match in re.finditer(r'\S+', msgid):
             part = match.group()
             if not outside_quotes(match):
                 continue

             if part.startswith('-'):
                 if len(part) >= 2 and part[1].isalpha():
                     if part == '-INF':
                         continue

                     warn(msg,
                          'option-outside-quotes',
                          'command line option outside %<quotes%>')

             if part.startswith('__builtin_'):
                 warn(msg,
                      'builtin-outside-quotes',
                      'builtin function outside %<quotes%>')

     def lint_plain_apostrophe():
         for match in re.finditer("[^%]'", msgid):
             if outside_quotes(match):
                 warn(msg, 'apostrophe', 'apostrophe without leading %')

     def lint_space_before_quote():
         """
         A space before %< is often the result of string literals that
         are joined by the C compiler and neither literal has a space
         to separate the words.
         """

         for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
             if match.group(1) != '%s':
                 warn(msg,
                      'no-space-before-quote',
                      '%< directly following a letter or digit')

     def lint_underscore_outside_quotes():
         """
         An underscore outside of quotes is used in several contexts,
         and many of them violate the GCC Guidelines for Diagnostics:

         * names of GCC-internal compiler functions
         * names of GCC-internal data structures
         * static_cast and the like (which are legitimate)
         """

         for match in re.finditer("_", msgid):
             if outside_quotes(match):
                 warn(msg,
                      'underscore-outside-quotes',
                      'underscore outside of %<quotes%>')
                 return

     def lint_may_not():
         """
         The term "may not" may either mean "it could be the case"
         or "should not". These two different meanings are sometimes
         hard to tell apart.
         """

         if re.search(r'\bmay not\b', msgid):
             warn(msg,
                  'ambiguous-may-not',
                  'the term "may not" is ambiguous')

     def lint_unbalanced_quotes():
         if msgid.count("%<") != msgid.count("%>"):
             warn(msg,
                  'unbalanced-quotes',
                  'unbalanced %< and %> quotes')

         if msg.translated():
             if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
                 warn(msg,
                      'unbalanced-quotes',
                      'unbalanced %< and %> quotes')

     def lint_single_space_after_sentence():
         """
         After a sentence there should be two spaces.
         """

         if re.search(r'[.] [A-Z]', msgid):
             warn(msg,
                  'single-space-after-sentence',
                  'single space after sentence')

     def lint_non_canonical_quotes():
         """
         Catches %<%s%>, which can be written in the shorter form %qs.
         """
         match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
         if match:
             warn(msg,
                  'non-canonical-quotes',
                  f'placeholder {match.group()} should be written as %qs')

     lint_option_outside_quotes()
     lint_plain_apostrophe()
     lint_space_before_quote()
     lint_underscore_outside_quotes()
     lint_may_not()
     lint_unbalanced_quotes()
     lint_matching_placeholders()
     lint_single_space_after_sentence()
     lint_non_canonical_quotes()


 def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
     """
     Detects messages that are structurally the same, except that they
     use different plain strings inside %<quotes%>. These messages can
     be merged in order to prevent copy-and-paste mistakes by the
     translators.

     See bug 90119.
     """

     seen: Dict[str, polib.POEntry] = {}

     for msg in po:
         msg: polib.POEntry
         msgid = msg.msgid

         normalized = re.sub('%<[^%]+%>', '%qs', msgid)
         if normalized not in seen:
             seen[normalized] = msg
             seen[msgid] = msg
             continue

         prev = seen[normalized]
         warn(msg,
              'same-pattern',
              f'same pattern for {repr(msgid)} and '
              f'{repr(prev.msgid)} in {location(prev)}',
              include_msgid=False)


 def lint_file(po: polib.POFile):
     for msg in po:
         msg: polib.POEntry

         if not msg.obsolete and not msg.fuzzy:
             if 'gcc-internal-format' in msg.flags:
                 lint_gcc_internal_format(msg)

     lint_diagnostics_differing_only_in_placeholders(po)


 def main():
     parser = argparse.ArgumentParser(description='')
     parser.add_argument('file', help='pot file')

     args = parser.parse_args()

     po = polib.pofile(args.file)
     lint_file(po)

     print()
     print('summary:')
     for entry in seen_warnings.most_common():
         if entry[1] > 1:
             print(f'{entry[1]}\t{entry[0]}')


 if __name__ == '__main__':
     main()
	#!/usr/bin/env python3
	#
	# Check gcc.pot file for stylistic issues as described in
	# https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
	# especially in gcc-internal-format messages.
	#
	# This file is part of GCC.
	#
	# GCC is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 3, or (at your option) any later
	# version.
	#
	# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	# WARRANTY; without even the implied warranty of MERCHANTABILITY or
	# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	# for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with GCC; see the file COPYING3. If not see
	# <http://www.gnu.org/licenses/>.

	import argparse
	import re
	from collections import Counter
	from typing import Dict, Match

	import polib

	seen_warnings = Counter()


	def location(msg: polib.POEntry):
	if msg.occurrences:
	occ = msg.occurrences[0]
	return f'{occ[0]}:{occ[1]}'
	return '<unknown location>'


	def warn(msg: polib.POEntry,
	diagnostic_id: str, diagnostic: str, include_msgid=True):
	"""
	To suppress a warning for a particular message,
	add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
	"""

	if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
	return

	seen_warnings[diagnostic] += 1

	if include_msgid:
	print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
	else:
	print(f'{location(msg)}: {diagnostic}')


	def lint_gcc_internal_format(msg: polib.POEntry):
	"""
	Checks a single message that has the gcc-internal-format. These
	messages use a variety of placeholders like %qs, %<quotes%> and
	%q#E.
	"""

	msgid: str = msg.msgid

	def outside_quotes(m: Match[str]):
	before = msgid[:m.start(0)]
	return before.count("%<") == before.count("%>")

	def lint_matching_placeholders():
	"""
	Warns when literal values in placeholders are not exactly equal
	in the translation. This can happen when doing copy-and-paste
	translations of similar messages.

	To avoid these mismatches in the first place,
	structurally equal messages are found by
	lint_diagnostics_differing_only_in_placeholders.

	This check only applies when checking a finished translation
	such as de.po, not gcc.pot.
	"""

	if not msg.translated():
	return

	in_msgid = re.findall('%<[^%]+%>', msgid)
	in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)

	if set(in_msgid) != set(in_msgstr):
	warn(msg,
	'placeholder-mismatch',
	f'placeholder mismatch: msgid has {in_msgid}, '
	f'msgstr has {in_msgstr}',
	include_msgid=False)

	def lint_option_outside_quotes():
	for match in re.finditer(r'\S+', msgid):
	part = match.group()
	if not outside_quotes(match):
	continue

	if part.startswith('-'):
	if len(part) >= 2 and part[1].isalpha():
	if part == '-INF':
	continue

	warn(msg,
	'option-outside-quotes',
	'command line option outside %<quotes%>')

	if part.startswith('__builtin_'):
	warn(msg,
	'builtin-outside-quotes',
	'builtin function outside %<quotes%>')

	def lint_plain_apostrophe():
	for match in re.finditer("[^%]'", msgid):
	if outside_quotes(match):
	warn(msg, 'apostrophe', 'apostrophe without leading %')

	def lint_space_before_quote():
	"""
	A space before %< is often the result of string literals that
	are joined by the C compiler and neither literal has a space
	to separate the words.
	"""

	for match in re.finditer("(.?[a-zA-Z0-9])%<", msgid):
	if match.group(1) != '%s':
	warn(msg,
	'no-space-before-quote',
	'%< directly following a letter or digit')

	def lint_underscore_outside_quotes():
	"""
	An underscore outside of quotes is used in several contexts,
	and many of them violate the GCC Guidelines for Diagnostics:

	* names of GCC-internal compiler functions
	* names of GCC-internal data structures
	* static_cast and the like (which are legitimate)
	"""

	for match in re.finditer("_", msgid):
	if outside_quotes(match):
	warn(msg,
	'underscore-outside-quotes',
	'underscore outside of %<quotes%>')
	return

	def lint_may_not():
	"""
	The term "may not" may either mean "it could be the case"
	or "should not". These two different meanings are sometimes
	hard to tell apart.
	"""

	if re.search(r'\bmay not\b', msgid):
	warn(msg,
	'ambiguous-may-not',
	'the term "may not" is ambiguous')

	def lint_unbalanced_quotes():
	if msgid.count("%<") != msgid.count("%>"):
	warn(msg,
	'unbalanced-quotes',
	'unbalanced %< and %> quotes')

	if msg.translated():
	if msg.msgstr.count("%<") != msg.msgstr.count("%>"):
	warn(msg,
	'unbalanced-quotes',
	'unbalanced %< and %> quotes')

	def lint_single_space_after_sentence():
	"""
	After a sentence there should be two spaces.
	"""

	if re.search(r'[.] [A-Z]', msgid):
	warn(msg,
	'single-space-after-sentence',
	'single space after sentence')

	def lint_non_canonical_quotes():
	"""
	Catches %<%s%>, which can be written in the shorter form %qs.
	"""
	match = re.search("%<%s%>\|'%s'\|\"%s\"\|`%s'", msgid)
	if match:
	warn(msg,
	'non-canonical-quotes',
	f'placeholder {match.group()} should be written as %qs')

	lint_option_outside_quotes()
	lint_plain_apostrophe()
	lint_space_before_quote()
	lint_underscore_outside_quotes()
	lint_may_not()
	lint_unbalanced_quotes()
	lint_matching_placeholders()
	lint_single_space_after_sentence()
	lint_non_canonical_quotes()


	def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
	"""
	Detects messages that are structurally the same, except that they
	use different plain strings inside %<quotes%>. These messages can
	be merged in order to prevent copy-and-paste mistakes by the
	translators.

	See bug 90119.
	"""

	seen: Dict[str, polib.POEntry] = {}

	for msg in po:
	msg: polib.POEntry
	msgid = msg.msgid

	normalized = re.sub('%<[^%]+%>', '%qs', msgid)
	if normalized not in seen:
	seen[normalized] = msg
	seen[msgid] = msg
	continue

	prev = seen[normalized]
	warn(msg,
	'same-pattern',
	f'same pattern for {repr(msgid)} and '
	f'{repr(prev.msgid)} in {location(prev)}',
	include_msgid=False)


	def lint_file(po: polib.POFile):
	for msg in po:
	msg: polib.POEntry

	if not msg.obsolete and not msg.fuzzy:
	if 'gcc-internal-format' in msg.flags:
	lint_gcc_internal_format(msg)

	lint_diagnostics_differing_only_in_placeholders(po)


	def main():
	parser = argparse.ArgumentParser(description='')
	parser.add_argument('file', help='pot file')

	args = parser.parse_args()

	po = polib.pofile(args.file)
	lint_file(po)

	print()
	print('summary:')
	for entry in seen_warnings.most_common():
	if entry[1] > 1:
	print(f'{entry[1]}\t{entry[0]}')


	if __name__ == '__main__':
	main()