blob: 6fcaed1ab6aa2ecc5f3d0846c9de086391e6eefa [file] [log] [blame]
// <text_encoding> -*- C++ -*-
// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/** @file include/text_encoding
* This is a Standard C++ Library header.
*/
#ifndef _GLIBCXX_TEXT_ENCODING
#define _GLIBCXX_TEXT_ENCODING
#ifdef _GLIBCXX_SYSHDR
#pragma GCC system_header
#endif
#include <bits/requires_hosted.h>
#define __glibcxx_want_text_encoding
#include <bits/version.h>
#ifdef __cpp_lib_text_encoding
#include <compare>
#include <string_view>
#include <bits/functional_hash.h> // hash
#include <bits/ranges_util.h> // view_interface
#include <bits/unicode.h> // __charset_alias_match
#include <ext/numeric_traits.h> // __int_traits
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
/**
* @brief An interface for accessing the IANA Character Sets registry.
* @ingroup locales
* @since C++23
*/
struct text_encoding
{
private:
struct _Rep
{
using id = __INT_LEAST32_TYPE__;
id _M_id;
const char* _M_name;
friend constexpr bool
operator<(const _Rep& __r, id __m) noexcept
{ return __r._M_id < __m; }
friend constexpr bool
operator==(const _Rep& __r, string_view __name) noexcept
{ return __r._M_name == __name; }
};
public:
static constexpr size_t max_name_length = 63;
enum class id : _Rep::id
{
other = 1,
unknown = 2,
ASCII = 3,
ISOLatin1 = 4,
ISOLatin2 = 5,
ISOLatin3 = 6,
ISOLatin4 = 7,
ISOLatinCyrillic = 8,
ISOLatinArabic = 9,
ISOLatinGreek = 10,
ISOLatinHebrew = 11,
ISOLatin5 = 12,
ISOLatin6 = 13,
ISOTextComm = 14,
HalfWidthKatakana = 15,
JISEncoding = 16,
ShiftJIS = 17,
EUCPkdFmtJapanese = 18,
EUCFixWidJapanese = 19,
ISO4UnitedKingdom = 20,
ISO11SwedishForNames = 21,
ISO15Italian = 22,
ISO17Spanish = 23,
ISO21German = 24,
ISO60DanishNorwegian = 25,
ISO69French = 26,
ISO10646UTF1 = 27,
ISO646basic1983 = 28,
INVARIANT = 29,
ISO2IntlRefVersion = 30,
NATSSEFI = 31,
NATSSEFIADD = 32,
ISO10Swedish = 35,
KSC56011987 = 36,
ISO2022KR = 37,
EUCKR = 38,
ISO2022JP = 39,
ISO2022JP2 = 40,
ISO13JISC6220jp = 41,
ISO14JISC6220ro = 42,
ISO16Portuguese = 43,
ISO18Greek7Old = 44,
ISO19LatinGreek = 45,
ISO25French = 46,
ISO27LatinGreek1 = 47,
ISO5427Cyrillic = 48,
ISO42JISC62261978 = 49,
ISO47BSViewdata = 50,
ISO49INIS = 51,
ISO50INIS8 = 52,
ISO51INISCyrillic = 53,
ISO54271981 = 54,
ISO5428Greek = 55,
ISO57GB1988 = 56,
ISO58GB231280 = 57,
ISO61Norwegian2 = 58,
ISO70VideotexSupp1 = 59,
ISO84Portuguese2 = 60,
ISO85Spanish2 = 61,
ISO86Hungarian = 62,
ISO87JISX0208 = 63,
ISO88Greek7 = 64,
ISO89ASMO449 = 65,
ISO90 = 66,
ISO91JISC62291984a = 67,
ISO92JISC62991984b = 68,
ISO93JIS62291984badd = 69,
ISO94JIS62291984hand = 70,
ISO95JIS62291984handadd = 71,
ISO96JISC62291984kana = 72,
ISO2033 = 73,
ISO99NAPLPS = 74,
ISO102T617bit = 75,
ISO103T618bit = 76,
ISO111ECMACyrillic = 77,
ISO121Canadian1 = 78,
ISO122Canadian2 = 79,
ISO123CSAZ24341985gr = 80,
ISO88596E = 81,
ISO88596I = 82,
ISO128T101G2 = 83,
ISO88598E = 84,
ISO88598I = 85,
ISO139CSN369103 = 86,
ISO141JUSIB1002 = 87,
ISO143IECP271 = 88,
ISO146Serbian = 89,
ISO147Macedonian = 90,
ISO150 = 91,
ISO151Cuba = 92,
ISO6937Add = 93,
ISO153GOST1976874 = 94,
ISO8859Supp = 95,
ISO10367Box = 96,
ISO158Lap = 97,
ISO159JISX02121990 = 98,
ISO646Danish = 99,
USDK = 100,
DKUS = 101,
KSC5636 = 102,
Unicode11UTF7 = 103,
ISO2022CN = 104,
ISO2022CNEXT = 105,
UTF8 = 106,
ISO885913 = 109,
ISO885914 = 110,
ISO885915 = 111,
ISO885916 = 112,
GBK = 113,
GB18030 = 114,
OSDEBCDICDF0415 = 115,
OSDEBCDICDF03IRV = 116,
OSDEBCDICDF041 = 117,
ISO115481 = 118,
KZ1048 = 119,
UCS2 = 1000,
UCS4 = 1001,
UnicodeASCII = 1002,
UnicodeLatin1 = 1003,
UnicodeJapanese = 1004,
UnicodeIBM1261 = 1005,
UnicodeIBM1268 = 1006,
UnicodeIBM1276 = 1007,
UnicodeIBM1264 = 1008,
UnicodeIBM1265 = 1009,
Unicode11 = 1010,
SCSU = 1011,
UTF7 = 1012,
UTF16BE = 1013,
UTF16LE = 1014,
UTF16 = 1015,
CESU8 = 1016,
UTF32 = 1017,
UTF32BE = 1018,
UTF32LE = 1019,
BOCU1 = 1020,
UTF7IMAP = 1021,
Windows30Latin1 = 2000,
Windows31Latin1 = 2001,
Windows31Latin2 = 2002,
Windows31Latin5 = 2003,
HPRoman8 = 2004,
AdobeStandardEncoding = 2005,
VenturaUS = 2006,
VenturaInternational = 2007,
DECMCS = 2008,
PC850Multilingual = 2009,
PC8DanishNorwegian = 2012,
PC862LatinHebrew = 2013,
PC8Turkish = 2014,
IBMSymbols = 2015,
IBMThai = 2016,
HPLegal = 2017,
HPPiFont = 2018,
HPMath8 = 2019,
HPPSMath = 2020,
HPDesktop = 2021,
VenturaMath = 2022,
MicrosoftPublishing = 2023,
Windows31J = 2024,
GB2312 = 2025,
Big5 = 2026,
Macintosh = 2027,
IBM037 = 2028,
IBM038 = 2029,
IBM273 = 2030,
IBM274 = 2031,
IBM275 = 2032,
IBM277 = 2033,
IBM278 = 2034,
IBM280 = 2035,
IBM281 = 2036,
IBM284 = 2037,
IBM285 = 2038,
IBM290 = 2039,
IBM297 = 2040,
IBM420 = 2041,
IBM423 = 2042,
IBM424 = 2043,
PC8CodePage437 = 2011,
IBM500 = 2044,
IBM851 = 2045,
PCp852 = 2010,
IBM855 = 2046,
IBM857 = 2047,
IBM860 = 2048,
IBM861 = 2049,
IBM863 = 2050,
IBM864 = 2051,
IBM865 = 2052,
IBM868 = 2053,
IBM869 = 2054,
IBM870 = 2055,
IBM871 = 2056,
IBM880 = 2057,
IBM891 = 2058,
IBM903 = 2059,
IBM904 = 2060,
IBM905 = 2061,
IBM918 = 2062,
IBM1026 = 2063,
IBMEBCDICATDE = 2064,
EBCDICATDEA = 2065,
EBCDICCAFR = 2066,
EBCDICDKNO = 2067,
EBCDICDKNOA = 2068,
EBCDICFISE = 2069,
EBCDICFISEA = 2070,
EBCDICFR = 2071,
EBCDICIT = 2072,
EBCDICPT = 2073,
EBCDICES = 2074,
EBCDICESA = 2075,
EBCDICESS = 2076,
EBCDICUK = 2077,
EBCDICUS = 2078,
Unknown8BiT = 2079,
Mnemonic = 2080,
Mnem = 2081,
VISCII = 2082,
VIQR = 2083,
KOI8R = 2084,
HZGB2312 = 2085,
IBM866 = 2086,
PC775Baltic = 2087,
KOI8U = 2088,
IBM00858 = 2089,
IBM00924 = 2090,
IBM01140 = 2091,
IBM01141 = 2092,
IBM01142 = 2093,
IBM01143 = 2094,
IBM01144 = 2095,
IBM01145 = 2096,
IBM01146 = 2097,
IBM01147 = 2098,
IBM01148 = 2099,
IBM01149 = 2100,
Big5HKSCS = 2101,
IBM1047 = 2102,
PTCP154 = 2103,
Amiga1251 = 2104,
KOI7switched = 2105,
BRF = 2106,
TSCII = 2107,
CP51932 = 2108,
windows874 = 2109,
windows1250 = 2250,
windows1251 = 2251,
windows1252 = 2252,
windows1253 = 2253,
windows1254 = 2254,
windows1255 = 2255,
windows1256 = 2256,
windows1257 = 2257,
windows1258 = 2258,
TIS620 = 2259,
CP50220 = 2260
};
using enum id;
constexpr text_encoding() = default;
constexpr explicit
text_encoding(string_view __enc) noexcept
: _M_rep(_S_find_name(__enc))
{
__enc.copy(_M_name, max_name_length);
}
// @pre i has the value of one of the enumerators of id.
constexpr
text_encoding(id __i) noexcept
: _M_rep(_S_find_id(__i))
{
if (string_view __name(_M_rep->_M_name); !__name.empty())
__name.copy(_M_name, max_name_length);
}
constexpr id mib() const noexcept { return id(_M_rep->_M_id); }
constexpr const char* name() const noexcept { return _M_name; }
struct aliases_view : ranges::view_interface<aliases_view>
{
private:
class _Iterator;
struct _Sentinel { };
public:
constexpr _Iterator begin() const noexcept;
constexpr _Sentinel end() const noexcept { return {}; }
private:
friend struct text_encoding;
constexpr explicit aliases_view(const _Rep* __r) : _M_begin(__r) { }
const _Rep* _M_begin = nullptr;
};
constexpr aliases_view
aliases() const noexcept
{
return _M_rep->_M_name[0] ? aliases_view(_M_rep) : aliases_view{nullptr};
}
friend constexpr bool
operator==(const text_encoding& __a,
const text_encoding& __b) noexcept
{
if (__a.mib() == id::other && __b.mib() == id::other) [[unlikely]]
return _S_comp(__a._M_name, __b._M_name);
else
return __a.mib() == __b.mib();
}
friend constexpr bool
operator==(const text_encoding& __encoding, id __i) noexcept
{ return __encoding.mib() == __i; }
#if __CHAR_BIT__ == 8
static consteval text_encoding
literal() noexcept
{
#ifdef __GNUC_EXECUTION_CHARSET_NAME
return text_encoding(__GNUC_EXECUTION_CHARSET_NAME);
#elif defined __clang_literal_encoding__
return text_encoding(__clang_literal_encoding__);
#else
return text_encoding();
#endif
}
static text_encoding
environment();
template<id _Id>
static bool
environment_is()
{ return text_encoding(_Id)._M_is_environment(); }
#else
static text_encoding literal() = delete;
static text_encoding environment() = delete;
template<id> static bool environment_is() = delete;
#endif
private:
const _Rep* _M_rep = _S_reps + 1; // id::unknown
char _M_name[max_name_length + 1] = {0};
bool
_M_is_environment() const;
static inline constexpr _Rep _S_reps[] = {
{ 1, "" }, { 2, "" },
#define _GLIBCXX_GET_ENCODING_DATA
#include <bits/text_encoding-data.h>
#ifdef _GLIBCXX_GET_ENCODING_DATA
# error "Invalid text_encoding data"
#endif
{ 9999, nullptr }, // sentinel
};
static constexpr bool
_S_comp(string_view __a, string_view __b)
{ return __unicode::__charset_alias_match(__a, __b); }
static constexpr const _Rep*
_S_find_name(string_view __name) noexcept
{
#ifdef _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET
// Optimize the common UTF-8 case to avoid a linear search through all
// strings in the table using the _S_comp function.
if (__name == "UTF-8")
return _S_reps + 2 + _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET;
#endif
// The first two array elements (other and unknown) don't have names.
// The last element is a sentinel that can never match anything.
const auto __first = _S_reps + 2, __end = std::end(_S_reps) - 1;
for (auto __r = __first; __r != __end; ++__r)
if (_S_comp(__r->_M_name, __name))
{
// Might have matched an alias. Find the first entry for this ID.
const auto __id = __r->_M_id;
while (__r[-1]._M_id == __id)
--__r;
return __r;
}
return _S_reps; // id::other
}
static constexpr const _Rep*
_S_find_id(id __id) noexcept
{
const auto __i = (_Rep::id)__id;
const auto __r = std::lower_bound(_S_reps, std::end(_S_reps) - 1, __i);
if (__r->_M_id == __i) [[likely]]
return __r;
else
{
// Preconditions: i has the value of one of the enumerators of id.
__glibcxx_assert(__r->_M_id == __i);
return _S_reps + 1; // id::unknown
}
}
};
template<>
struct hash<text_encoding>
{
size_t
operator()(const text_encoding& __enc) const noexcept
{ return std::hash<text_encoding::id>()(__enc.mib()); }
};
class text_encoding::aliases_view::_Iterator
{
public:
using value_type = const char*;
using reference = const char*;
using difference_type = int;
constexpr _Iterator() = default;
constexpr value_type
operator*() const
{
if (_M_dereferenceable()) [[likely]]
return _M_rep->_M_name;
else
{
__glibcxx_assert(_M_dereferenceable());
return "";
}
}
constexpr _Iterator&
operator++()
{
if (_M_dereferenceable()) [[likely]]
++_M_rep;
else
{
__glibcxx_assert(_M_dereferenceable());
*this = _Iterator{};
}
return *this;
}
constexpr _Iterator&
operator--()
{
const bool __decrementable
= _M_rep != nullptr && _M_rep[-1]._M_id == _M_id;
if (__decrementable) [[likely]]
--_M_rep;
else
{
__glibcxx_assert(__decrementable);
*this = _Iterator{};
}
return *this;
}
constexpr _Iterator
operator++(int)
{
auto __it = *this;
++*this;
return __it;
}
constexpr _Iterator
operator--(int)
{
auto __it = *this;
--*this;
return __it;
}
constexpr value_type
operator[](difference_type __n) const
{ return *(*this + __n); }
constexpr _Iterator&
operator+=(difference_type __n)
{
if (_M_rep != nullptr)
{
if (__n > 0)
{
if (__n < (std::end(_S_reps) - _M_rep)
&& _M_rep[__n - 1]._M_id == _M_id) [[likely]]
_M_rep += __n;
else
*this = _Iterator{};
}
else if (__n < 0)
{
if (__n > (_S_reps - _M_rep)
&& _M_rep[__n]._M_id == _M_id) [[likely]]
_M_rep += __n;
else
*this = _Iterator{};
}
}
if (__n != 0)
__glibcxx_assert(_M_rep != nullptr);
return *this;
}
constexpr _Iterator&
operator-=(difference_type __n)
{
using _Traits = __gnu_cxx::__int_traits<difference_type>;
if (__n == _Traits::__min) [[unlikely]]
return operator+=(_Traits::__max);
return operator+=(-__n);
}
constexpr difference_type
operator-(const _Iterator& __i) const
{
if (_M_id == __i._M_id)
return _M_rep - __i._M_rep;
__glibcxx_assert(_M_id == __i._M_id);
return __gnu_cxx::__int_traits<difference_type>::__max;
}
constexpr bool
operator==(const _Iterator&) const = default;
constexpr bool
operator==(_Sentinel) const noexcept
{ return !_M_dereferenceable(); }
constexpr strong_ordering
operator<=>(const _Iterator& __i) const
{
__glibcxx_assert(_M_id == __i._M_id);
return _M_rep <=> __i._M_rep;
}
friend constexpr _Iterator
operator+(_Iterator __i, difference_type __n)
{
__i += __n;
return __i;
}
friend constexpr _Iterator
operator+(difference_type __n, _Iterator __i)
{
__i += __n;
return __i;
}
friend constexpr _Iterator
operator-(_Iterator __i, difference_type __n)
{
__i -= __n;
return __i;
}
private:
friend struct text_encoding;
constexpr explicit
_Iterator(const _Rep* __r) noexcept
: _M_rep(__r), _M_id(__r ? __r->_M_id : 0)
{ }
constexpr bool
_M_dereferenceable() const noexcept
{ return _M_rep != nullptr && _M_rep->_M_id == _M_id; }
const _Rep* _M_rep = nullptr;
_Rep::id _M_id = 0;
};
constexpr auto
text_encoding::aliases_view::begin() const noexcept
-> _Iterator
{ return _Iterator(_M_begin); }
namespace ranges
{
// Opt-in to borrowed_range concept
template<>
inline constexpr bool
enable_borrowed_range<std::text_encoding::aliases_view> = true;
}
_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif // __cpp_lib_text_encoding
#endif // _GLIBCXX_TEXT_ENCODING