src/tools/unicode-table-generator/src/case_mapping.rs - rust - Git at Google

 //! Generates lookup tables (LUTs) for "case-mapping": mapping a `char` to its
 //! uppercase or lowercase equivalent(s).
 //!
 //! # Lookup algorithm
 //!
 //! The LUT is split into two levels:
 //!
 //! The 1st level LUT is simply an array of `L2Lut`s. The correct `L2Lut` is
 //! found by splitting the input `codepoint` into an `(input_high, input_low)`
 //! pair (Unicode plane and lower 16 bits, respectively), and using `input_high`
 //! as the index.
 //!
 //! The 2nd level LUT is further split into two separate tables:
 //! - `singles`, for characters that map to a single codepoint.
 //! - `multis`, for characters that map to 2 or 3 codepoints (eg `ß` -> `SS`).
 //!
 //! `singles` is a sorted slice of `(input_low, output_delta)` pairs. The correct
 //! pair is found by binary search. The final output codepoint is reconstructed
 //! with `input_high << 16 | (input_low + output_delta)`.
 //!
 //! `multis` is a sorted slice of `(input_low, [output_low1, output_low2,
 //! output_low3])` pairs. The correct pair is found by binary search. The final
 //! output codepoints are reconstructed with `input_high << 16 | output_low`.
 //!
 //! # Size optimizations
 //!
 //! Splitting the LUT into `L1Lut` and `L2Lut` means we do not need to store the
 //! upper 16 bits of every entry in the `L2Lut`, since every character stays
 //! within the same plane when case-mapped. This cuts the total size of the LUT
 //! in half.
 //!
 //! Storing the delta between the input codepoint and output codepoint in
 //! `singles` allows us to combine contiguous ranges which all map to codepoints
 //! the same distance away. For example, all of the accented letters in Latin-1
 //! that map to a single codepoint can be case-mapped by adding or subtracting
 //! 32.
 //!
 //! We also combine adjacent entries which only differ by 2, and indicate by
 //! setting the `parity` bit in the `Range` struct. An input codepoint belongs
 //! to the `Range` if `input_low & range.parity == range.start & range.parity`.
 //! This allows us to combine even more entries, without increasing the size of
 //! each entry.

 use std::collections::BTreeMap;
 use std::fmt;
 use std::ops::RangeInclusive;

 use crate::fmt_helpers::Hex;
 use crate::{UnicodeData, fmt_list};

 pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [(String, usize); 3]) {
     let mut file = String::new();

     file.push_str("\n\n");
     file.push_str(HEADER.trim_start());
     file.push('\n');
     let (lower_tables, lower_desc, lower_size) = generate_tables("LOWER", &data.to_lower);
     file.push_str(&lower_tables);
     file.push_str("\n\n");
     let (upper_tables, upper_desc, upper_size) = generate_tables("UPPER", &data.to_upper);
     file.push_str(&upper_tables);
     file.push_str("\n\n");
     let (title_tables, title_desc, title_size) = generate_tables("TITLE", &data.to_title);
     file.push_str(&title_tables);
     (file, [(lower_desc, lower_size), (upper_desc, upper_size), (title_desc, title_size)])
 }

 // So far, only planes 0 and 1 (Basic Multilingual Plane and Supplementary
 // Multilingual Plane, respectively) have case mappings. If that changes, increment this.
 const NUM_PLANES: u16 = 2;

 /// Split a codepoint into its plane (upper 16 bits) and position within the plane (lower 16 bits).
 fn deconstruct(c: u32) -> (u16, u16) {
     let high = (c >> 16) as u16;
     assert!(high <= NUM_PLANES);

     let low = c as u16;
     (high, low)
 }

 #[derive(Debug, Clone)]
 struct L1Lut {
     /// Keyed by bits 16..=31 of the code point. So far, only planes 0 and 1
     /// have case mappings, so we don't need to bother storing the plane numbers.
     l2_luts: [L2Lut; NUM_PLANES as usize],
 }

 impl Default for L1Lut {
     fn default() -> Self {
         Self { l2_luts: [L2Lut::default(), L2Lut::default()] }
     }
 }

 impl L1Lut {
     fn size(&self) -> usize {
         self.l2_luts.iter().map(|l2| l2.size()).sum()
     }
 }

 #[derive(Default, Clone)]
 struct L2Lut {
     /// Keyed by bits 0..=15 of the code point, see `Range` struct for how
     /// membership is tested.
     singles: Vec<(Range, i16)>,

     /// Keyed by bits 0..=15 of the code point, value is the lower 16-bits of the 2 or 3 code points that the char expands to.
     multis: Vec<(Hex<u16>, [Hex<u16>; 3])>,
 }

 /// A compact encoding of a `Range<u16>` in only 4 bytes.
 /// If `parity` is false, the range represents code points `start..=start+len`,
 /// with step 1.
 /// If `parity` is true, the range represents code points `start..=start+len`,
 /// with step 2.
 /// Membership can be tested with `range.start() <= codepoint && codepoint <=
 /// range.end() && (codepoint & parity == start & parity)`.
 #[derive(Copy, Clone)]
 struct Range {
     start: u16,
     len: u8,
     parity: bool,
     // 7 bits left for any more flags we may want in the future.
 }

 impl Range {
     const fn new(range: RangeInclusive<u16>, parity: bool) -> Self {
         let start = *range.start();
         let end = *range.end();
         assert!(start <= end);

         let len = end - start;
         assert!(len <= 255);

         Self { start, len: len as u8, parity }
     }

     const fn singleton(start: u16) -> Self {
         Self::new(start..=start, false)
     }

     const fn start(&self) -> u16 {
         self.start
     }

     const fn end(&self) -> u16 {
         self.start + self.len as u16
     }

     const fn len(&self) -> u16 {
         self.len as u16 + 1
     }
 }

 impl fmt::Debug for Range {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         if self.len() == 1 {
             return write!(f, "Range::singleton({})", Hex(self.start));
         }

         let range = Hex(self.start)..=Hex(self.end());
         match self.parity {
             false => write!(f, "Range::step_by_1({range:?})"),
             true => write!(f, "Range::step_by_2({range:?})"),
         }
     }
 }

 impl L2Lut {
     fn size(&self) -> usize {
         size_of_val(self.singles.as_slice()) + size_of_val(self.multis.as_slice())
     }
 }

 impl fmt::Debug for L2Lut {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let singles = self.singles.as_slice();
         let singles = fmt::from_fn(|f| {
             write!(f, "&[ // {} entries, {} bytes", singles.len(), size_of_val(singles))?;
             write!(f, "{}]", fmt_list(singles))
         });

         let multis = self.multis.as_slice();
         let multis = fmt::from_fn(|f| {
             write!(f, "&[ // {} entries, {} bytes", multis.len(), size_of_val(multis))?;
             write!(f, "{}]", fmt_list(multis))
         });

         f.debug_struct("L2Lut").field("singles", &singles).field("multis", &multis).finish()
     }
 }

 fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, String, usize) {
     let mut l1_lut = L1Lut::default();

     for (&input, &output) in data.iter() {
         assert!(input > 0x7f);

         let (input_high, input_low) = deconstruct(input);
         let l2_lut = &mut l1_lut.l2_luts[input_high as usize];

         match output {
             [output, 0, 0] => {
                 let (output_high, output_low) = deconstruct(output);
                 assert_eq!(
                     output_high, input_high,
                     "Case-mapping a character should not change its plane"
                 );
                 let delta = output_low as i16 - input_low as i16;
                 let range = Range::singleton(input_low);
                 l2_lut.singles.push((range, delta));
             }
             _ => {
                 let output_lows = output.map(|output| {
                     let (output_high, output_low) = deconstruct(output);
                     assert_eq!(
                         output_high, input_high,
                         "Case-mapping a character should not change its plane"
                     );
                     Hex(output_low)
                 });
                 l2_lut.multis.push((Hex(input_low), output_lows));
             }
         }
     }

     // Combine adjacent contiguous ranges which apply the same delta.
     // eg `(0x1234..=0x1234, +1), (0x1235..=0x1235, +1), (0x1236..=0x1236, +1)`
     // becomes `(Range::step_by_1(0x1234..=0x1236), +1)`.
     for l2_lut in &mut l1_lut.l2_luts {
         l2_lut.singles = l2_lut
             .singles
             .chunk_by(|(left_range, left_delta), (right_range, right_delta)| {
                 left_delta == right_delta && right_range.end() - left_range.start() == 1
             })
             .map(|chunk| {
                 let &(first_range, _delta) = chunk.first().unwrap();
                 let &(last_range, delta) = chunk.last().unwrap();
                 (Range::new(first_range.start()..=last_range.end(), false), delta)
             })
             .collect();
     }

     // Combine adjacent ranges which differ by 2, since we can encode that with the parity bit in the Range struct.
     // eg `(0x1234..=0x1234, +1), (0x1236..=0x1236, +1), (0x1238..=0x1238, +1)`
     // becomes `(Range::step_by_2(0x1234..=0x1238), +1)`.
     for l2_lut in &mut l1_lut.l2_luts {
         l2_lut.singles = l2_lut
             .singles
             .chunk_by(|(left_range, left_delta), (right_range, right_delta)| {
                 left_delta == right_delta
                     && left_range.len() == 1
                     && right_range.len() == 1
                     && right_range.end() - left_range.start() == 2
             })
             .map(|chunk| {
                 let &(first_range, _delta) = chunk.first().unwrap();
                 let &(last_range, delta) = chunk.last().unwrap();
                 let parity = chunk.len() > 1;
                 (Range::new(first_range.start()..=last_range.end(), parity), delta)
             })
             .collect();
     }

     let size = l1_lut.size();
     let num_ranges =
         l1_lut.l2_luts.iter().map(|l2| l2.singles.len() + l2.multis.len()).sum::<usize>();
     let table = format!("static {case}CASE_LUT: L1Lut = {l1_lut:#?};");
     let desc = format!(
         "{:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using 2-level LUT",
         data.len(),
         num_ranges,
         data.first_key_value().map(|(&k, _)| k).unwrap(),
         data.last_key_value().map(|(&k, _)| k).unwrap(),
     );
     (table, desc, size)
 }

 static HEADER: &str = r"
 use crate::ops::RangeInclusive;

 struct L1Lut {
     l2_luts: [L2Lut; 2],
 }

 struct L2Lut {
     singles: &'static [(Range, i16)],
     multis: &'static [(u16, [u16; 3])],
 }

 #[derive(Copy, Clone)]
 struct Range {
     start: u16,
     len: u8,
     parity: bool,
 }

 impl Range {
     const fn new(range: RangeInclusive<u16>, parity: bool) -> Self {
         let start = *range.start();
         let end = *range.end();
         assert!(start <= end);

         let len = end - start;
         assert!(len <= 255);

         Self { start, len: len as u8, parity }
     }

     const fn singleton(start: u16) -> Self {
         Self::new(start..=start, false)
     }

     const fn step_by_1(range: RangeInclusive<u16>) -> Self {
         Self::new(range, false)
     }

     const fn step_by_2(range: RangeInclusive<u16>) -> Self {
         Self::new(range, true)
     }

     const fn start(&self) -> u16 {
         self.start
     }

     const fn end(&self) -> u16 {
         self.start + self.len as u16
     }
 }

 fn deconstruct(c: char) -> (u16, u16) {
     let c = c as u32;
     let plane = (c >> 16) as u16;
     let low = c as u16;
     (plane, low)
 }

 unsafe fn reconstruct(plane: u16, low: u16) -> char {
     // SAFETY: The caller must ensure that the result is a valid `char`.
     unsafe { char::from_u32_unchecked(((plane as u32) << 16) | (low as u32)) }
 }

 fn lookup(input: char, l1_lut: &L1Lut) -> Option<[char; 3]> {
     let (input_high, input_low) = deconstruct(input);
     let Some(l2_lut) = l1_lut.l2_luts.get(input_high as usize) else {
         return None;
     };

     let idx = l2_lut.singles.binary_search_by(|(range, _)| {
         use crate::cmp::Ordering;

         if input_low < range.start() {
             Ordering::Greater
         } else if input_low > range.end() {
             Ordering::Less
         } else {
             Ordering::Equal
         }
     });

     if let Ok(idx) = idx {
         // SAFETY: binary search guarantees that the index is in bounds.
         let &(range, output_delta) = unsafe { l2_lut.singles.get_unchecked(idx) };
         let mask = range.parity as u16;
         if input_low & mask == range.start() & mask {
             let output_low = input_low.wrapping_add_signed(output_delta);
             // SAFETY: Table data are guaranteed to be valid Unicode.
             let output = unsafe { reconstruct(input_high, output_low) };
             return Some([output, '\0', '\0']);
         }
     };

     if let Ok(idx) = l2_lut.multis.binary_search_by_key(&input_low, |&(p, _)| p) {
         // SAFETY: binary search guarantees that the index is in bounds.
         let &(_, output_lows) = unsafe { l2_lut.multis.get_unchecked(idx) };
         // SAFETY: Table data are guaranteed to be valid Unicode.
         let output = output_lows.map(|output_low| unsafe { reconstruct(input_high, output_low) });
         return Some(output);
     };

     None
 }

 pub fn to_lower(c: char) -> [char; 3] {
     // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Lowercased%253A%5D-%5B%253AASCII%253A%5D&abb=on
     if c < '\u{C0}' {
         return [c.to_ascii_lowercase(), '\0', '\0'];
     }

     lookup(c, &LOWERCASE_LUT).unwrap_or([c, '\0', '\0'])
 }

 pub fn to_upper(c: char) -> [char; 3] {
     // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Uppercased%253A%5D-%5B%253AASCII%253A%5D&abb=on
     if c < '\u{B5}' {
         return [c.to_ascii_uppercase(), '\0', '\0'];
     }

     lookup(c, &UPPERCASE_LUT).unwrap_or([c, '\0', '\0'])
 }

 pub fn to_title(c: char) -> [char; 3] {
     // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Titlecased%253A%5D-%5B%253AASCII%253A%5D&abb=on
     if c < '\u{B5}' {
         return [c.to_ascii_uppercase(), '\0', '\0'];
     }

     lookup(c, &TITLECASE_LUT).or_else(|| lookup(c, &UPPERCASE_LUT)).unwrap_or([c, '\0', '\0'])
 }
 ";
	//! Generates lookup tables (LUTs) for "case-mapping": mapping a `char` to its
	//! uppercase or lowercase equivalent(s).
	//!
	//! # Lookup algorithm
	//!
	//! The LUT is split into two levels:
	//!
	//! The 1st level LUT is simply an array of `L2Lut`s. The correct `L2Lut` is
	//! found by splitting the input `codepoint` into an `(input_high, input_low)`
	//! pair (Unicode plane and lower 16 bits, respectively), and using `input_high`
	//! as the index.
	//!
	//! The 2nd level LUT is further split into two separate tables:
	//! - `singles`, for characters that map to a single codepoint.
	//! - `multis`, for characters that map to 2 or 3 codepoints (eg `ß` -> `SS`).
	//!
	//! `singles` is a sorted slice of `(input_low, output_delta)` pairs. The correct
	//! pair is found by binary search. The final output codepoint is reconstructed
	//! with `input_high << 16 \| (input_low + output_delta)`.
	//!
	//! `multis` is a sorted slice of `(input_low, [output_low1, output_low2,
	//! output_low3])` pairs. The correct pair is found by binary search. The final
	//! output codepoints are reconstructed with `input_high << 16 \| output_low`.
	//!
	//! # Size optimizations
	//!
	//! Splitting the LUT into `L1Lut` and `L2Lut` means we do not need to store the
	//! upper 16 bits of every entry in the `L2Lut`, since every character stays
	//! within the same plane when case-mapped. This cuts the total size of the LUT
	//! in half.
	//!
	//! Storing the delta between the input codepoint and output codepoint in
	//! `singles` allows us to combine contiguous ranges which all map to codepoints
	//! the same distance away. For example, all of the accented letters in Latin-1
	//! that map to a single codepoint can be case-mapped by adding or subtracting
	//! 32.
	//!
	//! We also combine adjacent entries which only differ by 2, and indicate by
	//! setting the `parity` bit in the `Range` struct. An input codepoint belongs
	//! to the `Range` if `input_low & range.parity == range.start & range.parity`.
	//! This allows us to combine even more entries, without increasing the size of
	//! each entry.

	use std::collections::BTreeMap;
	use std::fmt;
	use std::ops::RangeInclusive;

	use crate::fmt_helpers::Hex;
	use crate::{UnicodeData, fmt_list};

	pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [(String, usize); 3]) {
	let mut file = String::new();

	file.push_str("\n\n");
	file.push_str(HEADER.trim_start());
	file.push('\n');
	let (lower_tables, lower_desc, lower_size) = generate_tables("LOWER", &data.to_lower);
	file.push_str(&lower_tables);
	file.push_str("\n\n");
	let (upper_tables, upper_desc, upper_size) = generate_tables("UPPER", &data.to_upper);
	file.push_str(&upper_tables);
	file.push_str("\n\n");
	let (title_tables, title_desc, title_size) = generate_tables("TITLE", &data.to_title);
	file.push_str(&title_tables);
	(file, [(lower_desc, lower_size), (upper_desc, upper_size), (title_desc, title_size)])
	}

	// So far, only planes 0 and 1 (Basic Multilingual Plane and Supplementary
	// Multilingual Plane, respectively) have case mappings. If that changes, increment this.
	const NUM_PLANES: u16 = 2;

	/// Split a codepoint into its plane (upper 16 bits) and position within the plane (lower 16 bits).
	fn deconstruct(c: u32) -> (u16, u16) {
	let high = (c >> 16) as u16;
	assert!(high <= NUM_PLANES);

	let low = c as u16;
	(high, low)
	}

	#[derive(Debug, Clone)]
	struct L1Lut {
	/// Keyed by bits 16..=31 of the code point. So far, only planes 0 and 1
	/// have case mappings, so we don't need to bother storing the plane numbers.
	l2_luts: [L2Lut; NUM_PLANES as usize],
	}

	impl Default for L1Lut {
	fn default() -> Self {
	Self { l2_luts: [L2Lut::default(), L2Lut::default()] }
	}
	}

	impl L1Lut {
	fn size(&self) -> usize {
	self.l2_luts.iter().map(\|l2\| l2.size()).sum()
	}
	}

	#[derive(Default, Clone)]
	struct L2Lut {
	/// Keyed by bits 0..=15 of the code point, see `Range` struct for how
	/// membership is tested.
	singles: Vec<(Range, i16)>,

	/// Keyed by bits 0..=15 of the code point, value is the lower 16-bits of the 2 or 3 code points that the char expands to.
	multis: Vec<(Hex<u16>, [Hex<u16>; 3])>,
	}

	/// A compact encoding of a `Range<u16>` in only 4 bytes.
	/// If `parity` is false, the range represents code points `start..=start+len`,
	/// with step 1.
	/// If `parity` is true, the range represents code points `start..=start+len`,
	/// with step 2.
	/// Membership can be tested with `range.start() <= codepoint && codepoint <=
	/// range.end() && (codepoint & parity == start & parity)`.
	#[derive(Copy, Clone)]
	struct Range {
	start: u16,
	len: u8,
	parity: bool,
	// 7 bits left for any more flags we may want in the future.
	}

	impl Range {
	const fn new(range: RangeInclusive<u16>, parity: bool) -> Self {
	let start = *range.start();
	let end = *range.end();
	assert!(start <= end);

	let len = end - start;
	assert!(len <= 255);

	Self { start, len: len as u8, parity }
	}

	const fn singleton(start: u16) -> Self {
	Self::new(start..=start, false)
	}

	const fn start(&self) -> u16 {
	self.start
	}

	const fn end(&self) -> u16 {
	self.start + self.len as u16
	}

	const fn len(&self) -> u16 {
	self.len as u16 + 1
	}
	}

	impl fmt::Debug for Range {
	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
	if self.len() == 1 {
	return write!(f, "Range::singleton({})", Hex(self.start));
	}

	let range = Hex(self.start)..=Hex(self.end());
	match self.parity {
	false => write!(f, "Range::step_by_1({range:?})"),
	true => write!(f, "Range::step_by_2({range:?})"),
	}
	}
	}

	impl L2Lut {
	fn size(&self) -> usize {
	size_of_val(self.singles.as_slice()) + size_of_val(self.multis.as_slice())
	}
	}

	impl fmt::Debug for L2Lut {
	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
	let singles = self.singles.as_slice();
	let singles = fmt::from_fn(\|f\| {
	write!(f, "&[ // {} entries, {} bytes", singles.len(), size_of_val(singles))?;
	write!(f, "{}]", fmt_list(singles))
	});

	let multis = self.multis.as_slice();
	let multis = fmt::from_fn(\|f\| {
	write!(f, "&[ // {} entries, {} bytes", multis.len(), size_of_val(multis))?;
	write!(f, "{}]", fmt_list(multis))
	});

	f.debug_struct("L2Lut").field("singles", &singles).field("multis", &multis).finish()
	}
	}

	fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, String, usize) {
	let mut l1_lut = L1Lut::default();

	for (&input, &output) in data.iter() {
	assert!(input > 0x7f);

	let (input_high, input_low) = deconstruct(input);
	let l2_lut = &mut l1_lut.l2_luts[input_high as usize];

	match output {
	[output, 0, 0] => {
	let (output_high, output_low) = deconstruct(output);
	assert_eq!(
	output_high, input_high,
	"Case-mapping a character should not change its plane"
	);
	let delta = output_low as i16 - input_low as i16;
	let range = Range::singleton(input_low);
	l2_lut.singles.push((range, delta));
	}
	_ => {
	let output_lows = output.map(\|output\| {
	let (output_high, output_low) = deconstruct(output);
	assert_eq!(
	output_high, input_high,
	"Case-mapping a character should not change its plane"
	);
	Hex(output_low)
	});
	l2_lut.multis.push((Hex(input_low), output_lows));
	}
	}
	}

	// Combine adjacent contiguous ranges which apply the same delta.
	// eg `(0x1234..=0x1234, +1), (0x1235..=0x1235, +1), (0x1236..=0x1236, +1)`
	// becomes `(Range::step_by_1(0x1234..=0x1236), +1)`.
	for l2_lut in &mut l1_lut.l2_luts {
	l2_lut.singles = l2_lut
	.singles
	.chunk_by(\|(left_range, left_delta), (right_range, right_delta)\| {
	left_delta == right_delta && right_range.end() - left_range.start() == 1
	})
	.map(\|chunk\| {
	let &(first_range, _delta) = chunk.first().unwrap();
	let &(last_range, delta) = chunk.last().unwrap();
	(Range::new(first_range.start()..=last_range.end(), false), delta)
	})
	.collect();
	}

	// Combine adjacent ranges which differ by 2, since we can encode that with the parity bit in the Range struct.
	// eg `(0x1234..=0x1234, +1), (0x1236..=0x1236, +1), (0x1238..=0x1238, +1)`
	// becomes `(Range::step_by_2(0x1234..=0x1238), +1)`.
	for l2_lut in &mut l1_lut.l2_luts {
	l2_lut.singles = l2_lut
	.singles
	.chunk_by(\|(left_range, left_delta), (right_range, right_delta)\| {
	left_delta == right_delta
	&& left_range.len() == 1
	&& right_range.len() == 1
	&& right_range.end() - left_range.start() == 2
	})
	.map(\|chunk\| {
	let &(first_range, _delta) = chunk.first().unwrap();
	let &(last_range, delta) = chunk.last().unwrap();
	let parity = chunk.len() > 1;
	(Range::new(first_range.start()..=last_range.end(), parity), delta)
	})
	.collect();
	}

	let size = l1_lut.size();
	let num_ranges =
	l1_lut.l2_luts.iter().map(\|l2\| l2.singles.len() + l2.multis.len()).sum::<usize>();
	let table = format!("static {case}CASE_LUT: L1Lut = {l1_lut:#?};");
	let desc = format!(
	"{:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using 2-level LUT",
	data.len(),
	num_ranges,
	data.first_key_value().map(\|(&k, _)\| k).unwrap(),
	data.last_key_value().map(\|(&k, _)\| k).unwrap(),
	);
	(table, desc, size)
	}

	static HEADER: &str = r"
	use crate::ops::RangeInclusive;

	struct L1Lut {
	l2_luts: [L2Lut; 2],
	}

	struct L2Lut {
	singles: &'static [(Range, i16)],
	multis: &'static [(u16, [u16; 3])],
	}

	#[derive(Copy, Clone)]
	struct Range {
	start: u16,
	len: u8,
	parity: bool,
	}

	impl Range {
	const fn new(range: RangeInclusive<u16>, parity: bool) -> Self {
	let start = *range.start();
	let end = *range.end();
	assert!(start <= end);

	let len = end - start;
	assert!(len <= 255);

	Self { start, len: len as u8, parity }
	}

	const fn singleton(start: u16) -> Self {
	Self::new(start..=start, false)
	}

	const fn step_by_1(range: RangeInclusive<u16>) -> Self {
	Self::new(range, false)
	}

	const fn step_by_2(range: RangeInclusive<u16>) -> Self {
	Self::new(range, true)
	}

	const fn start(&self) -> u16 {
	self.start
	}

	const fn end(&self) -> u16 {
	self.start + self.len as u16
	}
	}

	fn deconstruct(c: char) -> (u16, u16) {
	let c = c as u32;
	let plane = (c >> 16) as u16;
	let low = c as u16;
	(plane, low)
	}

	unsafe fn reconstruct(plane: u16, low: u16) -> char {
	// SAFETY: The caller must ensure that the result is a valid `char`.
	unsafe { char::from_u32_unchecked(((plane as u32) << 16) \| (low as u32)) }
	}

	fn lookup(input: char, l1_lut: &L1Lut) -> Option<[char; 3]> {
	let (input_high, input_low) = deconstruct(input);
	let Some(l2_lut) = l1_lut.l2_luts.get(input_high as usize) else {
	return None;
	};

	let idx = l2_lut.singles.binary_search_by(\|(range, _)\| {
	use crate::cmp::Ordering;

	if input_low < range.start() {
	Ordering::Greater
	} else if input_low > range.end() {
	Ordering::Less
	} else {
	Ordering::Equal
	}
	});

	if let Ok(idx) = idx {
	// SAFETY: binary search guarantees that the index is in bounds.
	let &(range, output_delta) = unsafe { l2_lut.singles.get_unchecked(idx) };
	let mask = range.parity as u16;
	if input_low & mask == range.start() & mask {
	let output_low = input_low.wrapping_add_signed(output_delta);
	// SAFETY: Table data are guaranteed to be valid Unicode.
	let output = unsafe { reconstruct(input_high, output_low) };
	return Some([output, '\0', '\0']);
	}
	};

	if let Ok(idx) = l2_lut.multis.binary_search_by_key(&input_low, \|&(p, _)\| p) {
	// SAFETY: binary search guarantees that the index is in bounds.
	let &(_, output_lows) = unsafe { l2_lut.multis.get_unchecked(idx) };
	// SAFETY: Table data are guaranteed to be valid Unicode.
	let output = output_lows.map(\|output_low\| unsafe { reconstruct(input_high, output_low) });
	return Some(output);
	};

	None
	}

	pub fn to_lower(c: char) -> [char; 3] {
	// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Lowercased%253A%5D-%5B%253AASCII%253A%5D&abb=on
	if c < '\u{C0}' {
	return [c.to_ascii_lowercase(), '\0', '\0'];
	}

	lookup(c, &LOWERCASE_LUT).unwrap_or([c, '\0', '\0'])
	}

	pub fn to_upper(c: char) -> [char; 3] {
	// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Uppercased%253A%5D-%5B%253AASCII%253A%5D&abb=on
	if c < '\u{B5}' {
	return [c.to_ascii_uppercase(), '\0', '\0'];
	}

	lookup(c, &UPPERCASE_LUT).unwrap_or([c, '\0', '\0'])
	}

	pub fn to_title(c: char) -> [char; 3] {
	// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Titlecased%253A%5D-%5B%253AASCII%253A%5D&abb=on
	if c < '\u{B5}' {
	return [c.to_ascii_uppercase(), '\0', '\0'];
	}

	lookup(c, &TITLECASE_LUT).or_else(\|\| lookup(c, &UPPERCASE_LUT)).unwrap_or([c, '\0', '\0'])
	}
	";