blob: b7b385542ef5367147e6d57ae99c83461ac1fafa [file]
//! Generates lookup tables (LUTs) for "case-mapping": mapping a `char` to its
//! uppercase or lowercase equivalent(s).
//!
//! # Lookup algorithm
//!
//! The LUT is split into two levels:
//!
//! The 1st level LUT is simply an array of `L2Lut`s. The correct `L2Lut` is
//! found by splitting the input `codepoint` into an `(input_high, input_low)`
//! pair (Unicode plane and lower 16 bits, respectively), and using `input_high`
//! as the index.
//!
//! The 2nd level LUT is further split into two separate tables:
//! - `singles`, for characters that map to a single codepoint.
//! - `multis`, for characters that map to 2 or 3 codepoints (eg `ß` -> `SS`).
//!
//! `singles` is a sorted slice of `(input_low, output_delta)` pairs. The correct
//! pair is found by binary search. The final output codepoint is reconstructed
//! with `input_high << 16 | (input_low + output_delta)`.
//!
//! `multis` is a sorted slice of `(input_low, [output_low1, output_low2,
//! output_low3])` pairs. The correct pair is found by binary search. The final
//! output codepoints are reconstructed with `input_high << 16 | output_low`.
//!
//! # Size optimizations
//!
//! Splitting the LUT into `L1Lut` and `L2Lut` means we do not need to store the
//! upper 16 bits of every entry in the `L2Lut`, since every character stays
//! within the same plane when case-mapped. This cuts the total size of the LUT
//! in half.
//!
//! Storing the delta between the input codepoint and output codepoint in
//! `singles` allows us to combine contiguous ranges which all map to codepoints
//! the same distance away. For example, all of the accented letters in Latin-1
//! that map to a single codepoint can be case-mapped by adding or subtracting
//! 32.
//!
//! We also combine adjacent entries which only differ by 2, and indicate by
//! setting the `parity` bit in the `Range` struct. An input codepoint belongs
//! to the `Range` if `input_low & range.parity == range.start & range.parity`.
//! This allows us to combine even more entries, without increasing the size of
//! each entry.
use std::collections::BTreeMap;
use std::fmt;
use std::ops::RangeInclusive;
use crate::fmt_helpers::Hex;
use crate::{UnicodeData, fmt_list};
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [(String, usize); 3]) {
let mut file = String::new();
file.push_str("\n\n");
file.push_str(HEADER.trim_start());
file.push('\n');
let (lower_tables, lower_desc, lower_size) = generate_tables("LOWER", &data.to_lower);
file.push_str(&lower_tables);
file.push_str("\n\n");
let (upper_tables, upper_desc, upper_size) = generate_tables("UPPER", &data.to_upper);
file.push_str(&upper_tables);
file.push_str("\n\n");
let (title_tables, title_desc, title_size) = generate_tables("TITLE", &data.to_title);
file.push_str(&title_tables);
(file, [(lower_desc, lower_size), (upper_desc, upper_size), (title_desc, title_size)])
}
// So far, only planes 0 and 1 (Basic Multilingual Plane and Supplementary
// Multilingual Plane, respectively) have case mappings. If that changes, increment this.
const NUM_PLANES: u16 = 2;
/// Split a codepoint into its plane (upper 16 bits) and position within the plane (lower 16 bits).
fn deconstruct(c: u32) -> (u16, u16) {
let high = (c >> 16) as u16;
assert!(high <= NUM_PLANES);
let low = c as u16;
(high, low)
}
#[derive(Debug, Clone)]
struct L1Lut {
/// Keyed by bits 16..=31 of the code point. So far, only planes 0 and 1
/// have case mappings, so we don't need to bother storing the plane numbers.
l2_luts: [L2Lut; NUM_PLANES as usize],
}
impl Default for L1Lut {
fn default() -> Self {
Self { l2_luts: [L2Lut::default(), L2Lut::default()] }
}
}
impl L1Lut {
fn size(&self) -> usize {
self.l2_luts.iter().map(|l2| l2.size()).sum()
}
}
#[derive(Default, Clone)]
struct L2Lut {
/// Keyed by bits 0..=15 of the code point, see `Range` struct for how
/// membership is tested.
singles: Vec<(Range, i16)>,
/// Keyed by bits 0..=15 of the code point, value is the lower 16-bits of the 2 or 3 code points that the char expands to.
multis: Vec<(Hex<u16>, [Hex<u16>; 3])>,
}
/// A compact encoding of a `Range<u16>` in only 4 bytes.
/// If `parity` is false, the range represents code points `start..=start+len`,
/// with step 1.
/// If `parity` is true, the range represents code points `start..=start+len`,
/// with step 2.
/// Membership can be tested with `range.start() <= codepoint && codepoint <=
/// range.end() && (codepoint & parity == start & parity)`.
#[derive(Copy, Clone)]
struct Range {
start: u16,
len: u8,
parity: bool,
// 7 bits left for any more flags we may want in the future.
}
impl Range {
const fn new(range: RangeInclusive<u16>, parity: bool) -> Self {
let start = *range.start();
let end = *range.end();
assert!(start <= end);
let len = end - start;
assert!(len <= 255);
Self { start, len: len as u8, parity }
}
const fn singleton(start: u16) -> Self {
Self::new(start..=start, false)
}
const fn start(&self) -> u16 {
self.start
}
const fn end(&self) -> u16 {
self.start + self.len as u16
}
const fn len(&self) -> u16 {
self.len as u16 + 1
}
}
impl fmt::Debug for Range {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.len() == 1 {
return write!(f, "Range::singleton({})", Hex(self.start));
}
let range = Hex(self.start)..=Hex(self.end());
match self.parity {
false => write!(f, "Range::step_by_1({range:?})"),
true => write!(f, "Range::step_by_2({range:?})"),
}
}
}
impl L2Lut {
fn size(&self) -> usize {
size_of_val(self.singles.as_slice()) + size_of_val(self.multis.as_slice())
}
}
impl fmt::Debug for L2Lut {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let singles = self.singles.as_slice();
let singles = fmt::from_fn(|f| {
write!(f, "&[ // {} entries, {} bytes", singles.len(), size_of_val(singles))?;
write!(f, "{}]", fmt_list(singles))
});
let multis = self.multis.as_slice();
let multis = fmt::from_fn(|f| {
write!(f, "&[ // {} entries, {} bytes", multis.len(), size_of_val(multis))?;
write!(f, "{}]", fmt_list(multis))
});
f.debug_struct("L2Lut").field("singles", &singles).field("multis", &multis).finish()
}
}
fn generate_tables(case: &str, data: &BTreeMap<u32, [u32; 3]>) -> (String, String, usize) {
let mut l1_lut = L1Lut::default();
for (&input, &output) in data.iter() {
assert!(input > 0x7f);
let (input_high, input_low) = deconstruct(input);
let l2_lut = &mut l1_lut.l2_luts[input_high as usize];
match output {
[output, 0, 0] => {
let (output_high, output_low) = deconstruct(output);
assert_eq!(
output_high, input_high,
"Case-mapping a character should not change its plane"
);
let delta = output_low as i16 - input_low as i16;
let range = Range::singleton(input_low);
l2_lut.singles.push((range, delta));
}
_ => {
let output_lows = output.map(|output| {
let (output_high, output_low) = deconstruct(output);
assert_eq!(
output_high, input_high,
"Case-mapping a character should not change its plane"
);
Hex(output_low)
});
l2_lut.multis.push((Hex(input_low), output_lows));
}
}
}
// Combine adjacent contiguous ranges which apply the same delta.
// eg `(0x1234..=0x1234, +1), (0x1235..=0x1235, +1), (0x1236..=0x1236, +1)`
// becomes `(Range::step_by_1(0x1234..=0x1236), +1)`.
for l2_lut in &mut l1_lut.l2_luts {
l2_lut.singles = l2_lut
.singles
.chunk_by(|(left_range, left_delta), (right_range, right_delta)| {
left_delta == right_delta && right_range.end() - left_range.start() == 1
})
.map(|chunk| {
let &(first_range, _delta) = chunk.first().unwrap();
let &(last_range, delta) = chunk.last().unwrap();
(Range::new(first_range.start()..=last_range.end(), false), delta)
})
.collect();
}
// Combine adjacent ranges which differ by 2, since we can encode that with the parity bit in the Range struct.
// eg `(0x1234..=0x1234, +1), (0x1236..=0x1236, +1), (0x1238..=0x1238, +1)`
// becomes `(Range::step_by_2(0x1234..=0x1238), +1)`.
for l2_lut in &mut l1_lut.l2_luts {
l2_lut.singles = l2_lut
.singles
.chunk_by(|(left_range, left_delta), (right_range, right_delta)| {
left_delta == right_delta
&& left_range.len() == 1
&& right_range.len() == 1
&& right_range.end() - left_range.start() == 2
})
.map(|chunk| {
let &(first_range, _delta) = chunk.first().unwrap();
let &(last_range, delta) = chunk.last().unwrap();
let parity = chunk.len() > 1;
(Range::new(first_range.start()..=last_range.end(), parity), delta)
})
.collect();
}
let size = l1_lut.size();
let num_ranges =
l1_lut.l2_luts.iter().map(|l2| l2.singles.len() + l2.multis.len()).sum::<usize>();
let table = format!("static {case}CASE_LUT: L1Lut = {l1_lut:#?};");
let desc = format!(
"{:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using 2-level LUT",
data.len(),
num_ranges,
data.first_key_value().map(|(&k, _)| k).unwrap(),
data.last_key_value().map(|(&k, _)| k).unwrap(),
);
(table, desc, size)
}
static HEADER: &str = r"
use crate::ops::RangeInclusive;
struct L1Lut {
l2_luts: [L2Lut; 2],
}
struct L2Lut {
singles: &'static [(Range, i16)],
multis: &'static [(u16, [u16; 3])],
}
#[derive(Copy, Clone)]
struct Range {
start: u16,
len: u8,
parity: bool,
}
impl Range {
const fn new(range: RangeInclusive<u16>, parity: bool) -> Self {
let start = *range.start();
let end = *range.end();
assert!(start <= end);
let len = end - start;
assert!(len <= 255);
Self { start, len: len as u8, parity }
}
const fn singleton(start: u16) -> Self {
Self::new(start..=start, false)
}
const fn step_by_1(range: RangeInclusive<u16>) -> Self {
Self::new(range, false)
}
const fn step_by_2(range: RangeInclusive<u16>) -> Self {
Self::new(range, true)
}
const fn start(&self) -> u16 {
self.start
}
const fn end(&self) -> u16 {
self.start + self.len as u16
}
}
fn deconstruct(c: char) -> (u16, u16) {
let c = c as u32;
let plane = (c >> 16) as u16;
let low = c as u16;
(plane, low)
}
unsafe fn reconstruct(plane: u16, low: u16) -> char {
// SAFETY: The caller must ensure that the result is a valid `char`.
unsafe { char::from_u32_unchecked(((plane as u32) << 16) | (low as u32)) }
}
fn lookup(input: char, l1_lut: &L1Lut) -> Option<[char; 3]> {
let (input_high, input_low) = deconstruct(input);
let Some(l2_lut) = l1_lut.l2_luts.get(input_high as usize) else {
return None;
};
let idx = l2_lut.singles.binary_search_by(|(range, _)| {
use crate::cmp::Ordering;
if input_low < range.start() {
Ordering::Greater
} else if input_low > range.end() {
Ordering::Less
} else {
Ordering::Equal
}
});
if let Ok(idx) = idx {
// SAFETY: binary search guarantees that the index is in bounds.
let &(range, output_delta) = unsafe { l2_lut.singles.get_unchecked(idx) };
let mask = range.parity as u16;
if input_low & mask == range.start() & mask {
let output_low = input_low.wrapping_add_signed(output_delta);
// SAFETY: Table data are guaranteed to be valid Unicode.
let output = unsafe { reconstruct(input_high, output_low) };
return Some([output, '\0', '\0']);
}
};
if let Ok(idx) = l2_lut.multis.binary_search_by_key(&input_low, |&(p, _)| p) {
// SAFETY: binary search guarantees that the index is in bounds.
let &(_, output_lows) = unsafe { l2_lut.multis.get_unchecked(idx) };
// SAFETY: Table data are guaranteed to be valid Unicode.
let output = output_lows.map(|output_low| unsafe { reconstruct(input_high, output_low) });
return Some(output);
};
None
}
pub fn to_lower(c: char) -> [char; 3] {
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Lowercased%253A%5D-%5B%253AASCII%253A%5D&abb=on
if c < '\u{C0}' {
return [c.to_ascii_lowercase(), '\0', '\0'];
}
lookup(c, &LOWERCASE_LUT).unwrap_or([c, '\0', '\0'])
}
pub fn to_upper(c: char) -> [char; 3] {
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Uppercased%253A%5D-%5B%253AASCII%253A%5D&abb=on
if c < '\u{B5}' {
return [c.to_ascii_uppercase(), '\0', '\0'];
}
lookup(c, &UPPERCASE_LUT).unwrap_or([c, '\0', '\0'])
}
pub fn to_title(c: char) -> [char; 3] {
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%253AChanges_When_Titlecased%253A%5D-%5B%253AASCII%253A%5D&abb=on
if c < '\u{B5}' {
return [c.to_ascii_uppercase(), '\0', '\0'];
}
lookup(c, &TITLECASE_LUT).or_else(|| lookup(c, &UPPERCASE_LUT)).unwrap_or([c, '\0', '\0'])
}
";