| //! This implements the core logic of the compression scheme used to compactly |
| //! encode Unicode properties. |
| //! |
| //! We have two primary goals with the encoding: we want to be compact, because |
| //! these tables often end up in ~every Rust program (especially the |
| //! grapheme_extend table, used for str debugging), including those for embedded |
| //! targets (where space is important). We also want to be relatively fast, |
| //! though this is more of a nice to have rather than a key design constraint. |
| //! It is expected that libraries/applications which are performance-sensitive |
| //! to Unicode property lookups are extremely rare, and those that care may find |
| //! the tradeoff of the raw bitsets worth it. For most applications, a |
| //! relatively fast but much smaller (and as such less cache-impacting, etc.) |
| //! data set is likely preferable. |
| //! |
| //! We have two separate encoding schemes: a skiplist-like approach, and a |
| //! compressed bitset. The datasets we consider mostly use the skiplist (it's |
| //! smaller) but the lowercase and uppercase sets are sufficiently sparse for |
| //! the bitset to be worthwhile -- for those sets the bitset is a 2x size win. |
| //! Since the bitset is also faster, this seems an obvious choice. (As a |
| //! historical note, the bitset was also the prior implementation, so its |
| //! relative complexity had already been paid). |
| //! |
| //! ## The bitset |
| //! |
| //! The primary idea is that we 'flatten' the Unicode ranges into an enormous |
| //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need |
| //! over 17 kilobytes of data per character set -- way too much for our |
| //! purposes. |
| //! |
| //! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF, |
| //! not skipping the small 'gap') is associated into words (u64) and |
| //! deduplicated. On random data, this would be useless; on our data, this is |
| //! incredibly beneficial -- our data sets have (far) less than 256 unique |
| //! words. |
| //! |
| //! This gives us an array that maps `u8 -> word`; the current algorithm does |
| //! not handle the case of more than 256 unique words, but we are relatively far |
| //! from coming that close. |
| //! |
| //! With that scheme, we now have a single byte for every 64 codepoints. |
| //! |
| //! We further chunk these by some constant N (between 1 and 64 per group, |
| //! dynamically chosen for smallest size), and again deduplicate and store in an |
| //! array (u8 -> [u8; N]). |
| //! |
| //! The bytes of this array map into the words from the bitset above, but we |
| //! apply another trick here: some of these words are similar enough that they |
| //! can be represented by some function of another word. The particular |
| //! functions chosen are rotation, inversion, and shifting (right). |
| //! |
| //! ## The skiplist |
| //! |
| //! The skip list arose out of the desire for an even smaller encoding than the |
| //! bitset -- and was the answer to the question "what is the smallest |
| //! representation we can imagine?". However, it is not necessarily the |
| //! smallest, and if you have a better proposal, please do suggest it! |
| //! |
| //! This is a relatively straightforward encoding. First, we break up all the |
| //! ranges in the input data into offsets from each other, essentially a gap |
| //! encoding. In practice, most gaps are small -- less than u8::MAX -- so we |
| //! store those directly. We make use of the larger gaps (which are nicely |
| //! interspersed already) throughout the dataset to index this data set. |
| //! |
| //! In particular, each run of small gaps (terminating in a large gap) is |
| //! indexed in a separate dataset. That data set stores an index into the |
| //! primary offset list and a prefix sum of that offset list. These are packed |
| //! into a single u32 (11 bits for the offset, 21 bits for the prefix sum). |
| //! |
| //! Lookup proceeds via a binary search in the index and then a straightforward |
| //! linear scan (adding up the offsets) until we reach the needle, and then the |
| //! index of that offset is utilized as the answer to whether we're in the set |
| //! or not. |
| |
| use std::collections::{BTreeMap, HashMap}; |
| use std::fmt; |
| use std::fmt::Write; |
| use std::ops::Range; |
| |
| use ucd_parse::Codepoints; |
| |
| mod cascading_map; |
| mod case_mapping; |
| mod raw_emitter; |
| mod skiplist; |
| mod unicode_download; |
| |
| use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace}; |
| |
| static PROPERTIES: &[&str] = &[ |
| "Alphabetic", |
| "Lowercase", |
| "Uppercase", |
| "Cased", |
| "Case_Ignorable", |
| "Grapheme_Extend", |
| "White_Space", |
| "N", |
| ]; |
| |
| struct UnicodeData { |
| ranges: Vec<(&'static str, Vec<Range<u32>>)>, |
| to_upper: BTreeMap<u32, [u32; 3]>, |
| to_lower: BTreeMap<u32, [u32; 3]>, |
| } |
| |
| fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<[u32; 3]> { |
| let mut a = None; |
| let mut b = None; |
| let mut c = None; |
| |
| for codepoint in codepoints { |
| if origin == codepoint.value() { |
| return None; |
| } |
| |
| if a.is_none() { |
| a = Some(codepoint.value()); |
| } else if b.is_none() { |
| b = Some(codepoint.value()); |
| } else if c.is_none() { |
| c = Some(codepoint.value()); |
| } else { |
| panic!("more than 3 mapped codepoints") |
| } |
| } |
| |
| Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) |
| } |
| |
| static UNICODE_DIRECTORY: &str = "unicode-downloads"; |
| |
| fn load_data() -> UnicodeData { |
| unicode_download::fetch_latest(); |
| |
| let mut properties = HashMap::new(); |
| for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() { |
| if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { |
| properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); |
| } |
| } |
| for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() { |
| if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { |
| properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); |
| } |
| } |
| |
| let mut to_lower = BTreeMap::new(); |
| let mut to_upper = BTreeMap::new(); |
| for row in ucd_parse::UnicodeDataExpander::new( |
| ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), |
| ) { |
| let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) { |
| "N" |
| } else { |
| row.general_category.as_str() |
| }; |
| if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) { |
| properties |
| .entry(*name) |
| .or_insert_with(Vec::new) |
| .push(Codepoints::Single(row.codepoint)); |
| } |
| |
| if let Some(mapped) = row.simple_lowercase_mapping |
| && mapped != row.codepoint |
| { |
| to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]); |
| } |
| if let Some(mapped) = row.simple_uppercase_mapping |
| && mapped != row.codepoint |
| { |
| to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); |
| } |
| } |
| |
| for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { |
| if !row.conditions.is_empty() { |
| // Skip conditional case mappings |
| continue; |
| } |
| |
| let key = row.codepoint.value(); |
| if let Some(lower) = to_mapping(key, row.lowercase) { |
| to_lower.insert(key, lower); |
| } |
| if let Some(upper) = to_mapping(key, row.uppercase) { |
| to_upper.insert(key, upper); |
| } |
| } |
| |
| let mut properties: Vec<(&'static str, Vec<Range<u32>>)> = properties |
| .into_iter() |
| .map(|(prop, codepoints)| { |
| let codepoints = codepoints |
| .into_iter() |
| .flatten() |
| .flat_map(|cp| cp.scalar()) |
| .filter(|c| !c.is_ascii()) |
| .map(u32::from) |
| .collect::<Vec<_>>(); |
| (prop, ranges_from_set(&codepoints)) |
| }) |
| .collect(); |
| |
| properties.sort_by_key(|p| p.0); |
| UnicodeData { ranges: properties, to_lower, to_upper } |
| } |
| |
| fn main() { |
| let write_location = std::env::args().nth(1).unwrap_or_else(|| { |
| eprintln!("Must provide path to write unicode tables to"); |
| eprintln!( |
| "e.g. {} library/core/src/unicode/unicode_data.rs", |
| std::env::args().next().unwrap_or_default() |
| ); |
| std::process::exit(1); |
| }); |
| |
| // Optional test path, which is a Rust source file testing that the unicode |
| // property lookups are correct. |
| let test_path = std::env::args().nth(2); |
| |
| let unicode_data = load_data(); |
| let ranges_by_property = &unicode_data.ranges; |
| |
| if let Some(path) = test_path { |
| std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap(); |
| } |
| |
| let mut table_file = String::new(); |
| table_file.push_str( |
| "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", |
| ); |
| |
| let mut total_bytes = 0; |
| let mut modules = Vec::new(); |
| for (property, ranges) in ranges_by_property { |
| let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>(); |
| |
| let mut emitter = RawEmitter::new(); |
| if property == &"White_Space" { |
| emit_whitespace(&mut emitter, ranges); |
| } else { |
| emit_codepoints(&mut emitter, ranges); |
| } |
| |
| modules.push((property.to_lowercase().to_string(), emitter.file)); |
| table_file.push_str(&format!( |
| "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", |
| property, |
| emitter.bytes_used, |
| datapoints, |
| ranges.len(), |
| ranges.first().unwrap().start, |
| ranges.last().unwrap().end, |
| emitter.desc, |
| )); |
| total_bytes += emitter.bytes_used; |
| } |
| let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); |
| for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) { |
| table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size)); |
| total_bytes += size; |
| } |
| table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); |
| |
| // Include the range search function |
| table_file.push('\n'); |
| table_file.push_str(include_str!("range_search.rs")); |
| table_file.push('\n'); |
| |
| table_file.push_str(&version()); |
| |
| table_file.push('\n'); |
| |
| modules.push((String::from("conversions"), conversions)); |
| |
| for (name, contents) in modules { |
| table_file.push_str("#[rustfmt::skip]\n"); |
| table_file.push_str(&format!("pub mod {name} {{\n")); |
| for line in contents.lines() { |
| if !line.trim().is_empty() { |
| table_file.push_str(" "); |
| table_file.push_str(line); |
| } |
| table_file.push('\n'); |
| } |
| table_file.push_str("}\n\n"); |
| } |
| |
| std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); |
| } |
| |
| fn version() -> String { |
| let mut out = String::new(); |
| out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = "); |
| |
| let readme = |
| std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt")) |
| .unwrap(); |
| |
| let prefix = "for Version "; |
| let start = readme.find(prefix).unwrap() + prefix.len(); |
| let end = readme.find(" of the Unicode Standard.").unwrap(); |
| let version = |
| readme[start..end].split('.').map(|v| v.parse::<u32>().expect(v)).collect::<Vec<_>>(); |
| let [major, minor, micro] = [version[0], version[1], version[2]]; |
| |
| out.push_str(&format!("({major}, {minor}, {micro});\n")); |
| out |
| } |
| |
| fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String { |
| let pieces = values.into_iter().map(|b| format!("{b:?}, ")).collect::<Vec<_>>(); |
| let mut out = String::new(); |
| let mut line = String::from("\n "); |
| for piece in pieces { |
| if line.len() + piece.len() < 98 { |
| line.push_str(&piece); |
| } else { |
| out.push_str(line.trim_end()); |
| out.push('\n'); |
| line = format!(" {piece}"); |
| } |
| } |
| out.push_str(line.trim_end()); |
| out.push('\n'); |
| out |
| } |
| |
| fn generate_tests(data: &UnicodeData) -> Result<String, fmt::Error> { |
| let mut s = String::new(); |
| writeln!(s, "#![feature(core_intrinsics)]")?; |
| writeln!(s, "#![allow(internal_features, dead_code)]")?; |
| writeln!(s, "// ignore-tidy-filelength")?; |
| writeln!(s, "use std::intrinsics;")?; |
| writeln!(s, "mod unicode_data;")?; |
| writeln!(s, "fn main() {{")?; |
| for (property, ranges) in &data.ranges { |
| let prop = property.to_lowercase(); |
| writeln!(s, r#" println!("Testing {prop}");"#)?; |
| writeln!(s, " {prop}_true();")?; |
| writeln!(s, " {prop}_false();")?; |
| let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX) |
| .filter(|c| !c.is_ascii()) |
| .map(u32::from) |
| .partition(|c| ranges.iter().any(|r| r.contains(c))); |
| |
| writeln!(s, " fn {prop}_true() {{")?; |
| generate_asserts(&mut s, &prop, &is_true, true)?; |
| writeln!(s, " }}")?; |
| |
| writeln!(s, " fn {prop}_false() {{")?; |
| generate_asserts(&mut s, &prop, &is_false, false)?; |
| writeln!(s, " }}")?; |
| } |
| |
| for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper]) |
| { |
| writeln!(s, r#" println!("Testing {name}");"#)?; |
| for (c, mapping) in conversion { |
| let c = char::from_u32(*c).unwrap(); |
| let mapping = mapping.map(|c| char::from_u32(c).unwrap()); |
| writeln!( |
| s, |
| r#" assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});"# |
| )?; |
| } |
| let unmapped: Vec<_> = (char::MIN..=char::MAX) |
| .filter(|c| !c.is_ascii()) |
| .map(u32::from) |
| .filter(|c| !conversion.contains_key(c)) |
| .collect(); |
| let unmapped_ranges = ranges_from_set(&unmapped); |
| for range in unmapped_ranges { |
| let start = char::from_u32(range.start).unwrap(); |
| let end = char::from_u32(range.end - 1).unwrap(); |
| writeln!(s, " for c in {start:?}..={end:?} {{")?; |
| writeln!( |
| s, |
| r#" assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"# |
| )?; |
| |
| writeln!(s, " }}")?; |
| } |
| } |
| |
| writeln!(s, "}}")?; |
| Ok(s) |
| } |
| |
| fn generate_asserts( |
| s: &mut String, |
| prop: &str, |
| points: &[u32], |
| truthy: bool, |
| ) -> Result<(), fmt::Error> { |
| let truthy = if truthy { "" } else { "!" }; |
| for range in ranges_from_set(points) { |
| let start = char::from_u32(range.start).unwrap(); |
| let end = char::from_u32(range.end - 1).unwrap(); |
| match range.len() { |
| 1 => writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?, |
| _ => { |
| writeln!(s, " for c in {start:?}..={end:?} {{")?; |
| writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup(c));")?; |
| writeln!(s, " }}")?; |
| } |
| } |
| } |
| Ok(()) |
| } |
| |
| /// Group the elements of `set` into contigous ranges |
| fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> { |
| set.chunk_by(|a, b| a + 1 == *b) |
| .map(|chunk| { |
| let start = *chunk.first().unwrap(); |
| let end = *chunk.last().unwrap(); |
| start..(end + 1) |
| }) |
| .collect() |
| } |