src/tools/unicode-table-generator/src/main.rs - rust - Git at Google

 //! This implements the core logic of the compression scheme used to compactly
 //! encode Unicode properties.
 //!
 //! We have two primary goals with the encoding: we want to be compact, because
 //! these tables often end up in ~every Rust program (especially the
 //! grapheme_extend table, used for str debugging), including those for embedded
 //! targets (where space is important). We also want to be relatively fast,
 //! though this is more of a nice to have rather than a key design constraint.
 //! It is expected that libraries/applications which are performance-sensitive
 //! to Unicode property lookups are extremely rare, and those that care may find
 //! the tradeoff of the raw bitsets worth it. For most applications, a
 //! relatively fast but much smaller (and as such less cache-impacting, etc.)
 //! data set is likely preferable.
 //!
 //! We have two separate encoding schemes: a skiplist-like approach, and a
 //! compressed bitset. The datasets we consider mostly use the skiplist (it's
 //! smaller) but the lowercase and uppercase sets are sufficiently sparse for
 //! the bitset to be worthwhile -- for those sets the bitset is a 2x size win.
 //! Since the bitset is also faster, this seems an obvious choice. (As a
 //! historical note, the bitset was also the prior implementation, so its
 //! relative complexity had already been paid).
 //!
 //! ## The bitset
 //!
 //! The primary idea is that we 'flatten' the Unicode ranges into an enormous
 //! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
 //! over 17 kilobytes of data per character set -- way too much for our
 //! purposes.
 //!
 //! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
 //! not skipping the small 'gap') is associated into words (u64) and
 //! deduplicated. On random data, this would be useless; on our data, this is
 //! incredibly beneficial -- our data sets have (far) less than 256 unique
 //! words.
 //!
 //! This gives us an array that maps `u8 -> word`; the current algorithm does
 //! not handle the case of more than 256 unique words, but we are relatively far
 //! from coming that close.
 //!
 //! With that scheme, we now have a single byte for every 64 codepoints.
 //!
 //! We further chunk these by some constant N (between 1 and 64 per group,
 //! dynamically chosen for smallest size), and again deduplicate and store in an
 //! array (u8 -> [u8; N]).
 //!
 //! The bytes of this array map into the words from the bitset above, but we
 //! apply another trick here: some of these words are similar enough that they
 //! can be represented by some function of another word. The particular
 //! functions chosen are rotation, inversion, and shifting (right).
 //!
 //! ## The skiplist
 //!
 //! The skip list arose out of the desire for an even smaller encoding than the
 //! bitset -- and was the answer to the question "what is the smallest
 //! representation we can imagine?". However, it is not necessarily the
 //! smallest, and if you have a better proposal, please do suggest it!
 //!
 //! This is a relatively straightforward encoding. First, we break up all the
 //! ranges in the input data into offsets from each other, essentially a gap
 //! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
 //! store those directly. We make use of the larger gaps (which are nicely
 //! interspersed already) throughout the dataset to index this data set.
 //!
 //! In particular, each run of small gaps (terminating in a large gap) is
 //! indexed in a separate dataset. That data set stores an index into the
 //! primary offset list and a prefix sum of that offset list. These are packed
 //! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
 //!
 //! Lookup proceeds via a binary search in the index and then a straightforward
 //! linear scan (adding up the offsets) until we reach the needle, and then the
 //! index of that offset is utilized as the answer to whether we're in the set
 //! or not.

 use std::collections::{BTreeMap, HashMap};
 use std::fmt::Write;
 use std::ops::Range;

 use ucd_parse::Codepoints;

 mod cascading_map;
 mod case_mapping;
 mod fmt_helpers;
 mod raw_emitter;
 mod skiplist;
 mod unicode_download;

 use fmt_helpers::CharEscape;
 use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace};

 static PROPERTIES: &[&str] = &[
     "Alphabetic",
     "Lowercase",
     "Uppercase",
     "Case_Ignorable",
     "Grapheme_Extend",
     "White_Space",
     "N",
     "Lt",
 ];

 struct UnicodeData {
     ranges: Vec<(&'static str, Vec<Range<u32>>)>,
     /// Only stores mappings that are not to self
     to_upper: BTreeMap<u32, [u32; 3]>,
     /// Only stores mappings that differ from `to_upper`
     to_title: BTreeMap<u32, [u32; 3]>,
     /// Only stores mappings that are not to self
     to_lower: BTreeMap<u32, [u32; 3]>,
 }

 fn to_mapping(
     if_different_from: &[ucd_parse::Codepoint],
     codepoints: &[ucd_parse::Codepoint],
 ) -> Option<[u32; 3]> {
     if codepoints == if_different_from {
         return None;
     }

     let mut ret = [ucd_parse::Codepoint::default(); 3];
     ret[0..codepoints.len()].copy_from_slice(codepoints);
     Some(ret.map(ucd_parse::Codepoint::value))
 }

 static UNICODE_DIRECTORY: &str = "unicode-downloads";

 fn load_data() -> UnicodeData {
     unicode_download::fetch_latest();

     let mut properties = HashMap::new();
     for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
             properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
         }
     }
     for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
             properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
         }
     }

     let [mut to_lower, mut to_upper, mut to_title] = [const { BTreeMap::new() }; 3];
     for row in ucd_parse::UnicodeDataExpander::new(
         ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
     ) {
         let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
             "N"
         } else {
             row.general_category.as_str()
         };
         if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
             properties
                 .entry(*name)
                 .or_insert_with(Vec::new)
                 .push(Codepoints::Single(row.codepoint));
         }

         if let Some(mapped) = row.simple_lowercase_mapping
             && mapped != row.codepoint
         {
             to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]);
         }
         if let Some(mapped) = row.simple_uppercase_mapping
             && mapped != row.codepoint
         {
             to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]);
         }
         if let Some(mapped) = row.simple_titlecase_mapping
             && Some(mapped) != row.simple_uppercase_mapping
         {
             to_title.insert(row.codepoint.value(), [mapped.value(), 0, 0]);
         }
     }

     for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
         if !row.conditions.is_empty() {
             // Skip conditional case mappings
             continue;
         }

         let key = row.codepoint.value();
         if let Some(lower) = to_mapping(&[row.codepoint], &row.lowercase) {
             to_lower.insert(key, lower);
         }
         if let Some(upper) = to_mapping(&[row.codepoint], &row.uppercase) {
             to_upper.insert(key, upper);
         }
         if let Some(title) = to_mapping(&row.uppercase, &row.titlecase) {
             to_title.insert(key, title);
         }
     }

     // Filter out ASCII codepoints.
     to_lower.retain(|&c, _| c > 0x7f);
     to_upper.retain(|&c, _| c > 0x7f);
     let mut properties: Vec<(&'static str, Vec<Range<u32>>)> = properties
         .into_iter()
         .map(|(prop, codepoints)| {
             let codepoints = codepoints
                 .into_iter()
                 .flatten()
                 .flat_map(|cp| cp.scalar())
                 .filter(|c| !c.is_ascii())
                 .map(u32::from)
                 .collect::<Vec<_>>();
             (prop, ranges_from_set(&codepoints))
         })
         .collect();

     properties.sort_by_key(|p| p.0);
     UnicodeData { ranges: properties, to_lower, to_title, to_upper }
 }

 fn main() {
     let args = std::env::args().collect::<Vec<_>>();

     if args.len() != 3 {
         eprintln!("Must provide paths to write unicode tables and tests to");
         eprintln!(
             "e.g. {} library/core/src/unicode/unicode_data.rs library/coretests/tests/unicode/test_data.rs",
             args[0]
         );
         std::process::exit(1);
     }

     let data_path = &args[1];
     let test_path = &args[2];

     let unicode_data = load_data();
     let ranges_by_property = &unicode_data.ranges;

     let mut table_file = String::new();
     table_file.push_str(
         "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n",
     );

     let mut total_bytes = 0;
     let mut modules = Vec::new();
     for (property, ranges) in ranges_by_property {
         let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();

         let mut emitter = RawEmitter::new();
         if property == &"White_Space" {
             emit_whitespace(&mut emitter, ranges);
         } else {
             emit_codepoints(&mut emitter, ranges);
         }

         modules.push((property.to_lowercase().to_string(), emitter.file));
         table_file.push_str(&format!(
             "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n",
             property,
             emitter.bytes_used,
             datapoints,
             ranges.len(),
             ranges.first().unwrap().start,
             ranges.last().unwrap().end,
             emitter.desc,
         ));
         total_bytes += emitter.bytes_used;
     }
     let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data);
     for (name, (desc, size)) in ["to_lower", "to_upper", "to_title"].iter().zip(sizes) {
         table_file.push_str(&format!("// {:16}: {:5} bytes, {desc}\n", name, size,));
         total_bytes += size;
     }
     table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes));

     // Include the range search function
     table_file.push('\n');
     table_file.push_str(include_str!("range_search.rs"));
     table_file.push('\n');

     table_file.push_str(&version());

     table_file.push('\n');

     modules.push((String::from("conversions"), conversions));

     for (name, contents) in modules {
         table_file.push_str("#[rustfmt::skip]\n");
         table_file.push_str(&format!("pub mod {name} {{\n"));
         for line in contents.lines() {
             if !line.trim().is_empty() {
                 table_file.push_str("    ");
                 table_file.push_str(line);
             }
             table_file.push('\n');
         }
         table_file.push_str("}\n\n");
     }

     let test_file = generate_tests(&unicode_data);
     std::fs::write(&test_path, test_file).unwrap();
     std::fs::write(&data_path, table_file).unwrap();
     eprintln!("Unicode data was generated. Remember to run \"x fmt\"!");
 }

 fn version() -> String {
     let mut out = String::new();
     out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = ");

     let readme =
         std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
             .unwrap();

     let prefix = "for Version ";
     let start = readme.find(prefix).unwrap() + prefix.len();
     let end = readme.find(" of the Unicode Standard.").unwrap();
     let version =
         readme[start..end].split('.').map(|v| v.parse::<u32>().expect(v)).collect::<Vec<_>>();
     let [major, minor, micro] = [version[0], version[1], version[2]];

     out.push_str(&format!("({major}, {minor}, {micro});\n"));
     out
 }

 fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
     let pieces = values.into_iter().map(|b| format!("{b:?}, ")).collect::<Vec<_>>();
     let mut out = String::new();
     let mut line = String::from("\n    ");
     for piece in pieces {
         if line.len() + piece.len() < 98 {
             line.push_str(&piece);
         } else {
             out.push_str(line.trim_end());
             out.push('\n');
             line = format!("    {piece}");
         }
     }
     out.push_str(line.trim_end());
     out.push('\n');
     out
 }

 fn generate_tests(data: &UnicodeData) -> String {
     let mut out = String::from(
         "\
 //! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
 // ignore-tidy-filelength

 use std::ops::RangeInclusive;
 ",
     );
     for (property, ranges) in &data.ranges {
         let prop_upper = property.to_uppercase();
         let is_true = (char::MIN..=char::MAX)
             .filter(|c| !c.is_ascii())
             .map(u32::from)
             .filter(|c| ranges.iter().any(|r| r.contains(c)))
             .collect::<Vec<_>>();
         let is_true = ranges_from_set(&is_true)
             .into_iter()
             .map(|r| {
                 let start = char::from_u32(r.start).unwrap();
                 let end = char::from_u32(r.end - 1).unwrap();
                 CharEscape(start)..=CharEscape(end)
             })
             .collect::<Vec<_>>();

         writeln!(
             out,
             r#"
 #[rustfmt::skip]
 pub(super) static {prop_upper}: &[RangeInclusive<char>; {is_true_len}] = &[{is_true}];
 "#,
             is_true_len = is_true.len(),
             is_true = fmt_list(is_true),
         )
         .unwrap();
     }

     for (name, lut) in ["TO_LOWER", "TO_UPPER", "TO_TITLE"].iter().zip([
         &data.to_lower,
         &data.to_upper,
         &data.to_title,
     ]) {
         let lut = lut
             .iter()
             .map(|(key, values)| {
                 let key = char::from_u32(*key).unwrap();
                 let values = values.map(|c| char::from_u32(c).unwrap());
                 (CharEscape(key), values.map(CharEscape))
             })
             .collect::<Vec<_>>();

         writeln!(
             out,
             r#"
 #[rustfmt::skip]
 pub(super) static {name}: &[(char, [char; 3]); {len}] = &[{lut}];
 "#,
             len = lut.len(),
             lut = fmt_list(lut),
         )
         .unwrap();
     }

     out
 }

 /// Group the elements of `set` into contigous ranges
 fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
     set.chunk_by(|a, b| a + 1 == *b)
         .map(|chunk| {
             let start = *chunk.first().unwrap();
             let end = *chunk.last().unwrap();
             start..(end + 1)
         })
         .collect()
 }
	//! This implements the core logic of the compression scheme used to compactly
	//! encode Unicode properties.
	//!
	//! We have two primary goals with the encoding: we want to be compact, because
	//! these tables often end up in ~every Rust program (especially the
	//! grapheme_extend table, used for str debugging), including those for embedded
	//! targets (where space is important). We also want to be relatively fast,
	//! though this is more of a nice to have rather than a key design constraint.
	//! It is expected that libraries/applications which are performance-sensitive
	//! to Unicode property lookups are extremely rare, and those that care may find
	//! the tradeoff of the raw bitsets worth it. For most applications, a
	//! relatively fast but much smaller (and as such less cache-impacting, etc.)
	//! data set is likely preferable.
	//!
	//! We have two separate encoding schemes: a skiplist-like approach, and a
	//! compressed bitset. The datasets we consider mostly use the skiplist (it's
	//! smaller) but the lowercase and uppercase sets are sufficiently sparse for
	//! the bitset to be worthwhile -- for those sets the bitset is a 2x size win.
	//! Since the bitset is also faster, this seems an obvious choice. (As a
	//! historical note, the bitset was also the prior implementation, so its
	//! relative complexity had already been paid).
	//!
	//! ## The bitset
	//!
	//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
	//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
	//! over 17 kilobytes of data per character set -- way too much for our
	//! purposes.
	//!
	//! First, the raw bitset (one bit for every valid `char`, from 0 to 0x10FFFF,
	//! not skipping the small 'gap') is associated into words (u64) and
	//! deduplicated. On random data, this would be useless; on our data, this is
	//! incredibly beneficial -- our data sets have (far) less than 256 unique
	//! words.
	//!
	//! This gives us an array that maps `u8 -> word`; the current algorithm does
	//! not handle the case of more than 256 unique words, but we are relatively far
	//! from coming that close.
	//!
	//! With that scheme, we now have a single byte for every 64 codepoints.
	//!
	//! We further chunk these by some constant N (between 1 and 64 per group,
	//! dynamically chosen for smallest size), and again deduplicate and store in an
	//! array (u8 -> [u8; N]).
	//!
	//! The bytes of this array map into the words from the bitset above, but we
	//! apply another trick here: some of these words are similar enough that they
	//! can be represented by some function of another word. The particular
	//! functions chosen are rotation, inversion, and shifting (right).
	//!
	//! ## The skiplist
	//!
	//! The skip list arose out of the desire for an even smaller encoding than the
	//! bitset -- and was the answer to the question "what is the smallest
	//! representation we can imagine?". However, it is not necessarily the
	//! smallest, and if you have a better proposal, please do suggest it!
	//!
	//! This is a relatively straightforward encoding. First, we break up all the
	//! ranges in the input data into offsets from each other, essentially a gap
	//! encoding. In practice, most gaps are small -- less than u8::MAX -- so we
	//! store those directly. We make use of the larger gaps (which are nicely
	//! interspersed already) throughout the dataset to index this data set.
	//!
	//! In particular, each run of small gaps (terminating in a large gap) is
	//! indexed in a separate dataset. That data set stores an index into the
	//! primary offset list and a prefix sum of that offset list. These are packed
	//! into a single u32 (11 bits for the offset, 21 bits for the prefix sum).
	//!
	//! Lookup proceeds via a binary search in the index and then a straightforward
	//! linear scan (adding up the offsets) until we reach the needle, and then the
	//! index of that offset is utilized as the answer to whether we're in the set
	//! or not.

	use std::collections::{BTreeMap, HashMap};
	use std::fmt::Write;
	use std::ops::Range;

	use ucd_parse::Codepoints;

	mod cascading_map;
	mod case_mapping;
	mod fmt_helpers;
	mod raw_emitter;
	mod skiplist;
	mod unicode_download;

	use fmt_helpers::CharEscape;
	use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace};

	static PROPERTIES: &[&str] = &[
	"Alphabetic",
	"Lowercase",
	"Uppercase",
	"Case_Ignorable",
	"Grapheme_Extend",
	"White_Space",
	"N",
	"Lt",
	];

	struct UnicodeData {
	ranges: Vec<(&'static str, Vec<Range<u32>>)>,
	/// Only stores mappings that are not to self
	to_upper: BTreeMap<u32, [u32; 3]>,
	/// Only stores mappings that differ from `to_upper`
	to_title: BTreeMap<u32, [u32; 3]>,
	/// Only stores mappings that are not to self
	to_lower: BTreeMap<u32, [u32; 3]>,
	}

	fn to_mapping(
	if_different_from: &[ucd_parse::Codepoint],
	codepoints: &[ucd_parse::Codepoint],
	) -> Option<[u32; 3]> {
	if codepoints == if_different_from {
	return None;
	}

	let mut ret = [ucd_parse::Codepoint::default(); 3];
	ret[0..codepoints.len()].copy_from_slice(codepoints);
	Some(ret.map(ucd_parse::Codepoint::value))
	}

	static UNICODE_DIRECTORY: &str = "unicode-downloads";

	fn load_data() -> UnicodeData {
	unicode_download::fetch_latest();

	let mut properties = HashMap::new();
	for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
	if let Some(name) = PROPERTIES.iter().find(\|prop\| **prop == row.property.as_str()) {
	properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
	}
	}
	for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
	if let Some(name) = PROPERTIES.iter().find(\|prop\| **prop == row.property.as_str()) {
	properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
	}
	}

	let [mut to_lower, mut to_upper, mut to_title] = [const { BTreeMap::new() }; 3];
	for row in ucd_parse::UnicodeDataExpander::new(
	ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
	) {
	let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
	"N"
	} else {
	row.general_category.as_str()
	};
	if let Some(name) = PROPERTIES.iter().find(\|prop\| **prop == general_category) {
	properties
	.entry(*name)
	.or_insert_with(Vec::new)
	.push(Codepoints::Single(row.codepoint));
	}

	if let Some(mapped) = row.simple_lowercase_mapping
	&& mapped != row.codepoint
	{
	to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]);
	}
	if let Some(mapped) = row.simple_uppercase_mapping
	&& mapped != row.codepoint
	{
	to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]);
	}
	if let Some(mapped) = row.simple_titlecase_mapping
	&& Some(mapped) != row.simple_uppercase_mapping
	{
	to_title.insert(row.codepoint.value(), [mapped.value(), 0, 0]);
	}
	}

	for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
	if !row.conditions.is_empty() {
	// Skip conditional case mappings
	continue;
	}

	let key = row.codepoint.value();
	if let Some(lower) = to_mapping(&[row.codepoint], &row.lowercase) {
	to_lower.insert(key, lower);
	}
	if let Some(upper) = to_mapping(&[row.codepoint], &row.uppercase) {
	to_upper.insert(key, upper);
	}
	if let Some(title) = to_mapping(&row.uppercase, &row.titlecase) {
	to_title.insert(key, title);
	}
	}

	// Filter out ASCII codepoints.
	to_lower.retain(\|&c, _\| c > 0x7f);
	to_upper.retain(\|&c, _\| c > 0x7f);
	let mut properties: Vec<(&'static str, Vec<Range<u32>>)> = properties
	.into_iter()
	.map(\|(prop, codepoints)\| {
	let codepoints = codepoints
	.into_iter()
	.flatten()
	.flat_map(\|cp\| cp.scalar())
	.filter(\|c\| !c.is_ascii())
	.map(u32::from)
	.collect::<Vec<_>>();
	(prop, ranges_from_set(&codepoints))
	})
	.collect();

	properties.sort_by_key(\|p\| p.0);
	UnicodeData { ranges: properties, to_lower, to_title, to_upper }
	}

	fn main() {
	let args = std::env::args().collect::<Vec<_>>();

	if args.len() != 3 {
	eprintln!("Must provide paths to write unicode tables and tests to");
	eprintln!(
	"e.g. {} library/core/src/unicode/unicode_data.rs library/coretests/tests/unicode/test_data.rs",
	args[0]
	);
	std::process::exit(1);
	}

	let data_path = &args[1];
	let test_path = &args[2];

	let unicode_data = load_data();
	let ranges_by_property = &unicode_data.ranges;

	let mut table_file = String::new();
	table_file.push_str(
	"//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n",
	);

	let mut total_bytes = 0;
	let mut modules = Vec::new();
	for (property, ranges) in ranges_by_property {
	let datapoints = ranges.iter().map(\|r\| r.end - r.start).sum::<u32>();

	let mut emitter = RawEmitter::new();
	if property == &"White_Space" {
	emit_whitespace(&mut emitter, ranges);
	} else {
	emit_codepoints(&mut emitter, ranges);
	}

	modules.push((property.to_lowercase().to_string(), emitter.file));
	table_file.push_str(&format!(
	"// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n",
	property,
	emitter.bytes_used,
	datapoints,
	ranges.len(),
	ranges.first().unwrap().start,
	ranges.last().unwrap().end,
	emitter.desc,
	));
	total_bytes += emitter.bytes_used;
	}
	let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data);
	for (name, (desc, size)) in ["to_lower", "to_upper", "to_title"].iter().zip(sizes) {
	table_file.push_str(&format!("// {:16}: {:5} bytes, {desc}\n", name, size,));
	total_bytes += size;
	}
	table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes));

	// Include the range search function
	table_file.push('\n');
	table_file.push_str(include_str!("range_search.rs"));
	table_file.push('\n');

	table_file.push_str(&version());

	table_file.push('\n');

	modules.push((String::from("conversions"), conversions));

	for (name, contents) in modules {
	table_file.push_str("#[rustfmt::skip]\n");
	table_file.push_str(&format!("pub mod {name} {{\n"));
	for line in contents.lines() {
	if !line.trim().is_empty() {
	table_file.push_str(" ");
	table_file.push_str(line);
	}
	table_file.push('\n');
	}
	table_file.push_str("}\n\n");
	}

	let test_file = generate_tests(&unicode_data);
	std::fs::write(&test_path, test_file).unwrap();
	std::fs::write(&data_path, table_file).unwrap();
	eprintln!("Unicode data was generated. Remember to run \"x fmt\"!");
	}

	fn version() -> String {
	let mut out = String::new();
	out.push_str("pub const UNICODE_VERSION: (u8, u8, u8) = ");

	let readme =
	std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
	.unwrap();

	let prefix = "for Version ";
	let start = readme.find(prefix).unwrap() + prefix.len();
	let end = readme.find(" of the Unicode Standard.").unwrap();
	let version =
	readme[start..end].split('.').map(\|v\| v.parse::<u32>().expect(v)).collect::<Vec<_>>();
	let [major, minor, micro] = [version[0], version[1], version[2]];

	out.push_str(&format!("({major}, {minor}, {micro});\n"));
	out
	}

	fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
	let pieces = values.into_iter().map(\|b\| format!("{b:?}, ")).collect::<Vec<_>>();
	let mut out = String::new();
	let mut line = String::from("\n ");
	for piece in pieces {
	if line.len() + piece.len() < 98 {
	line.push_str(&piece);
	} else {
	out.push_str(line.trim_end());
	out.push('\n');
	line = format!(" {piece}");
	}
	}
	out.push_str(line.trim_end());
	out.push('\n');
	out
	}

	fn generate_tests(data: &UnicodeData) -> String {
	let mut out = String::from(
	"\
	//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
	// ignore-tidy-filelength

	use std::ops::RangeInclusive;
	",
	);
	for (property, ranges) in &data.ranges {
	let prop_upper = property.to_uppercase();
	let is_true = (char::MIN..=char::MAX)
	.filter(\|c\| !c.is_ascii())
	.map(u32::from)
	.filter(\|c\| ranges.iter().any(\|r\| r.contains(c)))
	.collect::<Vec<_>>();
	let is_true = ranges_from_set(&is_true)
	.into_iter()
	.map(\|r\| {
	let start = char::from_u32(r.start).unwrap();
	let end = char::from_u32(r.end - 1).unwrap();
	CharEscape(start)..=CharEscape(end)
	})
	.collect::<Vec<_>>();

	writeln!(
	out,
	r#"
	#[rustfmt::skip]
	pub(super) static {prop_upper}: &[RangeInclusive<char>; {is_true_len}] = &[{is_true}];
	"#,
	is_true_len = is_true.len(),
	is_true = fmt_list(is_true),
	)
	.unwrap();
	}

	for (name, lut) in ["TO_LOWER", "TO_UPPER", "TO_TITLE"].iter().zip([
	&data.to_lower,
	&data.to_upper,
	&data.to_title,
	]) {
	let lut = lut
	.iter()
	.map(\|(key, values)\| {
	let key = char::from_u32(*key).unwrap();
	let values = values.map(\|c\| char::from_u32(c).unwrap());
	(CharEscape(key), values.map(CharEscape))
	})
	.collect::<Vec<_>>();

	writeln!(
	out,
	r#"
	#[rustfmt::skip]
	pub(super) static {name}: &[(char, [char; 3]); {len}] = &[{lut}];
	"#,
	len = lut.len(),
	lut = fmt_list(lut),
	)
	.unwrap();
	}

	out
	}

	/// Group the elements of `set` into contigous ranges
	fn ranges_from_set(set: &[u32]) -> Vec<Range<u32>> {
	set.chunk_by(\|a, b\| a + 1 == *b)
	.map(\|chunk\| {
	let start = *chunk.first().unwrap();
	let end = *chunk.last().unwrap();
	start..(end + 1)
	})
	.collect()
	}