src/tools/coverage-dump/src/llvm_utils.rs - rust - Git at Google

 use std::borrow::Cow;
 use std::sync::OnceLock;

 use anyhow::{anyhow, ensure};
 use regex::bytes;

 use crate::parser::Parser;

 #[cfg(test)]
 mod tests;

 /// Given the raw contents of a string literal in LLVM IR assembly, decodes any
 /// backslash escapes and returns a vector containing the resulting byte string.
 pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
     let escape_re = {
         static RE: OnceLock<bytes::Regex> = OnceLock::new();
         // LLVM IR supports two string escapes: `\\` and `\xx`.
         RE.get_or_init(|| bytes::Regex::new(r"\\\\|\\([0-9A-Za-z]{2})").unwrap())
     };

     fn u8_from_hex_digits(digits: &[u8]) -> u8 {
         // We know that the input contains exactly 2 hex digits, so these calls
         // should never fail.
         assert_eq!(digits.len(), 2);
         let digits = std::str::from_utf8(digits).unwrap();
         u8::from_str_radix(digits, 16).unwrap()
     }

     escape_re
         .replace_all(contents.as_bytes(), |captures: &bytes::Captures<'_>| {
             let byte = match captures.get(1) {
                 None => b'\\',
                 Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
             };
             [byte]
         })
         .into_owned()
 }

 /// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
 /// 64 bits as a way to associate data stored in different tables/sections.
 pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 {
     use md5::{Digest, Md5};
     let mut hasher = Md5::new();
     hasher.update(bytes);
     let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
     // The truncated hash is explicitly little-endian, regardless of host
     // or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
     u64::from_le_bytes(hash)
 }

 impl<'a> Parser<'a> {
     /// Reads a sequence of:
     /// - Length of uncompressed data in bytes, as ULEB128
     /// - Length of compressed data in bytes (or 0), as ULEB128
     /// - The indicated number of compressed or uncompressed bytes
     ///
     /// If the number of compressed bytes is 0, the subsequent bytes are
     /// uncompressed. Otherwise, the subsequent bytes are compressed, and will
     /// be decompressed.
     ///
     /// Returns the uncompressed bytes that were read directly or decompressed.
     pub(crate) fn read_chunk_to_uncompressed_bytes(&mut self) -> anyhow::Result<Cow<'a, [u8]>> {
         let uncompressed_len = self.read_uleb128_usize()?;
         let compressed_len = self.read_uleb128_usize()?;

         if compressed_len == 0 {
             // The bytes are uncompressed, so read them directly.
             let uncompressed_bytes = self.read_n_bytes(uncompressed_len)?;
             Ok(Cow::Borrowed(uncompressed_bytes))
         } else {
             // The bytes are compressed, so read and decompress them.
             let compressed_bytes = self.read_n_bytes(compressed_len)?;

             let uncompressed_bytes = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit(
                 compressed_bytes,
                 uncompressed_len,
             )
             .map_err(|e| anyhow!("{e:?}"))?;
             ensure!(uncompressed_bytes.len() == uncompressed_len);

             Ok(Cow::Owned(uncompressed_bytes))
         }
     }
 }
	use std::borrow::Cow;
	use std::sync::OnceLock;

	use anyhow::{anyhow, ensure};
	use regex::bytes;

	use crate::parser::Parser;

	#[cfg(test)]
	mod tests;

	/// Given the raw contents of a string literal in LLVM IR assembly, decodes any
	/// backslash escapes and returns a vector containing the resulting byte string.
	pub(crate) fn unescape_llvm_string_contents(contents: &str) -> Vec<u8> {
	let escape_re = {
	static RE: OnceLock<bytes::Regex> = OnceLock::new();
	// LLVM IR supports two string escapes: `\\` and `\xx`.
	RE.get_or_init(\|\| bytes::Regex::new(r"\\\\\|\\([0-9A-Za-z]{2})").unwrap())
	};

	fn u8_from_hex_digits(digits: &[u8]) -> u8 {
	// We know that the input contains exactly 2 hex digits, so these calls
	// should never fail.
	assert_eq!(digits.len(), 2);
	let digits = std::str::from_utf8(digits).unwrap();
	u8::from_str_radix(digits, 16).unwrap()
	}

	escape_re
	.replace_all(contents.as_bytes(), \|captures: &bytes::Captures<'_>\| {
	let byte = match captures.get(1) {
	None => b'\\',
	Some(hex_digits) => u8_from_hex_digits(hex_digits.as_bytes()),
	};
	[byte]
	})
	.into_owned()
	}

	/// LLVM's profiler/coverage metadata often uses an MD5 hash truncated to
	/// 64 bits as a way to associate data stored in different tables/sections.
	pub(crate) fn truncated_md5(bytes: &[u8]) -> u64 {
	use md5::{Digest, Md5};
	let mut hasher = Md5::new();
	hasher.update(bytes);
	let hash: [u8; 8] = hasher.finalize().as_slice()[..8].try_into().unwrap();
	// The truncated hash is explicitly little-endian, regardless of host
	// or target platform. (See `MD5Result::low` in LLVM's `MD5.h`.)
	u64::from_le_bytes(hash)
	}

	impl<'a> Parser<'a> {
	/// Reads a sequence of:
	/// - Length of uncompressed data in bytes, as ULEB128
	/// - Length of compressed data in bytes (or 0), as ULEB128
	/// - The indicated number of compressed or uncompressed bytes
	///
	/// If the number of compressed bytes is 0, the subsequent bytes are
	/// uncompressed. Otherwise, the subsequent bytes are compressed, and will
	/// be decompressed.
	///
	/// Returns the uncompressed bytes that were read directly or decompressed.
	pub(crate) fn read_chunk_to_uncompressed_bytes(&mut self) -> anyhow::Result<Cow<'a, [u8]>> {
	let uncompressed_len = self.read_uleb128_usize()?;
	let compressed_len = self.read_uleb128_usize()?;

	if compressed_len == 0 {
	// The bytes are uncompressed, so read them directly.
	let uncompressed_bytes = self.read_n_bytes(uncompressed_len)?;
	Ok(Cow::Borrowed(uncompressed_bytes))
	} else {
	// The bytes are compressed, so read and decompress them.
	let compressed_bytes = self.read_n_bytes(compressed_len)?;

	let uncompressed_bytes = miniz_oxide::inflate::decompress_to_vec_zlib_with_limit(
	compressed_bytes,
	uncompressed_len,
	)
	.map_err(\|e\| anyhow!("{e:?}"))?;
	ensure!(uncompressed_bytes.len() == uncompressed_len);

	Ok(Cow::Owned(uncompressed_bytes))
	}
	}
	}