compiler/rustc_span/src/analyze_source_file.rs - rust-lang/rust - Git at Google

 use super::*;

 #[cfg(test)]
 mod tests;

 /// Finds all newlines, multi-byte characters, and non-narrow characters in a
 /// SourceFile.
 ///
 /// This function will use an SSE2 enhanced implementation if hardware support
 /// is detected at runtime.
 pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
     let mut lines = vec![RelativeBytePos::from_u32(0)];
     let mut multi_byte_chars = vec![];

     // Calls the right implementation, depending on hardware support available.
     analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);

     // The code above optimistically registers a new line *after* each \n
     // it encounters. If that point is already outside the source_file, remove
     // it again.
     if let Some(&last_line_start) = lines.last() {
         let source_file_end = RelativeBytePos::from_usize(src.len());
         assert!(source_file_end >= last_line_start);
         if last_line_start == source_file_end {
             lines.pop();
         }
     }

     (lines, multi_byte_chars)
 }

 cfg_select! {
     any(target_arch = "x86", target_arch = "x86_64") => {
         fn analyze_source_file_dispatch(
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
         ) {
             if is_x86_feature_detected!("sse2") {
                 unsafe {
                     analyze_source_file_sse2(src, lines, multi_byte_chars);
                 }
             } else {
                 analyze_source_file_generic(
                     src,
                     src.len(),
                     RelativeBytePos::from_u32(0),
                     lines,
                     multi_byte_chars,
                 );
             }
         }

         /// Checks 16 byte chunks of text at a time. If the chunk contains
         /// something other than printable ASCII characters and newlines, the
         /// function falls back to the generic implementation. Otherwise it uses
         /// SSE2 intrinsics to quickly find all newlines.
         #[target_feature(enable = "sse2")]
         unsafe fn analyze_source_file_sse2(
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
         ) {
             #[cfg(target_arch = "x86")]
             use std::arch::x86::*;
             #[cfg(target_arch = "x86_64")]
             use std::arch::x86_64::*;

             const CHUNK_SIZE: usize = 16;

             let (chunks, tail) = src.as_bytes().as_chunks::<CHUNK_SIZE>();

             // This variable keeps track of where we should start decoding a
             // chunk. If a multi-byte character spans across chunk boundaries,
             // we need to skip that part in the next chunk because we already
             // handled it.
             let mut intra_chunk_offset = 0;

             for (chunk_index, chunk) in chunks.iter().enumerate() {
                 // We don't know if the pointer is aligned to 16 bytes, so we
                 // use `loadu`, which supports unaligned loading.
                 let chunk = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const __m128i) };

                 // For character in the chunk, see if its byte value is < 0, which
                 // indicates that it's part of a UTF-8 char.
                 let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
                 // Create a bit mask from the comparison results.
                 let multibyte_mask = _mm_movemask_epi8(multibyte_test);

                 // If the bit mask is all zero, we only have ASCII chars here:
                 if multibyte_mask == 0 {
                     assert!(intra_chunk_offset == 0);

                     // Check for newlines in the chunk
                     let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
                     let mut newlines_mask = _mm_movemask_epi8(newlines_test);

                     let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);

                     while newlines_mask != 0 {
                         let index = newlines_mask.trailing_zeros();

                         lines.push(RelativeBytePos(index) + output_offset);

                         // Clear the bit, so we can find the next one.
                         newlines_mask &= newlines_mask - 1;
                     }
                 } else {
                     // The slow path.
                     // There are multibyte chars in here, fallback to generic decoding.
                     let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
                     intra_chunk_offset = analyze_source_file_generic(
                         &src[scan_start..],
                         CHUNK_SIZE - intra_chunk_offset,
                         RelativeBytePos::from_usize(scan_start),
                         lines,
                         multi_byte_chars,
                     );
                 }
             }

             // There might still be a tail left to analyze
             let tail_start = src.len() - tail.len() + intra_chunk_offset;
             if tail_start < src.len() {
                 analyze_source_file_generic(
                     &src[tail_start..],
                     src.len() - tail_start,
                     RelativeBytePos::from_usize(tail_start),
                     lines,
                     multi_byte_chars,
                 );
             }
         }
     }
     _ => {
         // The target (or compiler version) does not support SSE2 ...
         fn analyze_source_file_dispatch(
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
         ) {
             analyze_source_file_generic(
                 src,
                 src.len(),
                 RelativeBytePos::from_u32(0),
                 lines,
                 multi_byte_chars,
             );
         }
     }
 }

 // `scan_len` determines the number of bytes in `src` to scan. Note that the
 // function can read past `scan_len` if a multi-byte character start within the
 // range but extends past it. The overflow is returned by the function.
 fn analyze_source_file_generic(
     src: &str,
     scan_len: usize,
     output_offset: RelativeBytePos,
     lines: &mut Vec<RelativeBytePos>,
     multi_byte_chars: &mut Vec<MultiByteChar>,
 ) -> usize {
     assert!(src.len() >= scan_len);
     let mut i = 0;
     let src_bytes = src.as_bytes();

     while i < scan_len {
         let byte = unsafe {
             // We verified that i < scan_len <= src.len()
             *src_bytes.get_unchecked(i)
         };

         // How much to advance in order to get to the next UTF-8 char in the
         // string.
         let mut char_len = 1;

         if byte == b'\n' {
             let pos = RelativeBytePos::from_usize(i) + output_offset;
             lines.push(pos + RelativeBytePos(1));
         } else if byte >= 128 {
             // This is the beginning of a multibyte char. Just decode to `char`.
             let c = src[i..].chars().next().unwrap();
             char_len = c.len_utf8();

             let pos = RelativeBytePos::from_usize(i) + output_offset;
             assert!((2..=4).contains(&char_len));
             let mbc = MultiByteChar { pos, bytes: char_len as u8 };
             multi_byte_chars.push(mbc);
         }

         i += char_len;
     }

     i - scan_len
 }
	use super::*;

	#[cfg(test)]
	mod tests;

	/// Finds all newlines, multi-byte characters, and non-narrow characters in a
	/// SourceFile.
	///
	/// This function will use an SSE2 enhanced implementation if hardware support
	/// is detected at runtime.
	pub(crate) fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
	let mut lines = vec![RelativeBytePos::from_u32(0)];
	let mut multi_byte_chars = vec![];

	// Calls the right implementation, depending on hardware support available.
	analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);

	// The code above optimistically registers a new line after each \n
	// it encounters. If that point is already outside the source_file, remove
	// it again.
	if let Some(&last_line_start) = lines.last() {
	let source_file_end = RelativeBytePos::from_usize(src.len());
	assert!(source_file_end >= last_line_start);
	if last_line_start == source_file_end {
	lines.pop();
	}
	}

	(lines, multi_byte_chars)
	}

	cfg_select! {
	any(target_arch = "x86", target_arch = "x86_64") => {
	fn analyze_source_file_dispatch(
	src: &str,
	lines: &mut Vec<RelativeBytePos>,
	multi_byte_chars: &mut Vec<MultiByteChar>,
	) {
	if is_x86_feature_detected!("sse2") {
	unsafe {
	analyze_source_file_sse2(src, lines, multi_byte_chars);
	}
	} else {
	analyze_source_file_generic(
	src,
	src.len(),
	RelativeBytePos::from_u32(0),
	lines,
	multi_byte_chars,
	);
	}
	}

	/// Checks 16 byte chunks of text at a time. If the chunk contains
	/// something other than printable ASCII characters and newlines, the
	/// function falls back to the generic implementation. Otherwise it uses
	/// SSE2 intrinsics to quickly find all newlines.
	#[target_feature(enable = "sse2")]
	unsafe fn analyze_source_file_sse2(
	src: &str,
	lines: &mut Vec<RelativeBytePos>,
	multi_byte_chars: &mut Vec<MultiByteChar>,
	) {
	#[cfg(target_arch = "x86")]
	use std::arch::x86::*;
	#[cfg(target_arch = "x86_64")]
	use std::arch::x86_64::*;

	const CHUNK_SIZE: usize = 16;

	let (chunks, tail) = src.as_bytes().as_chunks::<CHUNK_SIZE>();

	// This variable keeps track of where we should start decoding a
	// chunk. If a multi-byte character spans across chunk boundaries,
	// we need to skip that part in the next chunk because we already
	// handled it.
	let mut intra_chunk_offset = 0;

	for (chunk_index, chunk) in chunks.iter().enumerate() {
	// We don't know if the pointer is aligned to 16 bytes, so we
	// use `loadu`, which supports unaligned loading.
	let chunk = unsafe { _mm_loadu_si128(chunk.as_ptr() as *const __m128i) };

	// For character in the chunk, see if its byte value is < 0, which
	// indicates that it's part of a UTF-8 char.
	let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0));
	// Create a bit mask from the comparison results.
	let multibyte_mask = _mm_movemask_epi8(multibyte_test);

	// If the bit mask is all zero, we only have ASCII chars here:
	if multibyte_mask == 0 {
	assert!(intra_chunk_offset == 0);

	// Check for newlines in the chunk
	let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8));
	let mut newlines_mask = _mm_movemask_epi8(newlines_test);

	let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1);

	while newlines_mask != 0 {
	let index = newlines_mask.trailing_zeros();

	lines.push(RelativeBytePos(index) + output_offset);

	// Clear the bit, so we can find the next one.
	newlines_mask &= newlines_mask - 1;
	}
	} else {
	// The slow path.
	// There are multibyte chars in here, fallback to generic decoding.
	let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
	intra_chunk_offset = analyze_source_file_generic(
	&src[scan_start..],
	CHUNK_SIZE - intra_chunk_offset,
	RelativeBytePos::from_usize(scan_start),
	lines,
	multi_byte_chars,
	);
	}
	}

	// There might still be a tail left to analyze
	let tail_start = src.len() - tail.len() + intra_chunk_offset;
	if tail_start < src.len() {
	analyze_source_file_generic(
	&src[tail_start..],
	src.len() - tail_start,
	RelativeBytePos::from_usize(tail_start),
	lines,
	multi_byte_chars,
	);
	}
	}
	}
	_ => {
	// The target (or compiler version) does not support SSE2 ...
	fn analyze_source_file_dispatch(
	src: &str,
	lines: &mut Vec<RelativeBytePos>,
	multi_byte_chars: &mut Vec<MultiByteChar>,
	) {
	analyze_source_file_generic(
	src,
	src.len(),
	RelativeBytePos::from_u32(0),
	lines,
	multi_byte_chars,
	);
	}
	}
	}

	// `scan_len` determines the number of bytes in `src` to scan. Note that the
	// function can read past `scan_len` if a multi-byte character start within the
	// range but extends past it. The overflow is returned by the function.
	fn analyze_source_file_generic(
	src: &str,
	scan_len: usize,
	output_offset: RelativeBytePos,
	lines: &mut Vec<RelativeBytePos>,
	multi_byte_chars: &mut Vec<MultiByteChar>,
	) -> usize {
	assert!(src.len() >= scan_len);
	let mut i = 0;
	let src_bytes = src.as_bytes();

	while i < scan_len {
	let byte = unsafe {
	// We verified that i < scan_len <= src.len()
	*src_bytes.get_unchecked(i)
	};

	// How much to advance in order to get to the next UTF-8 char in the
	// string.
	let mut char_len = 1;

	if byte == b'\n' {
	let pos = RelativeBytePos::from_usize(i) + output_offset;
	lines.push(pos + RelativeBytePos(1));
	} else if byte >= 128 {
	// This is the beginning of a multibyte char. Just decode to `char`.
	let c = src[i..].chars().next().unwrap();
	char_len = c.len_utf8();

	let pos = RelativeBytePos::from_usize(i) + output_offset;
	assert!((2..=4).contains(&char_len));
	let mbc = MultiByteChar { pos, bytes: char_len as u8 };
	multi_byte_chars.push(mbc);
	}

	i += char_len;
	}

	i - scan_len
	}