libc/src/string/memory_utils/op_x86.h - rust-lang/llvm-project - Git at Google

 //===-- x86 implementation of memory function building blocks -------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file provides x86 specific building blocks to compose memory functions.
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H

 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"

 #if defined(LIBC_TARGET_ARCH_IS_X86)

 #include "src/__support/common.h"
 #include "src/string/memory_utils/op_builtin.h"
 #include "src/string/memory_utils/op_generic.h"

 #if defined(__AVX512BW__) || defined(__AVX512F__) || defined(__AVX2__) ||      \
     defined(__SSE2__)
 #include <immintrin.h>
 #endif

 // Define fake functions to prevent the compiler from failing on undefined
 // functions in case the CPU extension is not present.
 #if !defined(__AVX512BW__) && (defined(_MSC_VER) || defined(__SCE__))
 #undef _mm512_cmpneq_epi8_mask
 #define _mm512_cmpneq_epi8_mask(A, B) 0
 #endif
 #if !defined(__AVX2__) && (defined(_MSC_VER) || defined(__SCE__))
 #undef _mm256_movemask_epi8
 #define _mm256_movemask_epi8(A) 0
 #endif
 #if !defined(__SSE2__) && (defined(_MSC_VER) || defined(__SCE__))
 #undef _mm_movemask_epi8
 #define _mm_movemask_epi8(A) 0
 #endif

 namespace LIBC_NAMESPACE_DECL {
 namespace x86 {

 // A set of constants to check compile time features.
 LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
 LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
 LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
 LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
 LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
 LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);

 ///////////////////////////////////////////////////////////////////////////////
 // Memcpy repmovsb implementation
 struct Memcpy {
   LIBC_INLINE static void repmovsb(void *dst, const void *src, size_t count) {
     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
   }
 };

 } // namespace x86
 } // namespace LIBC_NAMESPACE_DECL

 namespace LIBC_NAMESPACE_DECL {
 namespace generic {

 // Not equals: returns non-zero iff values at head or tail differ.
 // This function typically loads more data than necessary when the two buffer
 // differs.
 template <typename T>
 LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
   static_assert(cpp::is_integral_v<T>);
   return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
 }

 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint16_t
 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
 template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
   return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
 }
 template <>
 LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
   return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
   return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
          static_cast<int32_t>(load_be<uint16_t>(p2, offset));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);

 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint32_t
 template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
 template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
   return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
 }
 template <>
 LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
   return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load_be<uint32_t>(p1, offset);
   const auto b = load_be<uint32_t>(p2, offset);
   return cmp_uint32_t(a, b);
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);

 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint64_t
 template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
 template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
   return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
 }
 template <>
 LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
   return !eq<uint64_t>(p1, p2, offset);
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
                                                size_t offset) {
   const auto a = load_be<uint64_t>(p1, offset);
   const auto b = load_be<uint64_t>(p2, offset);
   return cmp_neq_uint64_t(a, b);
 }

 // SIMD types are defined with attributes. e.g., '__m128i' is defined as
 // long long  __attribute__((__vector_size__(16), __aligned__(16)))
 // When we use these SIMD types in template specialization GCC complains:
 // "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
 // Therefore, we disable this warning in this file.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wignored-attributes"

 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for __m128i
 #if defined(__SSE4_1__)
 template <> struct is_vector<__m128i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
 LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m128i>(p1, offset);
   const auto b = load<__m128i>(p2, offset);
   return _mm_xor_si128(a, b);
 }
 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
   return _mm_max_epu8(a, b);
 }
 LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
   return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
                                               8, 9, 10, 11, 12, 13, 14, 15));
 }
 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
   return static_cast<uint16_t>(
       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
 }
 LIBC_INLINE bool is_zero(__m128i value) {
   return _mm_testz_si128(value, value) == 1;
 }
 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
   return is_zero(load_and_xor_m128i(p1, p2, offset));
 }
 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
   return !is_zero(load_and_xor_m128i(p1, p2, offset));
 }
 template <>
 LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
                                                        size_t count) {
   const __m128i head = load_and_xor_m128i(p1, p2, 0);
   const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
   return !is_zero(_mm_or_si128(head, tail));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m128i>(p1, offset);
   const auto b = load<__m128i>(p2, offset);
   const auto vmax = bytewise_max(a, b);
   const auto le = big_endian_cmp_mask(vmax, b);
   const auto ge = big_endian_cmp_mask(vmax, a);
   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
   return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
 }
 #endif // __SSE4_1__

 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for __m256i
 #if defined(__AVX__)
 template <> struct is_vector<__m256i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
 LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
   return _mm256_castps_si256(
       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 }
 LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
   return _mm256_castps_si256(
       _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
 }
 LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m256i>(p1, offset);
   const auto b = load<__m256i>(p2, offset);
   return xor_m256i(a, b);
 }
 LIBC_INLINE bool is_zero(__m256i value) {
   return _mm256_testz_si256(value, value) == 1;
 }
 template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
   return is_zero(load_and_xor_m256i(p1, p2, offset));
 }
 template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
   return !is_zero(load_and_xor_m256i(p1, p2, offset));
 }
 template <>
 LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
                                                        size_t count) {
   const __m256i head = load_and_xor_m256i(p1, p2, 0);
   const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
   return !is_zero(or_m256i(head, tail));
 }
 #endif // __AVX__

 #if defined(__AVX2__)
 LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
   return _mm256_max_epu8(a, b);
 }
 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
   // Bytewise comparison of 'max' and 'value'.
   const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
   // Because x86 is little endian, bytes in the vector must be reversed before
   // using movemask.
 #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
   // When AVX512BMI is available we can completely reverse the vector through
   // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
   const __m256i big_endian_byte_mask =
       _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
                                               8, 9, 10, 11, 12, 13, 14, 15,   //
                                               16, 17, 18, 19, 20, 21, 22, 23, //
                                               24, 25, 26, 27, 28, 29, 30, 31),
                               little_endian_byte_mask);
   // And turn the byte vector mask into an 'uint32_t' for direct scalar
   // comparison.
   return _mm256_movemask_epi8(big_endian_byte_mask);
 #else
   // We can't byte-reverse '__m256i' in a single instruction with AVX2.
   // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
   // leading to:
   // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
   //           31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
   // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
   const __m256i half_reversed = _mm256_shuffle_epi8(
       little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,       //
                                                8, 9, 10, 11, 12, 13, 14, 15, //
                                                0, 1, 2, 3, 4, 5, 6, 7,       //
                                                8, 9, 10, 11, 12, 13, 14, 15));
   // Then we turn the vector into an uint32_t.
   const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
   // And swap the lower and upper parts. This is optimized into a single `rorx`
   // instruction.
   return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
 #endif
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m256i>(p1, offset);
   const auto b = load<__m256i>(p2, offset);
   const auto vmax = bytewise_max(a, b);
   const auto le = big_endian_cmp_mask(vmax, b);
   const auto ge = big_endian_cmp_mask(vmax, a);
   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
   return cmp_neq_uint64_t(ge, le);
 }
 #endif // __AVX2__

 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for __m512i
 #if defined(__AVX512BW__)
 template <> struct is_vector<__m512i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
   return _mm512_max_epu8(a, b);
 }
 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
   // The AVX512BMI version is disabled due to bad codegen.
   // https://github.com/llvm/llvm-project/issues/77459
   // https://github.com/llvm/llvm-project/pull/77081
   // TODO: Re-enable when clang version meets the fixed version.
 #if false && defined(__AVX512VBMI__)
   // When AVX512BMI is available we can completely reverse the vector through
   // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
   const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
                                        8, 9, 10, 11, 12, 13, 14, 15,   //
                                        16, 17, 18, 19, 20, 21, 22, 23, //
                                        24, 25, 26, 27, 28, 29, 30, 31, //
                                        32, 33, 34, 35, 36, 37, 38, 39, //
                                        40, 41, 42, 43, 44, 45, 46, 47, //
                                        48, 49, 50, 51, 52, 53, 54, 55, //
                                        56, 57, 58, 59, 60, 61, 62, 63);
   // Then we compute the mask for equal bytes.
   return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
                                 _mm512_permutexvar_epi8(indices, value));
 #else
   // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
   // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
   // So we only reverse groups of 8 bytes, these groups are necessarily within a
   // 16-byte lane.
   // zmm = | 16 bytes  | 16 bytes  | 16 bytes  | 16 bytes  |
   // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
   const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
                                           0, 1, 2, 3, 4, 5, 6, 7,       //
                                           8, 9, 10, 11, 12, 13, 14, 15, //
                                           0, 1, 2, 3, 4, 5, 6, 7,       //
                                           8, 9, 10, 11, 12, 13, 14, 15, //
                                           0, 1, 2, 3, 4, 5, 6, 7,       //
                                           8, 9, 10, 11, 12, 13, 14, 15, //
                                           0, 1, 2, 3, 4, 5, 6, 7);
   // Then we compute the mask for equal bytes. In this mask the bits of each
   // byte are already reversed but the byte themselves should be reversed, this
   // is done by using a bswap instruction.
   return __builtin_bswap64(
       _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
                              _mm512_shuffle_epi8(value, indices)));

 #endif
 }
 template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
   return _mm512_cmpneq_epi8_mask(a, b) == 0;
 }
 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
   return _mm512_cmpneq_epi8_mask(a, b) != 0;
 }
 LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
   return _mm512_xor_epi64(a, b);
 }
 LIBC_INLINE bool is_zero(__m512i value) {
   return _mm512_test_epi32_mask(value, value) == 0;
 }
 template <>
 LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
                                                        size_t count) {
   const __m512i head = load_and_xor_m512i(p1, p2, 0);
   const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
   return !is_zero(_mm512_or_epi64(head, tail));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
   const auto vmax = bytewise_max(a, b);
   const auto le = big_endian_cmp_mask(vmax, b);
   const auto ge = big_endian_cmp_mask(vmax, a);
   static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
   return cmp_neq_uint64_t(ge, le);
 }
 #endif // __AVX512BW__

 #pragma GCC diagnostic pop

 } // namespace generic
 } // namespace LIBC_NAMESPACE_DECL

 #endif // LIBC_TARGET_ARCH_IS_X86

 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
	//===-- x86 implementation of memory function building blocks -------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file provides x86 specific building blocks to compose memory functions.
	//
	//===----------------------------------------------------------------------===//
	#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
	#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H

	#include "src/__support/macros/config.h"
	#include "src/__support/macros/properties/architectures.h"

	#if defined(LIBC_TARGET_ARCH_IS_X86)

	#include "src/__support/common.h"
	#include "src/string/memory_utils/op_builtin.h"
	#include "src/string/memory_utils/op_generic.h"

	#if defined(__AVX512BW__) \|\| defined(__AVX512F__) \|\| defined(__AVX2__) \|\| \
	defined(__SSE2__)
	#include <immintrin.h>
	#endif

	// Define fake functions to prevent the compiler from failing on undefined
	// functions in case the CPU extension is not present.
	#if !defined(__AVX512BW__) && (defined(_MSC_VER) \|\| defined(__SCE__))
	#undef _mm512_cmpneq_epi8_mask
	#define _mm512_cmpneq_epi8_mask(A, B) 0
	#endif
	#if !defined(__AVX2__) && (defined(_MSC_VER) \|\| defined(__SCE__))
	#undef _mm256_movemask_epi8
	#define _mm256_movemask_epi8(A) 0
	#endif
	#if !defined(__SSE2__) && (defined(_MSC_VER) \|\| defined(__SCE__))
	#undef _mm_movemask_epi8
	#define _mm_movemask_epi8(A) 0
	#endif

	namespace LIBC_NAMESPACE_DECL {
	namespace x86 {

	// A set of constants to check compile time features.
	LIBC_INLINE_VAR constexpr bool K_SSE2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
	LIBC_INLINE_VAR constexpr bool K_SSE41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
	LIBC_INLINE_VAR constexpr bool K_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
	LIBC_INLINE_VAR constexpr bool K_AVX2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
	LIBC_INLINE_VAR constexpr bool K_AVX512_F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
	LIBC_INLINE_VAR constexpr bool K_AVX512_BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);

	///////////////////////////////////////////////////////////////////////////////
	// Memcpy repmovsb implementation
	struct Memcpy {
	LIBC_INLINE static void repmovsb(void dst, const void src, size_t count) {
	asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
	}
	};

	} // namespace x86
	} // namespace LIBC_NAMESPACE_DECL

	namespace LIBC_NAMESPACE_DECL {
	namespace generic {

	// Not equals: returns non-zero iff values at head or tail differ.
	// This function typically loads more data than necessary when the two buffer
	// differs.
	template <typename T>
	LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
	static_assert(cpp::is_integral_v<T>);
	return neq<T>(p1, p2, 0) \| neq<T>(p1, p2, count - sizeof(T));
	}

	///////////////////////////////////////////////////////////////////////////////
	// Specializations for uint16_t
	template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
	template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
	return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
	}
	template <>
	LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
	return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
	return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
	static_cast<int32_t>(load_be<uint16_t>(p2, offset));
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);

	///////////////////////////////////////////////////////////////////////////////
	// Specializations for uint32_t
	template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
	template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
	return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
	}
	template <>
	LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
	return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load_be<uint32_t>(p1, offset);
	const auto b = load_be<uint32_t>(p2, offset);
	return cmp_uint32_t(a, b);
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);

	///////////////////////////////////////////////////////////////////////////////
	// Specializations for uint64_t
	template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
	template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
	return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
	}
	template <>
	LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
	return !eq<uint64_t>(p1, p2, offset);
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
	template <>
	LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
	size_t offset) {
	const auto a = load_be<uint64_t>(p1, offset);
	const auto b = load_be<uint64_t>(p2, offset);
	return cmp_neq_uint64_t(a, b);
	}

	// SIMD types are defined with attributes. e.g., '__m128i' is defined as
	// long long __attribute__((__vector_size__(16), __aligned__(16)))
	// When we use these SIMD types in template specialization GCC complains:
	// "ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
	// Therefore, we disable this warning in this file.
	#pragma GCC diagnostic push
	#pragma GCC diagnostic ignored "-Wignored-attributes"

	///////////////////////////////////////////////////////////////////////////////
	// Specializations for __m128i
	#if defined(__SSE4_1__)
	template <> struct is_vector<__m128i> : cpp::true_type {};
	template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
	LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m128i>(p1, offset);
	const auto b = load<__m128i>(p2, offset);
	return _mm_xor_si128(a, b);
	}
	LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
	return _mm_max_epu8(a, b);
	}
	LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
	return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15));
	}
	LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
	return static_cast<uint16_t>(
	_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
	}
	LIBC_INLINE bool is_zero(__m128i value) {
	return _mm_testz_si128(value, value) == 1;
	}
	template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
	return is_zero(load_and_xor_m128i(p1, p2, offset));
	}
	template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
	return !is_zero(load_and_xor_m128i(p1, p2, offset));
	}
	template <>
	LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
	size_t count) {
	const __m128i head = load_and_xor_m128i(p1, p2, 0);
	const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
	return !is_zero(_mm_or_si128(head, tail));
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m128i>(p1, offset);
	const auto b = load<__m128i>(p2, offset);
	const auto vmax = bytewise_max(a, b);
	const auto le = big_endian_cmp_mask(vmax, b);
	const auto ge = big_endian_cmp_mask(vmax, a);
	static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
	return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
	}
	#endif // __SSE4_1__

	///////////////////////////////////////////////////////////////////////////////
	// Specializations for __m256i
	#if defined(__AVX__)
	template <> struct is_vector<__m256i> : cpp::true_type {};
	template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
	LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
	return _mm256_castps_si256(
	_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
	}
	LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
	return _mm256_castps_si256(
	_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
	}
	LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m256i>(p1, offset);
	const auto b = load<__m256i>(p2, offset);
	return xor_m256i(a, b);
	}
	LIBC_INLINE bool is_zero(__m256i value) {
	return _mm256_testz_si256(value, value) == 1;
	}
	template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
	return is_zero(load_and_xor_m256i(p1, p2, offset));
	}
	template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
	return !is_zero(load_and_xor_m256i(p1, p2, offset));
	}
	template <>
	LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
	size_t count) {
	const __m256i head = load_and_xor_m256i(p1, p2, 0);
	const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
	return !is_zero(or_m256i(head, tail));
	}
	#endif // __AVX__

	#if defined(__AVX2__)
	LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
	return _mm256_max_epu8(a, b);
	}
	LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
	// Bytewise comparison of 'max' and 'value'.
	const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
	// Because x86 is little endian, bytes in the vector must be reversed before
	// using movemask.
	#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
	// When AVX512BMI is available we can completely reverse the vector through
	// VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
	const __m256i big_endian_byte_mask =
	_mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15, //
	16, 17, 18, 19, 20, 21, 22, 23, //
	24, 25, 26, 27, 28, 29, 30, 31),
	little_endian_byte_mask);
	// And turn the byte vector mask into an 'uint32_t' for direct scalar
	// comparison.
	return _mm256_movemask_epi8(big_endian_byte_mask);
	#else
	// We can't byte-reverse '__m256i' in a single instruction with AVX2.
	// '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
	// leading to:
	// ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
	// 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
	// So we first shuffle each 16-byte lane leading to half-reversed vector mask.
	const __m256i half_reversed = _mm256_shuffle_epi8(
	little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15, //
	0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15));
	// Then we turn the vector into an uint32_t.
	const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
	// And swap the lower and upper parts. This is optimized into a single `rorx`
	// instruction.
	return (half_reversed_scalar << 16) \| (half_reversed_scalar >> 16);
	#endif
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m256i>(p1, offset);
	const auto b = load<__m256i>(p2, offset);
	const auto vmax = bytewise_max(a, b);
	const auto le = big_endian_cmp_mask(vmax, b);
	const auto ge = big_endian_cmp_mask(vmax, a);
	static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
	return cmp_neq_uint64_t(ge, le);
	}
	#endif // __AVX2__

	///////////////////////////////////////////////////////////////////////////////
	// Specializations for __m512i
	#if defined(__AVX512BW__)
	template <> struct is_vector<__m512i> : cpp::true_type {};
	template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
	LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
	return _mm512_max_epu8(a, b);
	}
	LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
	// The AVX512BMI version is disabled due to bad codegen.
	// https://github.com/llvm/llvm-project/issues/77459
	// https://github.com/llvm/llvm-project/pull/77081
	// TODO: Re-enable when clang version meets the fixed version.
	#if false && defined(__AVX512VBMI__)
	// When AVX512BMI is available we can completely reverse the vector through
	// VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
	const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15, //
	16, 17, 18, 19, 20, 21, 22, 23, //
	24, 25, 26, 27, 28, 29, 30, 31, //
	32, 33, 34, 35, 36, 37, 38, 39, //
	40, 41, 42, 43, 44, 45, 46, 47, //
	48, 49, 50, 51, 52, 53, 54, 55, //
	56, 57, 58, 59, 60, 61, 62, 63);
	// Then we compute the mask for equal bytes.
	return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
	_mm512_permutexvar_epi8(indices, value));
	#else
	// We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
	// '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
	// So we only reverse groups of 8 bytes, these groups are necessarily within a
	// 16-byte lane.
	// zmm = \| 16 bytes \| 16 bytes \| 16 bytes \| 16 bytes \|
	// zmm = \| <8> \| <8> \| <8> \| <8> \| <8> \| <8> \| <8> \| <8> \|
	const __m512i indices = _mm512_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, //
	0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15, //
	0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15, //
	0, 1, 2, 3, 4, 5, 6, 7, //
	8, 9, 10, 11, 12, 13, 14, 15, //
	0, 1, 2, 3, 4, 5, 6, 7);
	// Then we compute the mask for equal bytes. In this mask the bits of each
	// byte are already reversed but the byte themselves should be reversed, this
	// is done by using a bswap instruction.
	return __builtin_bswap64(
	_mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
	_mm512_shuffle_epi8(value, indices)));

	#endif
	}
	template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m512i>(p1, offset);
	const auto b = load<__m512i>(p2, offset);
	return _mm512_cmpneq_epi8_mask(a, b) == 0;
	}
	template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m512i>(p1, offset);
	const auto b = load<__m512i>(p2, offset);
	return _mm512_cmpneq_epi8_mask(a, b) != 0;
	}
	LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m512i>(p1, offset);
	const auto b = load<__m512i>(p2, offset);
	return _mm512_xor_epi64(a, b);
	}
	LIBC_INLINE bool is_zero(__m512i value) {
	return _mm512_test_epi32_mask(value, value) == 0;
	}
	template <>
	LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
	size_t count) {
	const __m512i head = load_and_xor_m512i(p1, p2, 0);
	const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
	return !is_zero(_mm512_or_epi64(head, tail));
	}
	template <>
	LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
	const auto a = load<__m512i>(p1, offset);
	const auto b = load<__m512i>(p2, offset);
	const auto vmax = bytewise_max(a, b);
	const auto le = big_endian_cmp_mask(vmax, b);
	const auto ge = big_endian_cmp_mask(vmax, a);
	static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
	return cmp_neq_uint64_t(ge, le);
	}
	#endif // __AVX512BW__

	#pragma GCC diagnostic pop

	} // namespace generic
	} // namespace LIBC_NAMESPACE_DECL

	#endif // LIBC_TARGET_ARCH_IS_X86

	#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H