examples/gaussian.rs - stdarch - Git at Google

 //! Hexagon HVX Gaussian 3x3 blur example
 //!
 //! This example demonstrates the use of Hexagon HVX intrinsics to implement
 //! a 3x3 Gaussian blur filter on unsigned 8-bit images.
 //!
 //! The 3x3 Gaussian kernel is:
 //!     1 2 1
 //!     2 4 2  / 16
 //!     1 2 1
 //!
 //! This is a separable filter: `[1 2 1]^T * [1 2 1] / 16`.
 //!
 //! On Hexagon targets, this implementation uses `HvxVectorPair` for widening
 //! arithmetic to achieve full precision in the Gaussian computation, avoiding
 //! the approximation errors of byte-averaging approaches. On other targets,
 //! it runs a reference implementation in pure Rust.
 //!
 //! # Building and Running (Hexagon)
 //!
 //! To build (requires Hexagon toolchain):
 //!
 //!     RUSTFLAGS="-C target-feature=+hvxv62,+hvx-length128b \
 //!         -C linker=hexagon-unknown-linux-musl-clang" \
 //!         cargo +nightly build -p stdarch_examples --bin gaussian \
 //!         --target hexagon-unknown-linux-musl \
 //!         -Zbuild-std -Zbuild-std-features=llvm-libunwind
 //!
 //! To run under QEMU:
 //!
 //!     qemu-hexagon -L <sysroot>/target/hexagon-unknown-linux-musl \
 //!         target/hexagon-unknown-linux-musl/debug/gaussian
 //!
 //! # Building and Running (Other targets)
 //!
 //!     cargo +nightly run -p stdarch_examples --bin gaussian

 #![cfg_attr(target_arch = "hexagon", feature(stdarch_hexagon))]
 #![cfg_attr(target_arch = "hexagon", feature(hexagon_target_feature))]
 #![allow(
     unsafe_op_in_unsafe_fn,
     clippy::unwrap_used,
     clippy::print_stdout,
     clippy::missing_docs_in_private_items,
     clippy::cast_possible_wrap,
     clippy::cast_ptr_alignment
 )]

 /// Image width - must be multiple of HVX vector length on Hexagon
 const WIDTH: usize = 256;
 const HEIGHT: usize = 16;

 // ============================================================================
 // Hexagon HVX implementation
 // ============================================================================

 #[cfg(target_arch = "hexagon")]
 mod hvx {
     #[cfg(not(target_feature = "hvx-length128b"))]
     use core_arch::arch::hexagon::v64::*;
     #[cfg(target_feature = "hvx-length128b")]
     use core_arch::arch::hexagon::v128::*;

     /// Vector length in bytes for HVX 128-byte mode
     #[cfg(target_feature = "hvx-length128b")]
     const VLEN: usize = 128;

     /// Vector length in bytes for HVX 64-byte mode
     #[cfg(not(target_feature = "hvx-length128b"))]
     const VLEN: usize = 64;

     /// Vertical 1-2-1 filter pass using HvxVectorPair widening arithmetic
     ///
     /// Computes: dst[x] = (row_above[x] + 2*center[x] + row_below[x] + 2) >> 2
     ///
     /// Uses HvxVectorPair to widen u8 to u16 for precise arithmetic, avoiding
     /// the rounding errors of byte-averaging approximations.
     ///
     /// # Safety
     ///
     /// - `src` must point to the center row with valid data at -stride and +stride
     /// - `dst` must point to a valid output buffer for `width` bytes
     /// - `width` must be a multiple of VLEN
     /// - All pointers must be HVX-aligned (128-byte for 128B mode)
     #[target_feature(enable = "hvxv62")]
     unsafe fn vertical_121_pass(src: *const u8, stride: isize, width: usize, dst: *mut u8) {
         let inp0 = src.offset(-stride) as *const HvxVector;
         let inp1 = src as *const HvxVector;
         let inp2 = src.offset(stride) as *const HvxVector;
         let outp = dst as *mut HvxVector;

         let n_chunks = width / VLEN;
         for i in 0..n_chunks {
             let above = *inp0.add(i);
             let center = *inp1.add(i);
             let below = *inp2.add(i);

             // Widen above + below to 16-bit using HvxVectorPair
             // q6_wh_vadd_vubvub: adds two u8 vectors, producing u16 results in a pair
             let above_plus_below: HvxVectorPair = q6_wh_vadd_vubvub(above, below);

             // Widen center * 2 (add center to itself)
             let center_x2: HvxVectorPair = q6_wh_vadd_vubvub(center, center);

             // Add them: (above + below) + (center * 2) = above + 2*center + below
             let sum: HvxVectorPair = q6_wh_vadd_whwh(above_plus_below, center_x2);

             // Extract high and low vectors from the pair (each contains u16 values)
             let sum_lo = q6_v_lo_w(sum); // Lower 64 elements as i16
             let sum_hi = q6_v_hi_w(sum); // Upper 64 elements as i16

             // Arithmetic right shift by 2 (divide by 4) with rounding
             // Add 2 for rounding before shift: (sum + 2) >> 2
             let two = q6_vh_vsplat_r(2);
             let sum_lo_rounded = q6_vh_vadd_vhvh(sum_lo, two);
             let sum_hi_rounded = q6_vh_vadd_vhvh(sum_hi, two);
             let shifted_lo = q6_vh_vasr_vhvh(sum_lo_rounded, two);
             let shifted_hi = q6_vh_vasr_vhvh(sum_hi_rounded, two);

             // Pack back to u8 with saturation: takes hi and lo halfword vectors,
             // saturates to u8, and interleaves them back to original order
             let result = q6_vub_vsat_vhvh(shifted_hi, shifted_lo);

             *outp.add(i) = result;
         }
     }

     /// Horizontal 1-2-1 filter pass using HvxVectorPair widening arithmetic
     ///
     /// Computes: dst[x] = (src[x-1] + 2*src[x] + src[x+1] + 2) >> 2
     ///
     /// Uses `valign` and `vlalign` to shift vectors by 1 byte for neighbor access,
     /// then HvxVectorPair for precise widening arithmetic.
     ///
     /// # Safety
     ///
     /// - `src` and `dst` must point to valid buffers of `width` bytes
     /// - `width` must be a multiple of VLEN
     /// - All pointers must be HVX-aligned
     #[target_feature(enable = "hvxv62")]
     unsafe fn horizontal_121_pass(src: *const u8, width: usize, dst: *mut u8) {
         let inp = src as *const HvxVector;
         let outp = dst as *mut HvxVector;

         let n_chunks = width / VLEN;
         let mut prev = q6_v_vzero();

         for i in 0..n_chunks {
             let curr = *inp.add(i);
             let next = if i + 1 < n_chunks {
                 *inp.add(i + 1)
             } else {
                 q6_v_vzero()
             };

             // Left neighbor (x-1): shift curr right by 1 byte, filling from prev
             let left = q6_v_vlalign_vvr(curr, prev, 1);

             // Right neighbor (x+1): shift curr left by 1 byte, filling from next
             let right = q6_v_valign_vvr(next, curr, 1);

             // Widen left + right to 16-bit
             let left_plus_right: HvxVectorPair = q6_wh_vadd_vubvub(left, right);

             // Widen center * 2
             let center_x2: HvxVectorPair = q6_wh_vadd_vubvub(curr, curr);

             // Add: left + 2*center + right
             let sum: HvxVectorPair = q6_wh_vadd_whwh(left_plus_right, center_x2);

             // Extract high and low vectors
             let sum_lo = q6_v_lo_w(sum);
             let sum_hi = q6_v_hi_w(sum);

             // Arithmetic right shift by 2 with rounding
             let two = q6_vh_vsplat_r(2);
             let sum_lo_rounded = q6_vh_vadd_vhvh(sum_lo, two);
             let sum_hi_rounded = q6_vh_vadd_vhvh(sum_hi, two);
             let shifted_lo = q6_vh_vasr_vhvh(sum_lo_rounded, two);
             let shifted_hi = q6_vh_vasr_vhvh(sum_hi_rounded, two);

             // Pack back to u8 with saturation
             let result = q6_vub_vsat_vhvh(shifted_hi, shifted_lo);

             *outp.add(i) = result;

             prev = curr;
         }
     }

     /// Apply Gaussian 3x3 blur to an entire image using separable filtering
     ///
     /// Two-pass approach:
     /// 1. Vertical pass: apply 1-2-1 filter across rows
     /// 2. Horizontal pass: apply 1-2-1 filter across columns
     ///
     /// Combined effect: 3x3 Gaussian kernel [1 2 1; 2 4 2; 1 2 1] / 16
     ///
     /// # Safety
     ///
     /// - `src` and `dst` must point to valid image buffers of `stride * height` bytes
     /// - `tmp` must point to a valid temporary buffer of `width` bytes, HVX-aligned
     /// - `width` must be a multiple of VLEN and >= VLEN
     /// - `stride` must be >= `width`
     /// - All buffers must be HVX-aligned (128-byte for 128B mode)
     #[target_feature(enable = "hvxv62")]
     pub unsafe fn gaussian3x3u8(
         src: *const u8,
         stride: usize,
         width: usize,
         height: usize,
         dst: *mut u8,
         tmp: *mut u8,
     ) {
         let stride_i = stride as isize;

         // Process interior rows (skip first and last which lack vertical neighbors)
         for y in 1..height - 1 {
             let row_src = src.offset(y as isize * stride_i);
             let row_dst = dst.offset(y as isize * stride_i);

             // Pass 1: vertical 1-2-1 into tmp
             vertical_121_pass(row_src, stride_i, width, tmp);

             // Pass 2: horizontal 1-2-1 from tmp into dst
             horizontal_121_pass(tmp, width, row_dst);
         }
     }
 }

 // ============================================================================
 // Reference implementation (works on all targets)
 // ============================================================================

 /// Reference implementation of Gaussian 3x3 blur
 ///
 /// Kernel:
 ///     1 2 1
 ///     2 4 2  / 16
 ///     1 2 1
 fn gaussian3x3u8_reference(src: &[u8], stride: usize, width: usize, height: usize, dst: &mut [u8]) {
     for y in 1..height - 1 {
         for x in 1..width - 1 {
             // Compute column sums (vertical 1-2-1 weights)
             let mut col = [0u32; 3];
             for i in 0..3 {
                 col[i] = 1 * src[(y - 1) * stride + x - 1 + i] as u32
                     + 2 * src[y * stride + x - 1 + i] as u32
                     + 1 * src[(y + 1) * stride + x - 1 + i] as u32;
             }
             // Apply horizontal 1-2-1 weights and normalize
             // (1*col[0] + 2*col[1] + 1*col[2] + 8) / 16
             dst[y * stride + x] = ((1 * col[0] + 2 * col[1] + 1 * col[2] + 8) >> 4) as u8;
         }
     }
 }

 /// Generate deterministic test pattern
 fn generate_test_pattern(buf: &mut [u8], width: usize, height: usize) {
     for y in 0..height {
         for x in 0..width {
             buf[y * width + x] = ((x + y * 7) % 256) as u8;
         }
     }
 }

 // ============================================================================
 // Main: runs HVX + reference on Hexagon, reference-only on other targets
 // ============================================================================

 #[cfg(target_arch = "hexagon")]
 fn main() {
     // Aligned buffers for HVX
     #[repr(align(128))]
     struct AlignedBuf<const N: usize>([u8; N]);

     let mut src = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]);
     let mut dst_hvx = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]);
     let mut tmp = AlignedBuf::<{ WIDTH }>([0u8; WIDTH]);
     let mut dst_ref = vec![0u8; WIDTH * HEIGHT];

     // Generate test pattern
     generate_test_pattern(&mut src.0, WIDTH, HEIGHT);

     // Run HVX implementation
     unsafe {
         hvx::gaussian3x3u8(
             src.0.as_ptr(),
             WIDTH,
             WIDTH,
             HEIGHT,
             dst_hvx.0.as_mut_ptr(),
             tmp.0.as_mut_ptr(),
         );
     }

     // Run reference
     gaussian3x3u8_reference(&src.0, WIDTH, WIDTH, HEIGHT, &mut dst_ref);

     // Verify HVX matches reference (allowing small rounding differences)
     let mut max_diff = 0i32;
     for y in 1..HEIGHT - 1 {
         for x in 1..WIDTH - 1 {
             let idx = y * WIDTH + x;
             let diff = (dst_hvx.0[idx] as i32 - dst_ref[idx] as i32).abs();
             max_diff = max_diff.max(diff);
             // Allow up to 1 LSB difference due to rounding
             assert!(
                 diff <= 1,
                 "HVX differs from reference at ({}, {}): hvx={}, ref={}, diff={}",
                 x,
                 y,
                 dst_hvx.0[idx],
                 dst_ref[idx],
                 diff
             );
         }
     }

     println!(
         "Gaussian 3x3 HVX test passed! Max difference from reference: {}",
         max_diff
     );
 }

 #[cfg(not(target_arch = "hexagon"))]
 fn main() {
     let mut src = vec![0u8; WIDTH * HEIGHT];
     let mut dst = vec![0u8; WIDTH * HEIGHT];

     // Generate test pattern
     generate_test_pattern(&mut src, WIDTH, HEIGHT);

     // Run reference implementation
     gaussian3x3u8_reference(&src, WIDTH, WIDTH, HEIGHT, &mut dst);

     // Verify output is non-trivial (blurred values differ from input)
     let mut changed = 0;
     for y in 1..HEIGHT - 1 {
         for x in 1..WIDTH - 1 {
             let idx = y * WIDTH + x;
             if src[idx] != dst[idx] {
                 changed += 1;
             }
         }
     }

     println!(
         "Gaussian 3x3 reference test passed! {} pixels changed by blur",
         changed
     );
 }
	//! Hexagon HVX Gaussian 3x3 blur example
	//!
	//! This example demonstrates the use of Hexagon HVX intrinsics to implement
	//! a 3x3 Gaussian blur filter on unsigned 8-bit images.
	//!
	//! The 3x3 Gaussian kernel is:
	//! 1 2 1
	//! 2 4 2 / 16
	//! 1 2 1
	//!
	//! This is a separable filter: `[1 2 1]^T * [1 2 1] / 16`.
	//!
	//! On Hexagon targets, this implementation uses `HvxVectorPair` for widening
	//! arithmetic to achieve full precision in the Gaussian computation, avoiding
	//! the approximation errors of byte-averaging approaches. On other targets,
	//! it runs a reference implementation in pure Rust.
	//!
	//! # Building and Running (Hexagon)
	//!
	//! To build (requires Hexagon toolchain):
	//!
	//! RUSTFLAGS="-C target-feature=+hvxv62,+hvx-length128b \
	//! -C linker=hexagon-unknown-linux-musl-clang" \
	//! cargo +nightly build -p stdarch_examples --bin gaussian \
	//! --target hexagon-unknown-linux-musl \
	//! -Zbuild-std -Zbuild-std-features=llvm-libunwind
	//!
	//! To run under QEMU:
	//!
	//! qemu-hexagon -L <sysroot>/target/hexagon-unknown-linux-musl \
	//! target/hexagon-unknown-linux-musl/debug/gaussian
	//!
	//! # Building and Running (Other targets)
	//!
	//! cargo +nightly run -p stdarch_examples --bin gaussian

	#![cfg_attr(target_arch = "hexagon", feature(stdarch_hexagon))]
	#![cfg_attr(target_arch = "hexagon", feature(hexagon_target_feature))]
	#![allow(
	unsafe_op_in_unsafe_fn,
	clippy::unwrap_used,
	clippy::print_stdout,
	clippy::missing_docs_in_private_items,
	clippy::cast_possible_wrap,
	clippy::cast_ptr_alignment
	)]

	/// Image width - must be multiple of HVX vector length on Hexagon
	const WIDTH: usize = 256;
	const HEIGHT: usize = 16;

	// ============================================================================
	// Hexagon HVX implementation
	// ============================================================================

	#[cfg(target_arch = "hexagon")]
	mod hvx {
	#[cfg(not(target_feature = "hvx-length128b"))]
	use core_arch::arch::hexagon::v64::*;
	#[cfg(target_feature = "hvx-length128b")]
	use core_arch::arch::hexagon::v128::*;

	/// Vector length in bytes for HVX 128-byte mode
	#[cfg(target_feature = "hvx-length128b")]
	const VLEN: usize = 128;

	/// Vector length in bytes for HVX 64-byte mode
	#[cfg(not(target_feature = "hvx-length128b"))]
	const VLEN: usize = 64;

	/// Vertical 1-2-1 filter pass using HvxVectorPair widening arithmetic
	///
	/// Computes: dst[x] = (row_above[x] + 2*center[x] + row_below[x] + 2) >> 2
	///
	/// Uses HvxVectorPair to widen u8 to u16 for precise arithmetic, avoiding
	/// the rounding errors of byte-averaging approximations.
	///
	/// # Safety
	///
	/// - `src` must point to the center row with valid data at -stride and +stride
	/// - `dst` must point to a valid output buffer for `width` bytes
	/// - `width` must be a multiple of VLEN
	/// - All pointers must be HVX-aligned (128-byte for 128B mode)
	#[target_feature(enable = "hvxv62")]
	unsafe fn vertical_121_pass(src: const u8, stride: isize, width: usize, dst: mut u8) {
	let inp0 = src.offset(-stride) as *const HvxVector;
	let inp1 = src as *const HvxVector;
	let inp2 = src.offset(stride) as *const HvxVector;
	let outp = dst as *mut HvxVector;

	let n_chunks = width / VLEN;
	for i in 0..n_chunks {
	let above = *inp0.add(i);
	let center = *inp1.add(i);
	let below = *inp2.add(i);

	// Widen above + below to 16-bit using HvxVectorPair
	// q6_wh_vadd_vubvub: adds two u8 vectors, producing u16 results in a pair
	let above_plus_below: HvxVectorPair = q6_wh_vadd_vubvub(above, below);

	// Widen center * 2 (add center to itself)
	let center_x2: HvxVectorPair = q6_wh_vadd_vubvub(center, center);

	// Add them: (above + below) + (center * 2) = above + 2*center + below
	let sum: HvxVectorPair = q6_wh_vadd_whwh(above_plus_below, center_x2);

	// Extract high and low vectors from the pair (each contains u16 values)
	let sum_lo = q6_v_lo_w(sum); // Lower 64 elements as i16
	let sum_hi = q6_v_hi_w(sum); // Upper 64 elements as i16

	// Arithmetic right shift by 2 (divide by 4) with rounding
	// Add 2 for rounding before shift: (sum + 2) >> 2
	let two = q6_vh_vsplat_r(2);
	let sum_lo_rounded = q6_vh_vadd_vhvh(sum_lo, two);
	let sum_hi_rounded = q6_vh_vadd_vhvh(sum_hi, two);
	let shifted_lo = q6_vh_vasr_vhvh(sum_lo_rounded, two);
	let shifted_hi = q6_vh_vasr_vhvh(sum_hi_rounded, two);

	// Pack back to u8 with saturation: takes hi and lo halfword vectors,
	// saturates to u8, and interleaves them back to original order
	let result = q6_vub_vsat_vhvh(shifted_hi, shifted_lo);

	*outp.add(i) = result;
	}
	}

	/// Horizontal 1-2-1 filter pass using HvxVectorPair widening arithmetic
	///
	/// Computes: dst[x] = (src[x-1] + 2*src[x] + src[x+1] + 2) >> 2
	///
	/// Uses `valign` and `vlalign` to shift vectors by 1 byte for neighbor access,
	/// then HvxVectorPair for precise widening arithmetic.
	///
	/// # Safety
	///
	/// - `src` and `dst` must point to valid buffers of `width` bytes
	/// - `width` must be a multiple of VLEN
	/// - All pointers must be HVX-aligned
	#[target_feature(enable = "hvxv62")]
	unsafe fn horizontal_121_pass(src: const u8, width: usize, dst: mut u8) {
	let inp = src as *const HvxVector;
	let outp = dst as *mut HvxVector;

	let n_chunks = width / VLEN;
	let mut prev = q6_v_vzero();

	for i in 0..n_chunks {
	let curr = *inp.add(i);
	let next = if i + 1 < n_chunks {
	*inp.add(i + 1)
	} else {
	q6_v_vzero()
	};

	// Left neighbor (x-1): shift curr right by 1 byte, filling from prev
	let left = q6_v_vlalign_vvr(curr, prev, 1);

	// Right neighbor (x+1): shift curr left by 1 byte, filling from next
	let right = q6_v_valign_vvr(next, curr, 1);

	// Widen left + right to 16-bit
	let left_plus_right: HvxVectorPair = q6_wh_vadd_vubvub(left, right);

	// Widen center * 2
	let center_x2: HvxVectorPair = q6_wh_vadd_vubvub(curr, curr);

	// Add: left + 2*center + right
	let sum: HvxVectorPair = q6_wh_vadd_whwh(left_plus_right, center_x2);

	// Extract high and low vectors
	let sum_lo = q6_v_lo_w(sum);
	let sum_hi = q6_v_hi_w(sum);

	// Arithmetic right shift by 2 with rounding
	let two = q6_vh_vsplat_r(2);
	let sum_lo_rounded = q6_vh_vadd_vhvh(sum_lo, two);
	let sum_hi_rounded = q6_vh_vadd_vhvh(sum_hi, two);
	let shifted_lo = q6_vh_vasr_vhvh(sum_lo_rounded, two);
	let shifted_hi = q6_vh_vasr_vhvh(sum_hi_rounded, two);

	// Pack back to u8 with saturation
	let result = q6_vub_vsat_vhvh(shifted_hi, shifted_lo);

	*outp.add(i) = result;

	prev = curr;
	}
	}

	/// Apply Gaussian 3x3 blur to an entire image using separable filtering
	///
	/// Two-pass approach:
	/// 1. Vertical pass: apply 1-2-1 filter across rows
	/// 2. Horizontal pass: apply 1-2-1 filter across columns
	///
	/// Combined effect: 3x3 Gaussian kernel [1 2 1; 2 4 2; 1 2 1] / 16
	///
	/// # Safety
	///
	/// - `src` and `dst` must point to valid image buffers of `stride * height` bytes
	/// - `tmp` must point to a valid temporary buffer of `width` bytes, HVX-aligned
	/// - `width` must be a multiple of VLEN and >= VLEN
	/// - `stride` must be >= `width`
	/// - All buffers must be HVX-aligned (128-byte for 128B mode)
	#[target_feature(enable = "hvxv62")]
	pub unsafe fn gaussian3x3u8(
	src: *const u8,
	stride: usize,
	width: usize,
	height: usize,
	dst: *mut u8,
	tmp: *mut u8,
	) {
	let stride_i = stride as isize;

	// Process interior rows (skip first and last which lack vertical neighbors)
	for y in 1..height - 1 {
	let row_src = src.offset(y as isize * stride_i);
	let row_dst = dst.offset(y as isize * stride_i);

	// Pass 1: vertical 1-2-1 into tmp
	vertical_121_pass(row_src, stride_i, width, tmp);

	// Pass 2: horizontal 1-2-1 from tmp into dst
	horizontal_121_pass(tmp, width, row_dst);
	}
	}
	}

	// ============================================================================
	// Reference implementation (works on all targets)
	// ============================================================================

	/// Reference implementation of Gaussian 3x3 blur
	///
	/// Kernel:
	/// 1 2 1
	/// 2 4 2 / 16
	/// 1 2 1
	fn gaussian3x3u8_reference(src: &[u8], stride: usize, width: usize, height: usize, dst: &mut [u8]) {
	for y in 1..height - 1 {
	for x in 1..width - 1 {
	// Compute column sums (vertical 1-2-1 weights)
	let mut col = [0u32; 3];
	for i in 0..3 {
	col[i] = 1 * src[(y - 1) * stride + x - 1 + i] as u32
	+ 2 * src[y * stride + x - 1 + i] as u32
	+ 1 * src[(y + 1) * stride + x - 1 + i] as u32;
	}
	// Apply horizontal 1-2-1 weights and normalize
	// (1col[0] + 2col[1] + 1*col[2] + 8) / 16
	dst[y * stride + x] = ((1 * col[0] + 2 * col[1] + 1 * col[2] + 8) >> 4) as u8;
	}
	}
	}

	/// Generate deterministic test pattern
	fn generate_test_pattern(buf: &mut [u8], width: usize, height: usize) {
	for y in 0..height {
	for x in 0..width {
	buf[y * width + x] = ((x + y * 7) % 256) as u8;
	}
	}
	}

	// ============================================================================
	// Main: runs HVX + reference on Hexagon, reference-only on other targets
	// ============================================================================

	#[cfg(target_arch = "hexagon")]
	fn main() {
	// Aligned buffers for HVX
	#[repr(align(128))]
	struct AlignedBuf<const N: usize>([u8; N]);

	let mut src = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]);
	let mut dst_hvx = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]);
	let mut tmp = AlignedBuf::<{ WIDTH }>([0u8; WIDTH]);
	let mut dst_ref = vec![0u8; WIDTH * HEIGHT];

	// Generate test pattern
	generate_test_pattern(&mut src.0, WIDTH, HEIGHT);

	// Run HVX implementation
	unsafe {
	hvx::gaussian3x3u8(
	src.0.as_ptr(),
	WIDTH,
	WIDTH,
	HEIGHT,
	dst_hvx.0.as_mut_ptr(),
	tmp.0.as_mut_ptr(),
	);
	}

	// Run reference
	gaussian3x3u8_reference(&src.0, WIDTH, WIDTH, HEIGHT, &mut dst_ref);

	// Verify HVX matches reference (allowing small rounding differences)
	let mut max_diff = 0i32;
	for y in 1..HEIGHT - 1 {
	for x in 1..WIDTH - 1 {
	let idx = y * WIDTH + x;
	let diff = (dst_hvx.0[idx] as i32 - dst_ref[idx] as i32).abs();
	max_diff = max_diff.max(diff);
	// Allow up to 1 LSB difference due to rounding
	assert!(
	diff <= 1,
	"HVX differs from reference at ({}, {}): hvx={}, ref={}, diff={}",
	x,
	y,
	dst_hvx.0[idx],
	dst_ref[idx],
	diff
	);
	}
	}

	println!(
	"Gaussian 3x3 HVX test passed! Max difference from reference: {}",
	max_diff
	);
	}

	#[cfg(not(target_arch = "hexagon"))]
	fn main() {
	let mut src = vec![0u8; WIDTH * HEIGHT];
	let mut dst = vec![0u8; WIDTH * HEIGHT];

	// Generate test pattern
	generate_test_pattern(&mut src, WIDTH, HEIGHT);

	// Run reference implementation
	gaussian3x3u8_reference(&src, WIDTH, WIDTH, HEIGHT, &mut dst);

	// Verify output is non-trivial (blurred values differ from input)
	let mut changed = 0;
	for y in 1..HEIGHT - 1 {
	for x in 1..WIDTH - 1 {
	let idx = y * WIDTH + x;
	if src[idx] != dst[idx] {
	changed += 1;
	}
	}
	}

	println!(
	"Gaussian 3x3 reference test passed! {} pixels changed by blur",
	changed
	);
	}