llvm/test/Transforms/CodeGenPrepare/AArch64/zext-to-shuffle.ll - rust-lang/llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -codegenprepare -S %s | FileCheck %s

 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios"

 ; It's profitable to convert the zext to a shuffle, which in turn will be
 ; lowered to 4 tbl instructions. The masks are materialized outside the loop.
 define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: @zext_v16i8_to_v16i32_in_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
 ; CHECK-NEXT:    [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <16 x i32>*
 ; CHECK-NEXT:    store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop

 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %src.gep = getelementptr i8, i8* %src, i64 %iv
   %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
   %load = load <16 x i8>, <16 x i8>* %src.gep.cast
   %ext = zext <16 x i8> %load to <16 x i32>
   %dst.gep = getelementptr i32, i32* %dst, i64 %iv
   %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
   store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
   %iv.next = add nuw i64 %iv, 16
   %ec = icmp eq i64 %iv.next, 128
   br i1 %ec, label %exit, label %loop

 exit:
   ret void
 }

 ; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
 ; require more instructions than lowering zext directly.
 define void @zext_v16i8_to_v16i32_no_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: @zext_v16i8_to_v16i32_no_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SRC_CAST:%.*]] = bitcast i8* [[SRC:%.*]] to <16 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_CAST]], align 16
 ; CHECK-NEXT:    [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[DST_CAST:%.*]] = bitcast i32* [[DST:%.*]] to <16 x i32>*
 ; CHECK-NEXT:    store <16 x i32> [[EXT]], <16 x i32>* [[DST_CAST]], align 64
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %src.cast = bitcast i8* %src to <16 x i8>*
   %load = load <16 x i8>, <16 x i8>* %src.cast
   %ext = zext <16 x i8> %load to <16 x i32>
   %dst.cast = bitcast i32* %dst to <16 x i32>*
   store <16 x i32> %ext, <16 x i32>* %dst.cast
   ret void
 }

 define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) {
 ; CHECK-LABEL: @zext_v16i8_to_v16i16_in_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
 ; CHECK-NEXT:    [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i16>
 ; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i16, i16* [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[DST_GEP_CAST:%.*]] = bitcast i16* [[DST_GEP]] to <16 x i16>*
 ; CHECK-NEXT:    store <16 x i16> [[EXT]], <16 x i16>* [[DST_GEP_CAST]], align 32
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop

 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %src.gep = getelementptr i8, i8* %src, i64 %iv
   %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
   %load = load <16 x i8>, <16 x i8>* %src.gep.cast
   %ext = zext <16 x i8> %load to <16 x i16>
   %dst.gep = getelementptr i16, i16* %dst, i64 %iv
   %dst.gep.cast = bitcast i16* %dst.gep to <16 x i16>*
   store <16 x i16> %ext, <16 x i16>* %dst.gep.cast
   %iv.next = add nuw i64 %iv, 16
   %ec = icmp eq i64 %iv.next, 128
   br i1 %ec, label %exit, label %loop

 exit:
   ret void
 }

 define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
 ; CHECK-LABEL: @zext_v8i8_to_v8i32_in_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <8 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[SRC_GEP_CAST]], align 8
 ; CHECK-NEXT:    [[EXT:%.*]] = zext <8 x i8> [[LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[DST_GEP_CAST:%.*]] = bitcast i32* [[DST_GEP]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[EXT]], <8 x i32>* [[DST_GEP_CAST]], align 32
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop

 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %src.gep = getelementptr i8, i8* %src, i64 %iv
   %src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
   %load = load <8 x i8>, <8 x i8>* %src.gep.cast
   %ext = zext <8 x i8> %load to <8 x i32>
   %dst.gep = getelementptr i32, i32* %dst, i64 %iv
   %dst.gep.cast = bitcast i32* %dst.gep to <8 x i32>*
   store <8 x i32> %ext, <8 x i32>* %dst.gep.cast
   %iv.next = add nuw i64 %iv, 16
   %ec = icmp eq i64 %iv.next, 128
   br i1 %ec, label %exit, label %loop

 exit:
   ret void
 }

 define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) {
 ; CHECK-LABEL: @zext_v16i8_to_v16i64_in_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[SRC_GEP_CAST:%.*]] = bitcast i8* [[SRC_GEP]] to <16 x i8>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[SRC_GEP_CAST]], align 16
 ; CHECK-NEXT:    [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64>
 ; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i64, i64* [[DST:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[DST_GEP_CAST:%.*]] = bitcast i64* [[DST_GEP]] to <16 x i64>*
 ; CHECK-NEXT:    store <16 x i64> [[EXT]], <16 x i64>* [[DST_GEP_CAST]], align 128
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop

 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %src.gep = getelementptr i8, i8* %src, i64 %iv
   %src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
   %load = load <16 x i8>, <16 x i8>* %src.gep.cast
   %ext = zext <16 x i8> %load to <16 x i64>
   %dst.gep = getelementptr i64, i64* %dst, i64 %iv
   %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
   store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
   %iv.next = add nuw i64 %iv, 16
   %ec = icmp eq i64 %iv.next, 128
   br i1 %ec, label %exit, label %loop

 exit:
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -codegenprepare -S %s \| FileCheck %s

	target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
	target triple = "arm64-apple-ios"

	; It's profitable to convert the zext to a shuffle, which in turn will be
	; lowered to 4 tbl instructions. The masks are materialized outside the loop.
	define void @zext_v16i8_to_v16i32_in_loop(i8* %src, i32* %dst) {
	; CHECK-LABEL: @zext_v16i8_to_v16i32_in_loop(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[LOOP:%.*]]
	; CHECK: loop:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
	; CHECK-NEXT: [[SRC_GEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[SRC_GEP_CAST:%.]] = bitcast i8 [[SRC_GEP]] to <16 x i8>*
	; CHECK-NEXT: [[LOAD:%.]] = load <16 x i8>, <16 x i8> [[SRC_GEP_CAST]], align 16
	; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
	; CHECK-NEXT: [[DST_GEP:%.]] = getelementptr i32, i32 [[DST:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[DST_GEP_CAST:%.]] = bitcast i32 [[DST_GEP]] to <16 x i32>*
	; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_GEP_CAST]], align 64
	; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
	; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
	; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
	; CHECK: exit:
	; CHECK-NEXT: ret void
	;
	entry:
	br label %loop

	loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%src.gep = getelementptr i8, i8* %src, i64 %iv
	%src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
	%load = load <16 x i8>, <16 x i8>* %src.gep.cast
	%ext = zext <16 x i8> %load to <16 x i32>
	%dst.gep = getelementptr i32, i32* %dst, i64 %iv
	%dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>*
	store <16 x i32> %ext, <16 x i32>* %dst.gep.cast
	%iv.next = add nuw i64 %iv, 16
	%ec = icmp eq i64 %iv.next, 128
	br i1 %ec, label %exit, label %loop

	exit:
	ret void
	}

	; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks
	; require more instructions than lowering zext directly.
	define void @zext_v16i8_to_v16i32_no_loop(i8* %src, i32* %dst) {
	; CHECK-LABEL: @zext_v16i8_to_v16i32_no_loop(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[SRC_CAST:%.]] = bitcast i8 [[SRC:%.]] to <16 x i8>
	; CHECK-NEXT: [[LOAD:%.]] = load <16 x i8>, <16 x i8> [[SRC_CAST]], align 16
	; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32>
	; CHECK-NEXT: [[DST_CAST:%.]] = bitcast i32 [[DST:%.]] to <16 x i32>
	; CHECK-NEXT: store <16 x i32> [[EXT]], <16 x i32>* [[DST_CAST]], align 64
	; CHECK-NEXT: ret void
	;
	entry:
	%src.cast = bitcast i8* %src to <16 x i8>*
	%load = load <16 x i8>, <16 x i8>* %src.cast
	%ext = zext <16 x i8> %load to <16 x i32>
	%dst.cast = bitcast i32* %dst to <16 x i32>*
	store <16 x i32> %ext, <16 x i32>* %dst.cast
	ret void
	}

	define void @zext_v16i8_to_v16i16_in_loop(i8* %src, i16* %dst) {
	; CHECK-LABEL: @zext_v16i8_to_v16i16_in_loop(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[LOOP:%.*]]
	; CHECK: loop:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
	; CHECK-NEXT: [[SRC_GEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[SRC_GEP_CAST:%.]] = bitcast i8 [[SRC_GEP]] to <16 x i8>*
	; CHECK-NEXT: [[LOAD:%.]] = load <16 x i8>, <16 x i8> [[SRC_GEP_CAST]], align 16
	; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i16>
	; CHECK-NEXT: [[DST_GEP:%.]] = getelementptr i16, i16 [[DST:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[DST_GEP_CAST:%.]] = bitcast i16 [[DST_GEP]] to <16 x i16>*
	; CHECK-NEXT: store <16 x i16> [[EXT]], <16 x i16>* [[DST_GEP_CAST]], align 32
	; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
	; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
	; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
	; CHECK: exit:
	; CHECK-NEXT: ret void
	;
	entry:
	br label %loop

	loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%src.gep = getelementptr i8, i8* %src, i64 %iv
	%src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
	%load = load <16 x i8>, <16 x i8>* %src.gep.cast
	%ext = zext <16 x i8> %load to <16 x i16>
	%dst.gep = getelementptr i16, i16* %dst, i64 %iv
	%dst.gep.cast = bitcast i16* %dst.gep to <16 x i16>*
	store <16 x i16> %ext, <16 x i16>* %dst.gep.cast
	%iv.next = add nuw i64 %iv, 16
	%ec = icmp eq i64 %iv.next, 128
	br i1 %ec, label %exit, label %loop

	exit:
	ret void
	}

	define void @zext_v8i8_to_v8i32_in_loop(i8* %src, i32* %dst) {
	; CHECK-LABEL: @zext_v8i8_to_v8i32_in_loop(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[LOOP:%.*]]
	; CHECK: loop:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
	; CHECK-NEXT: [[SRC_GEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[SRC_GEP_CAST:%.]] = bitcast i8 [[SRC_GEP]] to <8 x i8>*
	; CHECK-NEXT: [[LOAD:%.]] = load <8 x i8>, <8 x i8> [[SRC_GEP_CAST]], align 8
	; CHECK-NEXT: [[EXT:%.*]] = zext <8 x i8> [[LOAD]] to <8 x i32>
	; CHECK-NEXT: [[DST_GEP:%.]] = getelementptr i32, i32 [[DST:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[DST_GEP_CAST:%.]] = bitcast i32 [[DST_GEP]] to <8 x i32>*
	; CHECK-NEXT: store <8 x i32> [[EXT]], <8 x i32>* [[DST_GEP_CAST]], align 32
	; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
	; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
	; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
	; CHECK: exit:
	; CHECK-NEXT: ret void
	;
	entry:
	br label %loop

	loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%src.gep = getelementptr i8, i8* %src, i64 %iv
	%src.gep.cast = bitcast i8* %src.gep to <8 x i8>*
	%load = load <8 x i8>, <8 x i8>* %src.gep.cast
	%ext = zext <8 x i8> %load to <8 x i32>
	%dst.gep = getelementptr i32, i32* %dst, i64 %iv
	%dst.gep.cast = bitcast i32* %dst.gep to <8 x i32>*
	store <8 x i32> %ext, <8 x i32>* %dst.gep.cast
	%iv.next = add nuw i64 %iv, 16
	%ec = icmp eq i64 %iv.next, 128
	br i1 %ec, label %exit, label %loop

	exit:
	ret void
	}

	define void @zext_v16i8_to_v16i64_in_loop(i8* %src, i64* %dst) {
	; CHECK-LABEL: @zext_v16i8_to_v16i64_in_loop(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[LOOP:%.*]]
	; CHECK: loop:
	; CHECK-NEXT: [[IV:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
	; CHECK-NEXT: [[SRC_GEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[SRC_GEP_CAST:%.]] = bitcast i8 [[SRC_GEP]] to <16 x i8>*
	; CHECK-NEXT: [[LOAD:%.]] = load <16 x i8>, <16 x i8> [[SRC_GEP_CAST]], align 16
	; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64>
	; CHECK-NEXT: [[DST_GEP:%.]] = getelementptr i64, i64 [[DST:%.*]], i64 [[IV]]
	; CHECK-NEXT: [[DST_GEP_CAST:%.]] = bitcast i64 [[DST_GEP]] to <16 x i64>*
	; CHECK-NEXT: store <16 x i64> [[EXT]], <16 x i64>* [[DST_GEP_CAST]], align 128
	; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16
	; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128
	; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
	; CHECK: exit:
	; CHECK-NEXT: ret void
	;
	entry:
	br label %loop

	loop:
	%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
	%src.gep = getelementptr i8, i8* %src, i64 %iv
	%src.gep.cast = bitcast i8* %src.gep to <16 x i8>*
	%load = load <16 x i8>, <16 x i8>* %src.gep.cast
	%ext = zext <16 x i8> %load to <16 x i64>
	%dst.gep = getelementptr i64, i64* %dst, i64 %iv
	%dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>*
	store <16 x i64> %ext, <16 x i64>* %dst.gep.cast
	%iv.next = add nuw i64 %iv, 16
	%ec = icmp eq i64 %iv.next, 128
	br i1 %ec, label %exit, label %loop

	exit:
	ret void
	}