llvm/test/CodeGen/X86/pr33954.ll - rust-lang/llvm-project - Git at Google

 ; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization does not transform CMOV
 ;; instruction when the gain (in cycles) of converting to branch is less than
 ;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
 ;;
 ;; Test was created using the following command line:
 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
 ;; Where foo.c is:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;int bar(int *a, int *b, int n) {
 ;;  int sum = 0;
 ;;  for (int i = 0; i < n; ++i) {
 ;;    int x = aptr aptr a[i+2];
 ;;    int y = bptr b[i+1];
 ;;    sum += y > x ? x : 0;
 ;;  }
 ;;  return sum;
 ;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Adding a test to the above function shows code with CMOV is 25% faster than
 ;; the code with branch.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;#define N 10000
 ;;int A[N];
 ;;int B[N];
 ;;
 ;;
 ;;
 ;;int main () {
 ;;  for (int i=0; i< N; ++i) {
 ;;    A[i] = i%4;
 ;;    B[i] = i%5;
 ;;  }
 ;;  int sum = 0;
 ;;  for (int i=0; i< N*10; ++i)
 ;;    sum += bar(A, B, N);
 ;;  return sum;
 ;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ; CHECK-NOT: jg
 ; CHECK: cmovle
 define i32 @bar(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) #0 {
 entry:
   %cmp30 = icmp sgt i32 %n, 0
   br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup

 for.body.preheader:                               ; preds = %entry
   %.pre = load i32, ptr %a, align 4
   %arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 1
   %.pre34 = load i32, ptr %arrayidx2.phi.trans.insert, align 4
   %.pre35 = load i32, ptr %b, align 4
   %wide.trip.count = zext i32 %n to i64
   br label %for.body

 for.cond.cleanup:                                 ; preds = %for.body, %entry
   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
   ret i32 %sum.0.lcssa

 for.body:                                         ; preds = %for.body, %for.body.preheader
   %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
   %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
   %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %mul = mul nsw i32 %1, %2
   %3 = add nuw nsw i64 %indvars.iv, 2
   %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %3
   %4 = load i32, ptr %arrayidx5, align 4
   %mul6 = mul nsw i32 %mul, %4
   %arrayidx11 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv.next
   %5 = load i32, ptr %arrayidx11, align 4
   %mul12 = mul nsw i32 %5, %0
   %cmp13 = icmp sgt i32 %mul12, %mul6
   %cond = select i1 %cmp13, i32 %mul6, i32 0
   %add14 = add nsw i32 %cond, %sum.032
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }

 attributes #0 = {"target-cpu"="skylake"}

 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}

 !0 = !{i32 1, !"wchar_size", i32 2}
 !1 = !{i32 7, !"PIC Level", i32 2}
 !2 = !{!"clang version 5.0.0 (trunk)"}
	; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s \| FileCheck %s

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;; This test checks that x86-cmov-converter optimization does not transform CMOV
	;; instruction when the gain (in cycles) of converting to branch is less than
	;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
	;;
	;; Test was created using the following command line:
	;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
	;; Where foo.c is:
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;;int bar(int a, int b, int n) {
	;; int sum = 0;
	;; for (int i = 0; i < n; ++i) {
	;; int x = aptr aptr a[i+2];
	;; int y = bptr b[i+1];
	;; sum += y > x ? x : 0;
	;; }
	;; return sum;
	;;}
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;; Adding a test to the above function shows code with CMOV is 25% faster than
	;; the code with branch.
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;;#define N 10000
	;;int A[N];
	;;int B[N];
	;;
	;;
	;;
	;;int main () {
	;; for (int i=0; i< N; ++i) {
	;; A[i] = i%4;
	;; B[i] = i%5;
	;; }
	;; int sum = 0;
	;; for (int i=0; i< N*10; ++i)
	;; sum += bar(A, B, N);
	;; return sum;
	;;}
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	; CHECK-NOT: jg
	; CHECK: cmovle
	define i32 @bar(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) #0 {
	entry:
	%cmp30 = icmp sgt i32 %n, 0
	br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup

	for.body.preheader: ; preds = %entry
	%.pre = load i32, ptr %a, align 4
	%arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 1
	%.pre34 = load i32, ptr %arrayidx2.phi.trans.insert, align 4
	%.pre35 = load i32, ptr %b, align 4
	%wide.trip.count = zext i32 %n to i64
	br label %for.body

	for.cond.cleanup: ; preds = %for.body, %entry
	%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
	ret i32 %sum.0.lcssa

	for.body: ; preds = %for.body, %for.body.preheader
	%0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
	%1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
	%2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
	%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
	%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
	%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
	%mul = mul nsw i32 %1, %2
	%3 = add nuw nsw i64 %indvars.iv, 2
	%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %3
	%4 = load i32, ptr %arrayidx5, align 4
	%mul6 = mul nsw i32 %mul, %4
	%arrayidx11 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv.next
	%5 = load i32, ptr %arrayidx11, align 4
	%mul12 = mul nsw i32 %5, %0
	%cmp13 = icmp sgt i32 %mul12, %mul6
	%cond = select i1 %cmp13, i32 %mul6, i32 0
	%add14 = add nsw i32 %cond, %sum.032
	%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
	br i1 %exitcond, label %for.cond.cleanup, label %for.body
	}

	attributes #0 = {"target-cpu"="skylake"}

	!llvm.module.flags = !{!0, !1}
	!llvm.ident = !{!2}

	!0 = !{i32 1, !"wchar_size", i32 2}
	!1 = !{i32 7, !"PIC Level", i32 2}
	!2 = !{!"clang version 5.0.0 (trunk)"}