| ; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s | 
 |  | 
 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | 
 | ;; This test checks that x86-cmov-converter optimization does not transform CMOV | 
 | ;; instruction when the gain (in cycles) of converting to branch is less than | 
 | ;; a fix threshold (measured for "-x86-cmov-converter-threshold=4"). | 
 | ;; | 
 | ;; Test was created using the following command line: | 
 | ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o - | 
 | ;; Where foo.c is: | 
 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | 
 | ;;int bar(int *a, int *b, int n) { | 
 | ;;  int sum = 0; | 
 | ;;  for (int i = 0; i < n; ++i) { | 
 | ;;    int x = aptr aptr a[i+2]; | 
 | ;;    int y = bptr b[i+1]; | 
 | ;;    sum += y > x ? x : 0; | 
 | ;;  } | 
 | ;;  return sum; | 
 | ;;} | 
 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | 
 | ;; Adding a test to the above function shows code with CMOV is 25% faster than | 
 | ;; the code with branch. | 
 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | 
 | ;;#define N 10000 | 
 | ;;int A[N]; | 
 | ;;int B[N]; | 
 | ;; | 
 | ;; | 
 | ;; | 
 | ;;int main () { | 
 | ;;  for (int i=0; i< N; ++i) { | 
 | ;;    A[i] = i%4; | 
 | ;;    B[i] = i%5; | 
 | ;;  } | 
 | ;;  int sum = 0; | 
 | ;;  for (int i=0; i< N*10; ++i) | 
 | ;;    sum += bar(A, B, N); | 
 | ;;  return sum; | 
 | ;;} | 
 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | 
 |  | 
 | ; CHECK-NOT: jg | 
 | ; CHECK: cmovle | 
 | define i32 @bar(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) #0 { | 
 | entry: | 
 |   %cmp30 = icmp sgt i32 %n, 0 | 
 |   br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup | 
 |  | 
 | for.body.preheader:                               ; preds = %entry | 
 |   %.pre = load i32, ptr %a, align 4 | 
 |   %arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 1 | 
 |   %.pre34 = load i32, ptr %arrayidx2.phi.trans.insert, align 4 | 
 |   %.pre35 = load i32, ptr %b, align 4 | 
 |   %wide.trip.count = zext i32 %n to i64 | 
 |   br label %for.body | 
 |  | 
 | for.cond.cleanup:                                 ; preds = %for.body, %entry | 
 |   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ] | 
 |   ret i32 %sum.0.lcssa | 
 |  | 
 | for.body:                                         ; preds = %for.body, %for.body.preheader | 
 |   %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ] | 
 |   %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ] | 
 |   %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ] | 
 |   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] | 
 |   %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ] | 
 |   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | 
 |   %mul = mul nsw i32 %1, %2 | 
 |   %3 = add nuw nsw i64 %indvars.iv, 2 | 
 |   %arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %3 | 
 |   %4 = load i32, ptr %arrayidx5, align 4 | 
 |   %mul6 = mul nsw i32 %mul, %4 | 
 |   %arrayidx11 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv.next | 
 |   %5 = load i32, ptr %arrayidx11, align 4 | 
 |   %mul12 = mul nsw i32 %5, %0 | 
 |   %cmp13 = icmp sgt i32 %mul12, %mul6 | 
 |   %cond = select i1 %cmp13, i32 %mul6, i32 0 | 
 |   %add14 = add nsw i32 %cond, %sum.032 | 
 |   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count | 
 |   br i1 %exitcond, label %for.cond.cleanup, label %for.body | 
 | } | 
 |  | 
 | attributes #0 = {"target-cpu"="skylake"} | 
 |  | 
 | !llvm.module.flags = !{!0, !1} | 
 | !llvm.ident = !{!2} | 
 |  | 
 | !0 = !{i32 1, !"wchar_size", i32 2} | 
 | !1 = !{i32 7, !"PIC Level", i32 2} | 
 | !2 = !{!"clang version 5.0.0 (trunk)"} |