| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE |
| ; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,AVX |
| ; |
| ; This file tests the look-ahead operand reordering heuristic. |
| ; |
| ; |
| ; This checks that operand reordering will reorder the operands of the adds |
| ; by taking into consideration the instructions beyond the immediate |
| ; predecessors. |
| ; |
| ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] |
| ; \ / \ / \ / \ / |
| ; - - - - |
| ; \ / \ / |
| ; + + |
| ; | | |
| ; S[0] S[1] |
| ; |
| define void @lookahead_basic(ptr %array) { |
| ; CHECK-LABEL: @lookahead_basic( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2 |
| ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4 |
| ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6 |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8 |
| ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8 |
| ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]] |
| ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]] |
| ; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %idx1 = getelementptr inbounds double, ptr %array, i64 1 |
| %idx2 = getelementptr inbounds double, ptr %array, i64 2 |
| %idx3 = getelementptr inbounds double, ptr %array, i64 3 |
| %idx4 = getelementptr inbounds double, ptr %array, i64 4 |
| %idx5 = getelementptr inbounds double, ptr %array, i64 5 |
| %idx6 = getelementptr inbounds double, ptr %array, i64 6 |
| %idx7 = getelementptr inbounds double, ptr %array, i64 7 |
| |
| %A_0 = load double, ptr %array, align 8 |
| %A_1 = load double, ptr %idx1, align 8 |
| %B_0 = load double, ptr %idx2, align 8 |
| %B_1 = load double, ptr %idx3, align 8 |
| %C_0 = load double, ptr %idx4, align 8 |
| %C_1 = load double, ptr %idx5, align 8 |
| %D_0 = load double, ptr %idx6, align 8 |
| %D_1 = load double, ptr %idx7, align 8 |
| |
| %subAB_0 = fsub fast double %A_0, %B_0 |
| %subCD_0 = fsub fast double %C_0, %D_0 |
| |
| %subAB_1 = fsub fast double %A_1, %B_1 |
| %subCD_1 = fsub fast double %C_1, %D_1 |
| |
| %addABCD_0 = fadd fast double %subAB_0, %subCD_0 |
| %addCDAB_1 = fadd fast double %subCD_1, %subAB_1 |
| |
| store double %addABCD_0, ptr %array, align 8 |
| store double %addCDAB_1, ptr %idx1, align 8 |
| ret void |
| } |
| |
| |
| ; Check whether the look-ahead operand reordering heuristic will avoid |
| ; bundling the alt opcodes. The vectorized code should have no shuffles. |
| ; |
| ; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1] |
| ; \ / \ / \ / \ / |
| ; + - - + |
| ; \ / \ / |
| ; + + |
| ; | | |
| ; S[0] S[1] |
| ; |
| define void @lookahead_alt1(ptr %array) { |
| ; CHECK-LABEL: @lookahead_alt1( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2 |
| ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4 |
| ; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5 |
| ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6 |
| ; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7 |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8 |
| ; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]] |
| ; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] |
| ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %idx1 = getelementptr inbounds double, ptr %array, i64 1 |
| %idx2 = getelementptr inbounds double, ptr %array, i64 2 |
| %idx3 = getelementptr inbounds double, ptr %array, i64 3 |
| %idx4 = getelementptr inbounds double, ptr %array, i64 4 |
| %idx5 = getelementptr inbounds double, ptr %array, i64 5 |
| %idx6 = getelementptr inbounds double, ptr %array, i64 6 |
| %idx7 = getelementptr inbounds double, ptr %array, i64 7 |
| |
| %A_0 = load double, ptr %array, align 8 |
| %A_1 = load double, ptr %idx1, align 8 |
| %B_0 = load double, ptr %idx2, align 8 |
| %B_1 = load double, ptr %idx3, align 8 |
| |
| %addAB_0_L = fadd fast double %A_0, %B_0 |
| %subAB_0_R = fsub fast double %A_0, %B_0 |
| |
| %subAB_1_L = fsub fast double %A_1, %B_1 |
| %addAB_1_R = fadd fast double %A_1, %B_1 |
| |
| %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R |
| %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R |
| |
| store double %addABCD_0, ptr %array, align 8 |
| store double %addCDAB_1, ptr %idx1, align 8 |
| ret void |
| } |
| |
| |
| ; This code should get vectorized all the way to the loads with shuffles for |
| ; the alt opcodes. |
| ; |
| ; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1] |
| ; \ / \ / \ / \ / |
| ; + - + - |
| ; \ / \ / |
| ; + + |
| ; | | |
| ; S[0] S[1] |
| ; |
| define void @lookahead_alt2(ptr %array) { |
| ; CHECK-LABEL: @lookahead_alt2( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2 |
| ; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4 |
| ; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6 |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8 |
| ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8 |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8 |
| ; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3> |
| ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3> |
| ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] |
| ; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %idx1 = getelementptr inbounds double, ptr %array, i64 1 |
| %idx2 = getelementptr inbounds double, ptr %array, i64 2 |
| %idx3 = getelementptr inbounds double, ptr %array, i64 3 |
| %idx4 = getelementptr inbounds double, ptr %array, i64 4 |
| %idx5 = getelementptr inbounds double, ptr %array, i64 5 |
| %idx6 = getelementptr inbounds double, ptr %array, i64 6 |
| %idx7 = getelementptr inbounds double, ptr %array, i64 7 |
| |
| %A_0 = load double, ptr %array, align 8 |
| %A_1 = load double, ptr %idx1, align 8 |
| %B_0 = load double, ptr %idx2, align 8 |
| %B_1 = load double, ptr %idx3, align 8 |
| %C_0 = load double, ptr %idx4, align 8 |
| %C_1 = load double, ptr %idx5, align 8 |
| %D_0 = load double, ptr %idx6, align 8 |
| %D_1 = load double, ptr %idx7, align 8 |
| |
| %addAB_0 = fadd fast double %A_0, %B_0 |
| %subCD_0 = fsub fast double %C_0, %D_0 |
| |
| %addCD_1 = fadd fast double %C_1, %D_1 |
| %subAB_1 = fsub fast double %A_1, %B_1 |
| |
| %addABCD_0 = fadd fast double %addAB_0, %subCD_0 |
| %addCDAB_1 = fadd fast double %addCD_1, %subAB_1 |
| |
| store double %addABCD_0, ptr %array, align 8 |
| store double %addCDAB_1, ptr %idx1, align 8 |
| ret void |
| } |
| |
| |
| ; |
| ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] |
| ; \ / \ / / \ / \ / |
| ; - - U - - |
| ; \ / \ / |
| ; + + |
| ; | | |
| ; S[0] S[1] |
| ; |
| ; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses. |
| ; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use. |
| |
| define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) { |
| ; CHECK-LABEL: @lookahead_external_uses( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 |
| ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 |
| ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 |
| ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 |
| ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 |
| ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 |
| ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 |
| ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 |
| ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 |
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 |
| ; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] |
| ; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 |
| ; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| |
| %IdxA1 = getelementptr inbounds double, ptr %A, i64 1 |
| %IdxB2 = getelementptr inbounds double, ptr %B, i64 2 |
| %IdxA2 = getelementptr inbounds double, ptr %A, i64 2 |
| %IdxB1 = getelementptr inbounds double, ptr %B, i64 1 |
| |
| %A0 = load double, ptr %A, align 8 |
| %B0 = load double, ptr %B, align 8 |
| %C0 = load double, ptr %C, align 8 |
| %D0 = load double, ptr %D, align 8 |
| |
| %A1 = load double, ptr %IdxA1, align 8 |
| %B2 = load double, ptr %IdxB2, align 8 |
| %A2 = load double, ptr %IdxA2, align 8 |
| %B1 = load double, ptr %IdxB1, align 8 |
| |
| %subA0B0 = fsub fast double %A0, %B0 |
| %subC0D0 = fsub fast double %C0, %D0 |
| |
| %subA1B2 = fsub fast double %A1, %B2 |
| %subA2B1 = fsub fast double %A2, %B1 |
| |
| %add0 = fadd fast double %subA0B0, %subC0D0 |
| %add1 = fadd fast double %subA1B2, %subA2B1 |
| |
| %IdxS1 = getelementptr inbounds double, ptr %S, i64 1 |
| |
| store double %add0, ptr %S, align 8 |
| store double %add1, ptr %IdxS1, align 8 |
| |
| ; External use |
| store double %A1, ptr %Ext1, align 8 |
| ret void |
| } |
| |
| ; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1] |
| ; \ / \ / / \ / \ / \ |
| ; - - U1,U2,U3 - - U4,U5 |
| ; \ / \ / |
| ; + + |
| ; | | |
| ; S[0] S[1] |
| ; |
| ; |
| ; If we limit the users budget for the look-ahead heuristic to 2, then the |
| ; look-ahead heuristic has no way of choosing B[1] (with 2 external users) |
| ; over A[1] (with 3 external users). |
| ; The result is that the operands are of the Add not reordered and the loads |
| ; from A get vectorized instead of the loads from B. |
| ; |
| define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) { |
| ; CHECK-LABEL: @lookahead_limit_users_budget( |
| ; CHECK-NEXT: entry: |
| ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 |
| ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 |
| ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 |
| ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 |
| ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 |
| ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 |
| ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 |
| ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 |
| ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 |
| ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 |
| ; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 |
| ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 |
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 |
| ; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] |
| ; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 |
| ; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 |
| ; CHECK-NEXT: store double [[A1]], ptr [[EXT2:%.*]], align 8 |
| ; CHECK-NEXT: store double [[A1]], ptr [[EXT3:%.*]], align 8 |
| ; CHECK-NEXT: store double [[B1]], ptr [[EXT4:%.*]], align 8 |
| ; CHECK-NEXT: store double [[B1]], ptr [[EXT5:%.*]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| |
| %IdxA1 = getelementptr inbounds double, ptr %A, i64 1 |
| %IdxB2 = getelementptr inbounds double, ptr %B, i64 2 |
| %IdxA2 = getelementptr inbounds double, ptr %A, i64 2 |
| %IdxB1 = getelementptr inbounds double, ptr %B, i64 1 |
| |
| %A0 = load double, ptr %A, align 8 |
| %B0 = load double, ptr %B, align 8 |
| %C0 = load double, ptr %C, align 8 |
| %D0 = load double, ptr %D, align 8 |
| |
| %A1 = load double, ptr %IdxA1, align 8 |
| %B2 = load double, ptr %IdxB2, align 8 |
| %A2 = load double, ptr %IdxA2, align 8 |
| %B1 = load double, ptr %IdxB1, align 8 |
| |
| %subA0B0 = fsub fast double %A0, %B0 |
| %subC0D0 = fsub fast double %C0, %D0 |
| |
| %subA1B2 = fsub fast double %A1, %B2 |
| %subA2B1 = fsub fast double %A2, %B1 |
| |
| %add0 = fadd fast double %subA0B0, %subC0D0 |
| %add1 = fadd fast double %subA1B2, %subA2B1 |
| |
| %IdxS1 = getelementptr inbounds double, ptr %S, i64 1 |
| |
| store double %add0, ptr %S, align 8 |
| store double %add1, ptr %IdxS1, align 8 |
| |
| ; External uses of A1 |
| store double %A1, ptr %Ext1, align 8 |
| store double %A1, ptr %Ext2, align 8 |
| store double %A1, ptr %Ext3, align 8 |
| |
| ; External uses of B1 |
| store double %B1, ptr %Ext4, align 8 |
| store double %B1, ptr %Ext5, align 8 |
| |
| ret void |
| } |
| |
| ; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls). |
| |
| %Class = type { i8 } |
| declare double @_ZN1i2ayEv(ptr) |
| declare double @_ZN1i2axEv() |
| |
| define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) { |
| ; CHECK-LABEL: @lookahead_crash( |
| ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 |
| ; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(ptr [[ARG0:%.*]]) |
| ; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv() |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 |
| ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1 |
| ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] |
| ; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[S:%.*]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| %IdxA1 = getelementptr inbounds double, ptr %A, i64 1 |
| |
| %A0 = load double, ptr %A, align 8 |
| %A1 = load double, ptr %IdxA1, align 8 |
| |
| %C0 = call double @_ZN1i2ayEv(ptr %Arg0) |
| %C1 = call double @_ZN1i2axEv() |
| |
| %add0 = fadd fast double %A0, %C0 |
| %add1 = fadd fast double %A1, %C1 |
| |
| %IdxS1 = getelementptr inbounds double, ptr %S, i64 1 |
| store double %add0, ptr %S, align 8 |
| store double %add1, ptr %IdxS1, align 8 |
| ret void |
| } |
| |
| ; This checks that we choose to group consecutive extracts from the same vectors. |
| define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) { |
| ; CHECK-LABEL: @ChecksExtractScores( |
| ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1 |
| ; CHECK-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4 |
| ; CHECK-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4 |
| ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4 |
| ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4 |
| ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 |
| ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] |
| ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8 |
| ; CHECK-NEXT: ret void |
| ; |
| %idx1 = getelementptr inbounds double, ptr %array, i64 1 |
| %loadA0 = load double, ptr %array, align 4 |
| %loadA1 = load double, ptr %idx1, align 4 |
| |
| %loadVec = load <2 x double>, ptr %vecPtr1, align 4 |
| %extrA0 = extractelement <2 x double> %loadVec, i32 0 |
| %extrA1 = extractelement <2 x double> %loadVec, i32 1 |
| %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4 |
| %extrB0 = extractelement <2 x double> %loadVec2, i32 0 |
| %extrB1 = extractelement <2 x double> %loadVec2, i32 1 |
| |
| %mul0 = fmul double %extrA0, %loadA0 |
| %mul1 = fmul double %extrA1, %loadA0 |
| %mul3 = fmul double %extrB0, %loadA1 |
| %mul4 = fmul double %extrB1, %loadA1 |
| %add0 = fadd double %mul0, %mul3 |
| %add1 = fadd double %mul1, %mul4 |
| |
| %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1 |
| store double %add0, ptr %storeArray, align 8 |
| store double %add1, ptr %sidx1, align 8 |
| ret void |
| } |
| |
| |
| define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { |
| ; SSE-LABEL: @ExtractIdxNotConstantInt1( |
| ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef |
| ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] |
| ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] |
| ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 |
| ; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 |
| ; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 |
| ; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 |
| ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] |
| ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0 |
| ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] |
| ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00> |
| ; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 |
| ; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 |
| ; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] |
| ; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 |
| ; SSE-NEXT: ret i1 [[CMP_I185]] |
| ; |
| ; AVX-LABEL: @ExtractIdxNotConstantInt1( |
| ; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef |
| ; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] |
| ; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] |
| ; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] |
| ; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] |
| ; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 |
| ; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] |
| ; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 |
| ; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] |
| ; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] |
| ; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 |
| ; AVX-NEXT: ret i1 [[CMP_I185]] |
| ; |
| %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef |
| %sub14.i167 = fsub float undef, %vecext.i291.i166 |
| %fm = fmul float %a, %sub14.i167 |
| %sub25.i168 = fsub float %fm, %b |
| %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2 |
| %add36.i173 = fadd float %sub25.i168, 10.0 |
| %mul72.i179 = fmul float %c, %vecext.i276.i169 |
| %add78.i180 = fsub float %mul72.i179, 30.0 |
| %add79.i181 = fadd float 2.0, %add78.i180 |
| %mul123.i184 = fmul float %add36.i173, %add79.i181 |
| %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 |
| ret i1 %cmp.i185 |
| } |
| |
| |
| define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { |
| ; SSE-LABEL: @ExtractIdxNotConstantInt2( |
| ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 |
| ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] |
| ; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] |
| ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 |
| ; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 |
| ; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 |
| ; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 |
| ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] |
| ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0 |
| ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] |
| ; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00> |
| ; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 |
| ; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 |
| ; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] |
| ; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 |
| ; SSE-NEXT: ret i1 [[CMP_I185]] |
| ; |
| ; AVX-LABEL: @ExtractIdxNotConstantInt2( |
| ; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1 |
| ; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] |
| ; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] |
| ; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] |
| ; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]] |
| ; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 |
| ; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] |
| ; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 |
| ; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] |
| ; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] |
| ; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 |
| ; AVX-NEXT: ret i1 [[CMP_I185]] |
| ; |
| %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1 |
| %sub14.i167 = fsub float undef, %vecext.i291.i166 |
| %fm = fmul float %a, %sub14.i167 |
| %sub25.i168 = fsub float %fm, %b |
| %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2 |
| %add36.i173 = fadd float %sub25.i168, 10.0 |
| %mul72.i179 = fmul float %c, %vecext.i276.i169 |
| %add78.i180 = fsub float %mul72.i179, 30.0 |
| %add79.i181 = fadd float 2.0, %add78.i180 |
| %mul123.i184 = fmul float %add36.i173, %add79.i181 |
| %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 |
| ret i1 %cmp.i185 |
| } |
| |
| |
| define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { |
| ; CHECK-LABEL: @foo( |
| ; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 |
| ; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] |
| ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 |
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 |
| ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1> |
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 |
| ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0 |
| ; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] |
| ; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00> |
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 |
| ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 |
| ; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] |
| ; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 |
| ; CHECK-NEXT: ret i1 [[CMP_I185]] |
| ; |
| %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 |
| %sub14.i167 = fsub float undef, %vecext.i291.i166 |
| %fm = fmul float %a, %sub14.i167 |
| %sub25.i168 = fsub float %fm, %b |
| %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1 |
| %add36.i173 = fadd float %sub25.i168, 10.0 |
| %mul72.i179 = fmul float %c, %vecext.i276.i169 |
| %add78.i180 = fsub float %mul72.i179, 30.0 |
| %add79.i181 = fadd float 2.0, %add78.i180 |
| %mul123.i184 = fmul float %add36.i173, %add79.i181 |
| %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00 |
| ret i1 %cmp.i185 |
| } |
| |
| ; Same as @ChecksExtractScores, but the extratelement vector operands do not match. |
| define void @ChecksExtractScores_different_vectors(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2, ptr %vecPtr3, ptr %vecPtr4) { |
| ; |
| ; SSE-LABEL: @ChecksExtractScores_different_vectors( |
| ; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4 |
| ; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4 |
| ; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4 |
| ; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4 |
| ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4 |
| ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> <i32 1, i32 2> |
| ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]] |
| ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0> |
| ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3> |
| ; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]] |
| ; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]] |
| ; SSE-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8 |
| ; SSE-NEXT: ret void |
| ; |
| ; AVX-LABEL: @ChecksExtractScores_different_vectors( |
| ; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1 |
| ; AVX-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4 |
| ; AVX-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4 |
| ; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4 |
| ; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4 |
| ; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4 |
| ; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4 |
| ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> <i32 0, i32 3> |
| ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 |
| ; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] |
| ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3> |
| ; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0 |
| ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]] |
| ; AVX-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]] |
| ; AVX-NEXT: store <2 x double> [[TMP9]], ptr [[STOREARRAY:%.*]], align 8 |
| ; AVX-NEXT: ret void |
| ; |
| %idx1 = getelementptr inbounds double, ptr %array, i64 1 |
| %loadA0 = load double, ptr %array, align 4 |
| %loadA1 = load double, ptr %idx1, align 4 |
| |
| %loadVec = load <2 x double>, ptr %vecPtr1, align 4 |
| %loadVec2 = load <2 x double>, ptr %vecPtr2, align 4 |
| %extrA0 = extractelement <2 x double> %loadVec, i32 0 |
| %extrA1 = extractelement <2 x double> %loadVec2, i32 1 |
| %loadVec3= load <2 x double>, ptr %vecPtr3, align 4 |
| %loadVec4 = load <2 x double>, ptr %vecPtr4, align 4 |
| %extrB0 = extractelement <2 x double> %loadVec3, i32 0 |
| %extrB1 = extractelement <2 x double> %loadVec4, i32 1 |
| |
| %mul0 = fmul double %extrA0, %loadA0 |
| %mul1 = fmul double %extrA1, %loadA0 |
| %mul3 = fmul double %extrB0, %loadA1 |
| %mul4 = fmul double %extrB1, %loadA1 |
| %add0 = fadd double %mul0, %mul3 |
| %add1 = fadd double %mul1, %mul4 |
| |
| %sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1 |
| store double %add0, ptr %storeArray, align 8 |
| store double %add1, ptr %sidx1, align 8 |
| ret void |
| } |
| |
| ; This checks that we we prefer splats rather than reverse load vectors + shuffles. |
| ; 2-wide splat loads in x86 use a single instruction so they are quite cheap. |
| define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) { |
| ; SSE-LABEL: @splat_loads( |
| ; SSE-NEXT: entry: |
| ; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 |
| ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 |
| ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0> |
| ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] |
| ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] |
| ; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] |
| ; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 |
| ; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 |
| ; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]] |
| ; SSE-NEXT: ret double [[ADD3]] |
| ; |
| ; AVX-LABEL: @splat_loads( |
| ; AVX-NEXT: entry: |
| ; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1 |
| ; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8 |
| ; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8 |
| ; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 |
| ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 |
| ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] |
| ; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 |
| ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]] |
| ; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] |
| ; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 |
| ; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 |
| ; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]] |
| ; AVX-NEXT: ret double [[ADD3]] |
| ; |
| entry: |
| %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1 |
| %ld_1_0 = load double, ptr %array1, align 8 |
| %ld_1_1 = load double, ptr %gep_1_1, align 8 |
| |
| %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1 |
| %ld_2_0 = load double, ptr %array2, align 8 |
| %ld_2_1 = load double, ptr %gep_2_1, align 8 |
| |
| %mul1 = fmul double %ld_1_0, %ld_2_0 |
| %mul2 = fmul double %ld_1_1, %ld_2_0 |
| |
| %mul3 = fmul double %ld_1_0, %ld_2_1 |
| %mul4 = fmul double %ld_1_1, %ld_2_1 |
| |
| %add1 = fadd double %mul1, %mul3 |
| %add2 = fadd double %mul2, %mul4 |
| |
| %add3 = fadd double %add1, %add2 |
| ret double %add3 |
| } |
| |
| |
| ; Same as splat_loads() but the splat load has internal uses in the slp graph. |
| define double @splat_loads_with_internal_uses(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) { |
| ; SSE-LABEL: @splat_loads_with_internal_uses( |
| ; SSE-NEXT: entry: |
| ; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 |
| ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8 |
| ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0> |
| ; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] |
| ; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]] |
| ; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]] |
| ; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] |
| ; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 |
| ; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 |
| ; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]] |
| ; SSE-NEXT: ret double [[RES]] |
| ; |
| ; AVX-LABEL: @splat_loads_with_internal_uses( |
| ; AVX-NEXT: entry: |
| ; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1 |
| ; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8 |
| ; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8 |
| ; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8 |
| ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0 |
| ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]] |
| ; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0 |
| ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer |
| ; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]] |
| ; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] |
| ; AVX-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]] |
| ; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 |
| ; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 |
| ; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]] |
| ; AVX-NEXT: ret double [[RES]] |
| ; |
| entry: |
| %gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1 |
| %ld_1_0 = load double, ptr %array1, align 8 |
| %ld_1_1 = load double, ptr %gep_1_1, align 8 |
| |
| %gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1 |
| %ld_2_0 = load double, ptr %array2, align 8 |
| %ld_2_1 = load double, ptr %gep_2_1, align 8 |
| |
| %mul1 = fmul double %ld_1_0, %ld_2_0 |
| %mul2 = fmul double %ld_1_1, %ld_2_0 |
| |
| %mul3 = fmul double %ld_1_0, %ld_2_1 |
| %mul4 = fmul double %ld_1_1, %ld_2_1 |
| |
| %add1 = fadd double %mul1, %mul3 |
| %add2 = fadd double %mul2, %mul4 |
| |
| ; One more user for the broadcast of %ld_2_0 |
| %sub1 = fsub double %add1, %ld_2_0 |
| %sub2 = fsub double %add2, %ld_2_0 |
| |
| %res = fadd double %sub1, %sub2 |
| |
| ret double %res |
| } |