| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3              | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops    | FileCheck %s --check-prefixes=SSE3,SSE3-FAST | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl           | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW | 
 | ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512-FAST | 
 |  | 
 | ; 128-bit vectors, 16/32-bit, add/sub | 
 |  | 
 | define i32 @extract_extract01_v4i32_add_i32(<4 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v4i32_add_i32(<4 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v4i32_add_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 2 | 
 |   %x1 = extractelement <4 x i32> %x, i32 3 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v4i32_add_i32_commute(<4 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v4i32_add_i32_commute(<4 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v4i32_add_i32_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 2 | 
 |   %x1 = extractelement <4 x i32> %x, i32 3 | 
 |   %x01 = add i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v8i16_add_i16(<8 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v8i16_add_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v8i16_add_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v8i16_add_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v8i16_add_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 0 | 
 |   %x1 = extractelement <8 x i16> %x, i32 1 | 
 |   %x01 = add i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract45_v8i16_add_i16(<8 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract45_v8i16_add_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pextrw $4, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $5, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract45_v8i16_add_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pextrw $2, %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract45_v8i16_add_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $5, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract45_v8i16_add_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrw $2, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 4 | 
 |   %x1 = extractelement <8 x i16> %x, i32 5 | 
 |   %x01 = add i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v8i16_add_i16_commute(<8 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v8i16_add_i16_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v8i16_add_i16_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v8i16_add_i16_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v8i16_add_i16_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 0 | 
 |   %x1 = extractelement <8 x i16> %x, i32 1 | 
 |   %x01 = add i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract45_v8i16_add_i16_commute(<8 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract45_v8i16_add_i16_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pextrw $4, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $5, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract45_v8i16_add_i16_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pextrw $2, %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract45_v8i16_add_i16_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $5, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract45_v8i16_add_i16_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrw $2, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 4 | 
 |   %x1 = extractelement <8 x i16> %x, i32 5 | 
 |   %x01 = add i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v4i32_sub_i32(<4 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v4i32_sub_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v4i32_sub_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v4i32_sub_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v4i32_sub_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   %x01 = sub i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v4i32_sub_i32(<4 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v4i32_sub_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v4i32_sub_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v4i32_sub_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v4i32_sub_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 2 | 
 |   %x1 = extractelement <4 x i32> %x, i32 3 | 
 |   %x01 = sub i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v4i32_sub_i32_commute(<4 x i32> %x) { | 
 | ; SSE3-LABEL: extract_extract01_v4i32_sub_i32_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v4i32_sub_i32_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   %x01 = sub i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v4i32_sub_i32_commute(<4 x i32> %x) { | 
 | ; SSE3-LABEL: extract_extract23_v4i32_sub_i32_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract23_v4i32_sub_i32_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 2 | 
 |   %x1 = extractelement <4 x i32> %x, i32 3 | 
 |   %x01 = sub i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v8i16_sub_i16(<8 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v8i16_sub_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v8i16_sub_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v8i16_sub_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v8i16_sub_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 0 | 
 |   %x1 = extractelement <8 x i16> %x, i32 1 | 
 |   %x01 = sub i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract23_v8i16_sub_i16(<8 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v8i16_sub_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pextrw $2, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pextrw $3, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v8i16_sub_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v8i16_sub_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpextrw $2, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrw $3, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v8i16_sub_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 2 | 
 |   %x1 = extractelement <8 x i16> %x, i32 3 | 
 |   %x01 = sub i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v8i16_sub_i16_commute(<8 x i16> %x) { | 
 | ; SSE3-LABEL: extract_extract01_v8i16_sub_i16_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v8i16_sub_i16_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 0 | 
 |   %x1 = extractelement <8 x i16> %x, i32 1 | 
 |   %x01 = sub i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract23_v8i16_sub_i16_commute(<8 x i16> %x) { | 
 | ; SSE3-LABEL: extract_extract23_v8i16_sub_i16_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    pextrw $2, %xmm0, %ecx | 
 | ; SSE3-NEXT:    pextrw $3, %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract23_v8i16_sub_i16_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vpextrw $2, %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrw $3, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <8 x i16> %x, i32 2 | 
 |   %x1 = extractelement <8 x i16> %x, i32 3 | 
 |   %x01 = sub i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | ; 256-bit vectors, i32/i16, add/sub | 
 |  | 
 | define i32 @extract_extract01_v8i32_add_i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v8i32_add_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v8i32_add_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v8i32_add_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v8i32_add_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 0 | 
 |   %x1 = extractelement <8 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v8i32_add_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 2 | 
 |   %x1 = extractelement <8 x i32> %x, i32 3 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract67_v8i32_add_i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract67_v8i32_add_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 6 | 
 |   %x1 = extractelement <8 x i32> %x, i32 7 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v8i32_add_i32_commute(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v8i32_add_i32_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v8i32_add_i32_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v8i32_add_i32_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v8i32_add_i32_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 0 | 
 |   %x1 = extractelement <8 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v8i32_add_i32_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 2 | 
 |   %x1 = extractelement <8 x i32> %x, i32 3 | 
 |   %x01 = add i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract67_v8i32_add_i32_commute(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 6 | 
 |   %x1 = extractelement <8 x i32> %x, i32 7 | 
 |   %x01 = add i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v16i16_add_i16(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v16i16_add_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v16i16_add_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v16i16_add_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v16i16_add_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 0 | 
 |   %x1 = extractelement <16 x i16> %x, i32 1 | 
 |   %x01 = add i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract23_v16i16_add_i16(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v16i16_add_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pextrw $2, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $3, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v16i16_add_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v16i16_add_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpextrw $2, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $3, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v16i16_add_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 2 | 
 |   %x1 = extractelement <16 x i16> %x, i32 3 | 
 |   %x01 = add i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract89_v16i16_add_i16(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm1, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX1-SLOW-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; AVX1-SLOW:       # %bb.0: | 
 | ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX1-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX1-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX1-SLOW-NEXT:    vzeroupper | 
 | ; AVX1-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-SLOW-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; AVX2-SLOW:       # %bb.0: | 
 | ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX2-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX2-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX2-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX2-SLOW-NEXT:    vzeroupper | 
 | ; AVX2-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX2-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-SLOW-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; AVX512-SLOW:       # %bb.0: | 
 | ; AVX512-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX512-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX512-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX512-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX512-SLOW-NEXT:    vzeroupper | 
 | ; AVX512-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX512-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 8 | 
 |   %x1 = extractelement <16 x i16> %x, i32 9 | 
 |   %x01 = add i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v16i16_add_i16_commute(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v16i16_add_i16_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v16i16_add_i16_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v16i16_add_i16_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v16i16_add_i16_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 0 | 
 |   %x1 = extractelement <16 x i16> %x, i32 1 | 
 |   %x01 = add i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract45_v16i16_add_i16_commute(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract45_v16i16_add_i16_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pextrw $4, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $5, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract45_v16i16_add_i16_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pextrw $2, %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract45_v16i16_add_i16_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $5, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract45_v16i16_add_i16_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrw $2, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 4 | 
 |   %x1 = extractelement <16 x i16> %x, i32 5 | 
 |   %x01 = add i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract89_v16i16_add_i16_commute(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm1, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX1-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; AVX1-SLOW:       # %bb.0: | 
 | ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX1-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX1-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX1-SLOW-NEXT:    vzeroupper | 
 | ; AVX1-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; AVX2-SLOW:       # %bb.0: | 
 | ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX2-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX2-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX2-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX2-SLOW-NEXT:    vzeroupper | 
 | ; AVX2-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX2-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; AVX512-SLOW:       # %bb.0: | 
 | ; AVX512-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX512-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX512-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX512-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX512-SLOW-NEXT:    vzeroupper | 
 | ; AVX512-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX512-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 8 | 
 |   %x1 = extractelement <16 x i16> %x, i32 9 | 
 |   %x01 = add i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v8i32_sub_i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v8i32_sub_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v8i32_sub_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v8i32_sub_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v8i32_sub_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 0 | 
 |   %x1 = extractelement <8 x i32> %x, i32 1 | 
 |   %x01 = sub i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract23_v8i32_sub_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract23_v8i32_sub_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract23_v8i32_sub_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract23_v8i32_sub_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 2 | 
 |   %x1 = extractelement <8 x i32> %x, i32 3 | 
 |   %x01 = sub i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract67_v8i32_sub_i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract67_v8i32_sub_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract67_v8i32_sub_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract67_v8i32_sub_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: extract_extract67_v8i32_sub_i32: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: extract_extract67_v8i32_sub_i32: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: extract_extract67_v8i32_sub_i32: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 6 | 
 |   %x1 = extractelement <8 x i32> %x, i32 7 | 
 |   %x01 = sub i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | ; Negative test...or get hoppy and negate? | 
 |  | 
 | define i32 @extract_extract01_v8i32_sub_i32_commute(<8 x i32> %x) { | 
 | ; SSE3-LABEL: extract_extract01_v8i32_sub_i32_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v8i32_sub_i32_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    vzeroupper | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <8 x i32> %x, i32 0 | 
 |   %x1 = extractelement <8 x i32> %x, i32 1 | 
 |   %x01 = sub i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v16i16_sub_i16(<16 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v16i16_sub_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v16i16_sub_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v16i16_sub_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v16i16_sub_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 0 | 
 |   %x1 = extractelement <16 x i16> %x, i32 1 | 
 |   %x01 = sub i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | ; Negative test...or get hoppy and negate? | 
 |  | 
 | define i16 @extract_extract01_v16i16_sub_i16_commute(<16 x i16> %x) { | 
 | ; SSE3-LABEL: extract_extract01_v16i16_sub_i16_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v16i16_sub_i16_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-NEXT:    vzeroupper | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <16 x i16> %x, i32 0 | 
 |   %x1 = extractelement <16 x i16> %x, i32 1 | 
 |   %x01 = sub i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | ; 512-bit vectors, i32/i16, add/sub | 
 |  | 
 | define i32 @extract_extract01_v16i32_add_i32(<16 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v16i32_add_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v16i32_add_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v16i32_add_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v16i32_add_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i32> %x, i32 0 | 
 |   %x1 = extractelement <16 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v16i32_add_i32_commute(<16 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v16i32_add_i32_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v16i32_add_i32_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v16i32_add_i32_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v16i32_add_i32_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i32> %x, i32 0 | 
 |   %x1 = extractelement <16 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v32i16_add_i16(<32 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v32i16_add_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v32i16_add_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v32i16_add_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v32i16_add_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <32 x i16> %x, i32 0 | 
 |   %x1 = extractelement <32 x i16> %x, i32 1 | 
 |   %x01 = add i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v32i16_add_i16_commute(<32 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v32i16_add_i16_commute: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v32i16_add_i16_commute: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v32i16_add_i16_commute: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v32i16_add_i16_commute: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <32 x i16> %x, i32 0 | 
 |   %x1 = extractelement <32 x i16> %x, i32 1 | 
 |   %x01 = add i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v16i32_sub_i32(<16 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v16i32_sub_i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v16i32_sub_i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v16i32_sub_i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v16i32_sub_i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <16 x i32> %x, i32 0 | 
 |   %x1 = extractelement <16 x i32> %x, i32 1 | 
 |   %x01 = sub i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v16i32_sub_i32_commute(<16 x i32> %x) { | 
 | ; SSE3-LABEL: extract_extract01_v16i32_sub_i32_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v16i32_sub_i32_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    vzeroupper | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <16 x i32> %x, i32 0 | 
 |   %x1 = extractelement <16 x i32> %x, i32 1 | 
 |   %x01 = sub i32 %x1, %x0 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v32i16_sub_i16(<32 x i16> %x) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v32i16_sub_i16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    subl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v32i16_sub_i16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v32i16_sub_i16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    subl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v32i16_sub_i16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <32 x i16> %x, i32 0 | 
 |   %x1 = extractelement <32 x i16> %x, i32 1 | 
 |   %x01 = sub i16 %x0, %x1 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | define i16 @extract_extract01_v32i16_sub_i16_commute(<32 x i16> %x) { | 
 | ; SSE3-LABEL: extract_extract01_v32i16_sub_i16_commute: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    pextrw $1, %xmm0, %eax | 
 | ; SSE3-NEXT:    subl %ecx, %eax | 
 | ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v32i16_sub_i16_commute: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vpextrw $1, %xmm0, %eax | 
 | ; AVX-NEXT:    subl %ecx, %eax | 
 | ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-NEXT:    vzeroupper | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <32 x i16> %x, i32 0 | 
 |   %x1 = extractelement <32 x i16> %x, i32 1 | 
 |   %x01 = sub i16 %x1, %x0 | 
 |   ret i16 %x01 | 
 | } | 
 |  | 
 | ; Check output when 1 or both extracts have extra uses. | 
 |  | 
 | define i32 @extract_extract01_v4i32_add_i32_uses1(<4 x i32> %x, ptr %p) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses1: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, (%rdi) | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, (%rdi) | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses1: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, (%rdi) | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, (%rdi) | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   store i32 %x0, ptr %p | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v4i32_add_i32_uses2(<4 x i32> %x, ptr %p) { | 
 | ; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    addl %ecx, %eax | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, (%rdi) | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, (%rdi) | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    addl %ecx, %eax | 
 | ; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, (%rdi) | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vpextrd $1, %xmm0, (%rdi) | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   store i32 %x1, ptr %p | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, ptr %p1, ptr %p2) { | 
 | ; SSE3-LABEL: extract_extract01_v4i32_add_i32_uses3: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    movd %xmm0, %ecx | 
 | ; SSE3-NEXT:    movd %xmm0, (%rdi) | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    addl %ecx, %eax | 
 | ; SSE3-NEXT:    movd %xmm0, (%rsi) | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: extract_extract01_v4i32_add_i32_uses3: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vmovd %xmm0, %ecx | 
 | ; AVX-NEXT:    vmovd %xmm0, (%rdi) | 
 | ; AVX-NEXT:    vpextrd $1, %xmm0, %eax | 
 | ; AVX-NEXT:    addl %ecx, %eax | 
 | ; AVX-NEXT:    vpextrd $1, %xmm0, (%rsi) | 
 | ; AVX-NEXT:    retq | 
 |   %x0 = extractelement <4 x i32> %x, i32 0 | 
 |   store i32 %x0, ptr %p1 | 
 |   %x1 = extractelement <4 x i32> %x, i32 1 | 
 |   store i32 %x1, ptr %p2 | 
 |   %x01 = add i32 %x0, %x1 | 
 |   ret i32 %x01 | 
 | } | 
 |  | 
 | ; PR33758: https://bugs.llvm.org/show_bug.cgi?id=33758 | 
 |  | 
 | define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: partial_reduction_add_v8i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: partial_reduction_add_v8i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: partial_reduction_add_v8i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: partial_reduction_add_v8i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0213 = add <8 x i32> %x, %x23 | 
 |   %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0123 = add <8 x i32> %x0213, %x13 | 
 |   %r = extractelement <8 x i32> %x0123, i32 0 | 
 |   ret i32 %r | 
 | } | 
 |  | 
 | define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: partial_reduction_add_v16i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: partial_reduction_add_v16i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: partial_reduction_add_v16i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: partial_reduction_add_v16i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0213 = add <16 x i32> %x, %x23 | 
 |   %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0123 = add <16 x i32> %x0213, %x13 | 
 |   %r = extractelement <16 x i32> %x0123, i32 0 | 
 |   ret i32 %r | 
 | } | 
 |  | 
 | define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: partial_reduction_sub_v8i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-FAST-NEXT:    psubd %xmm1, %xmm0 | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: partial_reduction_sub_v8i32: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0213 = sub <8 x i32> %x, %x23 | 
 |   %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0123 = sub <8 x i32> %x0213, %x13 | 
 |   %r = extractelement <8 x i32> %x0123, i32 0 | 
 |   ret i32 %r | 
 | } | 
 |  | 
 | define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { | 
 | ; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: partial_reduction_sub_v16i32: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-FAST-NEXT:    psubd %xmm1, %xmm0 | 
 | ; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX1-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX2-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX512-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX512-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0213 = sub <16 x i32> %x, %x23 | 
 |   %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x0123 = sub <16 x i32> %x0213, %x13 | 
 |   %r = extractelement <16 x i32> %x0123, i32 0 | 
 |   ret i32 %r | 
 | } | 
 |  | 
 | ; https://bugs.chromium.org/p/chromium/issues/detail?id=1195353 | 
 | define <2 x i64> @negative_extract_v16i16_v8i16(<4 x i64> %a0) { | 
 | ; SSE3-LABEL: negative_extract_v16i16_v8i16: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    paddw %xmm1, %xmm0 | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX1-SLOW-LABEL: negative_extract_v16i16_v8i16: | 
 | ; AVX1-SLOW:       # %bb.0: | 
 | ; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1 | 
 | ; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX1-SLOW-NEXT:    vzeroupper | 
 | ; AVX1-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX1-FAST-LABEL: negative_extract_v16i16_v8i16: | 
 | ; AVX1-FAST:       # %bb.0: | 
 | ; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1 | 
 | ; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX1-FAST-NEXT:    vzeroupper | 
 | ; AVX1-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX2-SLOW-LABEL: negative_extract_v16i16_v8i16: | 
 | ; AVX2-SLOW:       # %bb.0: | 
 | ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1 | 
 | ; AVX2-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX2-SLOW-NEXT:    vzeroupper | 
 | ; AVX2-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX2-FAST-LABEL: negative_extract_v16i16_v8i16: | 
 | ; AVX2-FAST:       # %bb.0: | 
 | ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1 | 
 | ; AVX2-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX2-FAST-NEXT:    vzeroupper | 
 | ; AVX2-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX512-SLOW-LABEL: negative_extract_v16i16_v8i16: | 
 | ; AVX512-SLOW:       # %bb.0: | 
 | ; AVX512-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1 | 
 | ; AVX512-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX512-SLOW-NEXT:    vzeroupper | 
 | ; AVX512-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX512-FAST-LABEL: negative_extract_v16i16_v8i16: | 
 | ; AVX512-FAST:       # %bb.0: | 
 | ; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1 | 
 | ; AVX512-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX512-FAST-NEXT:    vzeroupper | 
 | ; AVX512-FAST-NEXT:    retq | 
 |   %s = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> | 
 |   %b = bitcast <4 x i64> %a0 to <16 x i16> | 
 |   %c = bitcast <4 x i64> %s to <16 x i16> | 
 |   %d = add <16 x i16> %b, %c | 
 |   %e = bitcast <16 x i16> %d to <4 x i64> | 
 |   %f = shufflevector <4 x i64> %e, <4 x i64> undef, <2 x i32> <i32 0, i32 1> | 
 |   ret <2 x i64> %f | 
 | } | 
 |  | 
 | ; PR42023 - https://bugs.llvm.org/show_bug.cgi?id=42023 | 
 |  | 
 | define i16 @hadd16_8(<8 x i16> %x223) { | 
 | ; SSE3-SLOW-LABEL: hadd16_8: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    paddw %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    paddw %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    psrld $16, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    paddw %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: hadd16_8: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: hadd16_8: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1 | 
 | ; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: hadd16_8: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x224 = shufflevector <8 x i16> %x223, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x225 = add <8 x i16> %x223, %x224 | 
 |   %x226 = shufflevector <8 x i16> %x225, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x227 = add <8 x i16> %x225, %x226 | 
 |   %x228 = shufflevector <8 x i16> %x227, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <8 x i16> %x227, %x228 | 
 |   %x230 = extractelement <8 x i16> %x229, i32 0 | 
 |   ret i16 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_4(<4 x i32> %x225) { | 
 | ; SSE3-SLOW-LABEL: hadd32_4: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: hadd32_4: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-FAST-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: hadd32_4: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: hadd32_4: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> | 
 |   %x227 = add <4 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <4 x i32> %x227, %x228 | 
 |   %x230 = extractelement <4 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_8(<8 x i32> %x225) { | 
 | ; SSE3-SLOW-LABEL: hadd32_8: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: hadd32_8: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: hadd32_8: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: hadd32_8: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x227 = add <8 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <8 x i32> %x227, %x228 | 
 |   %x230 = extractelement <8 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_16(<16 x i32> %x225) { | 
 | ; SSE3-SLOW-LABEL: hadd32_16: | 
 | ; SSE3-SLOW:       # %bb.0: | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] | 
 | ; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0 | 
 | ; SSE3-SLOW-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-SLOW-NEXT:    retq | 
 | ; | 
 | ; SSE3-FAST-LABEL: hadd32_16: | 
 | ; SSE3-FAST:       # %bb.0: | 
 | ; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-FAST-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-FAST-NEXT:    retq | 
 | ; | 
 | ; AVX-SLOW-LABEL: hadd32_16: | 
 | ; AVX-SLOW:       # %bb.0: | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] | 
 | ; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 | 
 | ; AVX-SLOW-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-SLOW-NEXT:    vzeroupper | 
 | ; AVX-SLOW-NEXT:    retq | 
 | ; | 
 | ; AVX-FAST-LABEL: hadd32_16: | 
 | ; AVX-FAST:       # %bb.0: | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-FAST-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-FAST-NEXT:    vzeroupper | 
 | ; AVX-FAST-NEXT:    retq | 
 |   %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x227 = add <16 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <16 x i32> %x227, %x228 | 
 |   %x230 = extractelement <16 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { | 
 | ; SSE3-LABEL: hadd16_8_optsize: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    phaddw %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: hadd16_8_optsize: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax | 
 | ; AVX-NEXT:    retq | 
 |   %x224 = shufflevector <8 x i16> %x223, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x225 = add <8 x i16> %x223, %x224 | 
 |   %x226 = shufflevector <8 x i16> %x225, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x227 = add <8 x i16> %x225, %x226 | 
 |   %x228 = shufflevector <8 x i16> %x227, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <8 x i16> %x227, %x228 | 
 |   %x230 = extractelement <8 x i16> %x229, i32 0 | 
 |   ret i16 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize { | 
 | ; SSE3-LABEL: hadd32_4_optsize: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: hadd32_4_optsize: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-NEXT:    retq | 
 |   %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> | 
 |   %x227 = add <4 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <4 x i32> %x227, %x228 | 
 |   %x230 = extractelement <4 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { | 
 | ; SSE3-LABEL: hadd32_4_pgso: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    phaddd %xmm0, %xmm0 | 
 | ; SSE3-NEXT:    movd %xmm0, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: hadd32_4_pgso: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-NEXT:    retq | 
 |   %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> | 
 |   %x227 = add <4 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <4 x i32> %x227, %x228 | 
 |   %x230 = extractelement <4 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { | 
 | ; SSE3-LABEL: hadd32_8_optsize: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: hadd32_8_optsize: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-NEXT:    vzeroupper | 
 | ; AVX-NEXT:    retq | 
 |   %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x227 = add <8 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <8 x i32> %x227, %x228 | 
 |   %x230 = extractelement <8 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize { | 
 | ; SSE3-LABEL: hadd32_16_optsize: | 
 | ; SSE3:       # %bb.0: | 
 | ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] | 
 | ; SSE3-NEXT:    paddd %xmm0, %xmm1 | 
 | ; SSE3-NEXT:    phaddd %xmm1, %xmm1 | 
 | ; SSE3-NEXT:    movd %xmm1, %eax | 
 | ; SSE3-NEXT:    retq | 
 | ; | 
 | ; AVX-LABEL: hadd32_16_optsize: | 
 | ; AVX:       # %bb.0: | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0 | 
 | ; AVX-NEXT:    vmovd %xmm0, %eax | 
 | ; AVX-NEXT:    vzeroupper | 
 | ; AVX-NEXT:    retq | 
 |   %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x227 = add <16 x i32> %x225, %x226 | 
 |   %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> | 
 |   %x229 = add <16 x i32> %x227, %x228 | 
 |   %x230 = extractelement <16 x i32> %x229, i32 0 | 
 |   ret i32 %x230 | 
 | } | 
 |  | 
 | !llvm.module.flags = !{!0} | 
 | !0 = !{i32 1, !"ProfileSummary", !1} | 
 | !1 = !{!2, !3, !4, !5, !6, !7, !8, !9} | 
 | !2 = !{!"ProfileFormat", !"InstrProf"} | 
 | !3 = !{!"TotalCount", i64 10000} | 
 | !4 = !{!"MaxCount", i64 10} | 
 | !5 = !{!"MaxInternalCount", i64 1} | 
 | !6 = !{!"MaxFunctionCount", i64 1000} | 
 | !7 = !{!"NumCounts", i64 3} | 
 | !8 = !{!"NumFunctions", i64 3} | 
 | !9 = !{!"DetailedSummary", !10} | 
 | !10 = !{!11, !12, !13} | 
 | !11 = !{i32 10000, i64 100, i32 1} | 
 | !12 = !{i32 999000, i64 100, i32 1} | 
 | !13 = !{i32 999999, i64 1, i32 2} | 
 | !14 = !{!"function_entry_count", i64 0} |