| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s | 
 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s | 
 | ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s | 
 | ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_unordered_load( | 
 | ; GFX7-LABEL: flat_wavefront_unordered_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_unordered_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_monotonic_load( | 
 | ; GFX7-LABEL: flat_wavefront_monotonic_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_monotonic_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_load( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_load( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_unordered_store( | 
 | ; GFX7-LABEL: flat_wavefront_unordered_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_unordered_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_unordered_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_unordered_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_monotonic_store( | 
 | ; GFX7-LABEL: flat_wavefront_monotonic_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_monotonic_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_release_store( | 
 | ; GFX7-LABEL: flat_wavefront_release_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_release_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_release_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_store( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_release_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_release_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_ret_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire | 
 |   store i32 %val, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel | 
 |   store i32 %val, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst | 
 |   store i32 %val, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_unordered_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1] | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_load: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_load_dword v0, v[0:1] | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[2:3], v0 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %in, i32* %out) { | 
 | entry: | 
 |   %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 | 
 |   store i32 %val, i32* %out | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_unordered_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_release_store( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_release_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_store: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s2, s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32 %in, i32* %out) { | 
 | entry: | 
 |   store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire | 
 |   store i32 %val, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel | 
 |   store i32 %val, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dword s2, s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in) { | 
 | entry: | 
 |   %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst | 
 |   store i32 %val, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s0, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s2, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s3, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3] | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  | 
 | define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( | 
 | ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; GFX7:       ; %bb.0: ; %entry | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x2 | 
 | ; GFX7-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX7-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX7-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX7-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX7-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX7-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX7-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; GFX10-WGP:       ; %bb.0: ; %entry | 
 | ; GFX10-WGP-NEXT:    s_clause 0x1 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-WGP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-WGP-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-WGP-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-WGP-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-WGP-NEXT:    s_endpgm | 
 | ; | 
 | ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; GFX10-CU:       ; %bb.0: ; %entry | 
 | ; GFX10-CU-NEXT:    s_clause 0x1 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0 | 
 | ; GFX10-CU-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8 | 
 | ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    s_add_u32 s4, s0, 16 | 
 | ; GFX10-CU-NEXT:    s_addc_u32 s5, s1, 0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s2 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s3 | 
 | ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s0 | 
 | ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s1 | 
 | ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; GFX10-CU-NEXT:    s_endpgm | 
 | ; | 
 | ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: | 
 | ; SKIP-CACHE-INV:       ; %bb.0: ; %entry | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9 | 
 | ; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    s_add_u32 s4, s2, 16 | 
 | ; SKIP-CACHE-INV-NEXT:    s_addc_u32 s5, s3, 0 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v3, s1 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s5 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v2, s0 | 
 | ; SKIP-CACHE-INV-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s2 | 
 | ; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s3 | 
 | ; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) | 
 | ; SKIP-CACHE-INV-NEXT:    flat_store_dword v[0:1], v2 | 
 | ; SKIP-CACHE-INV-NEXT:    s_endpgm | 
 |     i32* %out, i32 %in, i32 %old) { | 
 | entry: | 
 |   %gep = getelementptr i32, i32* %out, i32 4 | 
 |   %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst | 
 |   %val0 = extractvalue { i32, i1 } %val, 0 | 
 |   store i32 %val0, i32* %out, align 4 | 
 |   ret void | 
 | } | 
 |  |