1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s 4 5define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { 6; SI-LABEL: round_f64: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9; SI-NEXT: s_mov_b32 s10, -1 10; SI-NEXT: s_mov_b32 s1, 0xfffff 11; SI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 12; SI-NEXT: s_mov_b32 s11, 0xf000 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 15; SI-NEXT: s_mov_b32 s8, s4 16; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 17; SI-NEXT: s_mov_b32 s0, s10 18; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 19; SI-NEXT: s_andn2_b64 s[2:3], s[6:7], s[0:1] 20; SI-NEXT: s_and_b32 s0, s7, 0x80000000 21; SI-NEXT: s_cmp_lt_i32 s4, 0 22; SI-NEXT: v_mov_b32_e32 v0, s3 23; SI-NEXT: v_mov_b32_e32 v1, s0 24; SI-NEXT: s_cselect_b64 vcc, -1, 0 25; SI-NEXT: s_cmp_gt_i32 s4, 51 26; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 27; SI-NEXT: v_mov_b32_e32 v1, s7 28; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 29; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 30; SI-NEXT: v_mov_b32_e32 v0, s2 31; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 32; SI-NEXT: v_mov_b32_e32 v2, s6 33; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 34; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 35; SI-NEXT: s_brev_b32 s0, -2 36; SI-NEXT: v_mov_b32_e32 v5, s7 37; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 38; SI-NEXT: v_bfi_b32 v4, s0, v4, v5 39; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 40; SI-NEXT: v_mov_b32_e32 v2, 0 41; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 42; SI-NEXT: s_mov_b32 s9, s5 43; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 44; SI-NEXT: s_endpgm 45; 46; CI-LABEL: round_f64: 47; CI: ; %bb.0: 48; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 49; CI-NEXT: s_brev_b32 s5, -2 50; CI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 51; CI-NEXT: s_mov_b32 s7, 0xf000 52; CI-NEXT: s_mov_b32 s6, -1 53; CI-NEXT: s_waitcnt lgkmcnt(0) 54; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] 55; CI-NEXT: v_mov_b32_e32 v5, s3 56; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] 57; CI-NEXT: v_bfi_b32 v4, s5, v4, v5 58; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 59; CI-NEXT: v_mov_b32_e32 v2, 0 60; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 61; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 62; CI-NEXT: s_mov_b32 s4, s0 63; CI-NEXT: s_mov_b32 s5, s1 64; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 65; CI-NEXT: s_endpgm 66 %result = call double @llvm.round.f64(double %x) #1 67 store double %result, double addrspace(1)* %out 68 ret void 69} 70 71define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { 72; SI-LABEL: v_round_f64: 73; SI: ; %bb.0: 74; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 75; SI-NEXT: s_mov_b32 s7, 0xf000 76; SI-NEXT: s_mov_b32 s6, 0 77; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 78; SI-NEXT: v_mov_b32_e32 v1, 0 79; SI-NEXT: s_waitcnt lgkmcnt(0) 80; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 81; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 82; SI-NEXT: s_movk_i32 s4, 0xfc01 83; SI-NEXT: s_mov_b32 s2, -1 84; SI-NEXT: s_mov_b32 s3, 0xfffff 85; SI-NEXT: s_brev_b32 s5, -2 86; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 89; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 90; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 91; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 92; SI-NEXT: v_not_b32_e32 v5, v5 93; SI-NEXT: v_not_b32_e32 v4, v4 94; SI-NEXT: v_and_b32_e32 v5, v3, v5 95; SI-NEXT: v_and_b32_e32 v4, v2, v4 96; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 97; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 98; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 99; SI-NEXT: v_cmp_lt_i32_e32 vcc, 51, v6 100; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc 101; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 102; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] 103; SI-NEXT: v_bfi_b32 v2, s5, v8, v3 104; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 105; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 106; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc 107; SI-NEXT: v_mov_b32_e32 v2, 0 108; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] 109; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 110; SI-NEXT: s_endpgm 111; 112; CI-LABEL: v_round_f64: 113; CI: ; %bb.0: 114; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 115; CI-NEXT: s_mov_b32 s7, 0xf000 116; CI-NEXT: s_mov_b32 s6, 0 117; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 118; CI-NEXT: v_mov_b32_e32 v1, 0 119; CI-NEXT: s_waitcnt lgkmcnt(0) 120; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 121; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 122; CI-NEXT: s_brev_b32 s2, -2 123; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 124; CI-NEXT: s_waitcnt vmcnt(0) 125; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] 126; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] 127; CI-NEXT: v_bfi_b32 v2, s2, v8, v3 128; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 129; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 130; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc 131; CI-NEXT: v_mov_b32_e32 v2, 0 132; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] 133; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 134; CI-NEXT: s_endpgm 135 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 136 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 137 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 138 %x = load double, double addrspace(1)* %gep 139 %result = call double @llvm.round.f64(double %x) #1 140 store double %result, double addrspace(1)* %out.gep 141 ret void 142} 143 144define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { 145; SI-LABEL: round_v2f64: 146; SI: ; %bb.0: 147; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 148; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 149; SI-NEXT: s_mov_b32 s6, -1 150; SI-NEXT: s_movk_i32 s7, 0xfc01 151; SI-NEXT: s_mov_b32 s3, 0xfffff 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 154; SI-NEXT: s_add_i32 s14, s0, s7 155; SI-NEXT: s_mov_b32 s2, s6 156; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14 157; SI-NEXT: s_brev_b32 s15, 1 158; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] 159; SI-NEXT: s_and_b32 s0, s11, s15 160; SI-NEXT: s_cmp_lt_i32 s14, 0 161; SI-NEXT: v_mov_b32_e32 v0, s13 162; SI-NEXT: v_mov_b32_e32 v1, s0 163; SI-NEXT: s_cselect_b64 vcc, -1, 0 164; SI-NEXT: s_cmp_gt_i32 s14, 51 165; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 166; SI-NEXT: v_mov_b32_e32 v1, s11 167; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 168; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 169; SI-NEXT: v_mov_b32_e32 v0, s12 170; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 171; SI-NEXT: v_mov_b32_e32 v2, s10 172; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 173; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] 174; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 175; SI-NEXT: s_add_i32 s7, s0, s7 176; SI-NEXT: s_brev_b32 s10, -2 177; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 178; SI-NEXT: v_mov_b32_e32 v4, s11 179; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 180; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 181; SI-NEXT: v_bfi_b32 v4, s10, v6, v4 182; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] 183; SI-NEXT: s_and_b32 s0, s9, s15 184; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 185; SI-NEXT: v_mov_b32_e32 v2, 0 186; SI-NEXT: s_cmp_lt_i32 s7, 0 187; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 188; SI-NEXT: v_mov_b32_e32 v0, s3 189; SI-NEXT: v_mov_b32_e32 v1, s0 190; SI-NEXT: s_cselect_b64 vcc, -1, 0 191; SI-NEXT: s_cmp_gt_i32 s7, 51 192; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 193; SI-NEXT: v_mov_b32_e32 v1, s9 194; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 195; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 196; SI-NEXT: v_mov_b32_e32 v0, s2 197; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 198; SI-NEXT: v_mov_b32_e32 v4, s8 199; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 200; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] 201; SI-NEXT: v_mov_b32_e32 v7, s9 202; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 203; SI-NEXT: v_bfi_b32 v6, s10, v6, v7 204; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc 205; SI-NEXT: v_mov_b32_e32 v4, 0 206; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] 207; SI-NEXT: s_mov_b32 s7, 0xf000 208; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 209; SI-NEXT: s_endpgm 210; 211; CI-LABEL: round_v2f64: 212; CI: ; %bb.0: 213; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 214; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 215; CI-NEXT: s_brev_b32 s2, -2 216; CI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 217; CI-NEXT: s_mov_b32 s3, 0xf000 218; CI-NEXT: s_waitcnt lgkmcnt(0) 219; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] 220; CI-NEXT: v_mov_b32_e32 v4, s7 221; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 222; CI-NEXT: v_bfi_b32 v4, s2, v6, v4 223; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 224; CI-NEXT: v_mov_b32_e32 v2, 0 225; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 226; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] 227; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 228; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] 229; CI-NEXT: v_mov_b32_e32 v7, s5 230; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 231; CI-NEXT: v_bfi_b32 v6, s2, v6, v7 232; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc 233; CI-NEXT: v_mov_b32_e32 v0, 0 234; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] 235; CI-NEXT: s_mov_b32 s2, -1 236; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 237; CI-NEXT: s_endpgm 238 %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 239 store <2 x double> %result, <2 x double> addrspace(1)* %out 240 ret void 241} 242 243define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { 244; SI-LABEL: round_v4f64: 245; SI: ; %bb.0: 246; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 247; SI-NEXT: s_mov_b32 s14, -1 248; SI-NEXT: s_movk_i32 s18, 0xfc01 249; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 250; SI-NEXT: s_mov_b32 s3, 0xfffff 251; SI-NEXT: s_waitcnt lgkmcnt(0) 252; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 253; SI-NEXT: s_add_i32 s19, s0, s18 254; SI-NEXT: s_mov_b32 s2, s14 255; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 256; SI-NEXT: s_brev_b32 s20, 1 257; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] 258; SI-NEXT: s_and_b32 s0, s7, s20 259; SI-NEXT: s_cmp_lt_i32 s19, 0 260; SI-NEXT: v_mov_b32_e32 v0, s17 261; SI-NEXT: v_mov_b32_e32 v1, s0 262; SI-NEXT: s_cselect_b64 vcc, -1, 0 263; SI-NEXT: s_cmp_gt_i32 s19, 51 264; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 265; SI-NEXT: v_mov_b32_e32 v1, s7 266; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 267; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 268; SI-NEXT: v_mov_b32_e32 v0, s16 269; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 270; SI-NEXT: v_mov_b32_e32 v2, s6 271; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 272; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 273; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 274; SI-NEXT: s_add_i32 s17, s0, s18 275; SI-NEXT: s_brev_b32 s16, -2 276; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 277; SI-NEXT: v_mov_b32_e32 v4, s7 278; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 279; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 280; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 281; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] 282; SI-NEXT: s_and_b32 s0, s5, s20 283; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 284; SI-NEXT: v_mov_b32_e32 v2, 0 285; SI-NEXT: s_cmp_lt_i32 s17, 0 286; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 287; SI-NEXT: v_mov_b32_e32 v0, s7 288; SI-NEXT: v_mov_b32_e32 v1, s0 289; SI-NEXT: s_cselect_b64 vcc, -1, 0 290; SI-NEXT: s_cmp_gt_i32 s17, 51 291; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 292; SI-NEXT: v_mov_b32_e32 v1, s5 293; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 294; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 295; SI-NEXT: v_mov_b32_e32 v0, s6 296; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 297; SI-NEXT: v_mov_b32_e32 v4, s4 298; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 299; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 300; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] 301; SI-NEXT: s_add_i32 s6, s0, s18 302; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 303; SI-NEXT: v_mov_b32_e32 v6, s5 304; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 305; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] 306; SI-NEXT: s_and_b32 s0, s11, s20 307; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 308; SI-NEXT: s_cmp_lt_i32 s6, 0 309; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc 310; SI-NEXT: v_mov_b32_e32 v4, s5 311; SI-NEXT: v_mov_b32_e32 v5, s0 312; SI-NEXT: s_cselect_b64 vcc, -1, 0 313; SI-NEXT: s_cmp_gt_i32 s6, 51 314; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 315; SI-NEXT: v_mov_b32_e32 v5, s11 316; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 317; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] 318; SI-NEXT: v_mov_b32_e32 v4, s4 319; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 320; SI-NEXT: v_mov_b32_e32 v6, s10 321; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] 322; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] 323; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 324; SI-NEXT: s_add_i32 s4, s0, s18 325; SI-NEXT: v_mov_b32_e32 v10, s11 326; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 327; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 328; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 329; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] 330; SI-NEXT: s_and_b32 s0, s9, s20 331; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc 332; SI-NEXT: v_mov_b32_e32 v6, 0 333; SI-NEXT: s_cmp_lt_i32 s4, 0 334; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] 335; SI-NEXT: v_mov_b32_e32 v4, s3 336; SI-NEXT: v_mov_b32_e32 v5, s0 337; SI-NEXT: s_cselect_b64 vcc, -1, 0 338; SI-NEXT: s_cmp_gt_i32 s4, 51 339; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 340; SI-NEXT: v_mov_b32_e32 v5, s9 341; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 342; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] 343; SI-NEXT: v_mov_b32_e32 v4, s2 344; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 345; SI-NEXT: v_mov_b32_e32 v10, s8 346; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] 347; SI-NEXT: v_add_f64 v[10:11], s[8:9], -v[4:5] 348; SI-NEXT: v_mov_b32_e32 v13, s9 349; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 350; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 351; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc 352; SI-NEXT: v_mov_b32_e32 v10, 0 353; SI-NEXT: v_mov_b32_e32 v8, 0 354; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] 355; SI-NEXT: s_mov_b32 s15, 0xf000 356; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] 357; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 358; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 359; SI-NEXT: s_endpgm 360; 361; CI-LABEL: round_v4f64: 362; CI: ; %bb.0: 363; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 364; CI-NEXT: s_brev_b32 s12, -2 365; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 366; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 367; CI-NEXT: s_mov_b32 s3, 0xf000 368; CI-NEXT: s_waitcnt lgkmcnt(0) 369; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] 370; CI-NEXT: v_mov_b32_e32 v4, s7 371; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 372; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 373; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 374; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] 375; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 376; CI-NEXT: v_mov_b32_e32 v2, 0 377; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 378; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] 379; CI-NEXT: v_mov_b32_e32 v4, s5 380; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 381; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 382; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 383; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] 384; CI-NEXT: v_mov_b32_e32 v10, s11 385; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] 386; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 387; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 388; CI-NEXT: v_mov_b32_e32 v6, 0 389; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc 390; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] 391; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] 392; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] 393; CI-NEXT: v_mov_b32_e32 v13, s9 394; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 395; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 396; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc 397; CI-NEXT: v_mov_b32_e32 v4, 0 398; CI-NEXT: v_mov_b32_e32 v0, 0 399; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] 400; CI-NEXT: s_mov_b32 s2, -1 401; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] 402; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 403; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 404; CI-NEXT: s_endpgm 405 %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 406 store <4 x double> %result, <4 x double> addrspace(1)* %out 407 ret void 408} 409 410define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { 411; SI-LABEL: round_v8f64: 412; SI: ; %bb.0: 413; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 414; SI-NEXT: s_mov_b32 s22, -1 415; SI-NEXT: s_movk_i32 s28, 0xfc01 416; SI-NEXT: s_mov_b32 s21, 0xfffff 417; SI-NEXT: s_mov_b32 s20, s22 418; SI-NEXT: s_waitcnt lgkmcnt(0) 419; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 420; SI-NEXT: s_add_i32 s23, s2, s28 421; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s23 422; SI-NEXT: s_brev_b32 s29, 1 423; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] 424; SI-NEXT: s_and_b32 s2, s7, s29 425; SI-NEXT: s_cmp_lt_i32 s23, 0 426; SI-NEXT: v_mov_b32_e32 v0, s25 427; SI-NEXT: v_mov_b32_e32 v1, s2 428; SI-NEXT: s_cselect_b64 vcc, -1, 0 429; SI-NEXT: s_cmp_gt_i32 s23, 51 430; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 431; SI-NEXT: v_mov_b32_e32 v1, s7 432; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 433; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] 434; SI-NEXT: v_mov_b32_e32 v0, s24 435; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 436; SI-NEXT: v_mov_b32_e32 v2, s6 437; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] 438; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 439; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 440; SI-NEXT: s_add_i32 s24, s2, s28 441; SI-NEXT: s_brev_b32 s23, -2 442; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 443; SI-NEXT: v_mov_b32_e32 v4, s7 444; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 445; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 446; SI-NEXT: v_bfi_b32 v4, s23, v8, v4 447; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] 448; SI-NEXT: s_and_b32 s2, s5, s29 449; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 450; SI-NEXT: v_mov_b32_e32 v2, 0 451; SI-NEXT: s_cmp_lt_i32 s24, 0 452; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 453; SI-NEXT: v_mov_b32_e32 v0, s7 454; SI-NEXT: v_mov_b32_e32 v1, s2 455; SI-NEXT: s_cselect_b64 vcc, -1, 0 456; SI-NEXT: s_cmp_gt_i32 s24, 51 457; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 458; SI-NEXT: v_mov_b32_e32 v1, s5 459; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 460; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] 461; SI-NEXT: v_mov_b32_e32 v0, s6 462; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 463; SI-NEXT: v_mov_b32_e32 v4, s4 464; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] 465; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] 466; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 467; SI-NEXT: s_add_i32 s6, s2, s28 468; SI-NEXT: v_mov_b32_e32 v6, s5 469; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 470; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 471; SI-NEXT: v_bfi_b32 v6, s23, v8, v6 472; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] 473; SI-NEXT: s_and_b32 s2, s11, s29 474; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc 475; SI-NEXT: v_mov_b32_e32 v4, 0 476; SI-NEXT: s_cmp_lt_i32 s6, 0 477; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] 478; SI-NEXT: v_mov_b32_e32 v4, s5 479; SI-NEXT: v_mov_b32_e32 v5, s2 480; SI-NEXT: s_cselect_b64 vcc, -1, 0 481; SI-NEXT: s_cmp_gt_i32 s6, 51 482; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 483; SI-NEXT: v_mov_b32_e32 v5, s11 484; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 485; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] 486; SI-NEXT: v_mov_b32_e32 v4, s4 487; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 488; SI-NEXT: v_mov_b32_e32 v6, s10 489; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] 490; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] 491; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 492; SI-NEXT: s_add_i32 s6, s2, s28 493; SI-NEXT: v_mov_b32_e32 v9, s11 494; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 495; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 496; SI-NEXT: v_bfi_b32 v9, s23, v8, v9 497; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] 498; SI-NEXT: s_and_b32 s2, s9, s29 499; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc 500; SI-NEXT: v_mov_b32_e32 v6, 0 501; SI-NEXT: s_cmp_lt_i32 s6, 0 502; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] 503; SI-NEXT: v_mov_b32_e32 v4, s5 504; SI-NEXT: v_mov_b32_e32 v5, s2 505; SI-NEXT: s_cselect_b64 vcc, -1, 0 506; SI-NEXT: s_cmp_gt_i32 s6, 51 507; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 508; SI-NEXT: v_mov_b32_e32 v5, s9 509; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 510; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] 511; SI-NEXT: v_mov_b32_e32 v4, s4 512; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 513; SI-NEXT: v_mov_b32_e32 v9, s8 514; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] 515; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 516; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] 517; SI-NEXT: s_add_i32 s4, s2, s28 518; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 519; SI-NEXT: v_mov_b32_e32 v11, s9 520; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 521; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3] 522; SI-NEXT: s_and_b32 s2, s15, s29 523; SI-NEXT: v_bfi_b32 v11, s23, v8, v11 524; SI-NEXT: s_cmp_lt_i32 s4, 0 525; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc 526; SI-NEXT: v_mov_b32_e32 v9, 0 527; SI-NEXT: s_cselect_b64 vcc, -1, 0 528; SI-NEXT: s_cmp_gt_i32 s4, 51 529; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[9:10] 530; SI-NEXT: v_mov_b32_e32 v10, s2 531; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 532; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 533; SI-NEXT: s_add_i32 s6, s4, s28 534; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6 535; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5] 536; SI-NEXT: s_and_b32 s4, s13, s29 537; SI-NEXT: v_mov_b32_e32 v9, s25 538; SI-NEXT: s_cmp_lt_i32 s6, 0 539; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc 540; SI-NEXT: v_mov_b32_e32 v10, s4 541; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 542; SI-NEXT: s_cmp_gt_i32 s6, 51 543; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 544; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 545; SI-NEXT: s_add_i32 s25, s8, s28 546; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s25 547; SI-NEXT: s_andn2_b64 s[10:11], s[18:19], s[8:9] 548; SI-NEXT: s_and_b32 s8, s19, s29 549; SI-NEXT: v_mov_b32_e32 v9, s27 550; SI-NEXT: s_cmp_lt_i32 s25, 0 551; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5] 552; SI-NEXT: v_mov_b32_e32 v9, s11 553; SI-NEXT: v_mov_b32_e32 v10, s8 554; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 555; SI-NEXT: s_cmp_gt_i32 s25, 51 556; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] 557; SI-NEXT: v_mov_b32_e32 v10, s19 558; SI-NEXT: v_mov_b32_e32 v11, s10 559; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 560; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11] 561; SI-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[8:9] 562; SI-NEXT: v_mov_b32_e32 v11, s18 563; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 564; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11] 565; SI-NEXT: s_add_i32 s10, s8, s28 566; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 567; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] 568; SI-NEXT: s_and_b32 s8, s17, s29 569; SI-NEXT: s_cmp_lt_i32 s10, 0 570; SI-NEXT: v_mov_b32_e32 v11, s21 571; SI-NEXT: v_mov_b32_e32 v12, s8 572; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 573; SI-NEXT: s_cmp_gt_i32 s10, 51 574; SI-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[8:9] 575; SI-NEXT: v_mov_b32_e32 v12, s17 576; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 577; SI-NEXT: v_cndmask_b32_e64 v14, v11, v12, s[10:11] 578; SI-NEXT: v_mov_b32_e32 v11, s20 579; SI-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[8:9] 580; SI-NEXT: v_mov_b32_e32 v12, s16 581; SI-NEXT: v_cndmask_b32_e64 v13, v11, v12, s[10:11] 582; SI-NEXT: v_add_f64 v[11:12], s[16:17], -v[13:14] 583; SI-NEXT: v_mov_b32_e32 v19, s17 584; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5 585; SI-NEXT: v_mov_b32_e32 v11, s19 586; SI-NEXT: v_bfi_b32 v20, s23, v8, v11 587; SI-NEXT: v_add_f64 v[11:12], s[18:19], -v[9:10] 588; SI-NEXT: v_bfi_b32 v19, s23, v8, v19 589; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5 590; SI-NEXT: v_mov_b32_e32 v11, 0 591; SI-NEXT: v_cndmask_b32_e64 v12, 0, v20, s[10:11] 592; SI-NEXT: v_add_f64 v[11:12], v[9:10], v[11:12] 593; SI-NEXT: v_cndmask_b32_e64 v10, 0, v19, s[8:9] 594; SI-NEXT: v_mov_b32_e32 v9, 0 595; SI-NEXT: v_mov_b32_e32 v16, s15 596; SI-NEXT: v_add_f64 v[9:10], v[13:14], v[9:10] 597; SI-NEXT: v_mov_b32_e32 v13, s24 598; SI-NEXT: v_cndmask_b32_e64 v14, v15, v16, s[2:3] 599; SI-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc 600; SI-NEXT: v_mov_b32_e32 v15, s14 601; SI-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[2:3] 602; SI-NEXT: v_mov_b32_e32 v15, s15 603; SI-NEXT: v_bfi_b32 v19, s23, v8, v15 604; SI-NEXT: v_mov_b32_e32 v15, s26 605; SI-NEXT: v_mov_b32_e32 v18, s13 606; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, s[4:5] 607; SI-NEXT: v_mov_b32_e32 v16, s12 608; SI-NEXT: v_cndmask_b32_e64 v18, v17, v18, s[6:7] 609; SI-NEXT: v_cndmask_b32_e64 v17, v15, v16, s[6:7] 610; SI-NEXT: v_mov_b32_e32 v15, s13 611; SI-NEXT: v_bfi_b32 v8, s23, v8, v15 612; SI-NEXT: v_add_f64 v[15:16], s[12:13], -v[17:18] 613; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 614; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5 615; SI-NEXT: v_add_f64 v[15:16], s[14:15], -v[13:14] 616; SI-NEXT: s_mov_b32 s23, 0xf000 617; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5 618; SI-NEXT: v_mov_b32_e32 v15, 0 619; SI-NEXT: v_cndmask_b32_e64 v16, 0, v19, s[0:1] 620; SI-NEXT: v_add_f64 v[15:16], v[13:14], v[15:16] 621; SI-NEXT: v_cndmask_b32_e32 v14, 0, v8, vcc 622; SI-NEXT: v_mov_b32_e32 v13, 0 623; SI-NEXT: v_add_f64 v[13:14], v[17:18], v[13:14] 624; SI-NEXT: s_waitcnt lgkmcnt(0) 625; SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48 626; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32 627; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 628; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 629; SI-NEXT: s_endpgm 630; 631; CI-LABEL: round_v8f64: 632; CI: ; %bb.0: 633; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 634; CI-NEXT: s_brev_b32 s2, -2 635; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 636; CI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 637; CI-NEXT: s_mov_b32 s23, 0xf000 638; CI-NEXT: s_waitcnt lgkmcnt(0) 639; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] 640; CI-NEXT: v_mov_b32_e32 v4, s7 641; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 642; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 643; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 644; CI-NEXT: v_mov_b32_e32 v2, 0 645; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 646; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] 647; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 648; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] 649; CI-NEXT: v_mov_b32_e32 v6, s5 650; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 651; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 652; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc 653; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] 654; CI-NEXT: v_mov_b32_e32 v0, 0 655; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] 656; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[6:7] 657; CI-NEXT: v_mov_b32_e32 v8, s11 658; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 659; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 660; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc 661; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] 662; CI-NEXT: v_mov_b32_e32 v4, 0 663; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] 664; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[8:9] 665; CI-NEXT: v_mov_b32_e32 v10, s9 666; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 667; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 668; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc 669; CI-NEXT: v_mov_b32_e32 v4, 0 670; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] 671; CI-NEXT: v_mov_b32_e32 v8, s15 672; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 673; CI-NEXT: v_trunc_f64_e32 v[8:9], s[16:17] 674; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] 675; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[8:9] 676; CI-NEXT: v_mov_b32_e32 v19, s19 677; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 678; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[10:11] 679; CI-NEXT: v_mov_b32_e32 v17, s17 680; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 681; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 682; CI-NEXT: v_trunc_f64_e32 v[12:13], s[12:13] 683; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 684; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] 685; CI-NEXT: v_mov_b32_e32 v14, 0 686; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] 687; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc 688; CI-NEXT: v_mov_b32_e32 v14, 0 689; CI-NEXT: v_mov_b32_e32 v17, s13 690; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] 691; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[12:13] 692; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 693; CI-NEXT: v_trunc_f64_e32 v[16:17], s[14:15] 694; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 695; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[16:17] 696; CI-NEXT: s_mov_b32 s22, -1 697; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 698; CI-NEXT: v_mov_b32_e32 v14, 0 699; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] 700; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] 701; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc 702; CI-NEXT: v_mov_b32_e32 v16, 0 703; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] 704; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48 705; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32 706; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 707; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 708; CI-NEXT: s_endpgm 709 %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 710 store <8 x double> %result, <8 x double> addrspace(1)* %out 711 ret void 712} 713 714declare i32 @llvm.amdgcn.workitem.id.x() #1 715 716declare double @llvm.round.f64(double) #1 717declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 718declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 719declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 720 721attributes #0 = { nounwind } 722attributes #1 = { nounwind readnone } 723