1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s 4 5define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 { 6; SI-LABEL: round_f64: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9; SI-NEXT: s_mov_b32 s10, -1 10; SI-NEXT: s_mov_b32 s1, 0xfffff 11; SI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 12; SI-NEXT: s_mov_b32 s11, 0xf000 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 15; SI-NEXT: s_mov_b32 s8, s4 16; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 17; SI-NEXT: s_mov_b32 s0, s10 18; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 19; SI-NEXT: s_andn2_b64 s[2:3], s[6:7], s[0:1] 20; SI-NEXT: s_and_b32 s0, s7, 0x80000000 21; SI-NEXT: s_cmp_lt_i32 s4, 0 22; SI-NEXT: v_mov_b32_e32 v0, s3 23; SI-NEXT: v_mov_b32_e32 v1, s0 24; SI-NEXT: s_cselect_b64 vcc, -1, 0 25; SI-NEXT: s_cmp_gt_i32 s4, 51 26; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 27; SI-NEXT: v_mov_b32_e32 v1, s7 28; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 29; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 30; SI-NEXT: v_mov_b32_e32 v0, s2 31; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 32; SI-NEXT: v_mov_b32_e32 v2, s6 33; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 34; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 35; SI-NEXT: s_brev_b32 s0, -2 36; SI-NEXT: v_mov_b32_e32 v5, s7 37; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 38; SI-NEXT: v_bfi_b32 v4, s0, v4, v5 39; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 40; SI-NEXT: v_mov_b32_e32 v2, 0 41; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 42; SI-NEXT: s_mov_b32 s9, s5 43; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 44; SI-NEXT: s_endpgm 45; 46; CI-LABEL: round_f64: 47; CI: ; %bb.0: 48; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 49; CI-NEXT: s_brev_b32 s5, -2 50; CI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 51; CI-NEXT: s_mov_b32 s7, 0xf000 52; CI-NEXT: s_mov_b32 s6, -1 53; CI-NEXT: s_waitcnt lgkmcnt(0) 54; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] 55; CI-NEXT: v_mov_b32_e32 v5, s3 56; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] 57; CI-NEXT: v_bfi_b32 v4, s5, v4, v5 58; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 59; CI-NEXT: v_mov_b32_e32 v2, 0 60; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 61; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 62; CI-NEXT: s_mov_b32 s4, s0 63; CI-NEXT: s_mov_b32 s5, s1 64; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 65; CI-NEXT: s_endpgm 66 %result = call double @llvm.round.f64(double %x) #1 67 store double %result, double addrspace(1)* %out 68 ret void 69} 70 71define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { 72; SI-LABEL: v_round_f64: 73; SI: ; %bb.0: 74; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 75; SI-NEXT: s_mov_b32 s7, 0xf000 76; SI-NEXT: s_mov_b32 s6, 0 77; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 78; SI-NEXT: v_mov_b32_e32 v1, 0 79; SI-NEXT: s_waitcnt lgkmcnt(0) 80; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 81; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 82; SI-NEXT: s_movk_i32 s4, 0xfc01 83; SI-NEXT: s_mov_b32 s2, -1 84; SI-NEXT: s_mov_b32 s3, 0xfffff 85; SI-NEXT: s_brev_b32 s5, -2 86; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 89; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 90; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 91; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 92; SI-NEXT: v_not_b32_e32 v5, v5 93; SI-NEXT: v_not_b32_e32 v4, v4 94; SI-NEXT: v_and_b32_e32 v5, v3, v5 95; SI-NEXT: v_and_b32_e32 v4, v2, v4 96; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 97; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 98; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 99; SI-NEXT: v_cmp_lt_i32_e32 vcc, 51, v6 100; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc 101; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 102; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] 103; SI-NEXT: v_bfi_b32 v2, s5, v8, v3 104; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 105; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 106; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc 107; SI-NEXT: v_mov_b32_e32 v2, 0 108; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] 109; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 110; SI-NEXT: s_endpgm 111; 112; CI-LABEL: v_round_f64: 113; CI: ; %bb.0: 114; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 115; CI-NEXT: s_mov_b32 s7, 0xf000 116; CI-NEXT: s_mov_b32 s6, 0 117; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 118; CI-NEXT: v_mov_b32_e32 v1, 0 119; CI-NEXT: s_waitcnt lgkmcnt(0) 120; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 121; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 122; CI-NEXT: s_brev_b32 s2, -2 123; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 124; CI-NEXT: s_waitcnt vmcnt(0) 125; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] 126; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] 127; CI-NEXT: v_bfi_b32 v2, s2, v8, v3 128; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 129; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 130; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc 131; CI-NEXT: v_mov_b32_e32 v2, 0 132; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] 133; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 134; CI-NEXT: s_endpgm 135 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 136 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 137 %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid 138 %x = load double, double addrspace(1)* %gep 139 %result = call double @llvm.round.f64(double %x) #1 140 store double %result, double addrspace(1)* %out.gep 141 ret void 142} 143 144define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { 145; SI-LABEL: round_v2f64: 146; SI: ; %bb.0: 147; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 148; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 149; SI-NEXT: s_mov_b32 s6, -1 150; SI-NEXT: s_mov_b32 s3, 0xfffff 151; SI-NEXT: s_mov_b32 s2, s6 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 154; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 155; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 156; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] 157; SI-NEXT: s_and_b32 s0, s11, 0x80000000 158; SI-NEXT: s_cmp_lt_i32 s7, 0 159; SI-NEXT: v_mov_b32_e32 v0, s13 160; SI-NEXT: v_mov_b32_e32 v1, s0 161; SI-NEXT: s_cselect_b64 vcc, -1, 0 162; SI-NEXT: s_cmp_gt_i32 s7, 51 163; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 164; SI-NEXT: v_mov_b32_e32 v1, s11 165; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 166; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 167; SI-NEXT: v_mov_b32_e32 v0, s12 168; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 169; SI-NEXT: v_mov_b32_e32 v2, s10 170; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 171; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] 172; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 173; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01 174; SI-NEXT: s_brev_b32 s7, -2 175; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 176; SI-NEXT: v_mov_b32_e32 v4, s11 177; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 178; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 179; SI-NEXT: v_bfi_b32 v4, s7, v6, v4 180; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] 181; SI-NEXT: s_and_b32 s0, s9, 0x80000000 182; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 183; SI-NEXT: v_mov_b32_e32 v2, 0 184; SI-NEXT: s_cmp_lt_i32 s10, 0 185; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 186; SI-NEXT: v_mov_b32_e32 v0, s3 187; SI-NEXT: v_mov_b32_e32 v1, s0 188; SI-NEXT: s_cselect_b64 vcc, -1, 0 189; SI-NEXT: s_cmp_gt_i32 s10, 51 190; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 191; SI-NEXT: v_mov_b32_e32 v1, s9 192; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 193; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 194; SI-NEXT: v_mov_b32_e32 v0, s2 195; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 196; SI-NEXT: v_mov_b32_e32 v4, s8 197; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 198; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] 199; SI-NEXT: v_mov_b32_e32 v7, s9 200; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 201; SI-NEXT: v_bfi_b32 v6, s7, v6, v7 202; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc 203; SI-NEXT: v_mov_b32_e32 v4, 0 204; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] 205; SI-NEXT: s_mov_b32 s7, 0xf000 206; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 207; SI-NEXT: s_endpgm 208; 209; CI-LABEL: round_v2f64: 210; CI: ; %bb.0: 211; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 212; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 213; CI-NEXT: s_brev_b32 s2, -2 214; CI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 215; CI-NEXT: s_mov_b32 s3, 0xf000 216; CI-NEXT: s_waitcnt lgkmcnt(0) 217; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] 218; CI-NEXT: v_mov_b32_e32 v4, s7 219; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 220; CI-NEXT: v_bfi_b32 v4, s2, v6, v4 221; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 222; CI-NEXT: v_mov_b32_e32 v2, 0 223; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 224; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] 225; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 226; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] 227; CI-NEXT: v_mov_b32_e32 v7, s5 228; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 229; CI-NEXT: v_bfi_b32 v6, s2, v6, v7 230; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc 231; CI-NEXT: v_mov_b32_e32 v0, 0 232; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] 233; CI-NEXT: s_mov_b32 s2, -1 234; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 235; CI-NEXT: s_endpgm 236 %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 237 store <2 x double> %result, <2 x double> addrspace(1)* %out 238 ret void 239} 240 241define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { 242; SI-LABEL: round_v4f64: 243; SI: ; %bb.0: 244; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 245; SI-NEXT: s_mov_b32 s14, -1 246; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 247; SI-NEXT: s_mov_b32 s3, 0xfffff 248; SI-NEXT: s_mov_b32 s2, s14 249; SI-NEXT: s_waitcnt lgkmcnt(0) 250; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 251; SI-NEXT: s_add_i32 s18, s0, 0xfffffc01 252; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s18 253; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] 254; SI-NEXT: s_and_b32 s0, s7, 0x80000000 255; SI-NEXT: s_cmp_lt_i32 s18, 0 256; SI-NEXT: v_mov_b32_e32 v0, s17 257; SI-NEXT: v_mov_b32_e32 v1, s0 258; SI-NEXT: s_cselect_b64 vcc, -1, 0 259; SI-NEXT: s_cmp_gt_i32 s18, 51 260; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 261; SI-NEXT: v_mov_b32_e32 v1, s7 262; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 263; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 264; SI-NEXT: v_mov_b32_e32 v0, s16 265; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 266; SI-NEXT: v_mov_b32_e32 v2, s6 267; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 268; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 269; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 270; SI-NEXT: s_add_i32 s17, s0, 0xfffffc01 271; SI-NEXT: s_brev_b32 s16, -2 272; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 273; SI-NEXT: v_mov_b32_e32 v4, s7 274; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 275; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 276; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 277; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] 278; SI-NEXT: s_and_b32 s0, s5, 0x80000000 279; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 280; SI-NEXT: v_mov_b32_e32 v2, 0 281; SI-NEXT: s_cmp_lt_i32 s17, 0 282; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 283; SI-NEXT: v_mov_b32_e32 v0, s7 284; SI-NEXT: v_mov_b32_e32 v1, s0 285; SI-NEXT: s_cselect_b64 vcc, -1, 0 286; SI-NEXT: s_cmp_gt_i32 s17, 51 287; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 288; SI-NEXT: v_mov_b32_e32 v1, s5 289; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 290; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] 291; SI-NEXT: v_mov_b32_e32 v0, s6 292; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 293; SI-NEXT: v_mov_b32_e32 v4, s4 294; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 295; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 296; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] 297; SI-NEXT: s_add_i32 s6, s0, 0xfffffc01 298; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 299; SI-NEXT: v_mov_b32_e32 v6, s5 300; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 301; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] 302; SI-NEXT: s_and_b32 s0, s11, 0x80000000 303; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 304; SI-NEXT: s_cmp_lt_i32 s6, 0 305; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc 306; SI-NEXT: v_mov_b32_e32 v4, s5 307; SI-NEXT: v_mov_b32_e32 v5, s0 308; SI-NEXT: s_cselect_b64 vcc, -1, 0 309; SI-NEXT: s_cmp_gt_i32 s6, 51 310; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 311; SI-NEXT: v_mov_b32_e32 v5, s11 312; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 313; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] 314; SI-NEXT: v_mov_b32_e32 v4, s4 315; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 316; SI-NEXT: v_mov_b32_e32 v6, s10 317; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] 318; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] 319; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 320; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 321; SI-NEXT: v_mov_b32_e32 v10, s11 322; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 323; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 324; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 325; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] 326; SI-NEXT: s_and_b32 s0, s9, 0x80000000 327; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc 328; SI-NEXT: v_mov_b32_e32 v6, 0 329; SI-NEXT: s_cmp_lt_i32 s4, 0 330; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] 331; SI-NEXT: v_mov_b32_e32 v4, s3 332; SI-NEXT: v_mov_b32_e32 v5, s0 333; SI-NEXT: s_cselect_b64 vcc, -1, 0 334; SI-NEXT: s_cmp_gt_i32 s4, 51 335; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 336; SI-NEXT: v_mov_b32_e32 v5, s9 337; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 338; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] 339; SI-NEXT: v_mov_b32_e32 v4, s2 340; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 341; SI-NEXT: v_mov_b32_e32 v10, s8 342; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] 343; SI-NEXT: v_add_f64 v[10:11], s[8:9], -v[4:5] 344; SI-NEXT: v_mov_b32_e32 v13, s9 345; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 346; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 347; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc 348; SI-NEXT: v_mov_b32_e32 v10, 0 349; SI-NEXT: v_mov_b32_e32 v8, 0 350; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] 351; SI-NEXT: s_mov_b32 s15, 0xf000 352; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] 353; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 354; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 355; SI-NEXT: s_endpgm 356; 357; CI-LABEL: round_v4f64: 358; CI: ; %bb.0: 359; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 360; CI-NEXT: s_brev_b32 s12, -2 361; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 362; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 363; CI-NEXT: s_mov_b32 s3, 0xf000 364; CI-NEXT: s_waitcnt lgkmcnt(0) 365; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] 366; CI-NEXT: v_mov_b32_e32 v4, s7 367; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 368; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 369; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 370; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] 371; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 372; CI-NEXT: v_mov_b32_e32 v2, 0 373; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 374; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] 375; CI-NEXT: v_mov_b32_e32 v4, s5 376; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 377; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 378; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 379; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] 380; CI-NEXT: v_mov_b32_e32 v10, s11 381; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] 382; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 383; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 384; CI-NEXT: v_mov_b32_e32 v6, 0 385; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc 386; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] 387; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] 388; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] 389; CI-NEXT: v_mov_b32_e32 v13, s9 390; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 391; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 392; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc 393; CI-NEXT: v_mov_b32_e32 v4, 0 394; CI-NEXT: v_mov_b32_e32 v0, 0 395; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] 396; CI-NEXT: s_mov_b32 s2, -1 397; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] 398; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 399; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 400; CI-NEXT: s_endpgm 401 %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 402 store <4 x double> %result, <4 x double> addrspace(1)* %out 403 ret void 404} 405 406define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { 407; SI-LABEL: round_v8f64: 408; SI: ; %bb.0: 409; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 410; SI-NEXT: s_mov_b32 s22, -1 411; SI-NEXT: s_mov_b32 s21, 0xfffff 412; SI-NEXT: s_mov_b32 s20, s22 413; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 414; SI-NEXT: s_waitcnt lgkmcnt(0) 415; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 416; SI-NEXT: s_add_i32 s26, s2, 0xfffffc01 417; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 418; SI-NEXT: s_and_b32 s23, s7, 0x80000000 419; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] 420; SI-NEXT: s_cmp_lt_i32 s26, 0 421; SI-NEXT: v_mov_b32_e32 v0, s25 422; SI-NEXT: v_mov_b32_e32 v1, s23 423; SI-NEXT: s_cselect_b64 vcc, -1, 0 424; SI-NEXT: s_cmp_gt_i32 s26, 51 425; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 426; SI-NEXT: v_mov_b32_e32 v1, s7 427; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 428; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] 429; SI-NEXT: v_mov_b32_e32 v0, s24 430; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 431; SI-NEXT: v_mov_b32_e32 v2, s6 432; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] 433; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 434; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 435; SI-NEXT: s_add_i32 s24, s2, 0xfffffc01 436; SI-NEXT: s_brev_b32 s23, -2 437; SI-NEXT: v_mov_b32_e32 v4, s7 438; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 439; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 440; SI-NEXT: v_bfi_b32 v4, s23, v8, v4 441; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] 442; SI-NEXT: s_and_b32 s2, s5, 0x80000000 443; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 444; SI-NEXT: v_mov_b32_e32 v2, 0 445; SI-NEXT: s_cmp_lt_i32 s24, 0 446; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 447; SI-NEXT: v_mov_b32_e32 v0, s7 448; SI-NEXT: v_mov_b32_e32 v1, s2 449; SI-NEXT: s_cselect_b64 vcc, -1, 0 450; SI-NEXT: s_cmp_gt_i32 s24, 51 451; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 452; SI-NEXT: v_mov_b32_e32 v1, s5 453; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 454; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] 455; SI-NEXT: v_mov_b32_e32 v0, s6 456; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 457; SI-NEXT: v_mov_b32_e32 v4, s4 458; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] 459; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] 460; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 461; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 462; SI-NEXT: v_mov_b32_e32 v6, s5 463; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 464; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 465; SI-NEXT: v_bfi_b32 v6, s23, v8, v6 466; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] 467; SI-NEXT: s_and_b32 s2, s11, 0x80000000 468; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc 469; SI-NEXT: v_mov_b32_e32 v4, 0 470; SI-NEXT: s_cmp_lt_i32 s6, 0 471; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] 472; SI-NEXT: v_mov_b32_e32 v4, s5 473; SI-NEXT: v_mov_b32_e32 v5, s2 474; SI-NEXT: s_cselect_b64 vcc, -1, 0 475; SI-NEXT: s_cmp_gt_i32 s6, 51 476; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 477; SI-NEXT: v_mov_b32_e32 v5, s11 478; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 479; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] 480; SI-NEXT: v_mov_b32_e32 v4, s4 481; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 482; SI-NEXT: v_mov_b32_e32 v6, s10 483; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] 484; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] 485; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 486; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 487; SI-NEXT: v_mov_b32_e32 v9, s11 488; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 489; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 490; SI-NEXT: v_bfi_b32 v9, s23, v8, v9 491; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] 492; SI-NEXT: s_and_b32 s2, s9, 0x80000000 493; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc 494; SI-NEXT: v_mov_b32_e32 v6, 0 495; SI-NEXT: s_cmp_lt_i32 s6, 0 496; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] 497; SI-NEXT: v_mov_b32_e32 v4, s5 498; SI-NEXT: v_mov_b32_e32 v5, s2 499; SI-NEXT: s_cselect_b64 vcc, -1, 0 500; SI-NEXT: s_cmp_gt_i32 s6, 51 501; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 502; SI-NEXT: v_mov_b32_e32 v5, s9 503; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 504; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] 505; SI-NEXT: v_mov_b32_e32 v4, s4 506; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 507; SI-NEXT: v_mov_b32_e32 v9, s8 508; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] 509; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 510; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] 511; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 512; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 513; SI-NEXT: v_mov_b32_e32 v11, s9 514; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 515; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3] 516; SI-NEXT: s_and_b32 s2, s15, 0x80000000 517; SI-NEXT: v_bfi_b32 v11, s23, v8, v11 518; SI-NEXT: s_cmp_lt_i32 s4, 0 519; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc 520; SI-NEXT: v_mov_b32_e32 v9, 0 521; SI-NEXT: s_cselect_b64 vcc, -1, 0 522; SI-NEXT: s_cmp_gt_i32 s4, 51 523; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[9:10] 524; SI-NEXT: v_mov_b32_e32 v10, s2 525; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 526; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 527; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 528; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6 529; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5] 530; SI-NEXT: s_and_b32 s4, s13, 0x80000000 531; SI-NEXT: v_mov_b32_e32 v9, s25 532; SI-NEXT: s_cmp_lt_i32 s6, 0 533; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc 534; SI-NEXT: v_mov_b32_e32 v10, s4 535; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 536; SI-NEXT: s_cmp_gt_i32 s6, 51 537; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 538; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 539; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 540; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 541; SI-NEXT: s_andn2_b64 s[28:29], s[18:19], s[8:9] 542; SI-NEXT: s_and_b32 s8, s19, 0x80000000 543; SI-NEXT: v_mov_b32_e32 v9, s27 544; SI-NEXT: s_cmp_lt_i32 s10, 0 545; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5] 546; SI-NEXT: v_mov_b32_e32 v9, s29 547; SI-NEXT: v_mov_b32_e32 v10, s8 548; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 549; SI-NEXT: s_cmp_gt_i32 s10, 51 550; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] 551; SI-NEXT: v_mov_b32_e32 v10, s19 552; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 553; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11] 554; SI-NEXT: v_mov_b32_e32 v9, s28 555; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[8:9] 556; SI-NEXT: v_mov_b32_e32 v11, s18 557; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 558; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11] 559; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 560; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 561; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] 562; SI-NEXT: s_and_b32 s8, s17, 0x80000000 563; SI-NEXT: s_cmp_lt_i32 s10, 0 564; SI-NEXT: v_mov_b32_e32 v11, s21 565; SI-NEXT: v_mov_b32_e32 v12, s8 566; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 567; SI-NEXT: s_cmp_gt_i32 s10, 51 568; SI-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[8:9] 569; SI-NEXT: v_mov_b32_e32 v12, s17 570; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 571; SI-NEXT: v_cndmask_b32_e64 v14, v11, v12, s[10:11] 572; SI-NEXT: v_mov_b32_e32 v11, s20 573; SI-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[8:9] 574; SI-NEXT: v_mov_b32_e32 v12, s16 575; SI-NEXT: v_cndmask_b32_e64 v13, v11, v12, s[10:11] 576; SI-NEXT: v_add_f64 v[11:12], s[16:17], -v[13:14] 577; SI-NEXT: v_mov_b32_e32 v19, s17 578; SI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[11:12]|, 0.5 579; SI-NEXT: v_mov_b32_e32 v11, s19 580; SI-NEXT: v_bfi_b32 v20, s23, v8, v11 581; SI-NEXT: v_add_f64 v[11:12], s[18:19], -v[9:10] 582; SI-NEXT: v_bfi_b32 v19, s23, v8, v19 583; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[11:12]|, 0.5 584; SI-NEXT: v_mov_b32_e32 v11, 0 585; SI-NEXT: v_cndmask_b32_e64 v12, 0, v20, s[10:11] 586; SI-NEXT: v_add_f64 v[11:12], v[9:10], v[11:12] 587; SI-NEXT: v_cndmask_b32_e64 v10, 0, v19, s[8:9] 588; SI-NEXT: v_mov_b32_e32 v9, 0 589; SI-NEXT: v_mov_b32_e32 v16, s15 590; SI-NEXT: v_add_f64 v[9:10], v[13:14], v[9:10] 591; SI-NEXT: v_mov_b32_e32 v13, s24 592; SI-NEXT: v_cndmask_b32_e64 v14, v15, v16, s[2:3] 593; SI-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc 594; SI-NEXT: v_mov_b32_e32 v15, s14 595; SI-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[2:3] 596; SI-NEXT: v_mov_b32_e32 v15, s15 597; SI-NEXT: v_bfi_b32 v19, s23, v8, v15 598; SI-NEXT: v_mov_b32_e32 v15, s26 599; SI-NEXT: v_mov_b32_e32 v18, s13 600; SI-NEXT: v_cndmask_b32_e64 v15, v15, 0, s[4:5] 601; SI-NEXT: v_mov_b32_e32 v16, s12 602; SI-NEXT: v_cndmask_b32_e64 v18, v17, v18, s[6:7] 603; SI-NEXT: v_cndmask_b32_e64 v17, v15, v16, s[6:7] 604; SI-NEXT: v_mov_b32_e32 v15, s13 605; SI-NEXT: v_bfi_b32 v8, s23, v8, v15 606; SI-NEXT: v_add_f64 v[15:16], s[12:13], -v[17:18] 607; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 608; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[15:16]|, 0.5 609; SI-NEXT: v_add_f64 v[15:16], s[14:15], -v[13:14] 610; SI-NEXT: s_mov_b32 s23, 0xf000 611; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[15:16]|, 0.5 612; SI-NEXT: v_mov_b32_e32 v15, 0 613; SI-NEXT: v_cndmask_b32_e64 v16, 0, v19, s[0:1] 614; SI-NEXT: v_add_f64 v[15:16], v[13:14], v[15:16] 615; SI-NEXT: v_cndmask_b32_e32 v14, 0, v8, vcc 616; SI-NEXT: v_mov_b32_e32 v13, 0 617; SI-NEXT: v_add_f64 v[13:14], v[17:18], v[13:14] 618; SI-NEXT: s_waitcnt lgkmcnt(0) 619; SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[20:23], 0 offset:48 620; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[20:23], 0 offset:32 621; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 622; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 623; SI-NEXT: s_endpgm 624; 625; CI-LABEL: round_v8f64: 626; CI: ; %bb.0: 627; CI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 628; CI-NEXT: s_brev_b32 s2, -2 629; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 630; CI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 631; CI-NEXT: s_mov_b32 s23, 0xf000 632; CI-NEXT: s_waitcnt lgkmcnt(0) 633; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] 634; CI-NEXT: v_mov_b32_e32 v4, s7 635; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] 636; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 637; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 638; CI-NEXT: v_mov_b32_e32 v2, 0 639; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc 640; CI-NEXT: v_trunc_f64_e32 v[4:5], s[4:5] 641; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 642; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[4:5] 643; CI-NEXT: v_mov_b32_e32 v6, s5 644; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 645; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 646; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc 647; CI-NEXT: v_trunc_f64_e32 v[6:7], s[10:11] 648; CI-NEXT: v_mov_b32_e32 v0, 0 649; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] 650; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[6:7] 651; CI-NEXT: v_mov_b32_e32 v8, s11 652; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 653; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 654; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc 655; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] 656; CI-NEXT: v_mov_b32_e32 v4, 0 657; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] 658; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[8:9] 659; CI-NEXT: v_mov_b32_e32 v10, s9 660; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 661; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 662; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc 663; CI-NEXT: v_mov_b32_e32 v4, 0 664; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] 665; CI-NEXT: v_mov_b32_e32 v8, s15 666; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 667; CI-NEXT: v_trunc_f64_e32 v[8:9], s[16:17] 668; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] 669; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[8:9] 670; CI-NEXT: v_mov_b32_e32 v19, s19 671; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 672; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[10:11] 673; CI-NEXT: v_mov_b32_e32 v17, s17 674; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 675; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 676; CI-NEXT: v_trunc_f64_e32 v[12:13], s[12:13] 677; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 678; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] 679; CI-NEXT: v_mov_b32_e32 v14, 0 680; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] 681; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc 682; CI-NEXT: v_mov_b32_e32 v14, 0 683; CI-NEXT: v_mov_b32_e32 v17, s13 684; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] 685; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[12:13] 686; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 687; CI-NEXT: v_trunc_f64_e32 v[16:17], s[14:15] 688; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 689; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[16:17] 690; CI-NEXT: s_mov_b32 s22, -1 691; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 692; CI-NEXT: v_mov_b32_e32 v14, 0 693; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] 694; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[14:15] 695; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc 696; CI-NEXT: v_mov_b32_e32 v16, 0 697; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] 698; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48 699; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32 700; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 701; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 702; CI-NEXT: s_endpgm 703 %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 704 store <8 x double> %result, <8 x double> addrspace(1)* %out 705 ret void 706} 707 708declare i32 @llvm.amdgcn.workitem.id.x() #1 709 710declare double @llvm.round.f64(double) #1 711declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 712declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 713declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 714 715attributes #0 = { nounwind } 716attributes #1 = { nounwind readnone } 717