1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s 7 8define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { 9; GFX6-LABEL: v_saddsat_i8: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 13; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 14; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 15; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 16; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 17; GFX6-NEXT: s_setpc_b64 s[30:31] 18; 19; GFX8-LABEL: v_saddsat_i8: 20; GFX8: ; %bb.0: 21; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX8-NEXT: v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 23; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0 24; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0 25; GFX8-NEXT: s_setpc_b64 s[30:31] 26; 27; GFX9-LABEL: v_saddsat_i8: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 31; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 32; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 33; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 34; GFX9-NEXT: s_setpc_b64 s[30:31] 35; 36; GFX10PLUS-LABEL: v_saddsat_i8: 37; GFX10PLUS: ; %bb.0: 38; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 40; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 41; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 42; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp 43; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 44; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 45 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) 46 ret i8 %result 47} 48 49define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { 50; GFX6-LABEL: v_saddsat_i16: 51; GFX6: ; %bb.0: 52; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 53; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 54; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 56; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 57; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 58; GFX6-NEXT: s_setpc_b64 s[30:31] 59; 60; GFX8-LABEL: v_saddsat_i16: 61; GFX8: ; %bb.0: 62; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 64; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 65; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 66; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 67; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 68; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 69; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 70; GFX8-NEXT: s_setpc_b64 s[30:31] 71; 72; GFX9-LABEL: v_saddsat_i16: 73; GFX9: ; %bb.0: 74; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 76; GFX9-NEXT: s_setpc_b64 s[30:31] 77; 78; GFX10PLUS-LABEL: v_saddsat_i16: 79; GFX10PLUS: ; %bb.0: 80; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 81; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 82; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp 83; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 84 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 85 ret i16 %result 86} 87 88define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { 89; GFX6-LABEL: v_saddsat_i32: 90; GFX6: ; %bb.0: 91; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 93; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1 94; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 95; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 96; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 97; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] 98; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 99; GFX6-NEXT: s_setpc_b64 s[30:31] 100; 101; GFX8-LABEL: v_saddsat_i32: 102; GFX8: ; %bb.0: 103; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 104; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 105; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1 106; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 107; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 108; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 109; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 110; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 111; GFX8-NEXT: s_setpc_b64 s[30:31] 112; 113; GFX9-LABEL: v_saddsat_i32: 114; GFX9: ; %bb.0: 115; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 116; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp 117; GFX9-NEXT: s_setpc_b64 s[30:31] 118; 119; GFX10PLUS-LABEL: v_saddsat_i32: 120; GFX10PLUS: ; %bb.0: 121; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp 124; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 125 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 126 ret i32 %result 127} 128 129define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 130; GFX6-LABEL: v_saddsat_v2i16: 131; GFX6: ; %bb.0: 132; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 133; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 134; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 135; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 136; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 137; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 138; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 139; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 140; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 141; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 142; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 143; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 144; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 145; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 146; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 147; GFX6-NEXT: s_setpc_b64 s[30:31] 148; 149; GFX8-LABEL: v_saddsat_v2i16: 150; GFX8: ; %bb.0: 151; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 153; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 154; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 155; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 156; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 157; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 158; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 159; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 160; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 161; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 162; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 163; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 164; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 165; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 166; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 167; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 168; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 169; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 170; GFX8-NEXT: s_setpc_b64 s[30:31] 171; 172; GFX9-LABEL: v_saddsat_v2i16: 173; GFX9: ; %bb.0: 174; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 175; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 176; GFX9-NEXT: s_setpc_b64 s[30:31] 177; 178; GFX10PLUS-LABEL: v_saddsat_v2i16: 179; GFX10PLUS: ; %bb.0: 180; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 182; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v1 clamp 183; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 184 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 185 ret <2 x i16> %result 186} 187 188define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 189; GFX6-LABEL: v_saddsat_v3i16: 190; GFX6: ; %bb.0: 191; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 192; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 193; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 194; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 195; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 196; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 197; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 198; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 199; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 200; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 201; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 202; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 203; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 204; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 205; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 206; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 207; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 208; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 209; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 210; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 211; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 212; GFX6-NEXT: s_setpc_b64 s[30:31] 213; 214; GFX8-LABEL: v_saddsat_v3i16: 215; GFX8: ; %bb.0: 216; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 218; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 219; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 220; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 221; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 222; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 223; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 224; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 225; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 226; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 227; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 228; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 229; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 230; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 231; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 232; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 233; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 234; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 235; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 236; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 237; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 238; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 239; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 240; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 241; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 242; GFX8-NEXT: s_setpc_b64 s[30:31] 243; 244; GFX9-LABEL: v_saddsat_v3i16: 245; GFX9: ; %bb.0: 246; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 247; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp 248; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp 249; GFX9-NEXT: s_setpc_b64 s[30:31] 250; 251; GFX10PLUS-LABEL: v_saddsat_v3i16: 252; GFX10PLUS: ; %bb.0: 253; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 255; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp 256; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp 257; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 258 %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 259 ret <3 x i16> %result 260} 261 262define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 263; GFX6-LABEL: v_saddsat_v4i16: 264; GFX6: ; %bb.0: 265; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 266; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 267; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 268; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 269; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 270; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 271; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 272; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 273; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 274; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 275; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 276; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 277; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 278; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 279; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 280; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 281; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 282; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 283; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 284; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 285; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 286; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 287; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 288; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 289; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 290; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 291; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 292; GFX6-NEXT: s_setpc_b64 s[30:31] 293; 294; GFX8-LABEL: v_saddsat_v4i16: 295; GFX8: ; %bb.0: 296; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 297; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 298; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 299; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 300; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 301; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 302; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 303; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 304; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 305; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 306; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 307; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 308; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 309; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 310; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 311; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 312; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 313; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 314; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 315; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 316; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 317; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 318; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 319; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 320; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 321; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 322; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 323; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc 324; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 325; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 326; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 327; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 328; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 329; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 330; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 331; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 332; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 333; GFX8-NEXT: s_setpc_b64 s[30:31] 334; 335; GFX9-LABEL: v_saddsat_v4i16: 336; GFX9: ; %bb.0: 337; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 338; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp 339; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp 340; GFX9-NEXT: s_setpc_b64 s[30:31] 341; 342; GFX10PLUS-LABEL: v_saddsat_v4i16: 343; GFX10PLUS: ; %bb.0: 344; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 345; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 346; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp 347; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp 348; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 349 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 350 %cast = bitcast <4 x i16> %result to <2 x float> 351 ret <2 x float> %cast 352} 353 354define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 355; GFX6-LABEL: v_saddsat_v2i32: 356; GFX6: ; %bb.0: 357; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 358; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 359; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 360; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 361; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 362; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 363; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] 364; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 365; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 366; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 367; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 368; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 369; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 370; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] 371; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 372; GFX6-NEXT: s_setpc_b64 s[30:31] 373; 374; GFX8-LABEL: v_saddsat_v2i32: 375; GFX8: ; %bb.0: 376; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 378; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 379; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 380; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 381; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 382; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 383; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 384; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 385; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 386; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 387; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 388; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 389; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] 390; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 391; GFX8-NEXT: s_setpc_b64 s[30:31] 392; 393; GFX9-LABEL: v_saddsat_v2i32: 394; GFX9: ; %bb.0: 395; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 396; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp 397; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp 398; GFX9-NEXT: s_setpc_b64 s[30:31] 399; 400; GFX10PLUS-LABEL: v_saddsat_v2i32: 401; GFX10PLUS: ; %bb.0: 402; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 403; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 404; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v2 clamp 405; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v3 clamp 406; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 407 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 408 ret <2 x i32> %result 409} 410 411define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { 412; GFX6-LABEL: v_saddsat_i64: 413; GFX6: ; %bb.0: 414; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 416; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 417; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 418; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 419; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 420; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc 421; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 422; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 423; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 424; GFX6-NEXT: s_setpc_b64 s[30:31] 425; 426; GFX8-LABEL: v_saddsat_i64: 427; GFX8: ; %bb.0: 428; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 429; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 430; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 431; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 432; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 433; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 434; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 435; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 436; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 437; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 438; GFX8-NEXT: s_setpc_b64 s[30:31] 439; 440; GFX9-LABEL: v_saddsat_i64: 441; GFX9: ; %bb.0: 442; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 443; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 444; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 445; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 446; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 447; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 448; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 449; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 450; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 451; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 452; GFX9-NEXT: s_setpc_b64 s[30:31] 453; 454; GFX10-LABEL: v_saddsat_i64: 455; GFX10: ; %bb.0: 456; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 457; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 458; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 459; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 460; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] 461; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 462; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] 463; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 464; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo 465; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo 466; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 467; GFX10-NEXT: s_setpc_b64 s[30:31] 468; 469; GFX11-LABEL: v_saddsat_i64: 470; GFX11: ; %bb.0: 471; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 472; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 473; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 474; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 475; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] 476; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 477; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] 478; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 479; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 480; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 481; GFX11-NEXT: s_setpc_b64 s[30:31] 482 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 483 ret i64 %result 484} 485 486declare i8 @llvm.sadd.sat.i8(i8, i8) #0 487declare i16 @llvm.sadd.sat.i16(i16, i16) #0 488declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 489declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 490declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 491declare i32 @llvm.sadd.sat.i32(i32, i32) #0 492declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 493declare i64 @llvm.sadd.sat.i64(i64, i64) #0 494