1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s 7 8define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { 9; GFX6-LABEL: v_usubsat_i8: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 13; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 14; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 15; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 16; GFX6-NEXT: s_setpc_b64 s[30:31] 17; 18; GFX8-LABEL: v_usubsat_i8: 19; GFX8: ; %bb.0: 20; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 22; GFX8-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX9-LABEL: v_usubsat_i8: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 28; GFX9-NEXT: s_setpc_b64 s[30:31] 29; 30; GFX10PLUS-LABEL: v_usubsat_i8: 31; GFX10PLUS: ; %bb.0: 32; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 34; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 35; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 36; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 37; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 38 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) 39 ret i8 %result 40} 41 42define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { 43; GFX6-LABEL: v_usubsat_i16: 44; GFX6: ; %bb.0: 45; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 47; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 48; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 49; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 50; GFX6-NEXT: s_setpc_b64 s[30:31] 51; 52; GFX8-LABEL: v_usubsat_i16: 53; GFX8: ; %bb.0: 54; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 55; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 56; GFX8-NEXT: s_setpc_b64 s[30:31] 57; 58; GFX9-LABEL: v_usubsat_i16: 59; GFX9: ; %bb.0: 60; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 62; GFX9-NEXT: s_setpc_b64 s[30:31] 63; 64; GFX10PLUS-LABEL: v_usubsat_i16: 65; GFX10PLUS: ; %bb.0: 66; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 67; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 68; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 69; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 70 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) 71 ret i16 %result 72} 73 74define i16 @usubsat_as_bithack_i16(i16 %x) { 75; GFX6-LABEL: usubsat_as_bithack_i16: 76; GFX6: ; %bb.0: 77; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 79; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 80; GFX6-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 81; GFX6-NEXT: v_and_b32_e32 v0, v1, v0 82; GFX6-NEXT: s_setpc_b64 s[30:31] 83; 84; GFX8-LABEL: usubsat_as_bithack_i16: 85; GFX8: ; %bb.0: 86; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; GFX8-NEXT: s_movk_i32 s4, 0x8000 88; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 89; GFX8-NEXT: s_setpc_b64 s[30:31] 90; 91; GFX9-LABEL: usubsat_as_bithack_i16: 92; GFX9: ; %bb.0: 93; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 94; GFX9-NEXT: s_movk_i32 s4, 0x8000 95; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 96; GFX9-NEXT: s_setpc_b64 s[30:31] 97; 98; GFX10PLUS-LABEL: usubsat_as_bithack_i16: 99; GFX10PLUS: ; %bb.0: 100; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 101; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 102; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 103; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 104 %signsplat = ashr i16 %x, 15 105 %flipsign = xor i16 %x, 32768 106 %result = and i16 %signsplat, %flipsign 107 ret i16 %result 108} 109 110define i16 @usubsat_as_bithack2_i16(i16 %x) { 111; GFX6-LABEL: usubsat_as_bithack2_i16: 112; GFX6: ; %bb.0: 113; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 115; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 116; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xffff8000, v0 117; GFX6-NEXT: v_and_b32_e32 v0, v1, v0 118; GFX6-NEXT: s_setpc_b64 s[30:31] 119; 120; GFX8-LABEL: usubsat_as_bithack2_i16: 121; GFX8: ; %bb.0: 122; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX8-NEXT: s_movk_i32 s4, 0x8000 124; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 125; GFX8-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX9-LABEL: usubsat_as_bithack2_i16: 128; GFX9: ; %bb.0: 129; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX9-NEXT: s_movk_i32 s4, 0x8000 131; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 132; GFX9-NEXT: s_setpc_b64 s[30:31] 133; 134; GFX10PLUS-LABEL: usubsat_as_bithack2_i16: 135; GFX10PLUS: ; %bb.0: 136; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 138; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 139; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 140 %signsplat = ashr i16 %x, 15 141 %flipsign = add i16 %x, 32768 142 %result = and i16 %signsplat, %flipsign 143 ret i16 %result 144} 145 146define i16 @usubsat_as_bithack_commute_i16(i16 %x) { 147; GFX6-LABEL: usubsat_as_bithack_commute_i16: 148; GFX6: ; %bb.0: 149; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 151; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v1 152; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xffff8000, v0 153; GFX6-NEXT: v_and_b32_e32 v0, v0, v1 154; GFX6-NEXT: s_setpc_b64 s[30:31] 155; 156; GFX8-LABEL: usubsat_as_bithack_commute_i16: 157; GFX8: ; %bb.0: 158; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 159; GFX8-NEXT: s_movk_i32 s4, 0x8000 160; GFX8-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 161; GFX8-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX9-LABEL: usubsat_as_bithack_commute_i16: 164; GFX9: ; %bb.0: 165; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX9-NEXT: s_movk_i32 s4, 0x8000 167; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp 168; GFX9-NEXT: s_setpc_b64 s[30:31] 169; 170; GFX10PLUS-LABEL: usubsat_as_bithack_commute_i16: 171; GFX10PLUS: ; %bb.0: 172; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 174; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp 175; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 176 %signsplat = ashr i16 %x, 15 177 %flipsign = add i16 %x, 32768 178 %result = and i16 %flipsign, %signsplat 179 ret i16 %result 180} 181 182define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) { 183; GFX6-LABEL: v_usubsat_i32: 184; GFX6: ; %bb.0: 185; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 186; GFX6-NEXT: v_max_u32_e32 v0, v0, v1 187; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 188; GFX6-NEXT: s_setpc_b64 s[30:31] 189; 190; GFX8-LABEL: v_usubsat_i32: 191; GFX8: ; %bb.0: 192; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 193; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp 194; GFX8-NEXT: s_setpc_b64 s[30:31] 195; 196; GFX9-LABEL: v_usubsat_i32: 197; GFX9: ; %bb.0: 198; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp 200; GFX9-NEXT: s_setpc_b64 s[30:31] 201; 202; GFX10PLUS-LABEL: v_usubsat_i32: 203; GFX10PLUS: ; %bb.0: 204; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 205; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 206; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp 207; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 208 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) 209 ret i32 %result 210} 211 212define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 213; GFX6-LABEL: v_usubsat_v2i16: 214; GFX6: ; %bb.0: 215; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 216; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v3 217; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 218; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 219; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 220; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 221; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 222; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 223; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 224; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 225; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 226; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 227; GFX6-NEXT: s_setpc_b64 s[30:31] 228; 229; GFX8-LABEL: v_usubsat_v2i16: 230; GFX8: ; %bb.0: 231; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX8-NEXT: v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 233; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 234; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 235; GFX8-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX9-LABEL: v_usubsat_v2i16: 238; GFX9: ; %bb.0: 239; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 241; GFX9-NEXT: s_setpc_b64 s[30:31] 242; 243; GFX10PLUS-LABEL: v_usubsat_v2i16: 244; GFX10PLUS: ; %bb.0: 245; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 247; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 248; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 249 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 250 ret <2 x i16> %result 251} 252 253define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 254; GFX6-LABEL: v_usubsat_v3i16: 255; GFX6: ; %bb.0: 256; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 257; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v4 258; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 259; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 260; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 261; GFX6-NEXT: v_max_u32_e32 v1, v1, v6 262; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 263; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 264; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 265; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 266; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 267; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 268; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 269; GFX6-NEXT: v_max_u32_e32 v1, v2, v5 270; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 271; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16 272; GFX6-NEXT: s_setpc_b64 s[30:31] 273; 274; GFX8-LABEL: v_usubsat_v3i16: 275; GFX8: ; %bb.0: 276; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 278; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp 279; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp 280; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 281; GFX8-NEXT: s_setpc_b64 s[30:31] 282; 283; GFX9-LABEL: v_usubsat_v3i16: 284; GFX9: ; %bb.0: 285; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 287; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 288; GFX9-NEXT: s_setpc_b64 s[30:31] 289; 290; GFX10PLUS-LABEL: v_usubsat_v3i16: 291; GFX10PLUS: ; %bb.0: 292; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 294; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 295; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 296; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 297 %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 298 ret <3 x i16> %result 299} 300 301define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 302; GFX6-LABEL: v_usubsat_v4i16: 303; GFX6: ; %bb.0: 304; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5 306; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 307; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 308; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 309; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 310; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 311; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 312; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7 313; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 314; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 315; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 316; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 317; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 318; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 319; GFX6-NEXT: v_max_u32_e32 v1, v2, v6 320; GFX6-NEXT: v_max_u32_e32 v2, v3, v8 321; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 322; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 323; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 324; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 325; GFX6-NEXT: s_setpc_b64 s[30:31] 326; 327; GFX8-LABEL: v_usubsat_v4i16: 328; GFX8: ; %bb.0: 329; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GFX8-NEXT: v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 331; GFX8-NEXT: v_sub_u16_e64 v0, v0, v2 clamp 332; GFX8-NEXT: v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 333; GFX8-NEXT: v_sub_u16_e64 v1, v1, v3 clamp 334; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 335; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 336; GFX8-NEXT: s_setpc_b64 s[30:31] 337; 338; GFX9-LABEL: v_usubsat_v4i16: 339; GFX9: ; %bb.0: 340; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 341; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 342; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 343; GFX9-NEXT: s_setpc_b64 s[30:31] 344; 345; GFX10PLUS-LABEL: v_usubsat_v4i16: 346; GFX10PLUS: ; %bb.0: 347; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 348; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 349; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 350; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 351; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 352 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 353 %cast = bitcast <4 x i16> %result to <2 x float> 354 ret <2 x float> %cast 355} 356 357define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 358; GFX6-LABEL: v_usubsat_v2i32: 359; GFX6: ; %bb.0: 360; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 361; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 362; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 363; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 364; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 365; GFX6-NEXT: s_setpc_b64 s[30:31] 366; 367; GFX8-LABEL: v_usubsat_v2i32: 368; GFX8: ; %bb.0: 369; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp 371; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp 372; GFX8-NEXT: s_setpc_b64 s[30:31] 373; 374; GFX9-LABEL: v_usubsat_v2i32: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp 378; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp 379; GFX9-NEXT: s_setpc_b64 s[30:31] 380; 381; GFX10PLUS-LABEL: v_usubsat_v2i32: 382; GFX10PLUS: ; %bb.0: 383; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 384; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 385; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp 386; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp 387; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 388 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 389 ret <2 x i32> %result 390} 391 392define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 393; GFX6-LABEL: v_usubsat_v3i32: 394; GFX6: ; %bb.0: 395; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 396; GFX6-NEXT: v_max_u32_e32 v0, v0, v3 397; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 398; GFX6-NEXT: v_max_u32_e32 v2, v2, v5 399; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 400; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 401; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 402; GFX6-NEXT: s_setpc_b64 s[30:31] 403; 404; GFX8-LABEL: v_usubsat_v3i32: 405; GFX8: ; %bb.0: 406; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 407; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp 408; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp 409; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp 410; GFX8-NEXT: s_setpc_b64 s[30:31] 411; 412; GFX9-LABEL: v_usubsat_v3i32: 413; GFX9: ; %bb.0: 414; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp 416; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp 417; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp 418; GFX9-NEXT: s_setpc_b64 s[30:31] 419; 420; GFX10PLUS-LABEL: v_usubsat_v3i32: 421; GFX10PLUS: ; %bb.0: 422; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 423; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 424; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp 425; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp 426; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp 427; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 428 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 429 ret <3 x i32> %result 430} 431 432define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 433; GFX6-LABEL: v_usubsat_v4i32: 434; GFX6: ; %bb.0: 435; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 437; GFX6-NEXT: v_max_u32_e32 v1, v1, v5 438; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 439; GFX6-NEXT: v_max_u32_e32 v3, v3, v7 440; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 441; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 442; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 443; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 444; GFX6-NEXT: s_setpc_b64 s[30:31] 445; 446; GFX8-LABEL: v_usubsat_v4i32: 447; GFX8: ; %bb.0: 448; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 449; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp 450; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp 451; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp 452; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp 453; GFX8-NEXT: s_setpc_b64 s[30:31] 454; 455; GFX9-LABEL: v_usubsat_v4i32: 456; GFX9: ; %bb.0: 457; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp 459; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp 460; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp 461; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp 462; GFX9-NEXT: s_setpc_b64 s[30:31] 463; 464; GFX10PLUS-LABEL: v_usubsat_v4i32: 465; GFX10PLUS: ; %bb.0: 466; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 468; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp 469; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp 470; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp 471; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp 472; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 473 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 474 ret <4 x i32> %result 475} 476 477define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { 478; GFX6-LABEL: v_usubsat_v8i32: 479; GFX6: ; %bb.0: 480; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 481; GFX6-NEXT: v_max_u32_e32 v0, v0, v8 482; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 483; GFX6-NEXT: v_max_u32_e32 v2, v2, v10 484; GFX6-NEXT: v_max_u32_e32 v3, v3, v11 485; GFX6-NEXT: v_max_u32_e32 v4, v4, v12 486; GFX6-NEXT: v_max_u32_e32 v5, v5, v13 487; GFX6-NEXT: v_max_u32_e32 v6, v6, v14 488; GFX6-NEXT: v_max_u32_e32 v7, v7, v15 489; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 490; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 491; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 492; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v11 493; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 494; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v13 495; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v14 496; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v15 497; GFX6-NEXT: s_setpc_b64 s[30:31] 498; 499; GFX8-LABEL: v_usubsat_v8i32: 500; GFX8: ; %bb.0: 501; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 502; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v8 clamp 503; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v9 clamp 504; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v10 clamp 505; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v11 clamp 506; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v12 clamp 507; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v13 clamp 508; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v14 clamp 509; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v15 clamp 510; GFX8-NEXT: s_setpc_b64 s[30:31] 511; 512; GFX9-LABEL: v_usubsat_v8i32: 513; GFX9: ; %bb.0: 514; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 515; GFX9-NEXT: v_sub_u32_e64 v0, v0, v8 clamp 516; GFX9-NEXT: v_sub_u32_e64 v1, v1, v9 clamp 517; GFX9-NEXT: v_sub_u32_e64 v2, v2, v10 clamp 518; GFX9-NEXT: v_sub_u32_e64 v3, v3, v11 clamp 519; GFX9-NEXT: v_sub_u32_e64 v4, v4, v12 clamp 520; GFX9-NEXT: v_sub_u32_e64 v5, v5, v13 clamp 521; GFX9-NEXT: v_sub_u32_e64 v6, v6, v14 clamp 522; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp 523; GFX9-NEXT: s_setpc_b64 s[30:31] 524; 525; GFX10PLUS-LABEL: v_usubsat_v8i32: 526; GFX10PLUS: ; %bb.0: 527; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 529; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp 530; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp 531; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp 532; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp 533; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp 534; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp 535; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp 536; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp 537; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 538 %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) 539 ret <8 x i32> %result 540} 541 542define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 543; GFX6-LABEL: v_usubsat_v16i32: 544; GFX6: ; %bb.0: 545; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 546; GFX6-NEXT: v_max_u32_e32 v0, v0, v16 547; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 548; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 549; GFX6-NEXT: v_max_u32_e32 v1, v1, v17 550; GFX6-NEXT: v_max_u32_e32 v2, v2, v18 551; GFX6-NEXT: v_max_u32_e32 v3, v3, v19 552; GFX6-NEXT: v_max_u32_e32 v4, v4, v20 553; GFX6-NEXT: v_max_u32_e32 v5, v5, v21 554; GFX6-NEXT: v_max_u32_e32 v6, v6, v22 555; GFX6-NEXT: v_max_u32_e32 v7, v7, v23 556; GFX6-NEXT: v_max_u32_e32 v8, v8, v24 557; GFX6-NEXT: v_max_u32_e32 v9, v9, v25 558; GFX6-NEXT: v_max_u32_e32 v10, v10, v26 559; GFX6-NEXT: v_max_u32_e32 v11, v11, v27 560; GFX6-NEXT: v_max_u32_e32 v12, v12, v28 561; GFX6-NEXT: v_max_u32_e32 v13, v13, v29 562; GFX6-NEXT: v_max_u32_e32 v14, v14, v30 563; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 564; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 565; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v19 566; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v20 567; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v21 568; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v22 569; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v23 570; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v24 571; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v25 572; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v26 573; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v27 574; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v28 575; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v29 576; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v30 577; GFX6-NEXT: s_waitcnt vmcnt(0) 578; GFX6-NEXT: v_max_u32_e32 v15, v15, v16 579; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 580; GFX6-NEXT: s_setpc_b64 s[30:31] 581; 582; GFX8-LABEL: v_usubsat_v16i32: 583; GFX8: ; %bb.0: 584; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 585; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp 586; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 587; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp 588; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp 589; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp 590; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp 591; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp 592; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp 593; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp 594; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp 595; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp 596; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp 597; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp 598; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp 599; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp 600; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp 601; GFX8-NEXT: s_waitcnt vmcnt(0) 602; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp 603; GFX8-NEXT: s_setpc_b64 s[30:31] 604; 605; GFX9-LABEL: v_usubsat_v16i32: 606; GFX9: ; %bb.0: 607; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 608; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp 609; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 610; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp 611; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp 612; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp 613; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp 614; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp 615; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp 616; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp 617; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp 618; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp 619; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp 620; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp 621; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp 622; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp 623; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp 624; GFX9-NEXT: s_waitcnt vmcnt(0) 625; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp 626; GFX9-NEXT: s_setpc_b64 s[30:31] 627; 628; GFX10-LABEL: v_usubsat_v16i32: 629; GFX10: ; %bb.0: 630; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 631; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 632; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 633; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp 634; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp 635; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp 636; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp 637; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp 638; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp 639; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp 640; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp 641; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp 642; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp 643; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp 644; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp 645; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp 646; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp 647; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp 648; GFX10-NEXT: s_waitcnt vmcnt(0) 649; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp 650; GFX10-NEXT: s_setpc_b64 s[30:31] 651; 652; GFX11-LABEL: v_usubsat_v16i32: 653; GFX11: ; %bb.0: 654; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 655; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 656; GFX11-NEXT: scratch_load_b32 v31, off, s32 657; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp 658; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp 659; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp 660; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp 661; GFX11-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp 662; GFX11-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp 663; GFX11-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp 664; GFX11-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp 665; GFX11-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp 666; GFX11-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp 667; GFX11-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp 668; GFX11-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp 669; GFX11-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp 670; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp 671; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp 672; GFX11-NEXT: s_waitcnt vmcnt(0) 673; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp 674; GFX11-NEXT: s_setpc_b64 s[30:31] 675 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 676 ret <16 x i32> %result 677} 678 679 680define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) { 681; GFX6-LABEL: v_usubsat_i64: 682; GFX6: ; %bb.0: 683; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 684; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 685; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 686; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] 687; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 688; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc 689; GFX6-NEXT: s_setpc_b64 s[30:31] 690; 691; GFX8-LABEL: v_usubsat_i64: 692; GFX8: ; %bb.0: 693; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 694; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 695; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 696; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] 697; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 698; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc 699; GFX8-NEXT: s_setpc_b64 s[30:31] 700; 701; GFX9-LABEL: v_usubsat_i64: 702; GFX9: ; %bb.0: 703; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 704; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 705; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 706; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] 707; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc 708; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc 709; GFX9-NEXT: s_setpc_b64 s[30:31] 710; 711; GFX10PLUS-LABEL: v_usubsat_i64: 712; GFX10PLUS: ; %bb.0: 713; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 714; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 715; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 716; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo 717; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] 718; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo 719; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo 720; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 721 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) 722 ret i64 %result 723} 724 725declare i8 @llvm.usub.sat.i8(i8, i8) #0 726declare i16 @llvm.usub.sat.i16(i16, i16) #0 727declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0 728declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0 729declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0 730declare i32 @llvm.usub.sat.i32(i32, i32) #0 731declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0 732declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0 733declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0 734declare <8 x i32> @llvm.usub.sat.v8i32(<8 x i32>, <8 x i32>) #0 735declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0 736declare i64 @llvm.usub.sat.i64(i64, i64) #0 737 738attributes #0 = { nounwind readnone speculatable willreturn } 739