1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; FIXME: Manually added checks for metadata nodes at bottom
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
4; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
5
6target datalayout = "A5"
7
8define amdgpu_kernel void @kern_noargs() {
9; HSA-LABEL: @kern_noargs(
10; HSA-NEXT:    ret void
11;
12; MESA-LABEL: @kern_noargs(
13; MESA-NEXT:    ret void
14;
15  ret void
16}
17
18define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
19; HSA-LABEL: @kern_i8(
20; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
21; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
22; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
23; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
24; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
25; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
26; HSA-NEXT:    ret void
27;
28; MESA-LABEL: @kern_i8(
29; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
30; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
31; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
32; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
33; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
34; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
35; MESA-NEXT:    ret void
36;
37  store i8 %arg, i8 addrspace(1)* undef, align 1
38  ret void
39}
40
41define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
42; HSA-LABEL: @kern_i16(
43; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
44; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
45; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
46; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
47; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
48; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
49; HSA-NEXT:    ret void
50;
51; MESA-LABEL: @kern_i16(
52; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
53; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
54; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
55; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
56; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
57; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
58; MESA-NEXT:    ret void
59;
60  store i16 %arg, i16 addrspace(1)* undef, align 1
61  ret void
62}
63
64define amdgpu_kernel void @kern_f16(half %arg) #0 {
65; HSA-LABEL: @kern_f16(
66; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
67; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
68; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
69; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
70; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
71; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
72; HSA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
73; HSA-NEXT:    ret void
74;
75; MESA-LABEL: @kern_f16(
76; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
77; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
78; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
79; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
80; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
81; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
82; MESA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
83; MESA-NEXT:    ret void
84;
85  store half %arg, half addrspace(1)* undef, align 1
86  ret void
87}
88
89define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
90; HSA-LABEL: @kern_zeroext_i8(
91; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
92; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
93; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
94; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
95; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
96; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
97; HSA-NEXT:    ret void
98;
99; MESA-LABEL: @kern_zeroext_i8(
100; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
101; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
102; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
103; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
104; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
105; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
106; MESA-NEXT:    ret void
107;
108  store i8 %arg, i8 addrspace(1)* undef, align 1
109  ret void
110}
111
112define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
113; HSA-LABEL: @kern_zeroext_i16(
114; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
115; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
116; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
117; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
118; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
119; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
120; HSA-NEXT:    ret void
121;
122; MESA-LABEL: @kern_zeroext_i16(
123; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
124; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
125; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
126; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
127; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
128; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
129; MESA-NEXT:    ret void
130;
131  store i16 %arg, i16 addrspace(1)* undef, align 1
132  ret void
133}
134
135define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
136; HSA-LABEL: @kern_signext_i8(
137; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
138; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
139; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
140; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
141; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
142; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
143; HSA-NEXT:    ret void
144;
145; MESA-LABEL: @kern_signext_i8(
146; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
147; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
148; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
149; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
150; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
151; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
152; MESA-NEXT:    ret void
153;
154  store i8 %arg, i8 addrspace(1)* undef, align 1
155  ret void
156}
157
158define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
159; HSA-LABEL: @kern_signext_i16(
160; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
161; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
162; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
163; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
164; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
165; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
166; HSA-NEXT:    ret void
167;
168; MESA-LABEL: @kern_signext_i16(
169; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
170; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
171; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
172; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
173; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
174; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
175; MESA-NEXT:    ret void
176;
177  store i16 %arg, i16 addrspace(1)* undef, align 1
178  ret void
179}
180
181define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
182; HSA-LABEL: @kern_i8_i8(
183; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
184; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
185; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
186; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
187; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
188; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
189; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
190; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
191; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
192; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
193; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
194; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
195; HSA-NEXT:    ret void
196;
197; MESA-LABEL: @kern_i8_i8(
198; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
199; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
200; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
201; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
202; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
203; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
204; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
205; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
206; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
207; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
208; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
209; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
210; MESA-NEXT:    ret void
211;
212  store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
213  store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
214  ret void
215}
216
217define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
218; HSA-LABEL: @kern_v3i8(
219; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
220; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
221; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
222; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
223; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
224; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
225; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
226; HSA-NEXT:    ret void
227;
228; MESA-LABEL: @kern_v3i8(
229; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
230; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
231; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
232; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
233; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
234; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
235; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
236; MESA-NEXT:    ret void
237;
238  store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
239  ret void
240}
241
242define amdgpu_kernel void @kern_i24(i24 %arg0) {
243; HSA-LABEL: @kern_i24(
244; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
245; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
246; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
247; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
248; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
249; HSA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
250; HSA-NEXT:    ret void
251;
252; MESA-LABEL: @kern_i24(
253; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
254; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
255; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
256; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
257; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
258; MESA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
259; MESA-NEXT:    ret void
260;
261  store i24 %arg0, i24 addrspace(1)* undef
262  ret void
263}
264
265define amdgpu_kernel void @kern_i32(i32 %arg0) {
266; HSA-LABEL: @kern_i32(
267; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
268; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0
269; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
270; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
271; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
272; HSA-NEXT:    ret void
273;
274; MESA-LABEL: @kern_i32(
275; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
276; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
277; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
278; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
279; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
280; MESA-NEXT:    ret void
281;
282  store i32 %arg0, i32 addrspace(1)* undef
283  ret void
284}
285
286define amdgpu_kernel void @kern_f32(float %arg0) {
287; HSA-LABEL: @kern_f32(
288; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
289; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0
290; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
291; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
292; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
293; HSA-NEXT:    ret void
294;
295; MESA-LABEL: @kern_f32(
296; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
297; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
298; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
299; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
300; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
301; MESA-NEXT:    ret void
302;
303  store float %arg0, float addrspace(1)* undef
304  ret void
305}
306
307define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
308; HSA-LABEL: @kern_v3i32(
309; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
310; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
311; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
312; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
313; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
314; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
315; HSA-NEXT:    ret void
316;
317; MESA-LABEL: @kern_v3i32(
318; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
319; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
320; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
321; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
322; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
323; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
324; MESA-NEXT:    ret void
325;
326  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
327  ret void
328}
329
330define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
331; HSA-LABEL: @kern_v8i32(
332; HSA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
333; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
334; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
335; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
336; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
337; HSA-NEXT:    ret void
338;
339; MESA-LABEL: @kern_v8i32(
340; MESA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
341; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
342; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
343; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
344; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
345; MESA-NEXT:    ret void
346;
347  store <8 x i32> %arg, <8 x i32> addrspace(1)* undef
348  ret void
349}
350
351define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
352; HSA-LABEL: @kern_v8i64(
353; HSA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(120) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
354; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
355; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
356; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
357; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
358; HSA-NEXT:    ret void
359;
360; MESA-LABEL: @kern_v8i64(
361; MESA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(120) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
362; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
363; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
364; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
365; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
366; MESA-NEXT:    ret void
367;
368  store <8 x i64> %arg, <8 x i64> addrspace(1)* undef
369  ret void
370}
371
372define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
373; HSA-LABEL: @kern_v16i64(
374; HSA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(184) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
375; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
376; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
377; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
378; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
379; HSA-NEXT:    ret void
380;
381; MESA-LABEL: @kern_v16i64(
382; MESA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(184) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
383; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
384; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
385; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
386; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
387; MESA-NEXT:    ret void
388;
389  store <16 x i64> %arg, <16 x i64> addrspace(1)* undef
390  ret void
391}
392
393define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
394; HSA-LABEL: @kern_i32_v3i32(
395; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
396; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
397; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
398; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
399; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16
400; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
401; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
402; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
403; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
404; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
405; HSA-NEXT:    ret void
406;
407; MESA-LABEL: @kern_i32_v3i32(
408; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
409; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
410; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
411; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
412; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52
413; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
414; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
415; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
416; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
417; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
418; MESA-NEXT:    ret void
419;
420  store i32 %arg0, i32 addrspace(1)* undef
421  store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
422  ret void
423}
424
425%struct.a = type { i32, i8, [4 x i8] }
426%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
427
428define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
429; HSA-LABEL: @kern_struct_a(
430; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
431; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
432; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
433; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
434; HSA-NEXT:    store [[STRUCT_A]] [[ARG0_LOAD]], [[STRUCT_A]] addrspace(1)* undef, align 4
435; HSA-NEXT:    ret void
436;
437; MESA-LABEL: @kern_struct_a(
438; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
439; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
440; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
441; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
442; MESA-NEXT:    store [[STRUCT_A]] [[ARG0_LOAD]], [[STRUCT_A]] addrspace(1)* undef, align 4
443; MESA-NEXT:    ret void
444;
445  store %struct.a %arg0, %struct.a addrspace(1)* undef
446  ret void
447}
448
449define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
450; HSA-LABEL: @kern_struct_b_packed(
451; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
452; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
453; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
454; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
455; HSA-NEXT:    store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
456; HSA-NEXT:    ret void
457;
458; MESA-LABEL: @kern_struct_b_packed(
459; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(88) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
460; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
461; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
462; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
463; MESA-NEXT:    store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
464; MESA-NEXT:    ret void
465;
466  store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
467  ret void
468}
469
470define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
471; HSA-LABEL: @kern_implicit_arg_num_bytes(
472; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
473; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
474; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
475; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
476; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
477; HSA-NEXT:    ret void
478;
479; MESA-LABEL: @kern_implicit_arg_num_bytes(
480; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
481; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
482; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
483; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
484; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
485; MESA-NEXT:    ret void
486;
487  store i32 %arg0, i32 addrspace(1)* undef
488  ret void
489}
490
491define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 {
492; HSA-LABEL: @kernel_implicitarg_no_struct_align(
493; HSA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
494; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
495; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
496; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
497; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
498; HSA-NEXT:    ret void
499;
500; MESA-LABEL: @kernel_implicitarg_no_struct_align(
501; MESA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
502; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
503; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
504; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
505; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
506; MESA-NEXT:    ret void
507;
508  store i32 %arg1, i32 addrspace(1)* undef
509  ret void
510}
511
512define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
513; HSA-LABEL: @kern_lds_ptr(
514; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
515; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
516; HSA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
517; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
518; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
519; HSA-NEXT:    ret void
520;
521; MESA-LABEL: @kern_lds_ptr(
522; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
523; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
524; MESA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
525; MESA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
526; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
527; MESA-NEXT:    ret void
528;
529  store i32 0, i32 addrspace(3)* %lds, align 4
530  ret void
531}
532
533define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
534; HSA-LABEL: @kern_lds_ptr_si(
535; HSA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
536; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
537; HSA-NEXT:    ret void
538;
539; MESA-LABEL: @kern_lds_ptr_si(
540; MESA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
541; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
542; MESA-NEXT:    ret void
543;
544  store i32 0, i32 addrspace(3)* %lds, align 4
545  ret void
546}
547
548define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
549; HSA-LABEL: @kern_realign_i8_i8(
550; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
551; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
552; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
553; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
554; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
555; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
556; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
557; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
558; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
559; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
560; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
561; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
562; HSA-NEXT:    ret void
563;
564; MESA-LABEL: @kern_realign_i8_i8(
565; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
566; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
567; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
568; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
569; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
570; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
571; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
572; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
573; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
574; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
575; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
576; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
577; MESA-NEXT:    ret void
578;
579  store volatile i8 %arg0, i8 addrspace(1)* undef
580  store volatile i8 %arg1, i8 addrspace(1)* undef
581  ret void
582}
583
584define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
585; HSA-LABEL: @kern_realign_i8_i8_i8(
586; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
587; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
588; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
589; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
590; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
591; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
592; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
593; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
594; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
595; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
596; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
597; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
598; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
599; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
600; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
601; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
602; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
603; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
604; HSA-NEXT:    ret void
605;
606; MESA-LABEL: @kern_realign_i8_i8_i8(
607; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
608; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
609; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
610; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
611; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
612; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
613; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
614; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
615; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
616; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
617; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
618; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
619; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
620; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
621; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
622; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
623; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
624; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
625; MESA-NEXT:    ret void
626;
627  store volatile i8 %arg0, i8 addrspace(1)* undef
628  store volatile i8 %arg1, i8 addrspace(1)* undef
629  store volatile i8 %arg2, i8 addrspace(1)* undef
630  ret void
631}
632
633define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
634; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
635; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
636; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
637; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
638; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
639; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
640; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
641; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
642; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
643; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
644; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
645; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
646; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
647; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
648; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
649; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
650; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
651; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
652; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
653; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
654; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
655; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
656; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
657; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
658; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
659; HSA-NEXT:    ret void
660;
661; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
662; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
663; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
664; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
665; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
666; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
667; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
668; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
669; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
670; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
671; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
672; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
673; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
674; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
675; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
676; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
677; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
678; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
679; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
680; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
681; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
682; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
683; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
684; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
685; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
686; MESA-NEXT:    ret void
687;
688  store volatile i8 %arg0, i8 addrspace(1)* undef
689  store volatile i8 %arg1, i8 addrspace(1)* undef
690  store volatile i8 %arg2, i8 addrspace(1)* undef
691  store volatile i8 %arg3, i8 addrspace(1)* undef
692  ret void
693}
694
695define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
696; HSA-LABEL: @kern_realign_i8_v3i8(
697; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
698; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
699; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
700; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
701; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
702; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
703; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
704; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
705; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
706; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
707; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
708; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
709; HSA-NEXT:    ret void
710;
711; MESA-LABEL: @kern_realign_i8_v3i8(
712; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
713; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
714; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
715; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
716; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
717; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40
718; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
719; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
720; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
721; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
722; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
723; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
724; MESA-NEXT:    ret void
725;
726  store volatile i8 %arg0, i8 addrspace(1)* undef
727  store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
728  ret void
729}
730
731define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
732; HSA-LABEL: @kern_realign_i8_i16(
733; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
734; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
735; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
736; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
737; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
738; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
739; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
740; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
741; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
742; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
743; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
744; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
745; HSA-NEXT:    ret void
746;
747; MESA-LABEL: @kern_realign_i8_i16(
748; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
749; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
750; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
751; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
752; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
753; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
754; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
755; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
756; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
757; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
758; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
759; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
760; MESA-NEXT:    ret void
761;
762  store volatile i8 %arg0, i8 addrspace(1)* undef
763  store volatile i16 %arg1, i16 addrspace(1)* undef
764  ret void
765}
766
767define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
768; HSA-LABEL: @kern_realign_i1_i1(
769; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
770; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
771; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
772; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
773; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
774; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
775; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
776; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
777; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
778; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
779; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
780; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
781; HSA-NEXT:    ret void
782;
783; MESA-LABEL: @kern_realign_i1_i1(
784; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
785; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
786; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
787; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
788; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
789; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
790; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
791; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
792; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
793; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
794; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
795; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
796; MESA-NEXT:    ret void
797;
798  store volatile i1 %arg0, i1 addrspace(1)* undef
799  store volatile i1 %arg1, i1 addrspace(1)* undef
800  ret void
801}
802
803define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
804; HSA-LABEL: @kern_realign_i1_i1_i1(
805; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
806; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
807; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
808; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
809; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
810; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
811; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
812; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
813; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
814; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
815; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
816; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
817; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
818; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
819; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
820; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
821; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
822; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
823; HSA-NEXT:    ret void
824;
825; MESA-LABEL: @kern_realign_i1_i1_i1(
826; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
827; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
828; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
829; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
830; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
831; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
832; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
833; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
834; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
835; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
836; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
837; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
838; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
839; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
840; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
841; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
842; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
843; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
844; MESA-NEXT:    ret void
845;
846  store volatile i1 %arg0, i1 addrspace(1)* undef
847  store volatile i1 %arg1, i1 addrspace(1)* undef
848  store volatile i1 %arg2, i1 addrspace(1)* undef
849  ret void
850}
851
852define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
853; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
854; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
855; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
856; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
857; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
858; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
859; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
860; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
861; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
862; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
863; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
864; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
865; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
866; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
867; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
868; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
869; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
870; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
871; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
872; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
873; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
874; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
875; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
876; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
877; HSA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
878; HSA-NEXT:    ret void
879;
880; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
881; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
882; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
883; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
884; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
885; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
886; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
887; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
888; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
889; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
890; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
891; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
892; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
893; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
894; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
895; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
896; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
897; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
898; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
899; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
900; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
901; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
902; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
903; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
904; MESA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
905; MESA-NEXT:    ret void
906;
907  store volatile i1 %arg0, i1 addrspace(1)* undef
908  store volatile i1 %arg1, i1 addrspace(1)* undef
909  store volatile i1 %arg2, i1 addrspace(1)* undef
910  store volatile i1 %arg3, i1 addrspace(1)* undef
911  ret void
912}
913
914define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
915; HSA-LABEL: @kern_realign_i1_v3i1(
916; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
917; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
918; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
919; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
920; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
921; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
922; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
923; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
924; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
925; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3
926; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1>
927; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
928; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 1
929; HSA-NEXT:    ret void
930;
931; MESA-LABEL: @kern_realign_i1_v3i1(
932; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
933; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
934; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
935; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
936; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
937; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
938; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
939; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
940; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
941; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3
942; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1>
943; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
944; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 1
945; MESA-NEXT:    ret void
946;
947  store volatile i1 %arg0, i1 addrspace(1)* undef
948  store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
949  ret void
950}
951
952define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
953; HSA-LABEL: @kern_realign_i1_i16(
954; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
955; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
956; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
957; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
958; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
959; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
960; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
961; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
962; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
963; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
964; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
965; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
966; HSA-NEXT:    ret void
967;
968; MESA-LABEL: @kern_realign_i1_i16(
969; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
970; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
971; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
972; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
973; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
974; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
975; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
976; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
977; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
978; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
979; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
980; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
981; MESA-NEXT:    ret void
982;
983  store volatile i1 %arg0, i1 addrspace(1)* undef
984  store volatile i16 %arg1, i16 addrspace(1)* undef
985  ret void
986}
987
988define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
989; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
990; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
991; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
992; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
993; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
994; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
995; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
996; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
997; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
998; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
999; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
1000; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
1001; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1002; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1003; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1004; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
1005; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
1006; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1007; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1008; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
1009; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
1010; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1011; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1012; HSA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1013; HSA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
1014; HSA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
1015; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1016; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1017; HSA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1018; HSA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
1019; HSA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1020; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1021; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1022; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1023; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
1024; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1025; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
1026; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
1027; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
1028; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
1029; HSA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
1030; HSA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
1031; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
1032; HSA-NEXT:    ret void
1033;
1034; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
1035; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1036; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1037; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1038; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1039; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
1040; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1041; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1042; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1043; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
1044; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
1045; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1046; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1047; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1048; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1049; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
1050; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1051; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1052; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1053; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
1054; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
1055; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1056; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1057; MESA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1058; MESA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
1059; MESA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
1060; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1061; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1062; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1063; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
1064; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1065; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1066; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1067; MESA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1068; MESA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
1069; MESA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1070; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
1071; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
1072; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
1073; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
1074; MESA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
1075; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
1076; MESA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
1077; MESA-NEXT:    ret void
1078;
1079  store volatile i8 %arg0, i8 addrspace(1)* undef
1080  store volatile i8 %arg1, i8 addrspace(1)* undef
1081  store volatile i8 %arg2, i8 addrspace(1)* undef
1082  store volatile i8 %arg3, i8 addrspace(1)* undef
1083  store volatile i8 %arg5, i8 addrspace(1)* undef
1084  store volatile i8 %arg6, i8 addrspace(1)* undef
1085  store volatile i8 %arg7, i8 addrspace(1)* undef
1086  ret void
1087}
1088
1089define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
1090; HSA-LABEL: @kern_realign_f16_f16(
1091; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1092; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1093; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1094; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1095; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1096; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1097; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1098; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1099; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1100; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1101; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1102; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1103; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
1104; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
1105; HSA-NEXT:    ret void
1106;
1107; MESA-LABEL: @kern_realign_f16_f16(
1108; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1109; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1110; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1111; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1112; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1113; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1114; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1115; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1116; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1117; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1118; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1119; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1120; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
1121; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
1122; MESA-NEXT:    ret void
1123;
1124  store volatile half %arg0, half addrspace(1)* undef
1125  store volatile half %arg1, half addrspace(1)* undef
1126  ret void
1127}
1128
1129define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
1130; HSA-LABEL: @kern_global_ptr(
1131; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1132; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1133; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1134; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1135; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1136; HSA-NEXT:    ret void
1137;
1138; MESA-LABEL: @kern_global_ptr(
1139; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1140; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1141; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1142; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1143; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1144; MESA-NEXT:    ret void
1145;
1146  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1147  ret void
1148}
1149
1150define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
1151; HSA-LABEL: @kern_global_ptr_dereferencable(
1152; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1153; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
1154; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1155; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1
1156; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1157; HSA-NEXT:    ret void
1158;
1159; MESA-LABEL: @kern_global_ptr_dereferencable(
1160; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1161; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
1162; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1163; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1
1164; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1165; MESA-NEXT:    ret void
1166;
1167  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1168  ret void
1169}
1170
1171define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
1172; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
1173; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1174; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
1175; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1176; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2
1177; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1178; HSA-NEXT:    ret void
1179;
1180; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
1181; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1182; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
1183; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1184; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2
1185; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1186; MESA-NEXT:    ret void
1187;
1188  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1189  ret void
1190}
1191
1192define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
1193; HSA-LABEL: @kern_nonnull_global_ptr(
1194; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1195; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1196; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1197; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0
1198; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1199; HSA-NEXT:    ret void
1200;
1201; MESA-LABEL: @kern_nonnull_global_ptr(
1202; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1203; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1204; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1205; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0
1206; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1207; MESA-NEXT:    ret void
1208;
1209  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1210  ret void
1211}
1212
1213define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
1214; HSA-LABEL: @kern_align32_global_ptr(
1215; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1216; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1217; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1218; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3
1219; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1220; HSA-NEXT:    ret void
1221;
1222; MESA-LABEL: @kern_align32_global_ptr(
1223; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1224; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1225; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1226; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3
1227; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1228; MESA-NEXT:    ret void
1229;
1230  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1231  ret void
1232}
1233
1234define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
1235; HSA-LABEL: @kern_noalias_global_ptr(
1236; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1237; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1238; HSA-NEXT:    ret void
1239;
1240; MESA-LABEL: @kern_noalias_global_ptr(
1241; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1242; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1243; MESA-NEXT:    ret void
1244;
1245  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1246  ret void
1247}
1248
1249define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
1250; HSA-LABEL: @kern_noalias_global_ptr_x2(
1251; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1252; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1253; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1254; HSA-NEXT:    ret void
1255;
1256; MESA-LABEL: @kern_noalias_global_ptr_x2(
1257; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1258; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1259; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1260; MESA-NEXT:    ret void
1261;
1262  store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
1263  store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
1264  ret void
1265}
1266
1267define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
1268; HSA-LABEL: @struct_i8_i8_arg(
1269; HSA-NEXT:  entry:
1270; HSA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1271; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
1272; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
1273; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1274; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1275; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1276; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1277; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1278; HSA-NEXT:    ret void
1279;
1280; MESA-LABEL: @struct_i8_i8_arg(
1281; MESA-NEXT:  entry:
1282; MESA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1283; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
1284; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
1285; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1286; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1287; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1288; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1289; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1290; MESA-NEXT:    ret void
1291;
1292entry:
1293  %elt0 = extractvalue {i8, i8} %in, 0
1294  %elt1 = extractvalue {i8, i8} %in, 1
1295  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1296  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
1297  ret void
1298}
1299
1300define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
1301; HSA-LABEL: @struct_i8_i16_arg(
1302; HSA-NEXT:  entry:
1303; HSA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1304; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
1305; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
1306; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1307; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1308; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1309; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1310; HSA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
1311; HSA-NEXT:    ret void
1312;
1313; MESA-LABEL: @struct_i8_i16_arg(
1314; MESA-NEXT:  entry:
1315; MESA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1316; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
1317; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
1318; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1319; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1320; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1321; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1322; MESA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
1323; MESA-NEXT:    ret void
1324;
1325entry:
1326  %elt0 = extractvalue {i8, i16} %in, 0
1327  %elt1 = extractvalue {i8, i16} %in, 1
1328  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1329  store volatile i16 %elt1, i16 addrspace(1)* null, align 4
1330  ret void
1331}
1332
1333define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
1334; HSA-LABEL: @array_2xi8_arg(
1335; HSA-NEXT:  entry:
1336; HSA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1337; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
1338; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
1339; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1340; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1341; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1342; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1343; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1344; HSA-NEXT:    ret void
1345;
1346; MESA-LABEL: @array_2xi8_arg(
1347; MESA-NEXT:  entry:
1348; MESA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1349; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
1350; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
1351; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1352; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1353; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1354; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1355; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1356; MESA-NEXT:    ret void
1357;
1358entry:
1359  %elt0 = extractvalue [2 x i8] %in, 0
1360  %elt1 = extractvalue [2 x i8] %in, 1
1361  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1362  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
1363  ret void
1364}
1365
1366define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
1367; HSA-LABEL: @array_2xi1_arg(
1368; HSA-NEXT:  entry:
1369; HSA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1370; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
1371; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
1372; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1373; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1374; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1375; HSA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
1376; HSA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
1377; HSA-NEXT:    ret void
1378;
1379; MESA-LABEL: @array_2xi1_arg(
1380; MESA-NEXT:  entry:
1381; MESA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1382; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
1383; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
1384; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1385; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1386; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1387; MESA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
1388; MESA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
1389; MESA-NEXT:    ret void
1390;
1391entry:
1392  %elt0 = extractvalue [2 x i1] %in, 0
1393  %elt1 = extractvalue [2 x i1] %in, 1
1394  store volatile i1 %elt0, i1 addrspace(1)* null, align 4
1395  store volatile i1 %elt1, i1 addrspace(1)* null, align 4
1396  ret void
1397}
1398
1399define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
1400; HSA-LABEL: @only_empty_struct(
1401; HSA-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1402; HSA-NEXT:    ret void
1403;
1404; MESA-LABEL: @only_empty_struct(
1405; MESA-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1406; MESA-NEXT:    ret void
1407;
1408  ret void
1409}
1410
1411define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
1412; HSA-LABEL: @empty_struct_with_other(
1413; HSA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1414; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
1415; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
1416; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1417; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
1418; HSA-NEXT:    ret void
1419;
1420; MESA-LABEL: @empty_struct_with_other(
1421; MESA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1422; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
1423; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
1424; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1425; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
1426; MESA-NEXT:    ret void
1427;
1428  store i32 %arg1, i32 addrspace(1)* undef
1429  ret void
1430}
1431
1432; Should insert code after the allocas
1433define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
1434; HSA-LABEL: @static_alloca_kern_i32(
1435; HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1436; HSA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1437; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
1438; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
1439; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1440; HSA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
1441; HSA-NEXT:    ret void
1442;
1443; MESA-LABEL: @static_alloca_kern_i32(
1444; MESA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1445; MESA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1446; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
1447; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
1448; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1449; MESA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
1450; MESA-NEXT:    ret void
1451;
1452  %alloca = alloca i32, addrspace(5)
1453  store volatile i32 %arg0, i32 addrspace(5)* %alloca
1454  ret void
1455}
1456
1457; Make sure we don't break the IR if an alloca depends on the
1458; kernargs.
1459define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
1460; HSA-LABEL: @dyn_alloca_kernarg_i32(
1461; HSA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1462; HSA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1463; HSA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
1464; HSA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
1465; HSA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1466; HSA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1467; HSA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
1468; HSA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
1469; HSA-NEXT:    ret void
1470;
1471; MESA-LABEL: @dyn_alloca_kernarg_i32(
1472; MESA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1473; MESA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1474; MESA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
1475; MESA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
1476; MESA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1477; MESA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1478; MESA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
1479; MESA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
1480; MESA-NEXT:    ret void
1481;
1482  %alloca0 = alloca i32, addrspace(5)
1483  %alloca1 = alloca i32, i32 %n, addrspace(5)
1484  store volatile i32 0, i32 addrspace(5)* %alloca0
1485  store volatile i32 1, i32 addrspace(5)* %alloca1
1486  ret void
1487}
1488
1489; Byref pointers should only be treated as offsets from kernarg
1490define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
1491; HSA-LABEL: @byref_constant_i8_arg(
1492; HSA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1493; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0
1494; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1495; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1496; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8
1497; HSA-NEXT:    [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1498; HSA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1499; HSA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1500; HSA-NEXT:    ret void
1501;
1502; MESA-LABEL: @byref_constant_i8_arg(
1503; MESA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1504; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36
1505; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1506; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1507; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44
1508; MESA-NEXT:    [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1509; MESA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1510; MESA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1511; MESA-NEXT:    ret void
1512;
1513  %in = load i8, i8 addrspace(4)* %in.byref
1514  %ext = zext i8 %in to i32
1515  store i32 %ext, i32 addrspace(1)* %out, align 4
1516  ret void
1517}
1518
1519define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
1520; HSA-LABEL: @byref_constant_i16_arg(
1521; HSA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1522; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0
1523; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1524; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1525; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8
1526; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)*
1527; HSA-NEXT:    [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2
1528; HSA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1529; HSA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1530; HSA-NEXT:    ret void
1531;
1532; MESA-LABEL: @byref_constant_i16_arg(
1533; MESA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1534; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36
1535; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1536; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1537; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44
1538; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)*
1539; MESA-NEXT:    [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2
1540; MESA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1541; MESA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1542; MESA-NEXT:    ret void
1543;
1544  %in = load i16, i16 addrspace(4)* %in.byref
1545  %ext = zext i16 %in to i32
1546  store i32 %ext, i32 addrspace(1)* %out, align 4
1547  ret void
1548}
1549
1550define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) {
1551; HSA-LABEL: @byref_constant_i32_arg(
1552; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1553; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1554; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1555; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1556; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1557; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1558; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1559; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1560; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1561; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1562; HSA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1563; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1564; HSA-NEXT:    ret void
1565;
1566; MESA-LABEL: @byref_constant_i32_arg(
1567; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1568; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1569; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1570; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1571; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1572; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1573; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1574; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1575; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1576; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1577; MESA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1578; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1579; MESA-NEXT:    ret void
1580;
1581  %in = load i32, i32 addrspace(4)* %in.byref
1582  store volatile i32 %in, i32 addrspace(1)* %out, align 4
1583  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1584  ret void
1585}
1586
1587define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) {
1588; HSA-LABEL: @byref_constant_v4i32_arg(
1589; HSA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(96) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1590; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0
1591; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
1592; HSA-NEXT:    [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1593; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16
1594; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
1595; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32
1596; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1597; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1598; HSA-NEXT:    [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16
1599; HSA-NEXT:    store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4
1600; HSA-NEXT:    [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)*
1601; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4
1602; HSA-NEXT:    ret void
1603;
1604; MESA-LABEL: @byref_constant_v4i32_arg(
1605; MESA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(92) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1606; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36
1607; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
1608; MESA-NEXT:    [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1609; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52
1610; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
1611; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68
1612; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1613; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1614; MESA-NEXT:    [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16
1615; MESA-NEXT:    store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4
1616; MESA-NEXT:    [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)*
1617; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4
1618; MESA-NEXT:    ret void
1619;
1620  %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref
1621  store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
1622  %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)*
1623  store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4
1624  ret void
1625}
1626
1627define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
1628; HSA-LABEL: @byref_align_constant_i32_arg(
1629; HSA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(320) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1630; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1631; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1632; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1633; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256
1634; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1635; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260
1636; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1637; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1638; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1639; HSA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1640; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1641; HSA-NEXT:    ret void
1642;
1643; MESA-LABEL: @byref_align_constant_i32_arg(
1644; MESA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(320) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1645; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1646; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1647; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1648; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292
1649; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1650; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296
1651; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1652; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 8, !invariant.load !0
1653; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1654; MESA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1655; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1656; MESA-NEXT:    ret void
1657;
1658  %in = load i32, i32 addrspace(4)* %in.byref
1659  store volatile i32 %in, i32 addrspace(1)* %out, align 4
1660  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1661  ret void
1662}
1663
1664define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
1665; HSA-LABEL: @byref_natural_align_constant_v16i32_arg(
1666; HSA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(192) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1667; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0
1668; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1669; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1670; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64
1671; HSA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)*
1672; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128
1673; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1674; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1675; HSA-NEXT:    [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64
1676; HSA-NEXT:    [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)*
1677; HSA-NEXT:    store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4
1678; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1679; HSA-NEXT:    ret void
1680;
1681; MESA-LABEL: @byref_natural_align_constant_v16i32_arg(
1682; MESA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(188) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1683; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36
1684; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1685; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1686; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100
1687; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)*
1688; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164
1689; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1690; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1691; MESA-NEXT:    [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64
1692; MESA-NEXT:    [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)*
1693; MESA-NEXT:    store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4
1694; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1695; MESA-NEXT:    ret void
1696;
1697  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
1698  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
1699  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
1700  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1701  ret void
1702}
1703
1704; Also accept byref kernel arguments with other global address spaces.
1705define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) {
1706; HSA-LABEL: @byref_global_i32_arg(
1707; HSA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1708; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1709; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1710; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1711; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1712; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)*
1713; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
1714; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1715; HSA-NEXT:    ret void
1716;
1717; MESA-LABEL: @byref_global_i32_arg(
1718; MESA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1719; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1720; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1721; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1722; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1723; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)*
1724; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
1725; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1726; MESA-NEXT:    ret void
1727;
1728  %in = load i32, i32 addrspace(1)* %in.byref
1729  store i32 %in, i32 addrspace(1)* %out, align 4
1730  ret void
1731}
1732
1733define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) {
1734; HSA-LABEL: @byref_flat_i32_arg(
1735; HSA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1736; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0
1737; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1738; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1739; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8
1740; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32*
1741; HSA-NEXT:    [[IN:%.*]] = load i32, i32* [[TMP1]], align 4
1742; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1743; HSA-NEXT:    ret void
1744;
1745; MESA-LABEL: @byref_flat_i32_arg(
1746; MESA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1747; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36
1748; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1749; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1750; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44
1751; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32*
1752; MESA-NEXT:    [[IN:%.*]] = load i32, i32* [[TMP1]], align 4
1753; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1754; MESA-NEXT:    ret void
1755;
1756  %in = load i32, i32* %in.byref
1757  store i32 %in, i32 addrspace(1)* %out, align 4
1758  ret void
1759}
1760
1761define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) {
1762; HSA-LABEL: @byref_constant_32bit_i32_arg(
1763; HSA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1764; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0
1765; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1766; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1767; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8
1768; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)*
1769; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4
1770; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1771; HSA-NEXT:    ret void
1772;
1773; MESA-LABEL: @byref_constant_32bit_i32_arg(
1774; MESA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1775; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36
1776; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1777; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1778; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44
1779; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)*
1780; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4
1781; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1782; MESA-NEXT:    ret void
1783;
1784  %in = load i32, i32 addrspace(6)* %in.byref
1785  store i32 %in, i32 addrspace(1)* %out, align 4
1786  ret void
1787}
1788
1789define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) %in.byref) {
1790; HSA-LABEL: @byref_unknown_as_i32_arg(
1791; HSA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1792; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0
1793; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1794; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1795; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8
1796; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)*
1797; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4
1798; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1799; HSA-NEXT:    ret void
1800;
1801; MESA-LABEL: @byref_unknown_as_i32_arg(
1802; MESA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1803; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36
1804; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1805; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1806; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44
1807; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)*
1808; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4
1809; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1810; MESA-NEXT:    ret void
1811;
1812  %in = load i32, i32 addrspace(999)* %in.byref
1813  store i32 %in, i32 addrspace(1)* %out, align 4
1814  ret void
1815}
1816
1817; Invalid, but should not crash.
1818define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) %in.byref) {
1819; HSA-LABEL: @byref_local_i32_arg(
1820; HSA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1821; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1822; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1823; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1824; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1825; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)*
1826; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4
1827; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1828; HSA-NEXT:    ret void
1829;
1830; MESA-LABEL: @byref_local_i32_arg(
1831; MESA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1832; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1833; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1834; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1835; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1836; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)*
1837; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4
1838; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1839; MESA-NEXT:    ret void
1840;
1841  %in = load i32, i32 addrspace(3)* %in.byref
1842  store i32 %in, i32 addrspace(1)* %out, align 4
1843  ret void
1844}
1845
1846define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) {
1847; HSA-LABEL: @multi_byref_constant_i32_arg(
1848; HSA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1849; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1850; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1851; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1852; HSA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1853; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1854; HSA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1855; HSA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1856; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16
1857; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1858; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1859; HSA-NEXT:    [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1860; HSA-NEXT:    [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4
1861; HSA-NEXT:    store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1862; HSA-NEXT:    store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1863; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1864; HSA-NEXT:    ret void
1865;
1866; MESA-LABEL: @multi_byref_constant_i32_arg(
1867; MESA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(76) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1868; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1869; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1870; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1871; MESA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1872; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1873; MESA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1874; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1875; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52
1876; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1877; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1878; MESA-NEXT:    [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1879; MESA-NEXT:    [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4
1880; MESA-NEXT:    store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1881; MESA-NEXT:    store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1882; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1883; MESA-NEXT:    ret void
1884;
1885  %in0 = load i32, i32 addrspace(4)* %in0.byref
1886  %in1 = load i32, i32 addrspace(4)* %in1.byref
1887  store volatile i32 %in0, i32 addrspace(1)* %out, align 4
1888  store volatile i32 %in1, i32 addrspace(1)* %out, align 4
1889  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1890  ret void
1891}
1892
1893define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) {
1894; HSA-LABEL: @byref_constant_i32_arg_offset0(
1895; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1896; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0
1897; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1898; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1899; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* undef, align 4
1900; HSA-NEXT:    ret void
1901;
1902; MESA-LABEL: @byref_constant_i32_arg_offset0(
1903; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(60) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1904; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36
1905; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1906; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1907; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* undef, align 4
1908; MESA-NEXT:    ret void
1909;
1910  %in = load i32, i32 addrspace(4)* %in.byref
1911  store i32 %in, i32 addrspace(1)* undef, align 4
1912  ret void
1913}
1914
1915attributes #0 = { nounwind "target-cpu"="kaveri" }
1916attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
1917attributes #2 = { nounwind "target-cpu"="tahiti" }
1918
1919; GCN: 0 = !{}
1920; GCN: !1 = !{i64 42}
1921; GCN: !2 = !{i64 128}
1922; GCN: !3 = !{i64 1024}
1923