1; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s 2; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s 3; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s 4; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s 5 6; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [32 x i8] } 7; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i16, [2 x i8], i16 } 8; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] } 9; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i32 addrspace(3)*] } 10 11; SUPER-ALIGN_ON: @lds.unused = addrspace(3) global i32 undef, align 4 12; SUPER-ALIGN_OFF: @lds.unused = addrspace(3) global i32 undef, align 2 13@lds.unused = addrspace(3) global i32 undef, align 2 14 15; CHECK-NOT: @lds.1 16@lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1 17 18; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16 19; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1 20 21; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4 22; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16 23; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8 24 25; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 16 26; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 4 27 28; CHECK-LABEL: @k1 29; CHECK: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0 30; CHECK: %2 = addrspacecast i8 addrspace(3)* %1 to i8* 31; CHECK: %ptr = getelementptr inbounds i8, i8* %2, i64 %x 32; CHECK: store i8 1, i8* %ptr, align 1 33define amdgpu_kernel void @k1(i64 %x) { 34 %ptr = getelementptr inbounds i8, i8* addrspacecast ([32 x i8] addrspace(3)* @lds.1 to i8*), i64 %x 35 store i8 1, i8 addrspace(0)* %ptr, align 1 36 ret void 37} 38 39@lds.2 = internal unnamed_addr addrspace(3) global i16 undef, align 4 40@lds.3 = internal unnamed_addr addrspace(3) global i16 undef, align 4 41 42; Check that alignment is propagated to uses for scalar variables. 43 44; CHECK-LABEL: @k2 45; CHECK: store i16 1, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0), align 4 46; CHECK: store i16 2, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 2), align 4 47define amdgpu_kernel void @k2() { 48 store i16 1, i16 addrspace(3)* @lds.2, align 2 49 store i16 2, i16 addrspace(3)* @lds.3, align 2 50 ret void 51} 52 53@lds.4 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8 54@lds.5 = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 55 56; Check that alignment is propagated to uses for arrays. 57 58; CHECK-LABEL: @k3 59; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 8 60; CHECK: store i32 2, i32 addrspace(3)* %ptr2, align 4 61; SUPER-ALIGN_ON: store i32 3, i32 addrspace(3)* %ptr3, align 16 62; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8 63; CHECK: store i32 4, i32 addrspace(3)* %ptr4, align 4 64; CHECK: store i32 5, i32 addrspace(3)* %ptr5, align 4 65; CHECK: %load1 = load i32, i32 addrspace(3)* %ptr1, align 8 66; CHECK: %load2 = load i32, i32 addrspace(3)* %ptr2, align 4 67; SUPER-ALIGN_ON: %load3 = load i32, i32 addrspace(3)* %ptr3, align 16 68; SUPER-ALIGN_OFF: %load3 = load i32, i32 addrspace(3)* %ptr3, align 8 69; CHECK: %load4 = load i32, i32 addrspace(3)* %ptr4, align 4 70; CHECK: %load5 = load i32, i32 addrspace(3)* %ptr5, align 4 71; CHECK: %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 8 72; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 8 73; CHECK: %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)* 74; CHECK: %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)* 75; CHECK: %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)* 76; CHECK: %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)* 77; CHECK: store i16 11, i16 addrspace(3)* %ptr1.bc, align 8 78; CHECK: store i16 12, i16 addrspace(3)* %ptr2.bc, align 4 79; SUPER-ALIGN_ON: store i16 13, i16 addrspace(3)* %ptr3.bc, align 16 80; SUPER-ALIGN_OFF: store i16 13, i16 addrspace(3)* %ptr3.bc, align 8 81; CHECK: store i16 14, i16 addrspace(3)* %ptr4.bc, align 4 82; CHECK: %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32* 83; CHECK: %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32* 84; CHECK: %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32* 85; CHECK: %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32* 86; CHECK: store i32 21, i32* %ptr1.ac, align 8 87; CHECK: store i32 22, i32* %ptr2.ac, align 4 88; SUPER-ALIGN_ON: store i32 23, i32* %ptr3.ac, align 16 89; SUPER-ALIGN_OFF: store i32 23, i32* %ptr3.ac, align 8 90; CHECK: store i32 24, i32* %ptr4.ac, align 4 91define amdgpu_kernel void @k3(i64 %x) { 92 %ptr0 = getelementptr inbounds i64, i64 addrspace(3)* bitcast ([32 x i64] addrspace(3)* @lds.4 to i64 addrspace(3)*), i64 0 93 store i64 0, i64 addrspace(3)* %ptr0, align 8 94 95 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 2 96 %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 3 97 %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 4 98 %ptr4 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 5 99 %ptr5 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 %x 100 101 store i32 1, i32 addrspace(3)* %ptr1, align 4 102 store i32 2, i32 addrspace(3)* %ptr2, align 4 103 store i32 3, i32 addrspace(3)* %ptr3, align 4 104 store i32 4, i32 addrspace(3)* %ptr4, align 4 105 store i32 5, i32 addrspace(3)* %ptr5, align 4 106 107 %load1 = load i32, i32 addrspace(3)* %ptr1, align 4 108 %load2 = load i32, i32 addrspace(3)* %ptr2, align 4 109 %load3 = load i32, i32 addrspace(3)* %ptr3, align 4 110 %load4 = load i32, i32 addrspace(3)* %ptr4, align 4 111 %load5 = load i32, i32 addrspace(3)* %ptr5, align 4 112 113 %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 4 114 %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 4 115 116 %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)* 117 %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)* 118 %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)* 119 %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)* 120 121 store i16 11, i16 addrspace(3)* %ptr1.bc, align 2 122 store i16 12, i16 addrspace(3)* %ptr2.bc, align 2 123 store i16 13, i16 addrspace(3)* %ptr3.bc, align 2 124 store i16 14, i16 addrspace(3)* %ptr4.bc, align 2 125 126 %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32* 127 %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32* 128 %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32* 129 %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32* 130 131 store i32 21, i32* %ptr1.ac, align 4 132 store i32 22, i32* %ptr2.ac, align 4 133 store i32 23, i32* %ptr3.ac, align 4 134 store i32 24, i32* %ptr4.ac, align 4 135 136 ret void 137} 138 139@lds.6 = internal unnamed_addr addrspace(3) global [2 x i32 addrspace(3)*] undef, align 4 140 141; Check that aligment is not propagated if use is not a pointer operand. 142 143; CHECK-LABEL: @k4 144; SUPER-ALIGN_ON: store i32 undef, i32 addrspace(3)* %ptr, align 8 145; SUPER-ALIGN_OFF: store i32 undef, i32 addrspace(3)* %ptr, align 4 146; CHECK: store i32 addrspace(3)* %ptr, i32 addrspace(3)** undef, align 4 147; SUPER-ALIGN_ON: %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 8 148; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 4 149; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)** undef, i32 addrspace(3)* %ptr, i32 addrspace(3)* undef monotonic monotonic, align 4 150define amdgpu_kernel void @k4() { 151 %gep = getelementptr inbounds i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* bitcast ([2 x i32 addrspace(3)*] addrspace(3)* @lds.6 to i32 addrspace(3)* addrspace(3)*), i64 1 152 %ptr = bitcast i32 addrspace(3)* addrspace(3)* %gep to i32 addrspace(3)* 153 store i32 undef, i32 addrspace(3)* %ptr, align 4 154 store i32 addrspace(3)* %ptr, i32 addrspace(3)** undef, align 4 155 %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 4 156 %val2 = cmpxchg volatile i32 addrspace(3)** undef, i32 addrspace(3)* %ptr, i32 addrspace(3)* undef monotonic monotonic, align 4 157 ret void 158} 159