1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 4; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s 5; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s 6; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s 7; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s 8target datalayout = "A5" 9 10; OPT-LABEL: @vector_read( 11; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 12; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 13 14; FUNC-LABEL: {{^}}vector_read: 15; EG: MOV 16; EG: MOV 17; EG: MOV 18; EG: MOV 19; EG: MOVA_INT 20define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) { 21entry: 22 %tmp = alloca [4 x i32], addrspace(5) 23 %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 24 %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 25 %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 26 %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 27 store i32 0, i32 addrspace(5)* %x 28 store i32 1, i32 addrspace(5)* %y 29 store i32 2, i32 addrspace(5)* %z 30 store i32 3, i32 addrspace(5)* %w 31 %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index 32 %tmp2 = load i32, i32 addrspace(5)* %tmp1 33 store i32 %tmp2, i32 addrspace(1)* %out 34 ret void 35} 36 37; OPT-LABEL: @vector_write( 38; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index 39; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index 40; OPT: store i32 %1, i32 addrspace(1)* %out, align 4 41 42; FUNC-LABEL: {{^}}vector_write: 43; EG: MOV 44; EG: MOV 45; EG: MOV 46; EG: MOV 47; EG: MOVA_INT 48; EG: MOVA_INT 49define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { 50entry: 51 %tmp = alloca [4 x i32], addrspace(5) 52 %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 53 %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 54 %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 55 %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 56 store i32 0, i32 addrspace(5)* %x 57 store i32 0, i32 addrspace(5)* %y 58 store i32 0, i32 addrspace(5)* %z 59 store i32 0, i32 addrspace(5)* %w 60 %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index 61 store i32 1, i32 addrspace(5)* %tmp1 62 %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index 63 %tmp3 = load i32, i32 addrspace(5)* %tmp2 64 store i32 %tmp3, i32 addrspace(1)* %out 65 ret void 66} 67 68; This test should be optimize to: 69; store i32 0, i32 addrspace(1)* %out 70 71; OPT-LABEL: @bitcast_gep( 72; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4 73 74; FUNC-LABEL: {{^}}bitcast_gep: 75; EG: STORE_RAW 76define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { 77entry: 78 %tmp = alloca [4 x i32], addrspace(5) 79 %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 80 %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 81 %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 82 %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 83 store i32 0, i32 addrspace(5)* %x 84 store i32 0, i32 addrspace(5)* %y 85 store i32 0, i32 addrspace(5)* %z 86 store i32 0, i32 addrspace(5)* %w 87 %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 88 %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)* 89 %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0 90 %tmp4 = load i32, i32 addrspace(5)* %tmp3 91 store i32 %tmp4, i32 addrspace(1)* %out 92 ret void 93} 94 95; OPT-LABEL: @vector_read_bitcast_gep( 96; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index 97; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 98define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { 99entry: 100 %tmp = alloca [4 x i32], addrspace(5) 101 %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 102 %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 103 %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 104 %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 105 %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)* 106 store float 1.0, float addrspace(5)* %bc 107 store i32 1, i32 addrspace(5)* %y 108 store i32 2, i32 addrspace(5)* %z 109 store i32 3, i32 addrspace(5)* %w 110 %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index 111 %tmp2 = load i32, i32 addrspace(5)* %tmp1 112 store i32 %tmp2, i32 addrspace(1)* %out 113 ret void 114} 115 116; FIXME: Should be able to promote this. Instcombine should fold the 117; cast in the hasOneUse case so it might not matter in practice 118 119; OPT-LABEL: @vector_read_bitcast_alloca( 120; OPT: alloca [4 x float] 121; OPT: store float 122; OPT: store float 123; OPT: store float 124; OPT: store float 125; OPT: load float 126define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { 127entry: 128 %tmp = alloca [4 x i32], addrspace(5) 129 %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)* 130 %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0 131 %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1 132 %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2 133 %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3 134 store float 0.0, float addrspace(5)* %x 135 store float 1.0, float addrspace(5)* %y 136 store float 2.0, float addrspace(5)* %z 137 store float 4.0, float addrspace(5)* %w 138 %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index 139 %tmp2 = load float, float addrspace(5)* %tmp1 140 store float %tmp2, float addrspace(1)* %out 141 ret void 142} 143 144; The pointer arguments in local address space should not affect promotion to vector. 145 146; OPT-LABEL: @vector_read_with_local_arg( 147; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 148; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 149define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) { 150entry: 151 %tmp = alloca [4 x i32], addrspace(5) 152 %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 153 %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 154 %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 155 %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 156 store i32 0, i32 addrspace(5)* %x 157 store i32 1, i32 addrspace(5)* %y 158 store i32 2, i32 addrspace(5)* %z 159 store i32 3, i32 addrspace(5)* %w 160 %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index 161 %tmp2 = load i32, i32 addrspace(5)* %tmp1 162 store i32 %tmp2, i32 addrspace(1)* %out 163 ret void 164} 165