1; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
2
3define <9 x double> @strided_load_3x3_volatile(double* %in, i64 %stride) {
4; CHECK-LABEL: @strided_load_3x3_volatile(
5; CHECK-NEXT:  entry:
6; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
7; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* %in, i64 [[VEC_START]]
8; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
9; CHECK-NEXT:    load volatile <3 x double>, <3 x double>* [[VEC_CAST]], align 8
10; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
11; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* %in, i64 [[VEC_START1]]
12; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <3 x double>*
13; CHECK-NEXT:    load volatile <3 x double>, <3 x double>* [[VEC_CAST3]], align 8
14; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]]
15; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* %in, i64 [[VEC_START5]]
16; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <3 x double>*
17; CHECK-NEXT:    load volatile <3 x double>, <3 x double>* [[VEC_CAST7]], align 8
18; CHECK-NOT:     = load
19;
20entry:
21  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64(double* %in, i64 %stride, i1 true, i32 3, i32 3)
22  ret <9 x double> %load
23}
24
25declare <9 x double> @llvm.matrix.column.major.load.v9f64(double*, i64, i1, i32, i32)
26
27define <4 x double> @load_volatile_multiply(<4 x double>* %in) {
28; CHECK-LABEL: @load_volatile_multiply(
29; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double>* [[IN:%.*]] to double*
30; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
31; CHECK-NEXT:    load volatile <2 x double>, <2 x double>* [[VEC_CAST]], align 8
32; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i64 2
33; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
34; CHECK-NEXT:    load volatile <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
35; CHECK-NOT:     = load
36;
37  %in.m = load volatile <4 x double>, <4 x double>* %in, align 8
38  %res = call <4 x double> @llvm.matrix.multiply(<4 x double> %in.m, <4 x double> %in.m, i32 2, i32 2, i32 2)
39  ret <4 x double> %res
40}
41
42declare <4 x double> @llvm.matrix.multiply(<4 x double>, <4 x double>, i32, i32, i32)
43
44
45define <9 x double> @strided_load_3x3_align32(double* %in, i64 %stride) {
46; CHECK-LABEL: @strided_load_3x3_align32(
47; CHECK-NEXT:  entry:
48; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
49; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* %in, i64 [[VEC_START]]
50; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
51; CHECK-NEXT:    load <3 x double>, <3 x double>* [[VEC_CAST]], align 32
52; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
53; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* %in, i64 [[VEC_START1]]
54; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <3 x double>*
55; CHECK-NEXT:    load <3 x double>, <3 x double>* [[VEC_CAST3]], align 8
56; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]]
57; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* %in, i64 [[VEC_START5]]
58; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <3 x double>*
59; CHECK-NEXT:    load <3 x double>, <3 x double>* [[VEC_CAST7]], align 8
60; CHECK-NOT:     = load
61;
62entry:
63  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64(double* align 32 %in, i64 %stride, i1 false, i32 3, i32 3)
64  ret <9 x double> %load
65}
66
67define <9 x double> @strided_load_3x3_align2(double* %in, i64 %stride) {
68; CHECK-LABEL: @strided_load_3x3_align2(
69; CHECK-NEXT:  entry:
70; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
71; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* %in, i64 [[VEC_START]]
72; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[VEC_GEP]] to <3 x double>*
73; CHECK-NEXT:    load <3 x double>, <3 x double>* [[VEC_CAST]], align 2
74; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
75; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, double* %in, i64 [[VEC_START1]]
76; CHECK-NEXT:    [[VEC_CAST3:%.*]] = bitcast double* [[VEC_GEP2]] to <3 x double>*
77; CHECK-NEXT:    load <3 x double>, <3 x double>* [[VEC_CAST3]], align 2
78; CHECK-NEXT:    [[VEC_START5:%.*]] = mul i64 2, [[STRIDE]]
79; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr double, double* %in, i64 [[VEC_START5]]
80; CHECK-NEXT:    [[VEC_CAST7:%.*]] = bitcast double* [[VEC_GEP6]] to <3 x double>*
81; CHECK-NEXT:    load <3 x double>, <3 x double>* [[VEC_CAST7]], align 2
82; CHECK-NOT:     = load
83;
84entry:
85  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64(double* align 2 %in, i64 %stride, i1 false, i32 3, i32 3)
86  ret <9 x double> %load
87}
88
89
90define <4 x double> @load_align2_multiply(<4 x double>* %in) {
91; CHECK-LABEL: @load_align2_multiply(
92; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double>* [[IN:%.*]] to double*
93; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
94; CHECK-NEXT:    load <2 x double>, <2 x double>* [[VEC_CAST]], align 2
95; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, double* [[TMP1]], i64 2
96; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
97; CHECK-NEXT:    load <2 x double>, <2 x double>* [[VEC_CAST1]], align 2
98; CHECK-NOT:     = load
99;
100  %in.m = load <4 x double>, <4 x double>* %in, align 2
101  %res = call <4 x double> @llvm.matrix.multiply(<4 x double> %in.m, <4 x double> %in.m, i32 2, i32 2, i32 2)
102  ret <4 x double> %res
103}
104
105define <6 x float> @strided_load_2x3_align16_stride2(float* %in) {
106; CHECK-LABEL: @strided_load_2x3_align16_stride2(
107; CHECK-NEXT:  entry:
108; CHECK-NEXT:    [[VEC_CAST:%.*]] = bitcast float* %in to <2 x float>*
109; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST]], align 16
110; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, float* %in, i64 2
111; CHECK-NEXT:    [[VEC_CAST1:%.*]] = bitcast float* [[VEC_GEP]] to <2 x float>*
112; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST1]], align 8
113; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, float* %in, i64 4
114; CHECK-NEXT:    [[VEC_CAST4:%.*]] = bitcast float* [[VEC_GEP3]] to <2 x float>*
115; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, <2 x float>* [[VEC_CAST4]], align 16
116; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
117; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[COL_LOAD5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
118; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
119; CHECK-NEXT:    ret <6 x float> [[TMP3]]
120;
121entry:
122  %load = call <6 x float> @llvm.matrix.column.major.load.v6f32(float* align 16 %in, i64 2, i1 false, i32 2, i32 3)
123  ret <6 x float> %load
124}
125
126declare <6 x float> @llvm.matrix.column.major.load.v6f32(float*, i64, i1, i32, i32)
127