1// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
2
3// 2-d parallel loop mapped to block.y and block.x
4
5func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
6                              %arg3 : index, %arg4 : index,
7                              %buf : memref<?x?xf32>,
8                              %res : memref<?x?xf32>) {
9  %step = constant 2 : index
10  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
11                                          step (%arg4, %step)  {
12    %val = load %buf[%i0, %i1] : memref<?x?xf32>
13    store %val, %res[%i1, %i0] : memref<?x?xf32>
14  } { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] }
15  return
16}
17
18// CHECK:       #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> ((s0 - s1) ceildiv s2)>
19// CHECK:       #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
20
21// CHECK:       module {
22// CHECK-LABEL:   func @parallel_loop_bidy_bidx(
23// CHECK-SAME:                                  [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref<?x?xf32>, [[VAL_6:%.*]]: memref<?x?xf32>) {
24// CHECK:           [[VAL_7:%.*]] = constant 2 : index
25// CHECK:           [[VAL_8:%.*]] = constant 1 : index
26// CHECK:           [[VAL_9:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_2]], [[VAL_0]], [[VAL_4]]]
27// CHECK:           [[VAL_10:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_3]], [[VAL_1]], [[VAL_7]]]
28// CHECK:           gpu.launch blocks([[VAL_11:%.*]], [[VAL_12:%.*]], [[VAL_13:%.*]]) in ([[VAL_14:%.*]] = [[VAL_10]], [[VAL_15:%.*]] = [[VAL_9]], [[VAL_16:%.*]] = [[VAL_8]]) threads([[VAL_17:%.*]], [[VAL_18:%.*]], [[VAL_19:%.*]]) in ([[VAL_20:%.*]] = [[VAL_8]], [[VAL_21:%.*]] = [[VAL_8]], [[VAL_22:%.*]] = [[VAL_8]]) {
29// CHECK:             [[VAL_23:%.*]] = affine.apply #[[MAP1]]([[VAL_12]]){{\[}}[[VAL_4]], [[VAL_0]]]
30// CHECK:             [[VAL_24:%.*]] = affine.apply #[[MAP1]]([[VAL_11]]){{\[}}[[VAL_7]], [[VAL_1]]]
31// CHECK:             [[VAL_25:%.*]] = load [[VAL_5]]{{\[}}[[VAL_23]], [[VAL_24]]] : memref<?x?xf32>
32// CHECK:             store [[VAL_25]], [[VAL_6]]{{\[}}[[VAL_24]], [[VAL_23]]] : memref<?x?xf32>
33// CHECK:             gpu.terminator
34// CHECK:           }
35// CHECK:           return
36// CHECK:         }
37// CHECK:       }
38
39// -----
40
41// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x.
42
43func @parallel_loop_tiled(%arg0 : index, %arg1 : index, %arg2 : index,
44                        %arg3 : index,
45                        %buf : memref<?x?xf32>,
46                        %res : memref<?x?xf32>) {
47  %zero = constant 0 : index
48  %one = constant 1 : index
49  %four = constant 4 : index
50  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
51                                          step (%four, %four)  {
52    scf.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
53                                            step (%one, %one)  {
54      %idx0 = addi %i0, %si0 : index
55      %idx1 = addi %i1, %si1 : index
56      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
57      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
58    } { mapping = [
59        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
60        {processor = 3, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
61     ] }
62  } { mapping = [
63      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
64      {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
65    ] }
66  return
67}
68
69// CHECK:       #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> ((s0 - s1) ceildiv s2)>
70// CHECK:       #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
71
72// CHECK:       module {
73// CHECK-LABEL:   func @parallel_loop_tiled(
74// CHECK-SAME:                              [[VAL_26:%.*]]: index, [[VAL_27:%.*]]: index, [[VAL_28:%.*]]: index, [[VAL_29:%.*]]: index, [[VAL_30:%.*]]: memref<?x?xf32>, [[VAL_31:%.*]]: memref<?x?xf32>) {
75// CHECK:           [[VAL_32:%.*]] = constant 0 : index
76// CHECK:           [[VAL_33:%.*]] = constant 1 : index
77// CHECK:           [[VAL_34:%.*]] = constant 4 : index
78// CHECK:           [[VAL_35:%.*]] = constant 1 : index
79// CHECK:           [[VAL_36:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_28]], [[VAL_26]], [[VAL_34]]]
80// CHECK:           [[VAL_37:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_29]], [[VAL_27]], [[VAL_34]]]
81// CHECK:           [[VAL_38:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_34]], [[VAL_32]], [[VAL_33]]]
82// CHECK:           [[VAL_39:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_34]], [[VAL_32]], [[VAL_33]]]
83// CHECK:           gpu.launch blocks([[VAL_40:%.*]], [[VAL_41:%.*]], [[VAL_42:%.*]]) in ([[VAL_43:%.*]] = [[VAL_37]], [[VAL_44:%.*]] = [[VAL_36]], [[VAL_45:%.*]] = [[VAL_35]]) threads([[VAL_46:%.*]], [[VAL_47:%.*]], [[VAL_48:%.*]]) in ([[VAL_49:%.*]] = [[VAL_39]], [[VAL_50:%.*]] = [[VAL_38]], [[VAL_51:%.*]] = [[VAL_35]]) {
84// CHECK:             [[VAL_52:%.*]] = affine.apply #[[MAP1]]([[VAL_41]]){{\[}}[[VAL_34]], [[VAL_26]]]
85// CHECK:             [[VAL_53:%.*]] = affine.apply #[[MAP1]]([[VAL_40]]){{\[}}[[VAL_34]], [[VAL_27]]]
86// CHECK:             [[VAL_54:%.*]] = affine.apply #[[MAP1]]([[VAL_47]]){{\[}}[[VAL_33]], [[VAL_32]]]
87// CHECK:             [[VAL_55:%.*]] = affine.apply #[[MAP1]]([[VAL_46]]){{\[}}[[VAL_33]], [[VAL_32]]]
88// CHECK:             [[VAL_56:%.*]] = addi [[VAL_52]], [[VAL_54]] : index
89// CHECK:             [[VAL_57:%.*]] = addi [[VAL_53]], [[VAL_55]] : index
90// CHECK:             [[VAL_58:%.*]] = load [[VAL_30]]{{\[}}[[VAL_56]], [[VAL_57]]] : memref<?x?xf32>
91// CHECK:             store [[VAL_58]], [[VAL_31]]{{\[}}[[VAL_57]], [[VAL_56]]] : memref<?x?xf32>
92// CHECK:             gpu.terminator
93// CHECK:           }
94// CHECK:           return
95// CHECK:         }
96// CHECK:       }
97
98// -----
99
100// 2-d parallel loop mapped to block.y and sequential
101
102func @parallel_loop_bidy_seq(%arg0 : index, %arg1 : index, %arg2 : index,
103                             %arg3 : index, %arg4 : index,
104                             %buf : memref<?x?xf32>,
105                             %res : memref<?x?xf32>) {
106  %step = constant 2 : index
107  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
108                                          step (%arg4, %step)  {
109    %val = load %buf[%i0, %i1] : memref<?x?xf32>
110    store %val, %res[%i1, %i0] : memref<?x?xf32>
111  } { mapping = [
112      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
113      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
114    ] }
115  return
116}
117
118// CHECK:       #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> ((s0 - s1) ceildiv s2)>
119// CHECK:       #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
120
121// CHECK:       module {
122// CHECK-LABEL:   func @parallel_loop_bidy_seq(
123// CHECK-SAME:                                 [[VAL_59:%.*]]: index, [[VAL_60:%.*]]: index, [[VAL_61:%.*]]: index, [[VAL_62:%.*]]: index, [[VAL_63:%.*]]: index, [[VAL_64:%.*]]: memref<?x?xf32>, [[VAL_65:%.*]]: memref<?x?xf32>) {
124// CHECK:           [[VAL_66:%.*]] = constant 2 : index
125// CHECK:           [[VAL_67:%.*]] = constant 1 : index
126// CHECK:           [[VAL_68:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_61]], [[VAL_59]], [[VAL_63]]]
127// CHECK:           gpu.launch blocks([[VAL_69:%.*]], [[VAL_70:%.*]], [[VAL_71:%.*]]) in ([[VAL_72:%.*]] = [[VAL_67]], [[VAL_73:%.*]] = [[VAL_68]], [[VAL_74:%.*]] = [[VAL_67]]) threads([[VAL_75:%.*]], [[VAL_76:%.*]], [[VAL_77:%.*]]) in ([[VAL_78:%.*]] = [[VAL_67]], [[VAL_79:%.*]] = [[VAL_67]], [[VAL_80:%.*]] = [[VAL_67]]) {
128// CHECK:             [[VAL_81:%.*]] = affine.apply #[[MAP1]]([[VAL_70]]){{\[}}[[VAL_63]], [[VAL_59]]]
129// CHECK:             scf.for [[VAL_82:%.*]] = [[VAL_60]] to [[VAL_62]] step [[VAL_66]] {
130// CHECK:               [[VAL_83:%.*]] = load [[VAL_64]]{{\[}}[[VAL_81]], [[VAL_82]]] : memref<?x?xf32>
131// CHECK:               store [[VAL_83]], [[VAL_65]]{{\[}}[[VAL_82]], [[VAL_81]]] : memref<?x?xf32>
132// CHECK:             }
133// CHECK:             gpu.terminator
134// CHECK:           }
135// CHECK:           return
136// CHECK:         }
137// CHECK:       }
138
139// -----
140
141// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq.
142
143func @parallel_loop_tiled_seq(%arg0 : index, %arg1 : index, %arg2 : index,
144                              %arg3 : index,
145                              %buf : memref<?x?xf32>,
146                              %res : memref<?x?xf32>) {
147  %zero = constant 0 : index
148  %one = constant 1 : index
149  %four = constant 4 : index
150  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
151                                          step (%four, %four)  {
152    scf.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
153                                            step (%one, %one)  {
154      %idx0 = addi %i0, %si0 : index
155      %idx1 = addi %i1, %si1 : index
156      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
157      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
158    } { mapping = [
159        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
160        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
161      ] }
162  } { mapping = [
163      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
164      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
165    ] }
166  return
167}
168
169// CHECK:       #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> ((s0 - s1) ceildiv s2)>
170// CHECK:       #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
171
172// CHECK:       module {
173// CHECK-LABEL:   func @parallel_loop_tiled_seq(
174// CHECK-SAME:                                  [[VAL_84:%.*]]: index, [[VAL_85:%.*]]: index, [[VAL_86:%.*]]: index, [[VAL_87:%.*]]: index, [[VAL_88:%.*]]: memref<?x?xf32>, [[VAL_89:%.*]]: memref<?x?xf32>) {
175// CHECK:           [[VAL_90:%.*]] = constant 0 : index
176// CHECK:           [[VAL_91:%.*]] = constant 1 : index
177// CHECK:           [[VAL_92:%.*]] = constant 4 : index
178// CHECK:           [[VAL_93:%.*]] = constant 1 : index
179// CHECK:           [[VAL_94:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_86]], [[VAL_84]], [[VAL_92]]]
180// CHECK:           [[VAL_95:%.*]] = affine.apply #[[MAP0]](){{\[}}[[VAL_92]], [[VAL_90]], [[VAL_91]]]
181// CHECK:           gpu.launch blocks([[VAL_96:%.*]], [[VAL_97:%.*]], [[VAL_98:%.*]]) in ([[VAL_99:%.*]] = [[VAL_93]], [[VAL_100:%.*]] = [[VAL_94]], [[VAL_101:%.*]] = [[VAL_93]]) threads([[VAL_102:%.*]], [[VAL_103:%.*]], [[VAL_104:%.*]]) in ([[VAL_105:%.*]] = [[VAL_93]], [[VAL_106:%.*]] = [[VAL_95]], [[VAL_107:%.*]] = [[VAL_93]]) {
182// CHECK:             [[VAL_108:%.*]] = affine.apply #[[MAP1]]([[VAL_97]]){{\[}}[[VAL_92]], [[VAL_84]]]
183// CHECK:             scf.for [[VAL_109:%.*]] = [[VAL_85]] to [[VAL_87]] step [[VAL_92]] {
184// CHECK:               [[VAL_110:%.*]] = affine.apply #[[MAP1]]([[VAL_103]]){{\[}}[[VAL_91]], [[VAL_90]]]
185// CHECK:               scf.for [[VAL_111:%.*]] = [[VAL_90]] to [[VAL_92]] step [[VAL_91]] {
186// CHECK:                 [[VAL_112:%.*]] = addi [[VAL_108]], [[VAL_110]] : index
187// CHECK:                 [[VAL_113:%.*]] = addi [[VAL_109]], [[VAL_111]] : index
188// CHECK:                 [[VAL_114:%.*]] = load [[VAL_88]]{{\[}}[[VAL_112]], [[VAL_113]]] : memref<?x?xf32>
189// CHECK:                 store [[VAL_114]], [[VAL_89]]{{\[}}[[VAL_113]], [[VAL_112]]] : memref<?x?xf32>
190// CHECK:               }
191// CHECK:             }
192// CHECK:             gpu.terminator
193// CHECK:           }
194// CHECK:           return
195// CHECK:         }
196// CHECK:       }
197
198// -----
199
200#map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
201#map1 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
202#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
203#map3 = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
204
205module {
206  func @sum(%arg0: memref<?x?xf32, #map0>, %arg1: memref<?x?xf32, #map0>, %arg2: memref<?x?xf32, #map0>) {
207    %c1 = constant 1 : index
208    %c0 = constant 0 : index
209    %c3 = constant 3 : index
210    %c2 = constant 2 : index
211    %0 = dim %arg0, 0 : memref<?x?xf32, #map0>
212    %1 = dim %arg0, 1 : memref<?x?xf32, #map0>
213    scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%0, %1) step (%c2, %c3) {
214      %2 = dim %arg0, 0 : memref<?x?xf32, #map0>
215      %3 = affine.min #map1(%arg3)[%2]
216      %squared_min = muli %3, %3 : index
217      %4 = dim %arg0, 1 : memref<?x?xf32, #map0>
218      %5 = affine.min #map2(%arg4)[%4]
219      %6 = std.subview %arg0[%arg3, %arg4][%squared_min, %5][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, #map3>
220      %7 = dim %arg1, 0 : memref<?x?xf32, #map0>
221      %8 = affine.min #map1(%arg3)[%7]
222      %9 = dim %arg1, 1 : memref<?x?xf32, #map0>
223      %10 = affine.min #map2(%arg4)[%9]
224      %11 = std.subview %arg1[%arg3, %arg4][%8, %10][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, #map3>
225      %12 = dim %arg2, 0 : memref<?x?xf32, #map0>
226      %13 = affine.min #map1(%arg3)[%12]
227      %14 = dim %arg2, 1 : memref<?x?xf32, #map0>
228      %15 = affine.min #map2(%arg4)[%14]
229      %16 = std.subview %arg2[%arg3, %arg4][%13, %15][%c1, %c1] : memref<?x?xf32, #map0> to memref<?x?xf32, #map3>
230      scf.parallel (%arg5, %arg6) = (%c0, %c0) to (%squared_min, %5) step (%c1, %c1) {
231        %17 = load %6[%arg5, %arg6] : memref<?x?xf32, #map3>
232        %18 = load %11[%arg5, %arg6] : memref<?x?xf32, #map3>
233        %19 = load %16[%arg5, %arg6] : memref<?x?xf32, #map3>
234        %20 = addf %17, %18 : f32
235        store %20, %16[%arg5, %arg6] : memref<?x?xf32, #map3>
236        scf.yield
237      } {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 3 : i64}, {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 4 : i64}]}
238      scf.yield
239    } {mapping = [{bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 0 : i64}, {bound = affine_map<(d0) -> (d0)>, map = affine_map<(d0) -> (d0)>, processor = 1 : i64}]}
240    return
241  }
242}
243
244// CHECK:       #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>
245// CHECK:       #[[MAP1:.*]] = affine_map<()[s0, s1, s2] -> ((s0 - s1) ceildiv s2)>
246// CHECK:       #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
247// CHECK:       #[[MAP3:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>
248// CHECK:       #[[MAP4:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)>
249// CHECK:       #[[MAP5:.*]] = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>
250
251// CHECK:       module {
252// CHECK-LABEL:   func @sum(
253// CHECK-SAME:              [[VAL_0:%.*]]: memref<?x?xf32, #[[MAP0]]>, [[VAL_1:%.*]]: memref<?x?xf32, #[[MAP0]]>, [[VAL_2:%.*]]: memref<?x?xf32, #[[MAP0]]>) {
254// CHECK:           [[VAL_3:%.*]] = constant 1 : index
255// CHECK:           [[VAL_4:%.*]] = constant 0 : index
256// CHECK:           [[VAL_5:%.*]] = constant 3 : index
257// CHECK:           [[VAL_6:%.*]] = constant 2 : index
258// CHECK:           [[VAL_7:%.*]] = dim [[VAL_0]], 0 : memref<?x?xf32, #[[MAP0]]>
259// CHECK:           [[VAL_8:%.*]] = dim [[VAL_0]], 1 : memref<?x?xf32, #[[MAP0]]>
260// CHECK:           [[VAL_9:%.*]] = constant 1 : index
261// CHECK:           [[VAL_10:%.*]] = affine.apply #[[MAP1]](){{\[}}[[VAL_7]], [[VAL_4]], [[VAL_6]]]
262// CHECK:           [[VAL_11:%.*]] = affine.apply #[[MAP1]](){{\[}}[[VAL_8]], [[VAL_4]], [[VAL_5]]]
263// CHECK:           [[VAL_12:%.*]] = constant 4 : index
264// CHECK:           [[VAL_13:%.*]] = affine.apply #[[MAP1]](){{\[}}[[VAL_12]], [[VAL_4]], [[VAL_3]]]
265// CHECK:           [[VAL_14:%.*]] = constant 3 : index
266// CHECK:           [[VAL_15:%.*]] = affine.apply #[[MAP1]](){{\[}}[[VAL_14]], [[VAL_4]], [[VAL_3]]]
267// CHECK:           gpu.launch blocks([[VAL_16:%.*]], [[VAL_17:%.*]], [[VAL_18:%.*]]) in ([[VAL_19:%.*]] = [[VAL_10]], [[VAL_20:%.*]] = [[VAL_11]], [[VAL_21:%.*]] = [[VAL_9]]) threads([[VAL_22:%.*]], [[VAL_23:%.*]], [[VAL_24:%.*]]) in ([[VAL_25:%.*]] = [[VAL_13]], [[VAL_26:%.*]] = [[VAL_15]], [[VAL_27:%.*]] = [[VAL_9]]) {
268// CHECK:             [[VAL_28:%.*]] = affine.apply #[[MAP2]]([[VAL_16]]){{\[}}[[VAL_6]], [[VAL_4]]]
269// CHECK:             [[VAL_29:%.*]] = affine.apply #[[MAP2]]([[VAL_17]]){{\[}}[[VAL_5]], [[VAL_4]]]
270// CHECK:             [[VAL_30:%.*]] = dim [[VAL_0]], 0 : memref<?x?xf32, #[[MAP0]]>
271// CHECK:             [[VAL_31:%.*]] = affine.min #[[MAP3]]([[VAL_28]]){{\[}}[[VAL_30]]]
272// CHECK:             [[VAL_31_SQUARED:%.*]] = muli [[VAL_31]], [[VAL_31]] : index
273// CHECK:             [[VAL_32:%.*]] = dim [[VAL_0]], 1 : memref<?x?xf32, #[[MAP0]]>
274// CHECK:             [[VAL_33:%.*]] = affine.min #[[MAP4]]([[VAL_29]]){{\[}}[[VAL_32]]]
275// CHECK:             [[VAL_34:%.*]] = subview [[VAL_0]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_31_SQUARED]], [[VAL_33]]] {{\[}}[[VAL_3]], [[VAL_3]]] : memref<?x?xf32, #[[MAP0]]> to memref<?x?xf32, #[[MAP5]]>
276// CHECK:             [[VAL_35:%.*]] = dim [[VAL_1]], 0 : memref<?x?xf32, #[[MAP0]]>
277// CHECK:             [[VAL_36:%.*]] = affine.min #[[MAP3]]([[VAL_28]]){{\[}}[[VAL_35]]]
278// CHECK:             [[VAL_37:%.*]] = dim [[VAL_1]], 1 : memref<?x?xf32, #[[MAP0]]>
279// CHECK:             [[VAL_38:%.*]] = affine.min #[[MAP4]]([[VAL_29]]){{\[}}[[VAL_37]]]
280// CHECK:             [[VAL_39:%.*]] = subview [[VAL_1]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_36]], [[VAL_38]]] {{\[}}[[VAL_3]], [[VAL_3]]] : memref<?x?xf32, #[[MAP0]]> to memref<?x?xf32, #[[MAP5]]>
281// CHECK:             [[VAL_40:%.*]] = dim [[VAL_2]], 0 : memref<?x?xf32, #[[MAP0]]>
282// CHECK:             [[VAL_41:%.*]] = affine.min #[[MAP3]]([[VAL_28]]){{\[}}[[VAL_40]]]
283// CHECK:             [[VAL_42:%.*]] = dim [[VAL_2]], 1 : memref<?x?xf32, #[[MAP0]]>
284// CHECK:             [[VAL_43:%.*]] = affine.min #[[MAP4]]([[VAL_29]]){{\[}}[[VAL_42]]]
285// CHECK:             [[VAL_44:%.*]] = subview [[VAL_2]]{{\[}}[[VAL_28]], [[VAL_29]]] {{\[}}[[VAL_41]], [[VAL_43]]] {{\[}}[[VAL_3]], [[VAL_3]]] : memref<?x?xf32, #[[MAP0]]> to memref<?x?xf32, #[[MAP5]]>
286// CHECK:             [[VAL_45:%.*]] = affine.apply #[[MAP2]]([[VAL_22]]){{\[}}[[VAL_3]], [[VAL_4]]]
287// CHECK:             [[VAL_46:%.*]] = cmpi "slt", [[VAL_45]], [[VAL_31_SQUARED]] : index
288// CHECK:             scf.if [[VAL_46]] {
289// CHECK:               [[VAL_47:%.*]] = affine.apply #[[MAP2]]([[VAL_23]]){{\[}}[[VAL_3]], [[VAL_4]]]
290// CHECK:               [[VAL_48:%.*]] = cmpi "slt", [[VAL_47]], [[VAL_33]] : index
291// CHECK:               scf.if [[VAL_48]] {
292// CHECK:                 [[VAL_49:%.*]] = load [[VAL_34]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, #[[MAP5]]>
293// CHECK:                 [[VAL_50:%.*]] = load [[VAL_39]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, #[[MAP5]]>
294// CHECK:                 [[VAL_51:%.*]] = load [[VAL_44]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, #[[MAP5]]>
295// CHECK:                 [[VAL_52:%.*]] = addf [[VAL_49]], [[VAL_50]] : f32
296// CHECK:                 store [[VAL_52]], [[VAL_44]]{{\[}}[[VAL_45]], [[VAL_47]]] : memref<?x?xf32, #[[MAP5]]>
297// CHECK:               }
298// CHECK:             }
299// CHECK:             gpu.terminator
300// CHECK:           }
301// CHECK:           return
302// CHECK:         }
303// CHECK:       }
304
305// -----
306
307// Mapping to the same processor twice.
308
309func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
310                          %arg3 : index,
311                          %buf : memref<?x?xf32>,
312                          %res : memref<?x?xf32>) {
313  %four = constant 4 : index
314  // expected-error@+2 {{cannot redefine the bound for processor 1}}
315  // expected-error@+1 {{failed to legalize operation 'scf.parallel'}}
316  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
317                                          step (%four, %four)  {
318  } { mapping = [
319      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
320      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
321    ] }
322  return
323}
324
325// -----
326
327// Loop with loop-variant upper bound.
328
329func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
330                                       %arg3 : index,
331                                       %buf : memref<?x?xf32>,
332                                       %res : memref<?x?xf32>) {
333  %zero = constant 0 : index
334  %one = constant 1 : index
335  %four = constant 4 : index
336  // expected-error@+1 {{failed to legalize operation 'scf.parallel'}}
337  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
338                                          step (%four, %four)  {
339    // expected-error@+1 {{cannot derive loop-invariant upper bound}}
340    scf.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
341                                            step (%one, %one)  {
342      %idx0 = addi %i0, %si0 : index
343      %idx1 = addi %i1, %si1 : index
344      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
345      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
346    } { mapping = [
347        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
348        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
349      ] }
350  } { mapping = [
351      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
352      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
353    ] }
354  return
355}
356