124f836e8SJoachim Protze // RUN: %libomptarget-compilexx-run-and-check-generic
2e5da63d5SShilei Tian 
3ea68ad6eSJon Chesterfield // Currently hangs on amdgpu
4ea68ad6eSJon Chesterfield // UNSUPPORTED: amdgcn-amd-amdhsa
5ae23be84SJoseph Huber // UNSUPPORTED: amdgcn-amd-amdhsa-oldDriver
6*d5d83663SJoseph Huber // UNSUPPORTED: amdgcn-amd-amdhsa-LTO
7ad0f6e1dSJon Chesterfield // UNSUPPORTED: x86_64-pc-linux-gnu
8ae23be84SJoseph Huber // UNSUPPORTED: x86_64-pc-linux-gnu-oldDriver
9*d5d83663SJoseph Huber // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
10ad0f6e1dSJon Chesterfield 
11e5da63d5SShilei Tian #include <cassert>
12702a976cSShilei Tian #include <cmath>
13e5da63d5SShilei Tian #include <iostream>
14702a976cSShilei Tian #include <limits>
15e5da63d5SShilei Tian #include <memory>
16e5da63d5SShilei Tian #include <vector>
17e5da63d5SShilei Tian 
18e5da63d5SShilei Tian class BlockMatrix {
19e5da63d5SShilei Tian private:
20e5da63d5SShilei Tian   const int rowsPerBlock;
21e5da63d5SShilei Tian   const int colsPerBlock;
22e5da63d5SShilei Tian   const long nRows;
23e5da63d5SShilei Tian   const long nCols;
24e5da63d5SShilei Tian   const int nBlocksPerRow;
25e5da63d5SShilei Tian   const int nBlocksPerCol;
26e5da63d5SShilei Tian   std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
27e5da63d5SShilei Tian 
28e5da63d5SShilei Tian public:
BlockMatrix(const int _rowsPerBlock,const int _colsPerBlock,const long _nRows,const long _nCols)29e5da63d5SShilei Tian   BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
30e5da63d5SShilei Tian               const long _nRows, const long _nCols)
31e5da63d5SShilei Tian       : rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
32e5da63d5SShilei Tian         nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
33e5da63d5SShilei Tian         nBlocksPerCol(_nCols / _colsPerBlock), Blocks(nBlocksPerCol) {
34e5da63d5SShilei Tian     for (int i = 0; i < nBlocksPerCol; i++) {
35e5da63d5SShilei Tian       for (int j = 0; j < nBlocksPerRow; j++) {
36e5da63d5SShilei Tian         Blocks[i].emplace_back(new float[_rowsPerBlock * _colsPerBlock]);
37e5da63d5SShilei Tian       }
38e5da63d5SShilei Tian     }
39e5da63d5SShilei Tian   };
40e5da63d5SShilei Tian 
41e5da63d5SShilei Tian   // Initialize the BlockMatrix from 2D arrays
Initialize(const std::vector<float> & matrix)42e5da63d5SShilei Tian   void Initialize(const std::vector<float> &matrix) {
43e5da63d5SShilei Tian     for (int i = 0; i < nBlocksPerCol; i++)
44e5da63d5SShilei Tian       for (int j = 0; j < nBlocksPerRow; j++) {
45e5da63d5SShilei Tian         float *CurrBlock = GetBlock(i, j);
46e5da63d5SShilei Tian         for (int ii = 0; ii < colsPerBlock; ++ii)
47e5da63d5SShilei Tian           for (int jj = 0; jj < rowsPerBlock; ++jj) {
48e5da63d5SShilei Tian             int curri = i * colsPerBlock + ii;
49e5da63d5SShilei Tian             int currj = j * rowsPerBlock + jj;
50e5da63d5SShilei Tian             CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj * nCols];
51e5da63d5SShilei Tian           }
52e5da63d5SShilei Tian       }
53e5da63d5SShilei Tian   }
54e5da63d5SShilei Tian 
Compare(const std::vector<float> & matrix) const55092a5bb7SShilei Tian   void Compare(const std::vector<float> &matrix) const {
56e5da63d5SShilei Tian     for (int i = 0; i < nBlocksPerCol; i++)
57e5da63d5SShilei Tian       for (int j = 0; j < nBlocksPerRow; j++) {
58e5da63d5SShilei Tian         float *CurrBlock = GetBlock(i, j);
59e5da63d5SShilei Tian         for (int ii = 0; ii < colsPerBlock; ++ii)
60e5da63d5SShilei Tian           for (int jj = 0; jj < rowsPerBlock; ++jj) {
61e5da63d5SShilei Tian             int curri = i * colsPerBlock + ii;
62e5da63d5SShilei Tian             int currj = j * rowsPerBlock + jj;
63e5da63d5SShilei Tian             float m_value = matrix[curri + currj * nCols];
64e5da63d5SShilei Tian             float bm_value = CurrBlock[ii + jj * colsPerBlock];
65092a5bb7SShilei Tian             assert(std::fabs(bm_value - m_value) <
66092a5bb7SShilei Tian                    std::numeric_limits<float>::epsilon());
67e5da63d5SShilei Tian           }
68e5da63d5SShilei Tian       }
69e5da63d5SShilei Tian   }
70e5da63d5SShilei Tian 
GetBlock(int i,int j) const71e5da63d5SShilei Tian   float *GetBlock(int i, int j) const {
72e5da63d5SShilei Tian     assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
73e5da63d5SShilei Tian     return Blocks[i][j].get();
74e5da63d5SShilei Tian   }
75e5da63d5SShilei Tian };
76e5da63d5SShilei Tian 
77ec978664SShilei Tian constexpr const int BS = 16;
7875812e77SShilei Tian constexpr const int N = 256;
79e5da63d5SShilei Tian 
BlockMatMul_TargetNowait(BlockMatrix & A,BlockMatrix & B,BlockMatrix & C)80e5da63d5SShilei Tian int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
81e5da63d5SShilei Tian #pragma omp parallel
82e5da63d5SShilei Tian #pragma omp master
83e5da63d5SShilei Tian   for (int i = 0; i < N / BS; ++i)
84e5da63d5SShilei Tian     for (int j = 0; j < N / BS; ++j) {
85e5da63d5SShilei Tian       float *BlockC = C.GetBlock(i, j);
86e5da63d5SShilei Tian       for (int k = 0; k < N / BS; ++k) {
87e5da63d5SShilei Tian         float *BlockA = A.GetBlock(i, k);
88e5da63d5SShilei Tian         float *BlockB = B.GetBlock(k, j);
89e5da63d5SShilei Tian // clang-format off
90e5da63d5SShilei Tian #pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0])   \
91e5da63d5SShilei Tian             map(to: BlockA[:BS * BS], BlockB[:BS * BS])                        \
92e5da63d5SShilei Tian             map(tofrom: BlockC[:BS * BS]) nowait
93e5da63d5SShilei Tian // clang-format on
94e5da63d5SShilei Tian #pragma omp parallel for
95e5da63d5SShilei Tian         for (int ii = 0; ii < BS; ii++)
96e5da63d5SShilei Tian           for (int jj = 0; jj < BS; jj++) {
97e5da63d5SShilei Tian             for (int kk = 0; kk < BS; ++kk)
98e5da63d5SShilei Tian               BlockC[ii + jj * BS] +=
99e5da63d5SShilei Tian                   BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
100e5da63d5SShilei Tian           }
101e5da63d5SShilei Tian       }
102e5da63d5SShilei Tian     }
103e5da63d5SShilei Tian   return 0;
104e5da63d5SShilei Tian }
105e5da63d5SShilei Tian 
Matmul(const std::vector<float> & a,const std::vector<float> & b,std::vector<float> & c)106e5da63d5SShilei Tian void Matmul(const std::vector<float> &a, const std::vector<float> &b,
107e5da63d5SShilei Tian             std::vector<float> &c) {
108e5da63d5SShilei Tian   for (int i = 0; i < N; ++i) {
109e5da63d5SShilei Tian     for (int j = 0; j < N; ++j) {
110e5da63d5SShilei Tian       float sum = 0.0;
111e5da63d5SShilei Tian       for (int k = 0; k < N; ++k) {
112e5da63d5SShilei Tian         sum = sum + a[i * N + k] * b[k * N + j];
113e5da63d5SShilei Tian       }
114e5da63d5SShilei Tian       c[i * N + j] = sum;
115e5da63d5SShilei Tian     }
116e5da63d5SShilei Tian   }
117e5da63d5SShilei Tian }
118e5da63d5SShilei Tian 
main(int argc,char * argv[])119e5da63d5SShilei Tian int main(int argc, char *argv[]) {
120e5da63d5SShilei Tian   std::vector<float> a(N * N);
121e5da63d5SShilei Tian   std::vector<float> b(N * N);
122e5da63d5SShilei Tian   std::vector<float> c(N * N, 0.0);
123e5da63d5SShilei Tian 
124e5da63d5SShilei Tian   for (int i = 0; i < N; ++i) {
125e5da63d5SShilei Tian     for (int j = 0; j < N; ++j) {
126e5da63d5SShilei Tian       a[i * N + j] = b[i * N + j] = i + j % 100;
127e5da63d5SShilei Tian     }
128e5da63d5SShilei Tian   }
129e5da63d5SShilei Tian 
130e5da63d5SShilei Tian   auto BlockedA = BlockMatrix(BS, BS, N, N);
131e5da63d5SShilei Tian   auto BlockedB = BlockMatrix(BS, BS, N, N);
132092a5bb7SShilei Tian   auto BlockedC = BlockMatrix(BS, BS, N, N);
133092a5bb7SShilei Tian   BlockedA.Initialize(a);
134e5da63d5SShilei Tian   BlockedB.Initialize(b);
135092a5bb7SShilei Tian   BlockedC.Initialize(c);
136092a5bb7SShilei Tian   BlockedA.Compare(a);
137e5da63d5SShilei Tian   BlockedB.Compare(b);
138092a5bb7SShilei Tian   BlockedC.Compare(c);
139e5da63d5SShilei Tian 
140e5da63d5SShilei Tian   Matmul(a, b, c);
141e5da63d5SShilei Tian   BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
142e5da63d5SShilei Tian 
143092a5bb7SShilei Tian   BlockedC.Compare(c);
144e5da63d5SShilei Tian 
145e5da63d5SShilei Tian   std::cout << "PASS\n";
146e5da63d5SShilei Tian 
147e5da63d5SShilei Tian   return 0;
148e5da63d5SShilei Tian }
149e5da63d5SShilei Tian 
150e5da63d5SShilei Tian // CHECK: PASS
151