1 // RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
2 // RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
3 // RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
4 // RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
5 // RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
6 
7 #include <cassert>
8 #include <iostream>
9 #include <memory>
10 #include <vector>
11 
12 class BlockMatrix {
13 private:
14   const int rowsPerBlock;
15   const int colsPerBlock;
16   const long nRows;
17   const long nCols;
18   const int nBlocksPerRow;
19   const int nBlocksPerCol;
20   std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
21 
22 public:
23   BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
24               const long _nRows, const long _nCols)
25       : rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
26         nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
27         nBlocksPerCol(_nCols / _colsPerBlock), Blocks(nBlocksPerCol) {
28     for (int i = 0; i < nBlocksPerCol; i++) {
29       for (int j = 0; j < nBlocksPerRow; j++) {
30         Blocks[i].emplace_back(new float[_rowsPerBlock * _colsPerBlock]);
31       }
32     }
33   };
34 
35   // Initialize the BlockMatrix from 2D arrays
36   void Initialize(const std::vector<float> &matrix) {
37     for (int i = 0; i < nBlocksPerCol; i++)
38       for (int j = 0; j < nBlocksPerRow; j++) {
39         float *CurrBlock = GetBlock(i, j);
40         for (int ii = 0; ii < colsPerBlock; ++ii)
41           for (int jj = 0; jj < rowsPerBlock; ++jj) {
42             int curri = i * colsPerBlock + ii;
43             int currj = j * rowsPerBlock + jj;
44             CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj * nCols];
45           }
46       }
47   }
48 
49   long Compare(const std::vector<float> &matrix) const {
50     long fail = 0;
51     for (int i = 0; i < nBlocksPerCol; i++)
52       for (int j = 0; j < nBlocksPerRow; j++) {
53         float *CurrBlock = GetBlock(i, j);
54         for (int ii = 0; ii < colsPerBlock; ++ii)
55           for (int jj = 0; jj < rowsPerBlock; ++jj) {
56             int curri = i * colsPerBlock + ii;
57             int currj = j * rowsPerBlock + jj;
58             float m_value = matrix[curri + currj * nCols];
59             float bm_value = CurrBlock[ii + jj * colsPerBlock];
60             if (bm_value != m_value) {
61               fail++;
62             }
63           }
64       }
65     return fail;
66   }
67 
68   float *GetBlock(int i, int j) const {
69     assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
70     return Blocks[i][j].get();
71   }
72 };
73 
74 constexpr const int BS = 256;
75 constexpr const int N = 1024;
76 
77 int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
78 #pragma omp parallel
79 #pragma omp master
80   for (int i = 0; i < N / BS; ++i)
81     for (int j = 0; j < N / BS; ++j) {
82       float *BlockC = C.GetBlock(i, j);
83       for (int k = 0; k < N / BS; ++k) {
84         float *BlockA = A.GetBlock(i, k);
85         float *BlockB = B.GetBlock(k, j);
86 // clang-format off
87 #pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0])   \
88             map(to: BlockA[:BS * BS], BlockB[:BS * BS])                        \
89             map(tofrom: BlockC[:BS * BS]) nowait
90 // clang-format on
91 #pragma omp parallel for
92         for (int ii = 0; ii < BS; ii++)
93           for (int jj = 0; jj < BS; jj++) {
94             for (int kk = 0; kk < BS; ++kk)
95               BlockC[ii + jj * BS] +=
96                   BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
97           }
98       }
99     }
100   return 0;
101 }
102 
103 void Matmul(const std::vector<float> &a, const std::vector<float> &b,
104             std::vector<float> &c) {
105   for (int i = 0; i < N; ++i) {
106     for (int j = 0; j < N; ++j) {
107       float sum = 0.0;
108       for (int k = 0; k < N; ++k) {
109         sum = sum + a[i * N + k] * b[k * N + j];
110       }
111       c[i * N + j] = sum;
112     }
113   }
114 }
115 
116 int main(int argc, char *argv[]) {
117   std::vector<float> a(N * N);
118   std::vector<float> b(N * N);
119   std::vector<float> c(N * N, 0.0);
120 
121   for (int i = 0; i < N; ++i) {
122     for (int j = 0; j < N; ++j) {
123       a[i * N + j] = b[i * N + j] = i + j % 100;
124     }
125   }
126 
127   auto BlockedA = BlockMatrix(BS, BS, N, N);
128   BlockedA.Initialize(a);
129   BlockedA.Compare(a);
130   auto BlockedB = BlockMatrix(BS, BS, N, N);
131   BlockedB.Initialize(b);
132   BlockedB.Compare(b);
133 
134   Matmul(a, b, c);
135 
136   auto BlockedC = BlockMatrix(BS, BS, N, N);
137   BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
138 
139   if (BlockedC.Compare(c) > 0) {
140     return 1;
141   }
142 
143   std::cout << "PASS\n";
144 
145   return 0;
146 }
147 
148 // CHECK: PASS
149