124f836e8SJoachim Protze // RUN: %libomptarget-compilexx-run-and-check-generic
2e5da63d5SShilei Tian
3ea68ad6eSJon Chesterfield // Currently hangs on amdgpu
4ea68ad6eSJon Chesterfield // UNSUPPORTED: amdgcn-amd-amdhsa
5ae23be84SJoseph Huber // UNSUPPORTED: amdgcn-amd-amdhsa-oldDriver
6*d5d83663SJoseph Huber // UNSUPPORTED: amdgcn-amd-amdhsa-LTO
7ad0f6e1dSJon Chesterfield // UNSUPPORTED: x86_64-pc-linux-gnu
8ae23be84SJoseph Huber // UNSUPPORTED: x86_64-pc-linux-gnu-oldDriver
9*d5d83663SJoseph Huber // UNSUPPORTED: x86_64-pc-linux-gnu-LTO
10ad0f6e1dSJon Chesterfield
11e5da63d5SShilei Tian #include <cassert>
12702a976cSShilei Tian #include <cmath>
13e5da63d5SShilei Tian #include <iostream>
14702a976cSShilei Tian #include <limits>
15e5da63d5SShilei Tian #include <memory>
16e5da63d5SShilei Tian #include <vector>
17e5da63d5SShilei Tian
18e5da63d5SShilei Tian class BlockMatrix {
19e5da63d5SShilei Tian private:
20e5da63d5SShilei Tian const int rowsPerBlock;
21e5da63d5SShilei Tian const int colsPerBlock;
22e5da63d5SShilei Tian const long nRows;
23e5da63d5SShilei Tian const long nCols;
24e5da63d5SShilei Tian const int nBlocksPerRow;
25e5da63d5SShilei Tian const int nBlocksPerCol;
26e5da63d5SShilei Tian std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
27e5da63d5SShilei Tian
28e5da63d5SShilei Tian public:
BlockMatrix(const int _rowsPerBlock,const int _colsPerBlock,const long _nRows,const long _nCols)29e5da63d5SShilei Tian BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
30e5da63d5SShilei Tian const long _nRows, const long _nCols)
31e5da63d5SShilei Tian : rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
32e5da63d5SShilei Tian nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
33e5da63d5SShilei Tian nBlocksPerCol(_nCols / _colsPerBlock), Blocks(nBlocksPerCol) {
34e5da63d5SShilei Tian for (int i = 0; i < nBlocksPerCol; i++) {
35e5da63d5SShilei Tian for (int j = 0; j < nBlocksPerRow; j++) {
36e5da63d5SShilei Tian Blocks[i].emplace_back(new float[_rowsPerBlock * _colsPerBlock]);
37e5da63d5SShilei Tian }
38e5da63d5SShilei Tian }
39e5da63d5SShilei Tian };
40e5da63d5SShilei Tian
41e5da63d5SShilei Tian // Initialize the BlockMatrix from 2D arrays
Initialize(const std::vector<float> & matrix)42e5da63d5SShilei Tian void Initialize(const std::vector<float> &matrix) {
43e5da63d5SShilei Tian for (int i = 0; i < nBlocksPerCol; i++)
44e5da63d5SShilei Tian for (int j = 0; j < nBlocksPerRow; j++) {
45e5da63d5SShilei Tian float *CurrBlock = GetBlock(i, j);
46e5da63d5SShilei Tian for (int ii = 0; ii < colsPerBlock; ++ii)
47e5da63d5SShilei Tian for (int jj = 0; jj < rowsPerBlock; ++jj) {
48e5da63d5SShilei Tian int curri = i * colsPerBlock + ii;
49e5da63d5SShilei Tian int currj = j * rowsPerBlock + jj;
50e5da63d5SShilei Tian CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj * nCols];
51e5da63d5SShilei Tian }
52e5da63d5SShilei Tian }
53e5da63d5SShilei Tian }
54e5da63d5SShilei Tian
Compare(const std::vector<float> & matrix) const55092a5bb7SShilei Tian void Compare(const std::vector<float> &matrix) const {
56e5da63d5SShilei Tian for (int i = 0; i < nBlocksPerCol; i++)
57e5da63d5SShilei Tian for (int j = 0; j < nBlocksPerRow; j++) {
58e5da63d5SShilei Tian float *CurrBlock = GetBlock(i, j);
59e5da63d5SShilei Tian for (int ii = 0; ii < colsPerBlock; ++ii)
60e5da63d5SShilei Tian for (int jj = 0; jj < rowsPerBlock; ++jj) {
61e5da63d5SShilei Tian int curri = i * colsPerBlock + ii;
62e5da63d5SShilei Tian int currj = j * rowsPerBlock + jj;
63e5da63d5SShilei Tian float m_value = matrix[curri + currj * nCols];
64e5da63d5SShilei Tian float bm_value = CurrBlock[ii + jj * colsPerBlock];
65092a5bb7SShilei Tian assert(std::fabs(bm_value - m_value) <
66092a5bb7SShilei Tian std::numeric_limits<float>::epsilon());
67e5da63d5SShilei Tian }
68e5da63d5SShilei Tian }
69e5da63d5SShilei Tian }
70e5da63d5SShilei Tian
GetBlock(int i,int j) const71e5da63d5SShilei Tian float *GetBlock(int i, int j) const {
72e5da63d5SShilei Tian assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
73e5da63d5SShilei Tian return Blocks[i][j].get();
74e5da63d5SShilei Tian }
75e5da63d5SShilei Tian };
76e5da63d5SShilei Tian
77ec978664SShilei Tian constexpr const int BS = 16;
7875812e77SShilei Tian constexpr const int N = 256;
79e5da63d5SShilei Tian
BlockMatMul_TargetNowait(BlockMatrix & A,BlockMatrix & B,BlockMatrix & C)80e5da63d5SShilei Tian int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
81e5da63d5SShilei Tian #pragma omp parallel
82e5da63d5SShilei Tian #pragma omp master
83e5da63d5SShilei Tian for (int i = 0; i < N / BS; ++i)
84e5da63d5SShilei Tian for (int j = 0; j < N / BS; ++j) {
85e5da63d5SShilei Tian float *BlockC = C.GetBlock(i, j);
86e5da63d5SShilei Tian for (int k = 0; k < N / BS; ++k) {
87e5da63d5SShilei Tian float *BlockA = A.GetBlock(i, k);
88e5da63d5SShilei Tian float *BlockB = B.GetBlock(k, j);
89e5da63d5SShilei Tian // clang-format off
90e5da63d5SShilei Tian #pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0]) \
91e5da63d5SShilei Tian map(to: BlockA[:BS * BS], BlockB[:BS * BS]) \
92e5da63d5SShilei Tian map(tofrom: BlockC[:BS * BS]) nowait
93e5da63d5SShilei Tian // clang-format on
94e5da63d5SShilei Tian #pragma omp parallel for
95e5da63d5SShilei Tian for (int ii = 0; ii < BS; ii++)
96e5da63d5SShilei Tian for (int jj = 0; jj < BS; jj++) {
97e5da63d5SShilei Tian for (int kk = 0; kk < BS; ++kk)
98e5da63d5SShilei Tian BlockC[ii + jj * BS] +=
99e5da63d5SShilei Tian BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
100e5da63d5SShilei Tian }
101e5da63d5SShilei Tian }
102e5da63d5SShilei Tian }
103e5da63d5SShilei Tian return 0;
104e5da63d5SShilei Tian }
105e5da63d5SShilei Tian
Matmul(const std::vector<float> & a,const std::vector<float> & b,std::vector<float> & c)106e5da63d5SShilei Tian void Matmul(const std::vector<float> &a, const std::vector<float> &b,
107e5da63d5SShilei Tian std::vector<float> &c) {
108e5da63d5SShilei Tian for (int i = 0; i < N; ++i) {
109e5da63d5SShilei Tian for (int j = 0; j < N; ++j) {
110e5da63d5SShilei Tian float sum = 0.0;
111e5da63d5SShilei Tian for (int k = 0; k < N; ++k) {
112e5da63d5SShilei Tian sum = sum + a[i * N + k] * b[k * N + j];
113e5da63d5SShilei Tian }
114e5da63d5SShilei Tian c[i * N + j] = sum;
115e5da63d5SShilei Tian }
116e5da63d5SShilei Tian }
117e5da63d5SShilei Tian }
118e5da63d5SShilei Tian
main(int argc,char * argv[])119e5da63d5SShilei Tian int main(int argc, char *argv[]) {
120e5da63d5SShilei Tian std::vector<float> a(N * N);
121e5da63d5SShilei Tian std::vector<float> b(N * N);
122e5da63d5SShilei Tian std::vector<float> c(N * N, 0.0);
123e5da63d5SShilei Tian
124e5da63d5SShilei Tian for (int i = 0; i < N; ++i) {
125e5da63d5SShilei Tian for (int j = 0; j < N; ++j) {
126e5da63d5SShilei Tian a[i * N + j] = b[i * N + j] = i + j % 100;
127e5da63d5SShilei Tian }
128e5da63d5SShilei Tian }
129e5da63d5SShilei Tian
130e5da63d5SShilei Tian auto BlockedA = BlockMatrix(BS, BS, N, N);
131e5da63d5SShilei Tian auto BlockedB = BlockMatrix(BS, BS, N, N);
132092a5bb7SShilei Tian auto BlockedC = BlockMatrix(BS, BS, N, N);
133092a5bb7SShilei Tian BlockedA.Initialize(a);
134e5da63d5SShilei Tian BlockedB.Initialize(b);
135092a5bb7SShilei Tian BlockedC.Initialize(c);
136092a5bb7SShilei Tian BlockedA.Compare(a);
137e5da63d5SShilei Tian BlockedB.Compare(b);
138092a5bb7SShilei Tian BlockedC.Compare(c);
139e5da63d5SShilei Tian
140e5da63d5SShilei Tian Matmul(a, b, c);
141e5da63d5SShilei Tian BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
142e5da63d5SShilei Tian
143092a5bb7SShilei Tian BlockedC.Compare(c);
144e5da63d5SShilei Tian
145e5da63d5SShilei Tian std::cout << "PASS\n";
146e5da63d5SShilei Tian
147e5da63d5SShilei Tian return 0;
148e5da63d5SShilei Tian }
149e5da63d5SShilei Tian
150e5da63d5SShilei Tian // CHECK: PASS
151