1 // RUN: %libomp-cxx-compile-and-run 2 // RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run 3 // GCC-5 is needed for OpenMP 4.0 support (taskgroup) 4 // XFAIL: gcc-4 5 #include <cstdio> 6 #include <cmath> 7 #include <cassert> 8 #include <omp.h> 9 10 // Total number of loop iterations, should be multiple of T for this test 11 #define N 10000 12 13 // Flag to request lazy (1) or eager (0) allocation of reduction objects 14 #ifndef FLG 15 #define FLG 0 16 #endif 17 18 /* 19 // initial user's code that corresponds to pseudo code of the test 20 #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x) 21 { 22 for( int l = 0; l < N; ++l ) { 23 #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x) 24 { 25 i += l; 26 if( l%2 ) 27 x *= 1.0 / (l + 1); 28 else 29 x *= (l + 1); 30 } 31 } 32 33 #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y) 34 { 35 for( int l = 0; l < N; ++l ) { 36 #pragma omp task firstprivate(l) in_reduction(+:j,y) \ 37 in_reduction(*:x) in_reduction(-:k) 38 { 39 j += l; 40 k -= l; 41 y += (double)l; 42 if( l%2 ) 43 x *= 1.0 / (l + 1); 44 else 45 x *= (l + 1); 46 } 47 #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k) 48 { 49 i -= l; 50 k -= l; 51 y += (double)l; 52 } 53 #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x) 54 { 55 j += l; 56 if( l%2 ) 57 x *= 1.0 / (l + 1); 58 else 59 x *= (l + 1); 60 } 61 } 62 } // inner reduction 63 64 for( int l = 0; l < N; ++l ) { 65 #pragma omp task firstprivate(l) in_reduction(+:j) 66 j += l; 67 } 68 } // outer reduction 69 */ 70 71 //------------------------------------------------ 72 // OpenMP runtime library routines 73 #ifdef __cplusplus 74 extern "C" { 75 #endif 76 extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item); 77 extern void* __kmpc_task_reduction_init(int gtid, int num, void* data); 78 extern int __kmpc_global_thread_num(void*); 79 #ifdef __cplusplus 80 } 81 #endif 82 83 //------------------------------------------------ 84 // Compiler-generated code 85 86 typedef struct _task_red_item { 87 void *shar; // shared reduction item 88 size_t size; // size of data item 89 void *f_init; // data initialization routine 90 void *f_fini; // data finalization routine 91 void *f_comb; // data combiner routine 92 unsigned flags; 93 } _task_red_item_t; 94 95 // int:+ no need in init/fini callbacks, valid for subtraction 96 void __red_int_add_comb(void *lhs, void *rhs) // combiner 97 { *(int*)lhs += *(int*)rhs; } 98 99 // long long:+ no need in init/fini callbacks, valid for subtraction 100 void __red_llong_add_comb(void *lhs, void *rhs) // combiner 101 { *(long long*)lhs += *(long long*)rhs; } 102 103 // double:* no need in fini callback 104 void __red_dbl_mul_init(void *data) // initializer 105 { *(double*)data = 1.0; } 106 void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner 107 { *(double*)lhs *= *(double*)rhs; } 108 109 // double:+ no need in init/fini callbacks 110 void __red_dbl_add_comb(void *lhs, void *rhs) // combiner 111 { *(double*)lhs += *(double*)rhs; } 112 113 // ============================== 114 115 void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py) 116 { 117 for( int l = 0; l < N; ++l ) { 118 *pi += l; 119 if( l%2 ) 120 *px *= 1.0 / (l + 1); 121 else 122 *px *= (l + 1); 123 } 124 for( int l = 0; l < N; ++l ) { 125 *pj += l; 126 *pk -= l; 127 *py += (double)l; 128 if( l%2 ) 129 *px *= 1.0 / (l + 1); 130 else 131 *px *= (l + 1); 132 133 *pi -= l; 134 *pk -= l; 135 *py += (double)l; 136 137 *pj += l; 138 if( l%2 ) 139 *px *= 1.0 / (l + 1); 140 else 141 *px *= (l + 1); 142 } 143 for( int l = 0; l < N; ++l ) { 144 *pj += l; 145 } 146 } 147 148 //------------------------------------------------ 149 // Test case 150 int main() 151 { 152 int nthreads = omp_get_max_threads(); 153 int err = 0; 154 void** ptrs = (void**)malloc(nthreads*sizeof(void*)); 155 156 // user's code ====================================== 157 // variables for serial calculations: 158 int is = 3; 159 long long js = -9999999; 160 double xs = 99999.0; 161 long long ks = 99999999; 162 double ys = -99999999.0; 163 // variables for parallel calculations: 164 int ip = 3; 165 long long jp = -9999999; 166 double xp = 99999.0; 167 long long kp = 99999999; 168 double yp = -99999999.0; 169 170 calc_serial(&is, &js, &xs, &ks, &ys); 171 // ================================================== 172 for (int i = 0; i < nthreads; ++i) 173 ptrs[i] = NULL; 174 #pragma omp parallel 175 { 176 #pragma omp single nowait 177 { 178 // outer taskgroup reduces (i,j,x) 179 #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x) 180 { 181 _task_red_item_t red_data[3]; 182 red_data[0].shar = &ip; 183 red_data[0].size = sizeof(ip); 184 red_data[0].f_init = NULL; // RTL will zero thread-specific objects 185 red_data[0].f_fini = NULL; // no destructors needed 186 red_data[0].f_comb = (void*)&__red_int_add_comb; 187 red_data[0].flags = FLG; 188 red_data[1].shar = &jp; 189 red_data[1].size = sizeof(jp); 190 red_data[1].f_init = NULL; // RTL will zero thread-specific objects 191 red_data[1].f_fini = NULL; // no destructors needed 192 red_data[1].f_comb = (void*)&__red_llong_add_comb; 193 red_data[1].flags = FLG; 194 red_data[2].shar = &xp; 195 red_data[2].size = sizeof(xp); 196 red_data[2].f_init = (void*)&__red_dbl_mul_init; 197 red_data[2].f_fini = NULL; // no destructors needed 198 red_data[2].f_comb = (void*)&__red_dbl_mul_comb; 199 red_data[2].flags = FLG; 200 int gtid = __kmpc_global_thread_num(NULL); 201 void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data); 202 203 for( int l = 0; l < N; l += 2 ) { 204 // 2 iterations per task to get correct x value; actually any even 205 // number of iters per task will work, otherwise x looses precision 206 #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x) 207 { 208 int gtid = __kmpc_global_thread_num(NULL); 209 int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip); 210 double *p_xp = (double*)__kmpc_task_reduction_get_th_data( 211 gtid, tg1, &xp); 212 if (!ptrs[gtid]) ptrs[gtid] = p_xp; 213 214 // user's pseudo-code ============================== 215 *p_ip += l; 216 *p_xp *= (l + 1); 217 218 *p_ip += l + 1; 219 *p_xp *= 1.0 / (l + 2); 220 // ================================================== 221 } 222 } 223 // inner taskgroup reduces (i,k,y), i is same object as in outer one 224 #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y) 225 { 226 _task_red_item_t red_data[3]; 227 red_data[0].shar = &ip; 228 red_data[0].size = sizeof(ip); 229 red_data[0].f_init = NULL; // RTL will zero thread-specific objects 230 red_data[0].f_fini = NULL; // no destructors needed 231 red_data[0].f_comb = (void*)&__red_int_add_comb; 232 red_data[0].flags = FLG; 233 red_data[1].shar = &kp; 234 red_data[1].size = sizeof(kp); 235 red_data[1].f_init = NULL; // RTL will zero thread-specific objects 236 red_data[1].f_fini = NULL; // no destructors needed 237 red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and - 238 red_data[1].flags = FLG; 239 red_data[2].shar = &yp; 240 red_data[2].size = sizeof(yp); 241 red_data[2].f_init = NULL; // RTL will zero thread-specific objects 242 red_data[2].f_fini = NULL; // no destructors needed 243 red_data[2].f_comb = (void*)&__red_dbl_add_comb; 244 red_data[2].flags = FLG; 245 int gtid = __kmpc_global_thread_num(NULL); 246 void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data); 247 248 for( int l = 0; l < N; l += 2 ) { 249 #pragma omp task firstprivate(l) 250 // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k) 251 { 252 int gtid = __kmpc_global_thread_num(NULL); 253 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( 254 gtid, tg1, &jp); 255 long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( 256 gtid, tg2, &kp); 257 double *p_xp = (double*)__kmpc_task_reduction_get_th_data( 258 gtid, tg1, &xp); 259 double *p_yp = (double*)__kmpc_task_reduction_get_th_data( 260 gtid, tg2, &yp); 261 // user's pseudo-code ============================== 262 *p_jp += l; 263 *p_kp -= l; 264 *p_yp += (double)l; 265 *p_xp *= (l + 1); 266 267 *p_jp += l + 1; 268 *p_kp -= l + 1; 269 *p_yp += (double)(l + 1); 270 *p_xp *= 1.0 / (l + 2); 271 // ================================================= 272 { 273 // the following code is here just to check __kmpc_task_reduction_get_th_data: 274 int tid = omp_get_thread_num(); 275 void *addr1; 276 void *addr2; 277 addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared 278 addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private 279 if (addr1 != addr2) { 280 #pragma omp atomic 281 ++err; 282 printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2); 283 } 284 // from neighbour w/o taskgroup (should start lookup from current tg2) 285 if (tid > 0) { 286 if (ptrs[tid-1]) { 287 addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]); 288 if (addr1 != addr2) { 289 #pragma omp atomic 290 ++err; 291 printf("Wrong thread-specific addresses %d s:%p n:%p\n", 292 tid, addr1, addr2); 293 } 294 } 295 } else { 296 if (ptrs[nthreads-1]) { 297 addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]); 298 if (addr1 != addr2) { 299 #pragma omp atomic 300 ++err; 301 printf("Wrong thread-specific addresses %d s:%p n:%p\n", 302 tid, addr1, addr2); 303 } 304 } 305 } 306 // ---------------------------------------------- 307 } 308 } 309 #pragma omp task firstprivate(l) 310 // in_reduction(+:y) in_reduction(-:i,k) 311 { 312 int gtid = __kmpc_global_thread_num(NULL); 313 int *p_ip = (int*)__kmpc_task_reduction_get_th_data( 314 gtid, tg2, &ip); 315 long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data( 316 gtid, tg2, &kp); 317 double *p_yp = (double*)__kmpc_task_reduction_get_th_data( 318 gtid, tg2, &yp); 319 320 // user's pseudo-code ============================== 321 *p_ip -= l; 322 *p_kp -= l; 323 *p_yp += (double)l; 324 325 *p_ip -= l + 1; 326 *p_kp -= l + 1; 327 *p_yp += (double)(l + 1); 328 // ================================================= 329 } 330 #pragma omp task firstprivate(l) 331 // in_reduction(+:j) in_reduction(*:x) 332 { 333 int gtid = __kmpc_global_thread_num(NULL); 334 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( 335 gtid, tg1, &jp); 336 double *p_xp = (double*)__kmpc_task_reduction_get_th_data( 337 gtid, tg1, &xp); 338 // user's pseudo-code ============================== 339 *p_jp += l; 340 *p_xp *= (l + 1); 341 342 *p_jp += l + 1; 343 *p_xp *= 1.0 / (l + 2); 344 // ================================================= 345 } 346 } 347 } // inner reduction 348 349 for( int l = 0; l < N; l += 2 ) { 350 #pragma omp task firstprivate(l) // in_reduction(+:j) 351 { 352 int gtid = __kmpc_global_thread_num(NULL); 353 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data( 354 gtid, tg1, &jp); 355 // user's pseudo-code ============================== 356 *p_jp += l; 357 *p_jp += l + 1; 358 // ================================================= 359 } 360 } 361 } // outer reduction 362 } // end single 363 } // end parallel 364 // check results 365 #if _DEBUG 366 printf("reduction flags = %u\n", FLG); 367 #endif 368 if (ip == is && jp == js && ks == kp && 369 fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01) 370 printf("passed\n"); 371 else 372 printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n", 373 is, js, xs, ks, ys, 374 ip, jp, xp, kp, yp); 375 return 0; 376 } 377