1 /* 2 Copyright (c) 2005-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include "Evolution.hpp" 18 19 #ifdef USE_SSE 20 /* Update states with SSE */ 21 22 #include <xmmintrin.h> 23 #include <emmintrin.h> 24 25 inline void create_record(char* src, unsigned* dst, unsigned width) { 26 dst[0] |= src[width - 1]; 27 for (unsigned i = 0; i < 31u; ++i) 28 dst[0] |= src[i] << (i + 1); 29 unsigned col; 30 for (unsigned col = 31u; col < width; ++col) 31 dst[(col + 1) / 32u] |= src[col] << ((col + 1) % 32u); 32 dst[(col + 1) / 32u] |= src[0] << ((col + 1) % 32u); 33 } 34 35 inline void sum_offset(__m128i* X, 36 __m128i* A, 37 __m128i* B, 38 __m128i* C, 39 unsigned size_sse_ar, 40 unsigned shift) { 41 for (unsigned i = 0; i < size_sse_ar; ++i) { 42 __m128i tmp = _mm_and_si128(A[i], X[shift + i]); 43 A[i] = _mm_xor_si128(A[i], X[shift + i]); 44 C[i] = _mm_or_si128(C[i], _mm_and_si128(B[i], tmp)); 45 B[i] = _mm_xor_si128(B[i], tmp); 46 } 47 } 48 49 inline void shift_left2D(__m128i* X, unsigned height, unsigned size_sse_row) { 50 for (unsigned row = 0; row < height; ++row) { 51 unsigned ind = row * size_sse_row; 52 unsigned x0 = X[ind].m128i_u32[0] & 1; 53 54 X[ind] = 55 _mm_or_si128(_mm_srli_epi16(X[ind], 1), _mm_slli_epi16(_mm_srli_si128(X[ind], 2), 15)); 56 57 unsigned x1 = X[ind + 1].m128i_u32[0] & 1; 58 X[ind + 1] = _mm_or_si128(_mm_srli_epi16(X[ind + 1], 1), 59 _mm_slli_epi16(_mm_srli_si128(X[ind + 1], 2), 15)); 60 X[ind].m128i_u32[3] |= x1 << 31; 61 62 unsigned x2 = X[ind + 2].m128i_u32[0] & 1; 63 X[ind + 2] = _mm_or_si128(_mm_srli_epi16(X[ind + 2], 1), 64 _mm_slli_epi16(_mm_srli_si128(X[ind + 2], 2), 15)); 65 X[ind + 1].m128i_u32[3] |= x2 << 31; 66 67 unsigned* dst = (unsigned*)&X[ind]; 68 dst[301 / 32u] |= x0 << (301 % 32u); 69 } 70 } 71 72 inline void shift_right2D(__m128i* X, unsigned height, unsigned size_sse_row) { 73 for (unsigned row = 0; row < height; ++row) { 74 unsigned ind = row * size_sse_row; 75 76 unsigned x0 = X[ind].m128i_u32[3]; 77 x0 >>= 31; 78 X[ind] = 79 _mm_or_si128(_mm_slli_epi16(X[ind], 1), _mm_srli_epi16(_mm_slli_si128(X[ind], 2), 15)); 80 81 unsigned x1 = X[ind + 1].m128i_u32[3]; 82 x1 >>= 31; 83 X[ind + 1] = _mm_or_si128(_mm_slli_epi16(X[ind + 1], 1), 84 _mm_srli_epi16(_mm_slli_si128(X[ind + 1], 2), 15)); 85 X[ind + 1].m128i_u32[0] |= x0; 86 87 unsigned* dst = (unsigned*)&X[ind]; 88 unsigned x2 = dst[301 / 32u] & (1 << (301 % 32u)); 89 x2 >>= (301 % 32u); 90 X[ind + 2] = _mm_or_si128(_mm_slli_epi16(X[ind + 2], 1), 91 _mm_srli_epi16(_mm_slli_si128(X[ind + 2], 2), 15)); 92 X[ind + 2].m128i_u32[0] |= x1; 93 X[ind].m128i_u32[0] |= x2; 94 } 95 } 96 97 void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) { 98 //300/128 + 1 =3, 3*300=900 99 unsigned size_sse_row = m_matrix->width / 128 + 1; //3 100 unsigned size_sse_ar = size_sse_row * (end - begin); 101 __m128i X[906], A[900], B[900], C[900]; 102 char* mas = m_matrix->data; 103 104 for (unsigned i = 0; i < size_sse_ar; ++i) { 105 A[i].m128i_u32[0] = 0; 106 A[i].m128i_u32[1] = 0; 107 A[i].m128i_u32[2] = 0; 108 A[i].m128i_u32[3] = 0; 109 B[i].m128i_u32[0] = 0; 110 B[i].m128i_u32[1] = 0; 111 B[i].m128i_u32[2] = 0; 112 B[i].m128i_u32[3] = 0; 113 C[i].m128i_u32[0] = 0; 114 C[i].m128i_u32[1] = 0; 115 C[i].m128i_u32[2] = 0; 116 C[i].m128i_u32[3] = 0; 117 } 118 119 for (unsigned i = 0; i < size_sse_ar + 6; ++i) { 120 X[i].m128i_u32[0] = 0; 121 X[i].m128i_u32[1] = 0; 122 X[i].m128i_u32[2] = 0; 123 X[i].m128i_u32[3] = 0; 124 } 125 126 // create X[] with bounds 127 unsigned height = end - begin; 128 unsigned width = m_matrix->width; 129 for (unsigned row = 0; row < height; ++row) { 130 char* src = &mas[(row + begin) * width]; 131 unsigned* dst = (unsigned*)&X[(row + 1) * size_sse_row]; 132 create_record(src, dst, width); 133 } 134 // create high row in X[] 135 char* src; 136 if (begin == 0) { 137 src = &mas[(m_matrix->height - 1) * width]; 138 } 139 else { 140 src = &mas[(begin - 1) * width]; 141 } 142 unsigned* dst = (unsigned*)X; 143 create_record(src, dst, width); 144 145 //create lower row in X[] 146 if (end == m_matrix->height) { 147 src = mas; 148 } 149 else { 150 src = &mas[end * width]; 151 } 152 dst = (unsigned*)&X[(height + 1) * size_sse_row]; 153 create_record(src, dst, width); 154 155 //sum( C, B, A, X+offset_for_upwards ); high-left friend 156 sum_offset(X, A, B, C, size_sse_ar, 0); 157 158 //sum( C, B, A, X+offset_for_no_vertical_shift ); 159 sum_offset(X, A, B, C, size_sse_ar, size_sse_row); 160 161 //sum( C, B, A, X+offset_for_downwards ); 162 sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row); 163 164 //shift_left( X ); (when view 2D) in our logic it is in right 165 height = end - begin + 2; 166 shift_left2D(X, height, size_sse_row); 167 168 //sum( C, B, A, X+offset_for_upwards ); high-left friend 169 sum_offset(X, A, B, C, size_sse_ar, 0); 170 171 //sum( C, B, A, X+offset_for_downwards ); 172 sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row); 173 174 //shift_left( X ); (view in 2D) in our logic it is right shift 175 height = end - begin + 2; 176 shift_left2D(X, height, size_sse_row); 177 178 //sum( C, B, A, X+offset_for_upwards ); high-right friend 179 sum_offset(X, A, B, C, size_sse_ar, 0); 180 181 //sum( C, B, A, X+offset_for_no_vertical_shift ); right friend 182 sum_offset(X, A, B, C, size_sse_ar, size_sse_row); 183 184 //sum( C, B, A, X+offset_for_downwards ); right down friend 185 sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row); 186 187 //shift_right( X ); (when view in 2D) in our case it left shift. 188 height = end - begin + 2; 189 shift_right2D(X, height, size_sse_row); 190 191 //X = (X|A)&B&~C (done bitwise over the arrays) 192 unsigned shift = size_sse_row; 193 for (unsigned i = 0; i < size_sse_ar; ++i) { 194 C[i].m128i_u32[0] = ~C[i].m128i_u32[0]; 195 C[i].m128i_u32[1] = ~C[i].m128i_u32[1]; 196 C[i].m128i_u32[2] = ~C[i].m128i_u32[2]; 197 C[i].m128i_u32[3] = ~C[i].m128i_u32[3]; 198 X[shift + i] = _mm_and_si128(_mm_and_si128(_mm_or_si128(X[shift + i], A[i]), B[i]), C[i]); 199 } 200 201 height = end - begin; 202 width = m_matrix->width; 203 for (unsigned row = 0; row < height; ++row) { 204 char* dst = &dest[(row + begin) * width]; 205 unsigned* src = (unsigned*)&X[(row + 1) * size_sse_row]; 206 for (unsigned col = 0; col < width; ++col) { 207 unsigned c = src[col / 32u] & 1 << (col % 32u); 208 dst[col] = c >> (col % 32u); 209 } 210 } 211 } 212 #else 213 /* end SSE block */ 214 215 // ---------------------------------------------------------------------- 216 // GetAdjacentCellState() - returns the state (value) of the specified 217 // adjacent cell of the current cell "cellNumber" 218 char GetAdjacentCellState(char* source, // pointer to source data block 219 int x, // logical width of field 220 int y, // logical height of field 221 int cellNumber, // number of cell position to examine 222 int cp // which adjacent position 223 ) { 224 /* 225 cp 226 *-- cp=1 ... --- cp=8 (summary: -1-2-3- 227 -x- -x- -4-x-5- 228 --- --* -6-7-8- ) 229 */ 230 char cellState = 0; // return value 231 232 // set up boundary flags to trigger field-wrap logic 233 bool onTopRow = false; 234 bool onBottomRow = false; 235 bool onLeftColumn = false; 236 bool onRightColumn = false; 237 238 // check to see if cell is on top row 239 if (cellNumber < x) { 240 onTopRow = true; 241 } 242 // check to see if cell is on bottom row 243 if ((x * y) - cellNumber <= x) { 244 onBottomRow = true; 245 } 246 // check to see if cell is on left column 247 if (cellNumber % x == 0) { 248 onLeftColumn = true; 249 } 250 // check to see if cell is on right column 251 if ((cellNumber + 1) % x == 0) { 252 onRightColumn = true; 253 } 254 255 switch (cp) { 256 case 1: 257 if (onTopRow && onLeftColumn) { 258 return *(source + ((x * y) - 1)); 259 } 260 if (onTopRow && !onLeftColumn) { 261 return *(source + (((x * y) - x) + (cellNumber - 1))); 262 } 263 if (onLeftColumn && !onTopRow) { 264 return *(source + (cellNumber - 1)); 265 } 266 return *((source + cellNumber) - (x + 1)); 267 268 case 2: 269 if (onTopRow) { 270 return *(source + (((x * y) - x) + cellNumber)); 271 } 272 return *((source + cellNumber) - x); 273 274 case 3: 275 if (onTopRow && onRightColumn) { 276 return *(source + ((x * y) - x)); 277 } 278 if (onTopRow && !onRightColumn) { 279 return *(source + (((x * y) - x) + (cellNumber + 1))); 280 } 281 if (onRightColumn && !onTopRow) { 282 return *(source + ((cellNumber - (x * 2)) + 1)); 283 } 284 return *(source + (cellNumber - (x - 1))); 285 286 case 4: 287 if (onRightColumn) { 288 return *(source + (cellNumber - (x - 1))); 289 } 290 return *(source + (cellNumber + 1)); 291 292 case 5: 293 if (onBottomRow && onRightColumn) { 294 return *source; 295 } 296 if (onBottomRow && !onRightColumn) { 297 return *(source + ((cellNumber - ((x * y) - x)) + 1)); 298 } 299 if (onRightColumn && !onBottomRow) { 300 return *(source + (cellNumber + 1)); 301 } 302 return *(source + (((cellNumber + x)) + 1)); 303 304 case 6: 305 if (onBottomRow) { 306 return *(source + (cellNumber - ((x * y) - x))); 307 } 308 return *(source + (cellNumber + x)); 309 310 case 7: 311 if (onBottomRow && onLeftColumn) { 312 return *(source + (x - 1)); 313 } 314 if (onBottomRow && !onLeftColumn) { 315 return *(source + (cellNumber - ((x * y) - x) - 1)); 316 } 317 if (onLeftColumn && !onBottomRow) { 318 return *(source + (cellNumber + ((x * 2) - 1))); 319 } 320 return *(source + (cellNumber + (x - 1))); 321 322 case 8: 323 if (onLeftColumn) { 324 return *(source + (cellNumber + (x - 1))); 325 } 326 return *(source + (cellNumber - 1)); 327 } 328 return cellState; 329 } 330 331 char CheckCell(Matrix* m_matrix, int cellNumber) { 332 char total = 0; 333 char* source = m_matrix->data; 334 //look around to find cell's with status "alive" 335 for (int i = 1; i < 9; i++) { 336 total += GetAdjacentCellState(source, m_matrix->width, m_matrix->height, cellNumber, i); 337 } 338 // if the number of adjacent live cells is < 2 or > 3, the result is a dead 339 // cell regardless of its current state. (A live cell dies of loneliness if it 340 // has less than 2 neighbors, and of overcrowding if it has more than 3; a new 341 // cell is born in an empty spot only if it has exactly 3 neighbors. 342 if (total < 2 || total > 3) { 343 return 0; 344 } 345 346 // if we get here and the cell position holds a living cell, it stays alive 347 if (*(source + cellNumber)) { 348 return 1; 349 } 350 351 // we have an empty position. If there are only 2 neighbors, the position stays 352 // empty. 353 if (total == 2) { 354 return 0; 355 } 356 357 // we have an empty position and exactly 3 neighbors. A cell is born. 358 return 1; 359 } 360 361 void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) { 362 for (int i = begin; i <= end; i++) { 363 *(dest + i) = CheckCell(m_matrix, i); 364 } 365 } 366 367 #endif 368 /* end non-SSE block */ 369