1 /*
2 Copyright (c) 2005-2021 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 #include "Evolution.hpp"
18
19 #ifdef USE_SSE
20 /* Update states with SSE */
21
22 #include <xmmintrin.h>
23 #include <emmintrin.h>
24
create_record(char * src,unsigned * dst,unsigned width)25 inline void create_record(char* src, unsigned* dst, unsigned width) {
26 dst[0] |= src[width - 1];
27 for (unsigned i = 0; i < 31u; ++i)
28 dst[0] |= src[i] << (i + 1);
29 unsigned col;
30 for (unsigned col = 31u; col < width; ++col)
31 dst[(col + 1) / 32u] |= src[col] << ((col + 1) % 32u);
32 dst[(col + 1) / 32u] |= src[0] << ((col + 1) % 32u);
33 }
34
sum_offset(__m128i * X,__m128i * A,__m128i * B,__m128i * C,unsigned size_sse_ar,unsigned shift)35 inline void sum_offset(__m128i* X,
36 __m128i* A,
37 __m128i* B,
38 __m128i* C,
39 unsigned size_sse_ar,
40 unsigned shift) {
41 for (unsigned i = 0; i < size_sse_ar; ++i) {
42 __m128i tmp = _mm_and_si128(A[i], X[shift + i]);
43 A[i] = _mm_xor_si128(A[i], X[shift + i]);
44 C[i] = _mm_or_si128(C[i], _mm_and_si128(B[i], tmp));
45 B[i] = _mm_xor_si128(B[i], tmp);
46 }
47 }
48
shift_left2D(__m128i * X,unsigned height,unsigned size_sse_row)49 inline void shift_left2D(__m128i* X, unsigned height, unsigned size_sse_row) {
50 for (unsigned row = 0; row < height; ++row) {
51 unsigned ind = row * size_sse_row;
52 unsigned x0 = X[ind].m128i_u32[0] & 1;
53
54 X[ind] =
55 _mm_or_si128(_mm_srli_epi16(X[ind], 1), _mm_slli_epi16(_mm_srli_si128(X[ind], 2), 15));
56
57 unsigned x1 = X[ind + 1].m128i_u32[0] & 1;
58 X[ind + 1] = _mm_or_si128(_mm_srli_epi16(X[ind + 1], 1),
59 _mm_slli_epi16(_mm_srli_si128(X[ind + 1], 2), 15));
60 X[ind].m128i_u32[3] |= x1 << 31;
61
62 unsigned x2 = X[ind + 2].m128i_u32[0] & 1;
63 X[ind + 2] = _mm_or_si128(_mm_srli_epi16(X[ind + 2], 1),
64 _mm_slli_epi16(_mm_srli_si128(X[ind + 2], 2), 15));
65 X[ind + 1].m128i_u32[3] |= x2 << 31;
66
67 unsigned* dst = (unsigned*)&X[ind];
68 dst[301 / 32u] |= x0 << (301 % 32u);
69 }
70 }
71
shift_right2D(__m128i * X,unsigned height,unsigned size_sse_row)72 inline void shift_right2D(__m128i* X, unsigned height, unsigned size_sse_row) {
73 for (unsigned row = 0; row < height; ++row) {
74 unsigned ind = row * size_sse_row;
75
76 unsigned x0 = X[ind].m128i_u32[3];
77 x0 >>= 31;
78 X[ind] =
79 _mm_or_si128(_mm_slli_epi16(X[ind], 1), _mm_srli_epi16(_mm_slli_si128(X[ind], 2), 15));
80
81 unsigned x1 = X[ind + 1].m128i_u32[3];
82 x1 >>= 31;
83 X[ind + 1] = _mm_or_si128(_mm_slli_epi16(X[ind + 1], 1),
84 _mm_srli_epi16(_mm_slli_si128(X[ind + 1], 2), 15));
85 X[ind + 1].m128i_u32[0] |= x0;
86
87 unsigned* dst = (unsigned*)&X[ind];
88 unsigned x2 = dst[301 / 32u] & (1 << (301 % 32u));
89 x2 >>= (301 % 32u);
90 X[ind + 2] = _mm_or_si128(_mm_slli_epi16(X[ind + 2], 1),
91 _mm_srli_epi16(_mm_slli_si128(X[ind + 2], 2), 15));
92 X[ind + 2].m128i_u32[0] |= x1;
93 X[ind].m128i_u32[0] |= x2;
94 }
95 }
96
UpdateState(Matrix * m_matrix,char * dest,int begin,int end)97 void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) {
98 //300/128 + 1 =3, 3*300=900
99 unsigned size_sse_row = m_matrix->width / 128 + 1; //3
100 unsigned size_sse_ar = size_sse_row * (end - begin);
101 __m128i X[906], A[900], B[900], C[900];
102 char* mas = m_matrix->data;
103
104 for (unsigned i = 0; i < size_sse_ar; ++i) {
105 A[i].m128i_u32[0] = 0;
106 A[i].m128i_u32[1] = 0;
107 A[i].m128i_u32[2] = 0;
108 A[i].m128i_u32[3] = 0;
109 B[i].m128i_u32[0] = 0;
110 B[i].m128i_u32[1] = 0;
111 B[i].m128i_u32[2] = 0;
112 B[i].m128i_u32[3] = 0;
113 C[i].m128i_u32[0] = 0;
114 C[i].m128i_u32[1] = 0;
115 C[i].m128i_u32[2] = 0;
116 C[i].m128i_u32[3] = 0;
117 }
118
119 for (unsigned i = 0; i < size_sse_ar + 6; ++i) {
120 X[i].m128i_u32[0] = 0;
121 X[i].m128i_u32[1] = 0;
122 X[i].m128i_u32[2] = 0;
123 X[i].m128i_u32[3] = 0;
124 }
125
126 // create X[] with bounds
127 unsigned height = end - begin;
128 unsigned width = m_matrix->width;
129 for (unsigned row = 0; row < height; ++row) {
130 char* src = &mas[(row + begin) * width];
131 unsigned* dst = (unsigned*)&X[(row + 1) * size_sse_row];
132 create_record(src, dst, width);
133 }
134 // create high row in X[]
135 char* src;
136 if (begin == 0) {
137 src = &mas[(m_matrix->height - 1) * width];
138 }
139 else {
140 src = &mas[(begin - 1) * width];
141 }
142 unsigned* dst = (unsigned*)X;
143 create_record(src, dst, width);
144
145 //create lower row in X[]
146 if (end == m_matrix->height) {
147 src = mas;
148 }
149 else {
150 src = &mas[end * width];
151 }
152 dst = (unsigned*)&X[(height + 1) * size_sse_row];
153 create_record(src, dst, width);
154
155 //sum( C, B, A, X+offset_for_upwards ); high-left friend
156 sum_offset(X, A, B, C, size_sse_ar, 0);
157
158 //sum( C, B, A, X+offset_for_no_vertical_shift );
159 sum_offset(X, A, B, C, size_sse_ar, size_sse_row);
160
161 //sum( C, B, A, X+offset_for_downwards );
162 sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row);
163
164 //shift_left( X ); (when view 2D) in our logic it is in right
165 height = end - begin + 2;
166 shift_left2D(X, height, size_sse_row);
167
168 //sum( C, B, A, X+offset_for_upwards ); high-left friend
169 sum_offset(X, A, B, C, size_sse_ar, 0);
170
171 //sum( C, B, A, X+offset_for_downwards );
172 sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row);
173
174 //shift_left( X ); (view in 2D) in our logic it is right shift
175 height = end - begin + 2;
176 shift_left2D(X, height, size_sse_row);
177
178 //sum( C, B, A, X+offset_for_upwards ); high-right friend
179 sum_offset(X, A, B, C, size_sse_ar, 0);
180
181 //sum( C, B, A, X+offset_for_no_vertical_shift ); right friend
182 sum_offset(X, A, B, C, size_sse_ar, size_sse_row);
183
184 //sum( C, B, A, X+offset_for_downwards ); right down friend
185 sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row);
186
187 //shift_right( X ); (when view in 2D) in our case it left shift.
188 height = end - begin + 2;
189 shift_right2D(X, height, size_sse_row);
190
191 //X = (X|A)&B&~C (done bitwise over the arrays)
192 unsigned shift = size_sse_row;
193 for (unsigned i = 0; i < size_sse_ar; ++i) {
194 C[i].m128i_u32[0] = ~C[i].m128i_u32[0];
195 C[i].m128i_u32[1] = ~C[i].m128i_u32[1];
196 C[i].m128i_u32[2] = ~C[i].m128i_u32[2];
197 C[i].m128i_u32[3] = ~C[i].m128i_u32[3];
198 X[shift + i] = _mm_and_si128(_mm_and_si128(_mm_or_si128(X[shift + i], A[i]), B[i]), C[i]);
199 }
200
201 height = end - begin;
202 width = m_matrix->width;
203 for (unsigned row = 0; row < height; ++row) {
204 char* dst = &dest[(row + begin) * width];
205 unsigned* src = (unsigned*)&X[(row + 1) * size_sse_row];
206 for (unsigned col = 0; col < width; ++col) {
207 unsigned c = src[col / 32u] & 1 << (col % 32u);
208 dst[col] = c >> (col % 32u);
209 }
210 }
211 }
212 #else
213 /* end SSE block */
214
215 // ----------------------------------------------------------------------
216 // GetAdjacentCellState() - returns the state (value) of the specified
217 // adjacent cell of the current cell "cellNumber"
GetAdjacentCellState(char * source,int x,int y,int cellNumber,int cp)218 char GetAdjacentCellState(char* source, // pointer to source data block
219 int x, // logical width of field
220 int y, // logical height of field
221 int cellNumber, // number of cell position to examine
222 int cp // which adjacent position
223 ) {
224 /*
225 cp
226 *-- cp=1 ... --- cp=8 (summary: -1-2-3-
227 -x- -x- -4-x-5-
228 --- --* -6-7-8- )
229 */
230 char cellState = 0; // return value
231
232 // set up boundary flags to trigger field-wrap logic
233 bool onTopRow = false;
234 bool onBottomRow = false;
235 bool onLeftColumn = false;
236 bool onRightColumn = false;
237
238 // check to see if cell is on top row
239 if (cellNumber < x) {
240 onTopRow = true;
241 }
242 // check to see if cell is on bottom row
243 if ((x * y) - cellNumber <= x) {
244 onBottomRow = true;
245 }
246 // check to see if cell is on left column
247 if (cellNumber % x == 0) {
248 onLeftColumn = true;
249 }
250 // check to see if cell is on right column
251 if ((cellNumber + 1) % x == 0) {
252 onRightColumn = true;
253 }
254
255 switch (cp) {
256 case 1:
257 if (onTopRow && onLeftColumn) {
258 return *(source + ((x * y) - 1));
259 }
260 if (onTopRow && !onLeftColumn) {
261 return *(source + (((x * y) - x) + (cellNumber - 1)));
262 }
263 if (onLeftColumn && !onTopRow) {
264 return *(source + (cellNumber - 1));
265 }
266 return *((source + cellNumber) - (x + 1));
267
268 case 2:
269 if (onTopRow) {
270 return *(source + (((x * y) - x) + cellNumber));
271 }
272 return *((source + cellNumber) - x);
273
274 case 3:
275 if (onTopRow && onRightColumn) {
276 return *(source + ((x * y) - x));
277 }
278 if (onTopRow && !onRightColumn) {
279 return *(source + (((x * y) - x) + (cellNumber + 1)));
280 }
281 if (onRightColumn && !onTopRow) {
282 return *(source + ((cellNumber - (x * 2)) + 1));
283 }
284 return *(source + (cellNumber - (x - 1)));
285
286 case 4:
287 if (onRightColumn) {
288 return *(source + (cellNumber - (x - 1)));
289 }
290 return *(source + (cellNumber + 1));
291
292 case 5:
293 if (onBottomRow && onRightColumn) {
294 return *source;
295 }
296 if (onBottomRow && !onRightColumn) {
297 return *(source + ((cellNumber - ((x * y) - x)) + 1));
298 }
299 if (onRightColumn && !onBottomRow) {
300 return *(source + (cellNumber + 1));
301 }
302 return *(source + (((cellNumber + x)) + 1));
303
304 case 6:
305 if (onBottomRow) {
306 return *(source + (cellNumber - ((x * y) - x)));
307 }
308 return *(source + (cellNumber + x));
309
310 case 7:
311 if (onBottomRow && onLeftColumn) {
312 return *(source + (x - 1));
313 }
314 if (onBottomRow && !onLeftColumn) {
315 return *(source + (cellNumber - ((x * y) - x) - 1));
316 }
317 if (onLeftColumn && !onBottomRow) {
318 return *(source + (cellNumber + ((x * 2) - 1)));
319 }
320 return *(source + (cellNumber + (x - 1)));
321
322 case 8:
323 if (onLeftColumn) {
324 return *(source + (cellNumber + (x - 1)));
325 }
326 return *(source + (cellNumber - 1));
327 }
328 return cellState;
329 }
330
CheckCell(Matrix * m_matrix,int cellNumber)331 char CheckCell(Matrix* m_matrix, int cellNumber) {
332 char total = 0;
333 char* source = m_matrix->data;
334 //look around to find cell's with status "alive"
335 for (int i = 1; i < 9; i++) {
336 total += GetAdjacentCellState(source, m_matrix->width, m_matrix->height, cellNumber, i);
337 }
338 // if the number of adjacent live cells is < 2 or > 3, the result is a dead
339 // cell regardless of its current state. (A live cell dies of loneliness if it
340 // has less than 2 neighbors, and of overcrowding if it has more than 3; a new
341 // cell is born in an empty spot only if it has exactly 3 neighbors.
342 if (total < 2 || total > 3) {
343 return 0;
344 }
345
346 // if we get here and the cell position holds a living cell, it stays alive
347 if (*(source + cellNumber)) {
348 return 1;
349 }
350
351 // we have an empty position. If there are only 2 neighbors, the position stays
352 // empty.
353 if (total == 2) {
354 return 0;
355 }
356
357 // we have an empty position and exactly 3 neighbors. A cell is born.
358 return 1;
359 }
360
UpdateState(Matrix * m_matrix,char * dest,int begin,int end)361 void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) {
362 for (int i = begin; i <= end; i++) {
363 *(dest + i) = CheckCell(m_matrix, i);
364 }
365 }
366
367 #endif
368 /* end non-SSE block */
369