1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "Evolution.hpp"
18 
19 #ifdef USE_SSE
20 /* Update states with SSE */
21 
22 #include <xmmintrin.h>
23 #include <emmintrin.h>
24 
25 inline void create_record(char* src, unsigned* dst, unsigned width) {
26     dst[0] |= src[width - 1];
27     for (unsigned i = 0; i < 31u; ++i)
28         dst[0] |= src[i] << (i + 1);
29     unsigned col;
30     for (unsigned col = 31u; col < width; ++col)
31         dst[(col + 1) / 32u] |= src[col] << ((col + 1) % 32u);
32     dst[(col + 1) / 32u] |= src[0] << ((col + 1) % 32u);
33 }
34 
35 inline void sum_offset(__m128i* X,
36                        __m128i* A,
37                        __m128i* B,
38                        __m128i* C,
39                        unsigned size_sse_ar,
40                        unsigned shift) {
41     for (unsigned i = 0; i < size_sse_ar; ++i) {
42         __m128i tmp = _mm_and_si128(A[i], X[shift + i]);
43         A[i] = _mm_xor_si128(A[i], X[shift + i]);
44         C[i] = _mm_or_si128(C[i], _mm_and_si128(B[i], tmp));
45         B[i] = _mm_xor_si128(B[i], tmp);
46     }
47 }
48 
49 inline void shift_left2D(__m128i* X, unsigned height, unsigned size_sse_row) {
50     for (unsigned row = 0; row < height; ++row) {
51         unsigned ind = row * size_sse_row;
52         unsigned x0 = X[ind].m128i_u32[0] & 1;
53 
54         X[ind] =
55             _mm_or_si128(_mm_srli_epi16(X[ind], 1), _mm_slli_epi16(_mm_srli_si128(X[ind], 2), 15));
56 
57         unsigned x1 = X[ind + 1].m128i_u32[0] & 1;
58         X[ind + 1] = _mm_or_si128(_mm_srli_epi16(X[ind + 1], 1),
59                                   _mm_slli_epi16(_mm_srli_si128(X[ind + 1], 2), 15));
60         X[ind].m128i_u32[3] |= x1 << 31;
61 
62         unsigned x2 = X[ind + 2].m128i_u32[0] & 1;
63         X[ind + 2] = _mm_or_si128(_mm_srli_epi16(X[ind + 2], 1),
64                                   _mm_slli_epi16(_mm_srli_si128(X[ind + 2], 2), 15));
65         X[ind + 1].m128i_u32[3] |= x2 << 31;
66 
67         unsigned* dst = (unsigned*)&X[ind];
68         dst[301 / 32u] |= x0 << (301 % 32u);
69     }
70 }
71 
72 inline void shift_right2D(__m128i* X, unsigned height, unsigned size_sse_row) {
73     for (unsigned row = 0; row < height; ++row) {
74         unsigned ind = row * size_sse_row;
75 
76         unsigned x0 = X[ind].m128i_u32[3];
77         x0 >>= 31;
78         X[ind] =
79             _mm_or_si128(_mm_slli_epi16(X[ind], 1), _mm_srli_epi16(_mm_slli_si128(X[ind], 2), 15));
80 
81         unsigned x1 = X[ind + 1].m128i_u32[3];
82         x1 >>= 31;
83         X[ind + 1] = _mm_or_si128(_mm_slli_epi16(X[ind + 1], 1),
84                                   _mm_srli_epi16(_mm_slli_si128(X[ind + 1], 2), 15));
85         X[ind + 1].m128i_u32[0] |= x0;
86 
87         unsigned* dst = (unsigned*)&X[ind];
88         unsigned x2 = dst[301 / 32u] & (1 << (301 % 32u));
89         x2 >>= (301 % 32u);
90         X[ind + 2] = _mm_or_si128(_mm_slli_epi16(X[ind + 2], 1),
91                                   _mm_srli_epi16(_mm_slli_si128(X[ind + 2], 2), 15));
92         X[ind + 2].m128i_u32[0] |= x1;
93         X[ind].m128i_u32[0] |= x2;
94     }
95 }
96 
97 void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) {
98     //300/128 + 1 =3, 3*300=900
99     unsigned size_sse_row = m_matrix->width / 128 + 1; //3
100     unsigned size_sse_ar = size_sse_row * (end - begin);
101     __m128i X[906], A[900], B[900], C[900];
102     char* mas = m_matrix->data;
103 
104     for (unsigned i = 0; i < size_sse_ar; ++i) {
105         A[i].m128i_u32[0] = 0;
106         A[i].m128i_u32[1] = 0;
107         A[i].m128i_u32[2] = 0;
108         A[i].m128i_u32[3] = 0;
109         B[i].m128i_u32[0] = 0;
110         B[i].m128i_u32[1] = 0;
111         B[i].m128i_u32[2] = 0;
112         B[i].m128i_u32[3] = 0;
113         C[i].m128i_u32[0] = 0;
114         C[i].m128i_u32[1] = 0;
115         C[i].m128i_u32[2] = 0;
116         C[i].m128i_u32[3] = 0;
117     }
118 
119     for (unsigned i = 0; i < size_sse_ar + 6; ++i) {
120         X[i].m128i_u32[0] = 0;
121         X[i].m128i_u32[1] = 0;
122         X[i].m128i_u32[2] = 0;
123         X[i].m128i_u32[3] = 0;
124     }
125 
126     // create X[] with bounds
127     unsigned height = end - begin;
128     unsigned width = m_matrix->width;
129     for (unsigned row = 0; row < height; ++row) {
130         char* src = &mas[(row + begin) * width];
131         unsigned* dst = (unsigned*)&X[(row + 1) * size_sse_row];
132         create_record(src, dst, width);
133     }
134     // create high row in X[]
135     char* src;
136     if (begin == 0) {
137         src = &mas[(m_matrix->height - 1) * width];
138     }
139     else {
140         src = &mas[(begin - 1) * width];
141     }
142     unsigned* dst = (unsigned*)X;
143     create_record(src, dst, width);
144 
145     //create lower row in X[]
146     if (end == m_matrix->height) {
147         src = mas;
148     }
149     else {
150         src = &mas[end * width];
151     }
152     dst = (unsigned*)&X[(height + 1) * size_sse_row];
153     create_record(src, dst, width);
154 
155     //sum( C, B, A, X+offset_for_upwards ); high-left friend
156     sum_offset(X, A, B, C, size_sse_ar, 0);
157 
158     //sum( C, B, A, X+offset_for_no_vertical_shift );
159     sum_offset(X, A, B, C, size_sse_ar, size_sse_row);
160 
161     //sum( C, B, A, X+offset_for_downwards );
162     sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row);
163 
164     //shift_left( X ); (when view 2D) in our logic it is in right
165     height = end - begin + 2;
166     shift_left2D(X, height, size_sse_row);
167 
168     //sum( C, B, A, X+offset_for_upwards ); high-left friend
169     sum_offset(X, A, B, C, size_sse_ar, 0);
170 
171     //sum( C, B, A, X+offset_for_downwards );
172     sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row);
173 
174     //shift_left( X ); (view in 2D) in our logic it is right shift
175     height = end - begin + 2;
176     shift_left2D(X, height, size_sse_row);
177 
178     //sum( C, B, A, X+offset_for_upwards ); high-right friend
179     sum_offset(X, A, B, C, size_sse_ar, 0);
180 
181     //sum( C, B, A, X+offset_for_no_vertical_shift ); right friend
182     sum_offset(X, A, B, C, size_sse_ar, size_sse_row);
183 
184     //sum( C, B, A, X+offset_for_downwards ); right down friend
185     sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row);
186 
187     //shift_right( X ); (when view in 2D) in our case it left shift.
188     height = end - begin + 2;
189     shift_right2D(X, height, size_sse_row);
190 
191     //X = (X|A)&B&~C (done bitwise over the arrays)
192     unsigned shift = size_sse_row;
193     for (unsigned i = 0; i < size_sse_ar; ++i) {
194         C[i].m128i_u32[0] = ~C[i].m128i_u32[0];
195         C[i].m128i_u32[1] = ~C[i].m128i_u32[1];
196         C[i].m128i_u32[2] = ~C[i].m128i_u32[2];
197         C[i].m128i_u32[3] = ~C[i].m128i_u32[3];
198         X[shift + i] = _mm_and_si128(_mm_and_si128(_mm_or_si128(X[shift + i], A[i]), B[i]), C[i]);
199     }
200 
201     height = end - begin;
202     width = m_matrix->width;
203     for (unsigned row = 0; row < height; ++row) {
204         char* dst = &dest[(row + begin) * width];
205         unsigned* src = (unsigned*)&X[(row + 1) * size_sse_row];
206         for (unsigned col = 0; col < width; ++col) {
207             unsigned c = src[col / 32u] & 1 << (col % 32u);
208             dst[col] = c >> (col % 32u);
209         }
210     }
211 }
212 #else
213 /* end SSE block */
214 
215 // ----------------------------------------------------------------------
216 // GetAdjacentCellState() - returns the state (value) of the specified
217 // adjacent cell of the current cell "cellNumber"
218 char GetAdjacentCellState(char* source, // pointer to source data block
219                           int x, // logical width of field
220                           int y, // logical height of field
221                           int cellNumber, // number of cell position to examine
222                           int cp // which adjacent position
223 ) {
224     /*
225 cp
226 *-- cp=1 ... --- cp=8 (summary: -1-2-3-
227 -x-          -x-                -4-x-5-
228 ---          --*                -6-7-8- )
229 */
230     char cellState = 0; // return value
231 
232     // set up boundary flags to trigger field-wrap logic
233     bool onTopRow = false;
234     bool onBottomRow = false;
235     bool onLeftColumn = false;
236     bool onRightColumn = false;
237 
238     // check to see if cell is on top row
239     if (cellNumber < x) {
240         onTopRow = true;
241     }
242     // check to see if cell is on bottom row
243     if ((x * y) - cellNumber <= x) {
244         onBottomRow = true;
245     }
246     // check to see if cell is on left column
247     if (cellNumber % x == 0) {
248         onLeftColumn = true;
249     }
250     // check to see if cell is on right column
251     if ((cellNumber + 1) % x == 0) {
252         onRightColumn = true;
253     }
254 
255     switch (cp) {
256         case 1:
257             if (onTopRow && onLeftColumn) {
258                 return *(source + ((x * y) - 1));
259             }
260             if (onTopRow && !onLeftColumn) {
261                 return *(source + (((x * y) - x) + (cellNumber - 1)));
262             }
263             if (onLeftColumn && !onTopRow) {
264                 return *(source + (cellNumber - 1));
265             }
266             return *((source + cellNumber) - (x + 1));
267 
268         case 2:
269             if (onTopRow) {
270                 return *(source + (((x * y) - x) + cellNumber));
271             }
272             return *((source + cellNumber) - x);
273 
274         case 3:
275             if (onTopRow && onRightColumn) {
276                 return *(source + ((x * y) - x));
277             }
278             if (onTopRow && !onRightColumn) {
279                 return *(source + (((x * y) - x) + (cellNumber + 1)));
280             }
281             if (onRightColumn && !onTopRow) {
282                 return *(source + ((cellNumber - (x * 2)) + 1));
283             }
284             return *(source + (cellNumber - (x - 1)));
285 
286         case 4:
287             if (onRightColumn) {
288                 return *(source + (cellNumber - (x - 1)));
289             }
290             return *(source + (cellNumber + 1));
291 
292         case 5:
293             if (onBottomRow && onRightColumn) {
294                 return *source;
295             }
296             if (onBottomRow && !onRightColumn) {
297                 return *(source + ((cellNumber - ((x * y) - x)) + 1));
298             }
299             if (onRightColumn && !onBottomRow) {
300                 return *(source + (cellNumber + 1));
301             }
302             return *(source + (((cellNumber + x)) + 1));
303 
304         case 6:
305             if (onBottomRow) {
306                 return *(source + (cellNumber - ((x * y) - x)));
307             }
308             return *(source + (cellNumber + x));
309 
310         case 7:
311             if (onBottomRow && onLeftColumn) {
312                 return *(source + (x - 1));
313             }
314             if (onBottomRow && !onLeftColumn) {
315                 return *(source + (cellNumber - ((x * y) - x) - 1));
316             }
317             if (onLeftColumn && !onBottomRow) {
318                 return *(source + (cellNumber + ((x * 2) - 1)));
319             }
320             return *(source + (cellNumber + (x - 1)));
321 
322         case 8:
323             if (onLeftColumn) {
324                 return *(source + (cellNumber + (x - 1)));
325             }
326             return *(source + (cellNumber - 1));
327     }
328     return cellState;
329 }
330 
331 char CheckCell(Matrix* m_matrix, int cellNumber) {
332     char total = 0;
333     char* source = m_matrix->data;
334     //look around to find cell's with status "alive"
335     for (int i = 1; i < 9; i++) {
336         total += GetAdjacentCellState(source, m_matrix->width, m_matrix->height, cellNumber, i);
337     }
338     // if the number of adjacent live cells is < 2 or > 3, the result is a dead
339     // cell regardless of its current state. (A live cell dies of loneliness if it
340     // has less than 2 neighbors, and of overcrowding if it has more than 3; a new
341     // cell is born in an empty spot only if it has exactly 3 neighbors.
342     if (total < 2 || total > 3) {
343         return 0;
344     }
345 
346     // if we get here and the cell position holds a living cell, it stays alive
347     if (*(source + cellNumber)) {
348         return 1;
349     }
350 
351     // we have an empty position. If there are only 2 neighbors, the position stays
352     // empty.
353     if (total == 2) {
354         return 0;
355     }
356 
357     // we have an empty position and exactly 3 neighbors. A cell is born.
358     return 1;
359 }
360 
361 void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) {
362     for (int i = begin; i <= end; i++) {
363         *(dest + i) = CheckCell(m_matrix, i);
364     }
365 }
366 
367 #endif
368 /* end non-SSE block */
369