1//  z_Linux_asm.S:  - microtasking routines specifically
2//                    written for Intel platforms running Linux* OS
3
4//
5////===----------------------------------------------------------------------===//
6////
7////                     The LLVM Compiler Infrastructure
8////
9//// This file is dual licensed under the MIT and the University of Illinois Open
10//// Source Licenses. See LICENSE.txt for details.
11////
12////===----------------------------------------------------------------------===//
13//
14
15// -----------------------------------------------------------------------
16// macros
17// -----------------------------------------------------------------------
18
19#include "kmp_config.h"
20
21#if KMP_ARCH_X86 || KMP_ARCH_X86_64
22
23# if KMP_MIC
24// the 'delay r16/r32/r64' should be used instead of the 'pause'.
25// The delay operation has the effect of removing the current thread from
26// the round-robin HT mechanism, and therefore speeds up the issue rate of
27// the other threads on the same core.
28//
29// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
30// barrier time to increase greatly for 3 or more threads per core.
31//
32// A value of 100 works pretty well for up to 4 threads per core, but isn't
33// quite as fast as 0 for 2 threads per core.
34//
35// We need to check what happens for oversubscription / > 4 threads per core.
36// It is possible that we need to pass the delay value in as a parameter
37// that the caller determines based on the total # threads / # cores.
38//
39//.macro pause_op
40//	mov    $100, %rax
41//	delay  %rax
42//.endm
43# else
44#  define pause_op   .byte 0xf3,0x90
45# endif // KMP_MIC
46
47# if KMP_OS_DARWIN
48#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
49#  define KMP_LABEL(x) L_##x             // form the name of label
50.macro KMP_CFI_DEF_OFFSET
51.endmacro
52.macro KMP_CFI_OFFSET
53.endmacro
54.macro KMP_CFI_REGISTER
55.endmacro
56.macro KMP_CFI_DEF
57.endmacro
58.macro ALIGN
59	.align $0
60.endmacro
61.macro DEBUG_INFO
62/* Not sure what .size does in icc, not sure if we need to do something
63   similar for OS X*.
64*/
65.endmacro
66.macro PROC
67	ALIGN  4
68	.globl KMP_PREFIX_UNDERSCORE($0)
69KMP_PREFIX_UNDERSCORE($0):
70.endmacro
71# else // KMP_OS_DARWIN
72#  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
73// Format labels so that they don't override function names in gdb's backtraces
74// MIC assembler doesn't accept .L syntax, the L works fine there (as well as
75// on OS X*)
76# if KMP_MIC
77#  define KMP_LABEL(x) L_##x          // local label
78# else
79#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
80# endif // KMP_MIC
81.macro ALIGN size
82	.align 1<<(\size)
83.endm
84.macro DEBUG_INFO proc
85	.cfi_endproc
86// Not sure why we need .type and .size for the functions
87	.align 16
88	.type  \proc,@function
89        .size  \proc,.-\proc
90.endm
91.macro PROC proc
92	ALIGN  4
93        .globl KMP_PREFIX_UNDERSCORE(\proc)
94KMP_PREFIX_UNDERSCORE(\proc):
95	.cfi_startproc
96.endm
97.macro KMP_CFI_DEF_OFFSET sz
98	.cfi_def_cfa_offset	\sz
99.endm
100.macro KMP_CFI_OFFSET reg, sz
101	.cfi_offset	\reg,\sz
102.endm
103.macro KMP_CFI_REGISTER reg
104	.cfi_def_cfa_register	\reg
105.endm
106.macro KMP_CFI_DEF reg, sz
107	.cfi_def_cfa	\reg,\sz
108.endm
109# endif // KMP_OS_DARWIN
110#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
111
112#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
113
114# if KMP_OS_DARWIN
115#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
116#  define KMP_LABEL(x) L_##x             // form the name of label
117
118.macro ALIGN
119	.align $0
120.endmacro
121
122.macro DEBUG_INFO
123/* Not sure what .size does in icc, not sure if we need to do something
124   similar for OS X*.
125*/
126.endmacro
127
128.macro PROC
129	ALIGN  4
130	.globl KMP_PREFIX_UNDERSCORE($0)
131KMP_PREFIX_UNDERSCORE($0):
132.endmacro
133# else // KMP_OS_DARWIN
134#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
135// Format labels so that they don't override function names in gdb's backtraces
136#  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
137
138.macro ALIGN size
139	.align 1<<(\size)
140.endm
141
142.macro DEBUG_INFO proc
143	.cfi_endproc
144// Not sure why we need .type and .size for the functions
145	ALIGN 2
146	.type  \proc,@function
147	.size  \proc,.-\proc
148.endm
149
150.macro PROC proc
151	ALIGN 2
152	.globl KMP_PREFIX_UNDERSCORE(\proc)
153KMP_PREFIX_UNDERSCORE(\proc):
154	.cfi_startproc
155.endm
156# endif // KMP_OS_DARWIN
157
158#endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
159
160// -----------------------------------------------------------------------
161// data
162// -----------------------------------------------------------------------
163
164#ifdef KMP_GOMP_COMPAT
165
166// Support for unnamed common blocks.
167//
168// Because the symbol ".gomp_critical_user_" contains a ".", we have to
169// put this stuff in assembly.
170
171# if KMP_ARCH_X86
172#  if KMP_OS_DARWIN
173        .data
174        .comm .gomp_critical_user_,32
175        .data
176        .globl ___kmp_unnamed_critical_addr
177___kmp_unnamed_critical_addr:
178        .long .gomp_critical_user_
179#  else /* Linux* OS */
180        .data
181        .comm .gomp_critical_user_,32,8
182        .data
183	ALIGN 4
184        .global __kmp_unnamed_critical_addr
185__kmp_unnamed_critical_addr:
186        .4byte .gomp_critical_user_
187        .type __kmp_unnamed_critical_addr,@object
188        .size __kmp_unnamed_critical_addr,4
189#  endif /* KMP_OS_DARWIN */
190# endif /* KMP_ARCH_X86 */
191
192# if KMP_ARCH_X86_64
193#  if KMP_OS_DARWIN
194        .data
195        .comm .gomp_critical_user_,32
196        .data
197        .globl ___kmp_unnamed_critical_addr
198___kmp_unnamed_critical_addr:
199        .quad .gomp_critical_user_
200#  else /* Linux* OS */
201        .data
202        .comm .gomp_critical_user_,32,8
203        .data
204	ALIGN 8
205        .global __kmp_unnamed_critical_addr
206__kmp_unnamed_critical_addr:
207        .8byte .gomp_critical_user_
208        .type __kmp_unnamed_critical_addr,@object
209        .size __kmp_unnamed_critical_addr,8
210#  endif /* KMP_OS_DARWIN */
211# endif /* KMP_ARCH_X86_64 */
212
213#endif /* KMP_GOMP_COMPAT */
214
215
216#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
217
218// -----------------------------------------------------------------------
219// microtasking routines specifically written for IA-32 architecture
220// running Linux* OS
221// -----------------------------------------------------------------------
222
223	.ident "Intel Corporation"
224	.data
225	ALIGN 4
226// void
227// __kmp_x86_pause( void );
228
229        .text
230	PROC  __kmp_x86_pause
231
232        pause_op
233        ret
234
235	DEBUG_INFO __kmp_x86_pause
236
237// void
238// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
239
240	PROC  __kmp_x86_cpuid
241
242	pushl %ebp
243	movl  %esp,%ebp
244        pushl %edi
245        pushl %ebx
246        pushl %ecx
247        pushl %edx
248
249	movl  8(%ebp), %eax
250	movl  12(%ebp), %ecx
251	cpuid		// Query the CPUID for the current processor
252
253	movl  16(%ebp), %edi
254	movl  %eax, 0(%edi)
255	movl  %ebx, 4(%edi)
256	movl  %ecx, 8(%edi)
257	movl  %edx, 12(%edi)
258
259        popl  %edx
260        popl  %ecx
261        popl  %ebx
262        popl  %edi
263        movl  %ebp, %esp
264        popl  %ebp
265	ret
266
267	DEBUG_INFO __kmp_x86_cpuid
268
269
270# if !KMP_ASM_INTRINS
271
272//------------------------------------------------------------------------
273// kmp_int32
274// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
275
276        PROC      __kmp_test_then_add32
277
278        movl      4(%esp), %ecx
279        movl      8(%esp), %eax
280        lock
281        xaddl     %eax,(%ecx)
282        ret
283
284	DEBUG_INFO __kmp_test_then_add32
285
286//------------------------------------------------------------------------
287// FUNCTION __kmp_xchg_fixed8
288//
289// kmp_int32
290// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
291//
292// parameters:
293// 	p:	4(%esp)
294// 	d:	8(%esp)
295//
296// return:	%al
297        PROC  __kmp_xchg_fixed8
298
299        movl      4(%esp), %ecx    // "p"
300        movb      8(%esp), %al	// "d"
301
302        lock
303        xchgb     %al,(%ecx)
304        ret
305
306        DEBUG_INFO __kmp_xchg_fixed8
307
308
309//------------------------------------------------------------------------
310// FUNCTION __kmp_xchg_fixed16
311//
312// kmp_int16
313// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
314//
315// parameters:
316// 	p:	4(%esp)
317// 	d:	8(%esp)
318// return:     %ax
319        PROC  __kmp_xchg_fixed16
320
321        movl      4(%esp), %ecx    // "p"
322        movw      8(%esp), %ax	// "d"
323
324        lock
325        xchgw     %ax,(%ecx)
326        ret
327
328        DEBUG_INFO __kmp_xchg_fixed16
329
330
331//------------------------------------------------------------------------
332// FUNCTION __kmp_xchg_fixed32
333//
334// kmp_int32
335// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
336//
337// parameters:
338// 	p:	4(%esp)
339// 	d:	8(%esp)
340//
341// return:	%eax
342        PROC  __kmp_xchg_fixed32
343
344        movl      4(%esp), %ecx    // "p"
345        movl      8(%esp), %eax	// "d"
346
347        lock
348        xchgl     %eax,(%ecx)
349        ret
350
351        DEBUG_INFO __kmp_xchg_fixed32
352
353
354// kmp_int8
355// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
356        PROC  __kmp_compare_and_store8
357
358        movl      4(%esp), %ecx
359        movb      8(%esp), %al
360        movb      12(%esp), %dl
361        lock
362        cmpxchgb  %dl,(%ecx)
363        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
364        and       $1, %eax      // sign extend previous instruction
365        ret
366
367        DEBUG_INFO __kmp_compare_and_store8
368
369// kmp_int16
370// __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
371        PROC  __kmp_compare_and_store16
372
373        movl      4(%esp), %ecx
374        movw      8(%esp), %ax
375        movw      12(%esp), %dx
376        lock
377        cmpxchgw  %dx,(%ecx)
378        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
379        and       $1, %eax      // sign extend previous instruction
380        ret
381
382        DEBUG_INFO __kmp_compare_and_store16
383
384// kmp_int32
385// __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
386        PROC  __kmp_compare_and_store32
387
388        movl      4(%esp), %ecx
389        movl      8(%esp), %eax
390        movl      12(%esp), %edx
391        lock
392        cmpxchgl  %edx,(%ecx)
393        sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
394        and       $1, %eax     // sign extend previous instruction
395        ret
396
397        DEBUG_INFO __kmp_compare_and_store32
398
399// kmp_int32
400// __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
401        PROC  __kmp_compare_and_store64
402
403        pushl     %ebp
404        movl      %esp, %ebp
405        pushl     %ebx
406        pushl     %edi
407        movl      8(%ebp), %edi
408        movl      12(%ebp), %eax        // "cv" low order word
409        movl      16(%ebp), %edx        // "cv" high order word
410        movl      20(%ebp), %ebx        // "sv" low order word
411        movl      24(%ebp), %ecx        // "sv" high order word
412        lock
413        cmpxchg8b (%edi)
414        sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
415        and       $1, %eax // sign extend previous instruction
416        popl      %edi
417        popl      %ebx
418        movl      %ebp, %esp
419        popl      %ebp
420        ret
421
422        DEBUG_INFO __kmp_compare_and_store64
423
424// kmp_int8
425// __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
426        PROC  __kmp_compare_and_store_ret8
427
428        movl      4(%esp), %ecx
429        movb      8(%esp), %al
430        movb      12(%esp), %dl
431        lock
432        cmpxchgb  %dl,(%ecx)
433        ret
434
435        DEBUG_INFO __kmp_compare_and_store_ret8
436
437// kmp_int16
438// __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
439//                               kmp_int16 sv);
440        PROC  __kmp_compare_and_store_ret16
441
442        movl      4(%esp), %ecx
443        movw      8(%esp), %ax
444        movw      12(%esp), %dx
445        lock
446        cmpxchgw  %dx,(%ecx)
447        ret
448
449        DEBUG_INFO __kmp_compare_and_store_ret16
450
451// kmp_int32
452// __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
453//                               kmp_int32 sv);
454        PROC  __kmp_compare_and_store_ret32
455
456        movl      4(%esp), %ecx
457        movl      8(%esp), %eax
458        movl      12(%esp), %edx
459        lock
460        cmpxchgl  %edx,(%ecx)
461        ret
462
463        DEBUG_INFO __kmp_compare_and_store_ret32
464
465// kmp_int64
466// __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
467//                               kmp_int64 sv);
468        PROC  __kmp_compare_and_store_ret64
469
470        pushl     %ebp
471        movl      %esp, %ebp
472        pushl     %ebx
473        pushl     %edi
474        movl      8(%ebp), %edi
475        movl      12(%ebp), %eax        // "cv" low order word
476        movl      16(%ebp), %edx        // "cv" high order word
477        movl      20(%ebp), %ebx        // "sv" low order word
478        movl      24(%ebp), %ecx        // "sv" high order word
479        lock
480        cmpxchg8b (%edi)
481        popl      %edi
482        popl      %ebx
483        movl      %ebp, %esp
484        popl      %ebp
485        ret
486
487        DEBUG_INFO __kmp_compare_and_store_ret64
488
489
490//------------------------------------------------------------------------
491// FUNCTION __kmp_xchg_real32
492//
493// kmp_real32
494// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
495//
496// parameters:
497// 	addr:	4(%esp)
498// 	data:	8(%esp)
499//
500// return:	%eax
501        PROC  __kmp_xchg_real32
502
503        pushl   %ebp
504        movl    %esp, %ebp
505        subl    $4, %esp
506        pushl   %esi
507
508        movl    4(%ebp), %esi
509        flds    (%esi)
510                        // load <addr>
511        fsts    -4(%ebp)
512                        // store old value
513
514        movl    8(%ebp), %eax
515
516        lock
517        xchgl   %eax, (%esi)
518
519        flds    -4(%ebp)
520                        // return old value
521
522        popl    %esi
523        movl    %ebp, %esp
524        popl    %ebp
525        ret
526
527        DEBUG_INFO __kmp_xchg_real32
528
529# endif /* !KMP_ASM_INTRINS */
530
531
532//------------------------------------------------------------------------
533// FUNCTION __kmp_load_x87_fpu_control_word
534//
535// void
536// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
537//
538// parameters:
539// 	p:	4(%esp)
540        PROC  __kmp_load_x87_fpu_control_word
541
542        movl  4(%esp), %eax
543        fldcw (%eax)
544        ret
545
546        DEBUG_INFO __kmp_load_x87_fpu_control_word
547
548
549//------------------------------------------------------------------------
550// FUNCTION __kmp_store_x87_fpu_control_word
551//
552// void
553// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
554//
555// parameters:
556// 	p:	4(%esp)
557        PROC  __kmp_store_x87_fpu_control_word
558
559        movl  4(%esp), %eax
560        fstcw (%eax)
561        ret
562
563        DEBUG_INFO __kmp_store_x87_fpu_control_word
564
565
566//------------------------------------------------------------------------
567// FUNCTION __kmp_clear_x87_fpu_status_word
568//
569// void
570// __kmp_clear_x87_fpu_status_word();
571        PROC  __kmp_clear_x87_fpu_status_word
572
573        fnclex
574        ret
575
576        DEBUG_INFO __kmp_clear_x87_fpu_status_word
577
578
579//------------------------------------------------------------------------
580// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
581//
582// int
583// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
584//                         int argc, void *p_argv[] ) {
585//    (*pkfn)( & gtid, & gtid, argv[0], ... );
586//    return 1;
587// }
588
589// -- Begin __kmp_invoke_microtask
590// mark_begin;
591	PROC  __kmp_invoke_microtask
592
593	pushl %ebp
594	KMP_CFI_DEF_OFFSET 8
595	KMP_CFI_OFFSET ebp,-8
596	movl %esp,%ebp		// establish the base pointer for this routine.
597	KMP_CFI_REGISTER ebp
598	subl $8,%esp		// allocate space for two local variables.
599				// These varibales are:
600				//	argv: -4(%ebp)
601				//	temp: -8(%ebp)
602				//
603	pushl %ebx		// save %ebx to use during this routine
604				//
605#if OMPT_SUPPORT
606	movl 28(%ebp),%ebx	// get exit_frame address
607	movl %ebp,(%ebx)	// save exit_frame
608#endif
609
610	movl 20(%ebp),%ebx	// Stack alignment - # args
611	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
612	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
613	movl %esp,%eax		//
614	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
615	movl %eax,%ebx		// Save to %ebx
616	andl $0xFFFFFF80,%eax	// mask off 7 bits
617	subl %eax,%ebx		// Amount to subtract from %esp
618	subl %ebx,%esp		// Prepare the stack ptr --
619				//   now it will be aligned on 128-byte boundary at the call
620
621	movl 24(%ebp),%eax	// copy from p_argv[]
622	movl %eax,-4(%ebp)	// into the local variable *argv.
623
624	movl 20(%ebp),%ebx	// argc is 20(%ebp)
625	shll $2,%ebx
626
627KMP_LABEL(invoke_2):
628	cmpl $0,%ebx
629	jg  KMP_LABEL(invoke_4)
630	jmp KMP_LABEL(invoke_3)
631	ALIGN 2
632KMP_LABEL(invoke_4):
633	movl -4(%ebp),%eax
634	subl $4,%ebx			// decrement argc.
635	addl %ebx,%eax			// index into argv.
636	movl (%eax),%edx
637	pushl %edx
638
639	jmp KMP_LABEL(invoke_2)
640	ALIGN 2
641KMP_LABEL(invoke_3):
642	leal 16(%ebp),%eax		// push & tid
643	pushl %eax
644
645	leal 12(%ebp),%eax		// push & gtid
646	pushl %eax
647
648	movl 8(%ebp),%ebx
649	call *%ebx			// call (*pkfn)();
650
651	movl $1,%eax			// return 1;
652
653	movl -12(%ebp),%ebx		// restore %ebx
654	leave
655	KMP_CFI_DEF esp,4
656	ret
657
658	DEBUG_INFO __kmp_invoke_microtask
659// -- End  __kmp_invoke_microtask
660
661
662// kmp_uint64
663// __kmp_hardware_timestamp(void)
664	PROC  __kmp_hardware_timestamp
665	rdtsc
666	ret
667
668	DEBUG_INFO __kmp_hardware_timestamp
669// -- End  __kmp_hardware_timestamp
670
671#endif /* KMP_ARCH_X86 */
672
673
674#if KMP_ARCH_X86_64
675
676// -----------------------------------------------------------------------
677// microtasking routines specifically written for IA-32 architecture and
678// Intel(R) 64 running Linux* OS
679// -----------------------------------------------------------------------
680
681// -- Machine type P
682// mark_description "Intel Corporation";
683	.ident "Intel Corporation"
684// --	.file "z_Linux_asm.S"
685	.data
686	ALIGN 4
687
688// To prevent getting our code into .data section .text added to every routine
689// definition for x86_64.
690//------------------------------------------------------------------------
691// FUNCTION __kmp_x86_cpuid
692//
693// void
694// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
695//
696// parameters:
697// 	mode:		%edi
698// 	mode2:		%esi
699// 	cpuid_buffer:	%rdx
700        .text
701	PROC  __kmp_x86_cpuid
702
703	pushq  %rbp
704	movq   %rsp,%rbp
705        pushq  %rbx			// callee-save register
706
707	movl   %esi, %ecx		// "mode2"
708	movl   %edi, %eax		// "mode"
709        movq   %rdx, %rsi               // cpuid_buffer
710	cpuid				// Query the CPUID for the current processor
711
712	movl   %eax, 0(%rsi)		// store results into buffer
713	movl   %ebx, 4(%rsi)
714	movl   %ecx, 8(%rsi)
715	movl   %edx, 12(%rsi)
716
717        popq   %rbx			// callee-save register
718        movq   %rbp, %rsp
719        popq   %rbp
720	ret
721
722        DEBUG_INFO __kmp_x86_cpuid
723
724
725
726# if !KMP_ASM_INTRINS
727
728//------------------------------------------------------------------------
729// FUNCTION __kmp_test_then_add32
730//
731// kmp_int32
732// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
733//
734// parameters:
735// 	p:	%rdi
736// 	d:	%esi
737//
738// return:	%eax
739        .text
740        PROC  __kmp_test_then_add32
741
742        movl      %esi, %eax	// "d"
743        lock
744        xaddl     %eax,(%rdi)
745        ret
746
747        DEBUG_INFO __kmp_test_then_add32
748
749
750//------------------------------------------------------------------------
751// FUNCTION __kmp_test_then_add64
752//
753// kmp_int64
754// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
755//
756// parameters:
757// 	p:	%rdi
758// 	d:	%rsi
759//	return:	%rax
760        .text
761        PROC  __kmp_test_then_add64
762
763        movq      %rsi, %rax	// "d"
764        lock
765        xaddq     %rax,(%rdi)
766        ret
767
768        DEBUG_INFO __kmp_test_then_add64
769
770
771//------------------------------------------------------------------------
772// FUNCTION __kmp_xchg_fixed8
773//
774// kmp_int32
775// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
776//
777// parameters:
778// 	p:	%rdi
779// 	d:	%sil
780//
781// return:	%al
782        .text
783        PROC  __kmp_xchg_fixed8
784
785        movb      %sil, %al	// "d"
786
787        lock
788        xchgb     %al,(%rdi)
789        ret
790
791        DEBUG_INFO __kmp_xchg_fixed8
792
793
794//------------------------------------------------------------------------
795// FUNCTION __kmp_xchg_fixed16
796//
797// kmp_int16
798// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
799//
800// parameters:
801// 	p:	%rdi
802// 	d:	%si
803// return:     %ax
804        .text
805        PROC  __kmp_xchg_fixed16
806
807        movw      %si, %ax	// "d"
808
809        lock
810        xchgw     %ax,(%rdi)
811        ret
812
813        DEBUG_INFO __kmp_xchg_fixed16
814
815
816//------------------------------------------------------------------------
817// FUNCTION __kmp_xchg_fixed32
818//
819// kmp_int32
820// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
821//
822// parameters:
823// 	p:	%rdi
824// 	d:	%esi
825//
826// return:	%eax
827        .text
828        PROC  __kmp_xchg_fixed32
829
830        movl      %esi, %eax	// "d"
831
832        lock
833        xchgl     %eax,(%rdi)
834        ret
835
836        DEBUG_INFO __kmp_xchg_fixed32
837
838
839//------------------------------------------------------------------------
840// FUNCTION __kmp_xchg_fixed64
841//
842// kmp_int64
843// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
844//
845// parameters:
846// 	p:	%rdi
847// 	d:	%rsi
848// return:	%rax
849        .text
850        PROC  __kmp_xchg_fixed64
851
852        movq      %rsi, %rax	// "d"
853
854        lock
855        xchgq     %rax,(%rdi)
856        ret
857
858        DEBUG_INFO __kmp_xchg_fixed64
859
860
861//------------------------------------------------------------------------
862// FUNCTION __kmp_compare_and_store8
863//
864// kmp_int8
865// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
866//
867// parameters:
868// 	p:	%rdi
869// 	cv:	%esi
870//	sv:	%edx
871//
872// return:	%eax
873        .text
874        PROC  __kmp_compare_and_store8
875
876        movb      %sil, %al	// "cv"
877        lock
878        cmpxchgb  %dl,(%rdi)
879        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
880        andq      $1, %rax      // sign extend previous instruction for return value
881        ret
882
883        DEBUG_INFO __kmp_compare_and_store8
884
885
886//------------------------------------------------------------------------
887// FUNCTION __kmp_compare_and_store16
888//
889// kmp_int16
890// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
891//
892// parameters:
893// 	p:	%rdi
894// 	cv:	%si
895//	sv:	%dx
896//
897// return:	%eax
898        .text
899        PROC  __kmp_compare_and_store16
900
901        movw      %si, %ax	// "cv"
902        lock
903        cmpxchgw  %dx,(%rdi)
904        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
905        andq      $1, %rax      // sign extend previous instruction for return value
906        ret
907
908        DEBUG_INFO __kmp_compare_and_store16
909
910
911//------------------------------------------------------------------------
912// FUNCTION __kmp_compare_and_store32
913//
914// kmp_int32
915// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
916//
917// parameters:
918// 	p:	%rdi
919// 	cv:	%esi
920//	sv:	%edx
921//
922// return:	%eax
923        .text
924        PROC  __kmp_compare_and_store32
925
926        movl      %esi, %eax	// "cv"
927        lock
928        cmpxchgl  %edx,(%rdi)
929        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
930        andq      $1, %rax      // sign extend previous instruction for return value
931        ret
932
933        DEBUG_INFO __kmp_compare_and_store32
934
935
936//------------------------------------------------------------------------
937// FUNCTION __kmp_compare_and_store64
938//
939// kmp_int32
940// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
941//
942// parameters:
943// 	p:	%rdi
944// 	cv:	%rsi
945//	sv:	%rdx
946//	return:	%eax
947        .text
948        PROC  __kmp_compare_and_store64
949
950        movq      %rsi, %rax    // "cv"
951        lock
952        cmpxchgq  %rdx,(%rdi)
953        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
954        andq      $1, %rax      // sign extend previous instruction for return value
955        ret
956
957        DEBUG_INFO __kmp_compare_and_store64
958
959//------------------------------------------------------------------------
960// FUNCTION __kmp_compare_and_store_ret8
961//
962// kmp_int8
963// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
964//
965// parameters:
966// 	p:	%rdi
967// 	cv:	%esi
968//	sv:	%edx
969//
970// return:	%eax
971        .text
972        PROC  __kmp_compare_and_store_ret8
973
974        movb      %sil, %al	// "cv"
975        lock
976        cmpxchgb  %dl,(%rdi)
977        ret
978
979        DEBUG_INFO __kmp_compare_and_store_ret8
980
981
982//------------------------------------------------------------------------
983// FUNCTION __kmp_compare_and_store_ret16
984//
985// kmp_int16
986// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
987//
988// parameters:
989// 	p:	%rdi
990// 	cv:	%si
991//	sv:	%dx
992//
993// return:	%eax
994        .text
995        PROC  __kmp_compare_and_store_ret16
996
997        movw      %si, %ax	// "cv"
998        lock
999        cmpxchgw  %dx,(%rdi)
1000        ret
1001
1002        DEBUG_INFO __kmp_compare_and_store_ret16
1003
1004
1005//------------------------------------------------------------------------
1006// FUNCTION __kmp_compare_and_store_ret32
1007//
1008// kmp_int32
1009// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
1010//
1011// parameters:
1012// 	p:	%rdi
1013// 	cv:	%esi
1014//	sv:	%edx
1015//
1016// return:	%eax
1017        .text
1018        PROC  __kmp_compare_and_store_ret32
1019
1020        movl      %esi, %eax	// "cv"
1021        lock
1022        cmpxchgl  %edx,(%rdi)
1023        ret
1024
1025        DEBUG_INFO __kmp_compare_and_store_ret32
1026
1027
1028//------------------------------------------------------------------------
1029// FUNCTION __kmp_compare_and_store_ret64
1030//
1031// kmp_int64
1032// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1033//
1034// parameters:
1035// 	p:	%rdi
1036// 	cv:	%rsi
1037//	sv:	%rdx
1038//	return:	%eax
1039        .text
1040        PROC  __kmp_compare_and_store_ret64
1041
1042        movq      %rsi, %rax    // "cv"
1043        lock
1044        cmpxchgq  %rdx,(%rdi)
1045        ret
1046
1047        DEBUG_INFO __kmp_compare_and_store_ret64
1048
1049# endif /* !KMP_ASM_INTRINS */
1050
1051
1052# if !KMP_MIC
1053
1054# if !KMP_ASM_INTRINS
1055
1056//------------------------------------------------------------------------
1057// FUNCTION __kmp_xchg_real32
1058//
1059// kmp_real32
1060// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1061//
1062// parameters:
1063// 	addr:	%rdi
1064// 	data:	%xmm0 (lower 4 bytes)
1065//
1066// return:	%xmm0 (lower 4 bytes)
1067        .text
1068        PROC  __kmp_xchg_real32
1069
1070	movd	%xmm0, %eax	// load "data" to eax
1071
1072         lock
1073         xchgl %eax, (%rdi)
1074
1075	movd	%eax, %xmm0	// load old value into return register
1076
1077        ret
1078
1079        DEBUG_INFO __kmp_xchg_real32
1080
1081
1082//------------------------------------------------------------------------
1083// FUNCTION __kmp_xchg_real64
1084//
1085// kmp_real64
1086// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1087//
1088// parameters:
1089//      addr:   %rdi
1090//      data:   %xmm0 (lower 8 bytes)
1091//      return: %xmm0 (lower 8 bytes)
1092        .text
1093        PROC  __kmp_xchg_real64
1094
1095	movd	%xmm0, %rax	// load "data" to rax
1096
1097         lock
1098	xchgq  %rax, (%rdi)
1099
1100	movd	%rax, %xmm0	// load old value into return register
1101        ret
1102
1103        DEBUG_INFO __kmp_xchg_real64
1104
1105
1106# endif /* !KMP_MIC */
1107
1108# endif /* !KMP_ASM_INTRINS */
1109
1110
1111//------------------------------------------------------------------------
1112// FUNCTION __kmp_load_x87_fpu_control_word
1113//
1114// void
1115// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
1116//
1117// parameters:
1118// 	p:	%rdi
1119        .text
1120        PROC  __kmp_load_x87_fpu_control_word
1121
1122        fldcw (%rdi)
1123        ret
1124
1125        DEBUG_INFO __kmp_load_x87_fpu_control_word
1126
1127
1128//------------------------------------------------------------------------
1129// FUNCTION __kmp_store_x87_fpu_control_word
1130//
1131// void
1132// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
1133//
1134// parameters:
1135// 	p:	%rdi
1136        .text
1137        PROC  __kmp_store_x87_fpu_control_word
1138
1139        fstcw (%rdi)
1140        ret
1141
1142        DEBUG_INFO __kmp_store_x87_fpu_control_word
1143
1144
1145//------------------------------------------------------------------------
1146// FUNCTION __kmp_clear_x87_fpu_status_word
1147//
1148// void
1149// __kmp_clear_x87_fpu_status_word();
1150        .text
1151        PROC  __kmp_clear_x87_fpu_status_word
1152
1153#if KMP_MIC
1154// TODO: remove the workaround for problem with fnclex instruction (no CQ known)
1155        fstenv  -32(%rsp)              // store FP env
1156        andw    $~0x80ff, 4-32(%rsp)   // clear 0-7,15 bits of FP SW
1157        fldenv  -32(%rsp)              // load FP env back
1158        ret
1159#else
1160        fnclex
1161        ret
1162#endif
1163
1164        DEBUG_INFO __kmp_clear_x87_fpu_status_word
1165
1166
1167//------------------------------------------------------------------------
1168// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
1169//
1170// int
1171// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1172//		           int gtid, int tid,
1173//                         int argc, void *p_argv[] ) {
1174//    (*pkfn)( & gtid, & tid, argv[0], ... );
1175//    return 1;
1176// }
1177//
1178// note: at call to pkfn must have %rsp 128-byte aligned for compiler
1179//
1180// parameters:
1181//      %rdi:  	pkfn
1182//	%esi:	gtid
1183//	%edx:	tid
1184//	%ecx:	argc
1185//	%r8:	p_argv
1186//	%r9:	&exit_frame
1187//
1188// locals:
1189//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1190//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1191//
1192// reg temps:
1193//	%rax:	used all over the place
1194//	%rdx:	used in stack pointer alignment calculation
1195//	%r11:	used to traverse p_argv array
1196//	%rsi:	used as temporary for stack parameters
1197//		used as temporary for number of pkfn parms to push
1198//	%rbx:	used to hold pkfn address, and zero constant, callee-save
1199//
1200// return:	%eax 	(always 1/TRUE)
1201__gtid = -16
1202__tid = -24
1203
1204// -- Begin __kmp_invoke_microtask
1205// mark_begin;
1206        .text
1207	PROC  __kmp_invoke_microtask
1208
1209	pushq 	%rbp		// save base pointer
1210	KMP_CFI_DEF_OFFSET 16
1211	KMP_CFI_OFFSET rbp,-16
1212	movq 	%rsp,%rbp	// establish the base pointer for this routine.
1213	KMP_CFI_REGISTER rbp
1214
1215#if OMPT_SUPPORT
1216	movq	%rbp, (%r9)	// save exit_frame
1217#endif
1218
1219	pushq 	%rbx		// %rbx is callee-saved register
1220	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
1221	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
1222
1223	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
1224	movq	$0, %rbx	// constant for cmovs later
1225	subq	$4, %rax	// subtract four args passed in registers to pkfn
1226#if KMP_MIC
1227	js	KMP_LABEL(kmp_0)	// jump to movq
1228	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
1229KMP_LABEL(kmp_0):
1230	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1231KMP_LABEL(kmp_0_exit):
1232#else
1233	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
1234#endif // KMP_MIC
1235
1236	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
1237	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
1238
1239	movq 	%rsp, %rdx	//
1240	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
1241				// without align, stack ptr would be this
1242	movq 	%rdx, %rax	// Save to %rax
1243
1244	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
1245	subq 	%rax, %rdx	// Amount to subtract from %rsp
1246	subq 	%rdx, %rsp	// Prepare the stack ptr --
1247				// now %rsp will align to 128-byte boundary at call site
1248
1249				// setup pkfn parameter reg and stack
1250	movq	%rcx, %rax	// argc -> %rax
1251	cmpq	$0, %rsi
1252	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
1253	shlq	$3, %rcx	// argc*8 -> %rcx
1254	movq 	%r8, %rdx	// p_argv -> %rdx
1255	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
1256
1257	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
1258
1259KMP_LABEL(kmp_invoke_push_parms):
1260	// push nth - 7th parms to pkfn on stack
1261	subq	$8, %rdx	// decrement p_argv pointer to previous parm
1262	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
1263	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
1264	subl	$1, %ecx
1265
1266// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1267//		if the name of the label that is an operand of this jecxz starts with a dot (".");
1268//	   Apple's linker does not support 1-byte length relocation;
1269//         Resolution: replace all .labelX entries with L_labelX.
1270
1271	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
1272	jmp	KMP_LABEL(kmp_invoke_push_parms)
1273	ALIGN 3
1274KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
1275				// order here is important to avoid trashing
1276				// registers used for both input and output parms!
1277	movq	%rdi, %rbx	// pkfn -> %rbx
1278	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1279	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
1280
1281	movq	%r8, %r11	// p_argv -> %r11
1282
1283#if KMP_MIC
1284	cmpq	$4, %rax	// argc >= 4?
1285	jns	KMP_LABEL(kmp_4)	// jump to movq
1286	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
1287KMP_LABEL(kmp_4):
1288	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1289KMP_LABEL(kmp_4_exit):
1290
1291	cmpq	$3, %rax	// argc >= 3?
1292	jns	KMP_LABEL(kmp_3)	// jump to movq
1293	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
1294KMP_LABEL(kmp_3):
1295	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1296KMP_LABEL(kmp_3_exit):
1297
1298	cmpq	$2, %rax	// argc >= 2?
1299	jns	KMP_LABEL(kmp_2)	// jump to movq
1300	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
1301KMP_LABEL(kmp_2):
1302	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1303KMP_LABEL(kmp_2_exit):
1304
1305	cmpq	$1, %rax	// argc >= 1?
1306	jns	KMP_LABEL(kmp_1)	// jump to movq
1307	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
1308KMP_LABEL(kmp_1):
1309	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1310KMP_LABEL(kmp_1_exit):
1311#else
1312	cmpq	$4, %rax	// argc >= 4?
1313	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
1314
1315	cmpq	$3, %rax	// argc >= 3?
1316	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
1317
1318	cmpq	$2, %rax	// argc >= 2?
1319	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
1320
1321	cmpq	$1, %rax	// argc >= 1?
1322	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
1323#endif // KMP_MIC
1324
1325	call	*%rbx		// call (*pkfn)();
1326	movq	$1, %rax	// move 1 into return register;
1327
1328	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
1329	movq 	%rbp, %rsp	// restore stack pointer
1330	popq 	%rbp		// restore frame pointer
1331	KMP_CFI_DEF rsp,8
1332	ret
1333
1334	DEBUG_INFO __kmp_invoke_microtask
1335// -- End  __kmp_invoke_microtask
1336
1337// kmp_uint64
1338// __kmp_hardware_timestamp(void)
1339        .text
1340	PROC  __kmp_hardware_timestamp
1341	rdtsc
1342	shlq    $32, %rdx
1343	orq     %rdx, %rax
1344	ret
1345
1346	DEBUG_INFO __kmp_hardware_timestamp
1347// -- End  __kmp_hardware_timestamp
1348
1349//------------------------------------------------------------------------
1350// FUNCTION __kmp_bsr32
1351//
1352// int
1353// __kmp_bsr32( int );
1354        .text
1355        PROC  __kmp_bsr32
1356
1357        bsr    %edi,%eax
1358        ret
1359
1360        DEBUG_INFO __kmp_bsr32
1361
1362
1363// -----------------------------------------------------------------------
1364#endif /* KMP_ARCH_X86_64 */
1365
1366// '
1367#if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
1368
1369//------------------------------------------------------------------------
1370//
1371// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
1372//
1373// int
1374// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1375//		           int gtid, int tid,
1376//                         int argc, void *p_argv[] ) {
1377//    (*pkfn)( & gtid, & tid, argv[0], ... );
1378//    return 1;
1379// }
1380//
1381// parameters:
1382//	x0:	pkfn
1383//	w1:	gtid
1384//	w2:	tid
1385//	w3:	argc
1386//	x4:	p_argv
1387//	x5:	&exit_frame
1388//
1389// locals:
1390//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
1391//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
1392//
1393// reg temps:
1394//	 x8:	used to hold pkfn address
1395//	 w9:	used as temporary for number of pkfn parms
1396//	x10:	used to traverse p_argv array
1397//	x11:	used as temporary for stack placement calculation
1398//	x12:	used as temporary for stack parameters
1399//	x19:	used to preserve exit_frame_ptr, callee-save
1400//
1401// return:	w0	(always 1/TRUE)
1402//
1403
1404__gtid = 4
1405__tid = 8
1406
1407// -- Begin __kmp_invoke_microtask
1408// mark_begin;
1409	.text
1410	PROC __kmp_invoke_microtask
1411
1412	stp	x29, x30, [sp, #-16]!
1413# if OMPT_SUPPORT
1414	stp	x19, x20, [sp, #-16]!
1415# endif
1416	mov	x29, sp
1417
1418	orr	w9, wzr, #1
1419	add	w9, w9, w3, lsr #1
1420	sub	sp, sp, w9, lsl #4
1421	mov	x11, sp
1422
1423	mov	x8, x0
1424	str	w1, [x29, #-__gtid]
1425	str	w2, [x29, #-__tid]
1426	mov	w9, w3
1427	mov	x10, x4
1428# if OMPT_SUPPORT
1429	mov	x19, x5
1430	str	x29, [x19]
1431# endif
1432
1433	sub	x0, x29, #__gtid
1434	sub	x1, x29, #__tid
1435
1436	cbz	w9, KMP_LABEL(kmp_1)
1437	ldr	x2, [x10]
1438
1439	sub	w9, w9, #1
1440	cbz	w9, KMP_LABEL(kmp_1)
1441	ldr	x3, [x10, #8]!
1442
1443	sub	w9, w9, #1
1444	cbz	w9, KMP_LABEL(kmp_1)
1445	ldr	x4, [x10, #8]!
1446
1447	sub	w9, w9, #1
1448	cbz	w9, KMP_LABEL(kmp_1)
1449	ldr	x5, [x10, #8]!
1450
1451	sub	w9, w9, #1
1452	cbz	w9, KMP_LABEL(kmp_1)
1453	ldr	x6, [x10, #8]!
1454
1455	sub	w9, w9, #1
1456	cbz	w9, KMP_LABEL(kmp_1)
1457	ldr	x7, [x10, #8]!
1458
1459KMP_LABEL(kmp_0):
1460	sub	w9, w9, #1
1461	cbz	w9, KMP_LABEL(kmp_1)
1462	ldr	x12, [x10, #8]!
1463	str	x12, [x11], #8
1464	b	KMP_LABEL(kmp_0)
1465KMP_LABEL(kmp_1):
1466	blr	x8
1467	orr	w0, wzr, #1
1468	mov	sp, x29
1469# if OMPT_SUPPORT
1470	str	xzr, [x19]
1471	ldp	x19, x20, [sp], #16
1472# endif
1473	ldp	x29, x30, [sp], #16
1474	ret
1475
1476	DEBUG_INFO __kmp_invoke_microtask
1477// -- End  __kmp_invoke_microtask
1478
1479#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
1480
1481#if KMP_ARCH_PPC64
1482
1483//------------------------------------------------------------------------
1484//
1485// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
1486//
1487// int
1488// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1489//		           int gtid, int tid,
1490//                         int argc, void *p_argv[] ) {
1491//    (*pkfn)( & gtid, & tid, argv[0], ... );
1492//    return 1;
1493// }
1494//
1495// parameters:
1496//	r3:	pkfn
1497//	r4:	gtid
1498//	r5:	tid
1499//	r6:	argc
1500//	r7:	p_argv
1501//	r8:	&exit_frame
1502//
1503// return:	r3	(always 1/TRUE)
1504//
1505	.text
1506# if KMP_ARCH_PPC64_LE
1507	.abiversion 2
1508# endif
1509	.globl	__kmp_invoke_microtask
1510
1511# if KMP_ARCH_PPC64_LE
1512	.p2align	4
1513# else
1514	.p2align	2
1515# endif
1516
1517	.type	__kmp_invoke_microtask,@function
1518
1519# if KMP_ARCH_PPC64_LE
1520__kmp_invoke_microtask:
1521.Lfunc_begin0:
1522.Lfunc_gep0:
1523	addis 2, 12, .TOC.-.Lfunc_gep0@ha
1524	addi 2, 2, .TOC.-.Lfunc_gep0@l
1525.Lfunc_lep0:
1526	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1527# else
1528	.section	.opd,"aw",@progbits
1529__kmp_invoke_microtask:
1530	.p2align	3
1531	.quad	.Lfunc_begin0
1532	.quad	.TOC.@tocbase
1533	.quad	0
1534	.text
1535.Lfunc_begin0:
1536# endif
1537
1538// -- Begin __kmp_invoke_microtask
1539// mark_begin;
1540
1541// We need to allocate a stack frame large enough to hold all of the parameters
1542// on the stack for the microtask plus what this function needs. That's 48
1543// bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1544// parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1545// and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1546// to save r30 to hold a copy of r8.
1547
1548	.cfi_startproc
1549	mflr 0
1550	std 31, -8(1)
1551	std 0, 16(1)
1552
1553// This is unusual because normally we'd set r31 equal to r1 after the stack
1554// frame is established. In this case, however, we need to dynamically compute
1555// the stack frame size, and so we keep a direct copy of r1 to access our
1556// register save areas and restore the r1 value before returning.
1557	mr 31, 1
1558	.cfi_def_cfa_register r31
1559	.cfi_offset r31, -8
1560	.cfi_offset lr, 16
1561
1562// Compute the size necessary for the local stack frame.
1563# if KMP_ARCH_PPC64_LE
1564	li 12, 72
1565# else
1566	li 12, 88
1567# endif
1568	sldi 0, 6, 3
1569	add 12, 0, 12
1570	neg 12, 12
1571
1572// We need to make sure that the stack frame stays aligned (to 16 bytes, except
1573// under the BG/Q CNK, where it must be to 32 bytes).
1574# if KMP_OS_CNK
1575	li 0, -32
1576# else
1577	li 0, -16
1578# endif
1579	and 12, 0, 12
1580
1581// Establish the local stack frame.
1582	stdux 1, 1, 12
1583
1584# if OMPT_SUPPORT
1585	.cfi_offset r30, -16
1586	std 30, -16(31)
1587	std 1, 0(8)
1588	mr 30, 8
1589# endif
1590
1591// Store gtid and tid to the stack because they're passed by reference to the microtask.
1592	stw 4, -20(31)
1593	stw 5, -24(31)
1594
1595	mr 12, 6
1596	mr 4, 7
1597
1598	cmpwi 0, 12, 1
1599	blt	 0, .Lcall
1600
1601	ld 5, 0(4)
1602
1603	cmpwi 0, 12, 2
1604	blt	 0, .Lcall
1605
1606	ld 6, 8(4)
1607
1608	cmpwi 0, 12, 3
1609	blt	 0, .Lcall
1610
1611	ld 7, 16(4)
1612
1613	cmpwi 0, 12, 4
1614	blt	 0, .Lcall
1615
1616	ld 8, 24(4)
1617
1618	cmpwi 0, 12, 5
1619	blt	 0, .Lcall
1620
1621	ld 9, 32(4)
1622
1623	cmpwi 0, 12, 6
1624	blt	 0, .Lcall
1625
1626	ld 10, 40(4)
1627
1628	cmpwi 0, 12, 7
1629	blt	 0, .Lcall
1630
1631// There are more than 6 microtask parameters, so we need to store the
1632// remainder to the stack.
1633	addi 12, 12, -6
1634	mtctr 12
1635
1636// These are set to 8 bytes before the first desired store address (we're using
1637// pre-increment loads and stores in the loop below). The parameter save area
1638// for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1639// 32 + 8*8 == 96 bytes above r1 for ELFv2.
1640	addi 4, 4, 40
1641# if KMP_ARCH_PPC64_LE
1642	addi 12, 1, 88
1643# else
1644	addi 12, 1, 104
1645# endif
1646
1647.Lnext:
1648	ldu 0, 8(4)
1649	stdu 0, 8(12)
1650	bdnz .Lnext
1651
1652.Lcall:
1653# if KMP_ARCH_PPC64_LE
1654	std 2, 24(1)
1655	mr 12, 3
1656#else
1657	std 2, 40(1)
1658// For ELFv1, we need to load the actual function address from the function descriptor.
1659	ld 12, 0(3)
1660	ld 2, 8(3)
1661	ld 11, 16(3)
1662#endif
1663
1664	addi 3, 31, -20
1665	addi 4, 31, -24
1666
1667	mtctr 12
1668	bctrl
1669# if KMP_ARCH_PPC64_LE
1670	ld 2, 24(1)
1671# else
1672	ld 2, 40(1)
1673# endif
1674
1675# if OMPT_SUPPORT
1676	li 3, 0
1677	std 3, 0(30)
1678# endif
1679
1680	li 3, 1
1681
1682# if OMPT_SUPPORT
1683	ld 30, -16(31)
1684# endif
1685
1686	mr 1, 31
1687	ld 0, 16(1)
1688	ld 31, -8(1)
1689	mtlr 0
1690	blr
1691
1692	.long	0
1693	.quad	0
1694.Lfunc_end0:
1695	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1696	.cfi_endproc
1697
1698// -- End  __kmp_invoke_microtask
1699
1700#endif /* KMP_ARCH_PPC64 */
1701
1702#if KMP_ARCH_ARM || KMP_ARCH_MIPS
1703    .data
1704    .comm .gomp_critical_user_,32,8
1705    .data
1706    .align 4
1707    .global __kmp_unnamed_critical_addr
1708__kmp_unnamed_critical_addr:
1709    .4byte .gomp_critical_user_
1710    .size __kmp_unnamed_critical_addr,4
1711#endif /* KMP_ARCH_ARM */
1712
1713#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
1714    .data
1715    .comm .gomp_critical_user_,32,8
1716    .data
1717    .align 8
1718    .global __kmp_unnamed_critical_addr
1719__kmp_unnamed_critical_addr:
1720    .8byte .gomp_critical_user_
1721    .size __kmp_unnamed_critical_addr,8
1722#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
1723
1724#if KMP_OS_LINUX
1725# if KMP_ARCH_ARM
1726.section .note.GNU-stack,"",%progbits
1727# else
1728.section .note.GNU-stack,"",@progbits
1729# endif
1730#endif
1731