1 /*
2 * kmp_csupport.cpp -- kfront linkage support for OpenMP.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #define __KMP_IMP
14 #include "omp.h" /* extern "C" declarations of user-visible routines */
15 #include "kmp.h"
16 #include "kmp_error.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_lock.h"
20 #include "kmp_stats.h"
21 #include "ompt-specific.h"
22
23 #define MAX_MESSAGE 512
24
25 // flags will be used in future, e.g. to implement openmp_strict library
26 // restrictions
27
28 /*!
29 * @ingroup STARTUP_SHUTDOWN
30 * @param loc in source location information
31 * @param flags in for future use (currently ignored)
32 *
33 * Initialize the runtime library. This call is optional; if it is not made then
34 * it will be implicitly called by attempts to use other library functions.
35 */
__kmpc_begin(ident_t * loc,kmp_int32 flags)36 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
37 // By default __kmpc_begin() is no-op.
38 char *env;
39 if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
40 __kmp_str_match_true(env)) {
41 __kmp_middle_initialize();
42 KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
43 } else if (__kmp_ignore_mppbeg() == FALSE) {
44 // By default __kmp_ignore_mppbeg() returns TRUE.
45 __kmp_internal_begin();
46 KC_TRACE(10, ("__kmpc_begin: called\n"));
47 }
48 }
49
50 /*!
51 * @ingroup STARTUP_SHUTDOWN
52 * @param loc source location information
53 *
54 * Shutdown the runtime library. This is also optional, and even if called will
55 * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
56 * zero.
57 */
__kmpc_end(ident_t * loc)58 void __kmpc_end(ident_t *loc) {
59 // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
60 // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
61 // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
62 // returns FALSE and __kmpc_end() will unregister this root (it can cause
63 // library shut down).
64 if (__kmp_ignore_mppend() == FALSE) {
65 KC_TRACE(10, ("__kmpc_end: called\n"));
66 KA_TRACE(30, ("__kmpc_end\n"));
67
68 __kmp_internal_end_thread(-1);
69 }
70 #if KMP_OS_WINDOWS && OMPT_SUPPORT
71 // Normal exit process on Windows does not allow worker threads of the final
72 // parallel region to finish reporting their events, so shutting down the
73 // library here fixes the issue at least for the cases where __kmpc_end() is
74 // placed properly.
75 if (ompt_enabled.enabled)
76 __kmp_internal_end_library(__kmp_gtid_get_specific());
77 #endif
78 }
79
80 /*!
81 @ingroup THREAD_STATES
82 @param loc Source location information.
83 @return The global thread index of the active thread.
84
85 This function can be called in any context.
86
87 If the runtime has ony been entered at the outermost level from a
88 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
89 that which would be returned by omp_get_thread_num() in the outermost
90 active parallel construct. (Or zero if there is no active parallel
91 construct, since the master thread is necessarily thread zero).
92
93 If multiple non-OpenMP threads all enter an OpenMP construct then this
94 will be a unique thread identifier among all the threads created by
95 the OpenMP runtime (but the value cannote be defined in terms of
96 OpenMP thread ids returned by omp_get_thread_num()).
97 */
__kmpc_global_thread_num(ident_t * loc)98 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
99 kmp_int32 gtid = __kmp_entry_gtid();
100
101 KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
102
103 return gtid;
104 }
105
106 /*!
107 @ingroup THREAD_STATES
108 @param loc Source location information.
109 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
110
111 This function can be called in any context.
112 It returns the total number of threads under the control of the OpenMP runtime.
113 That is not a number that can be determined by any OpenMP standard calls, since
114 the library may be called from more than one non-OpenMP thread, and this
115 reflects the total over all such calls. Similarly the runtime maintains
116 underlying threads even when they are not active (since the cost of creating
117 and destroying OS threads is high), this call counts all such threads even if
118 they are not waiting for work.
119 */
__kmpc_global_num_threads(ident_t * loc)120 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
121 KC_TRACE(10,
122 ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
123
124 return TCR_4(__kmp_all_nth);
125 }
126
127 /*!
128 @ingroup THREAD_STATES
129 @param loc Source location information.
130 @return The thread number of the calling thread in the innermost active parallel
131 construct.
132 */
__kmpc_bound_thread_num(ident_t * loc)133 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
134 KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
135 return __kmp_tid_from_gtid(__kmp_entry_gtid());
136 }
137
138 /*!
139 @ingroup THREAD_STATES
140 @param loc Source location information.
141 @return The number of threads in the innermost active parallel construct.
142 */
__kmpc_bound_num_threads(ident_t * loc)143 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
144 KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
145
146 return __kmp_entry_thread()->th.th_team->t.t_nproc;
147 }
148
149 /*!
150 * @ingroup DEPRECATED
151 * @param loc location description
152 *
153 * This function need not be called. It always returns TRUE.
154 */
__kmpc_ok_to_fork(ident_t * loc)155 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
156 #ifndef KMP_DEBUG
157
158 return TRUE;
159
160 #else
161
162 const char *semi2;
163 const char *semi3;
164 int line_no;
165
166 if (__kmp_par_range == 0) {
167 return TRUE;
168 }
169 semi2 = loc->psource;
170 if (semi2 == NULL) {
171 return TRUE;
172 }
173 semi2 = strchr(semi2, ';');
174 if (semi2 == NULL) {
175 return TRUE;
176 }
177 semi2 = strchr(semi2 + 1, ';');
178 if (semi2 == NULL) {
179 return TRUE;
180 }
181 if (__kmp_par_range_filename[0]) {
182 const char *name = semi2 - 1;
183 while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
184 name--;
185 }
186 if ((*name == '/') || (*name == ';')) {
187 name++;
188 }
189 if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
190 return __kmp_par_range < 0;
191 }
192 }
193 semi3 = strchr(semi2 + 1, ';');
194 if (__kmp_par_range_routine[0]) {
195 if ((semi3 != NULL) && (semi3 > semi2) &&
196 (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
197 return __kmp_par_range < 0;
198 }
199 }
200 if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
201 if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
202 return __kmp_par_range > 0;
203 }
204 return __kmp_par_range < 0;
205 }
206 return TRUE;
207
208 #endif /* KMP_DEBUG */
209 }
210
211 /*!
212 @ingroup THREAD_STATES
213 @param loc Source location information.
214 @return 1 if this thread is executing inside an active parallel region, zero if
215 not.
216 */
__kmpc_in_parallel(ident_t * loc)217 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
218 return __kmp_entry_thread()->th.th_root->r.r_active;
219 }
220
221 /*!
222 @ingroup PARALLEL
223 @param loc source location information
224 @param global_tid global thread number
225 @param num_threads number of threads requested for this parallel construct
226
227 Set the number of threads to be used by the next fork spawned by this thread.
228 This call is only required if the parallel construct has a `num_threads` clause.
229 */
__kmpc_push_num_threads(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_threads)230 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
231 kmp_int32 num_threads) {
232 KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
233 global_tid, num_threads));
234
235 __kmp_push_num_threads(loc, global_tid, num_threads);
236 }
237
__kmpc_pop_num_threads(ident_t * loc,kmp_int32 global_tid)238 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
239 KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
240
241 /* the num_threads are automatically popped */
242 }
243
__kmpc_push_proc_bind(ident_t * loc,kmp_int32 global_tid,kmp_int32 proc_bind)244 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
245 kmp_int32 proc_bind) {
246 KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
247 proc_bind));
248
249 __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
250 }
251
252 /*!
253 @ingroup PARALLEL
254 @param loc source location information
255 @param argc total number of arguments in the ellipsis
256 @param microtask pointer to callback routine consisting of outlined parallel
257 construct
258 @param ... pointers to shared variables that aren't global
259
260 Do the actual fork and call the microtask in the relevant number of threads.
261 */
__kmpc_fork_call(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)262 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
263 int gtid = __kmp_entry_gtid();
264
265 #if (KMP_STATS_ENABLED)
266 // If we were in a serial region, then stop the serial timer, record
267 // the event, and start parallel region timer
268 stats_state_e previous_state = KMP_GET_THREAD_STATE();
269 if (previous_state == stats_state_e::SERIAL_REGION) {
270 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
271 } else {
272 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
273 }
274 int inParallel = __kmpc_in_parallel(loc);
275 if (inParallel) {
276 KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
277 } else {
278 KMP_COUNT_BLOCK(OMP_PARALLEL);
279 }
280 #endif
281
282 // maybe to save thr_state is enough here
283 {
284 va_list ap;
285 va_start(ap, microtask);
286
287 #if OMPT_SUPPORT
288 ompt_frame_t *ompt_frame;
289 if (ompt_enabled.enabled) {
290 kmp_info_t *master_th = __kmp_threads[gtid];
291 kmp_team_t *parent_team = master_th->th.th_team;
292 ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
293 if (lwt)
294 ompt_frame = &(lwt->ompt_task_info.frame);
295 else {
296 int tid = __kmp_tid_from_gtid(gtid);
297 ompt_frame = &(
298 parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
299 }
300 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
301 OMPT_STORE_RETURN_ADDRESS(gtid);
302 }
303 #endif
304
305 #if INCLUDE_SSC_MARKS
306 SSC_MARK_FORKING();
307 #endif
308 __kmp_fork_call(loc, gtid, fork_context_intel, argc,
309 VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
310 VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
311 kmp_va_addr_of(ap));
312 #if INCLUDE_SSC_MARKS
313 SSC_MARK_JOINING();
314 #endif
315 __kmp_join_call(loc, gtid
316 #if OMPT_SUPPORT
317 ,
318 fork_context_intel
319 #endif
320 );
321
322 va_end(ap);
323 }
324
325 #if KMP_STATS_ENABLED
326 if (previous_state == stats_state_e::SERIAL_REGION) {
327 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
328 } else {
329 KMP_POP_PARTITIONED_TIMER();
330 }
331 #endif // KMP_STATS_ENABLED
332 }
333
334 /*!
335 @ingroup PARALLEL
336 @param loc source location information
337 @param global_tid global thread number
338 @param num_teams number of teams requested for the teams construct
339 @param num_threads number of threads per team requested for the teams construct
340
341 Set the number of teams to be used by the teams construct.
342 This call is only required if the teams construct has a `num_teams` clause
343 or a `thread_limit` clause (or both).
344 */
__kmpc_push_num_teams(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_teams,kmp_int32 num_threads)345 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
346 kmp_int32 num_teams, kmp_int32 num_threads) {
347 KA_TRACE(20,
348 ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349 global_tid, num_teams, num_threads));
350
351 __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
352 }
353
354 /*!
355 @ingroup PARALLEL
356 @param loc source location information
357 @param argc total number of arguments in the ellipsis
358 @param microtask pointer to callback routine consisting of outlined teams
359 construct
360 @param ... pointers to shared variables that aren't global
361
362 Do the actual fork and call the microtask in the relevant number of threads.
363 */
__kmpc_fork_teams(ident_t * loc,kmp_int32 argc,kmpc_micro microtask,...)364 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
365 ...) {
366 int gtid = __kmp_entry_gtid();
367 kmp_info_t *this_thr = __kmp_threads[gtid];
368 va_list ap;
369 va_start(ap, microtask);
370
371 #if KMP_STATS_ENABLED
372 KMP_COUNT_BLOCK(OMP_TEAMS);
373 stats_state_e previous_state = KMP_GET_THREAD_STATE();
374 if (previous_state == stats_state_e::SERIAL_REGION) {
375 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
376 } else {
377 KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
378 }
379 #endif
380
381 // remember teams entry point and nesting level
382 this_thr->th.th_teams_microtask = microtask;
383 this_thr->th.th_teams_level =
384 this_thr->th.th_team->t.t_level; // AC: can be >0 on host
385
386 #if OMPT_SUPPORT
387 kmp_team_t *parent_team = this_thr->th.th_team;
388 int tid = __kmp_tid_from_gtid(gtid);
389 if (ompt_enabled.enabled) {
390 parent_team->t.t_implicit_task_taskdata[tid]
391 .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
392 }
393 OMPT_STORE_RETURN_ADDRESS(gtid);
394 #endif
395
396 // check if __kmpc_push_num_teams called, set default number of teams
397 // otherwise
398 if (this_thr->th.th_teams_size.nteams == 0) {
399 __kmp_push_num_teams(loc, gtid, 0, 0);
400 }
401 KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
402 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
403 KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
404
405 __kmp_fork_call(
406 loc, gtid, fork_context_intel, argc,
407 VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
408 VOLATILE_CAST(launch_t) __kmp_invoke_teams_master, kmp_va_addr_of(ap));
409 __kmp_join_call(loc, gtid
410 #if OMPT_SUPPORT
411 ,
412 fork_context_intel
413 #endif
414 );
415
416 // Pop current CG root off list
417 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
418 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
419 this_thr->th.th_cg_roots = tmp->up;
420 KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
421 " to node %p. cg_nthreads was %d\n",
422 this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
423 KMP_DEBUG_ASSERT(tmp->cg_nthreads);
424 int i = tmp->cg_nthreads--;
425 if (i == 1) { // check is we are the last thread in CG (not always the case)
426 __kmp_free(tmp);
427 }
428 // Restore current task's thread_limit from CG root
429 KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
430 this_thr->th.th_current_task->td_icvs.thread_limit =
431 this_thr->th.th_cg_roots->cg_thread_limit;
432
433 this_thr->th.th_teams_microtask = NULL;
434 this_thr->th.th_teams_level = 0;
435 *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
436 va_end(ap);
437 #if KMP_STATS_ENABLED
438 if (previous_state == stats_state_e::SERIAL_REGION) {
439 KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
440 } else {
441 KMP_POP_PARTITIONED_TIMER();
442 }
443 #endif // KMP_STATS_ENABLED
444 }
445
446 // I don't think this function should ever have been exported.
447 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
448 // openmp code ever called it, but it's been exported from the RTL for so
449 // long that I'm afraid to remove the definition.
__kmpc_invoke_task_func(int gtid)450 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
451
452 /*!
453 @ingroup PARALLEL
454 @param loc source location information
455 @param global_tid global thread number
456
457 Enter a serialized parallel construct. This interface is used to handle a
458 conditional parallel region, like this,
459 @code
460 #pragma omp parallel if (condition)
461 @endcode
462 when the condition is false.
463 */
__kmpc_serialized_parallel(ident_t * loc,kmp_int32 global_tid)464 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
465 // The implementation is now in kmp_runtime.cpp so that it can share static
466 // functions with kmp_fork_call since the tasks to be done are similar in
467 // each case.
468 #if OMPT_SUPPORT
469 OMPT_STORE_RETURN_ADDRESS(global_tid);
470 #endif
471 __kmp_serialized_parallel(loc, global_tid);
472 }
473
474 /*!
475 @ingroup PARALLEL
476 @param loc source location information
477 @param global_tid global thread number
478
479 Leave a serialized parallel construct.
480 */
__kmpc_end_serialized_parallel(ident_t * loc,kmp_int32 global_tid)481 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
482 kmp_internal_control_t *top;
483 kmp_info_t *this_thr;
484 kmp_team_t *serial_team;
485
486 KC_TRACE(10,
487 ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
488
489 /* skip all this code for autopar serialized loops since it results in
490 unacceptable overhead */
491 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
492 return;
493
494 // Not autopar code
495 if (!TCR_4(__kmp_init_parallel))
496 __kmp_parallel_initialize();
497
498 __kmp_resume_if_soft_paused();
499
500 this_thr = __kmp_threads[global_tid];
501 serial_team = this_thr->th.th_serial_team;
502
503 kmp_task_team_t *task_team = this_thr->th.th_task_team;
504 // we need to wait for the proxy tasks before finishing the thread
505 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
506 __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
507
508 KMP_MB();
509 KMP_DEBUG_ASSERT(serial_team);
510 KMP_ASSERT(serial_team->t.t_serialized);
511 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
512 KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
513 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
514 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
515
516 #if OMPT_SUPPORT
517 if (ompt_enabled.enabled &&
518 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
519 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
520 if (ompt_enabled.ompt_callback_implicit_task) {
521 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
522 ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
523 OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
524 }
525
526 // reset clear the task id only after unlinking the task
527 ompt_data_t *parent_task_data;
528 __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
529
530 if (ompt_enabled.ompt_callback_parallel_end) {
531 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
532 &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
533 ompt_parallel_invoker_program | ompt_parallel_team,
534 OMPT_LOAD_RETURN_ADDRESS(global_tid));
535 }
536 __ompt_lw_taskteam_unlink(this_thr);
537 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
538 }
539 #endif
540
541 /* If necessary, pop the internal control stack values and replace the team
542 * values */
543 top = serial_team->t.t_control_stack_top;
544 if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
545 copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
546 serial_team->t.t_control_stack_top = top->next;
547 __kmp_free(top);
548 }
549
550 // if( serial_team -> t.t_serialized > 1 )
551 serial_team->t.t_level--;
552
553 /* pop dispatch buffers stack */
554 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
555 {
556 dispatch_private_info_t *disp_buffer =
557 serial_team->t.t_dispatch->th_disp_buffer;
558 serial_team->t.t_dispatch->th_disp_buffer =
559 serial_team->t.t_dispatch->th_disp_buffer->next;
560 __kmp_free(disp_buffer);
561 }
562 this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
563
564 --serial_team->t.t_serialized;
565 if (serial_team->t.t_serialized == 0) {
566
567 /* return to the parallel section */
568
569 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
570 if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
571 __kmp_clear_x87_fpu_status_word();
572 __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
573 __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
574 }
575 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
576
577 this_thr->th.th_team = serial_team->t.t_parent;
578 this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
579
580 /* restore values cached in the thread */
581 this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /* JPH */
582 this_thr->th.th_team_master =
583 serial_team->t.t_parent->t.t_threads[0]; /* JPH */
584 this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
585
586 /* TODO the below shouldn't need to be adjusted for serialized teams */
587 this_thr->th.th_dispatch =
588 &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
589
590 __kmp_pop_current_task_from_thread(this_thr);
591
592 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
593 this_thr->th.th_current_task->td_flags.executing = 1;
594
595 if (__kmp_tasking_mode != tskm_immediate_exec) {
596 // Copy the task team from the new child / old parent team to the thread.
597 this_thr->th.th_task_team =
598 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
599 KA_TRACE(20,
600 ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
601 "team %p\n",
602 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
603 }
604 } else {
605 if (__kmp_tasking_mode != tskm_immediate_exec) {
606 KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
607 "depth of serial team %p to %d\n",
608 global_tid, serial_team, serial_team->t.t_serialized));
609 }
610 }
611
612 if (__kmp_env_consistency_check)
613 __kmp_pop_parallel(global_tid, NULL);
614 #if OMPT_SUPPORT
615 if (ompt_enabled.enabled)
616 this_thr->th.ompt_thread_info.state =
617 ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
618 : ompt_state_work_parallel);
619 #endif
620 }
621
622 /*!
623 @ingroup SYNCHRONIZATION
624 @param loc source location information.
625
626 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
627 depending on the memory ordering convention obeyed by the compiler
628 even that may not be necessary).
629 */
__kmpc_flush(ident_t * loc)630 void __kmpc_flush(ident_t *loc) {
631 KC_TRACE(10, ("__kmpc_flush: called\n"));
632
633 /* need explicit __mf() here since use volatile instead in library */
634 KMP_MB(); /* Flush all pending memory write invalidates. */
635
636 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
637 #if KMP_MIC
638 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
639 // We shouldn't need it, though, since the ABI rules require that
640 // * If the compiler generates NGO stores it also generates the fence
641 // * If users hand-code NGO stores they should insert the fence
642 // therefore no incomplete unordered stores should be visible.
643 #else
644 // C74404
645 // This is to address non-temporal store instructions (sfence needed).
646 // The clflush instruction is addressed either (mfence needed).
647 // Probably the non-temporal load monvtdqa instruction should also be
648 // addressed.
649 // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
650 if (!__kmp_cpuinfo.initialized) {
651 __kmp_query_cpuid(&__kmp_cpuinfo);
652 }
653 if (!__kmp_cpuinfo.sse2) {
654 // CPU cannot execute SSE2 instructions.
655 } else {
656 #if KMP_COMPILER_ICC
657 _mm_mfence();
658 #elif KMP_COMPILER_MSVC
659 MemoryBarrier();
660 #else
661 __sync_synchronize();
662 #endif // KMP_COMPILER_ICC
663 }
664 #endif // KMP_MIC
665 #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
666 KMP_ARCH_RISCV64)
667 // Nothing to see here move along
668 #elif KMP_ARCH_PPC64
669 // Nothing needed here (we have a real MB above).
670 #if KMP_OS_CNK
671 // The flushing thread needs to yield here; this prevents a
672 // busy-waiting thread from saturating the pipeline. flush is
673 // often used in loops like this:
674 // while (!flag) {
675 // #pragma omp flush(flag)
676 // }
677 // and adding the yield here is good for at least a 10x speedup
678 // when running >2 threads per core (on the NAS LU benchmark).
679 __kmp_yield();
680 #endif
681 #else
682 #error Unknown or unsupported architecture
683 #endif
684
685 #if OMPT_SUPPORT && OMPT_OPTIONAL
686 if (ompt_enabled.ompt_callback_flush) {
687 ompt_callbacks.ompt_callback(ompt_callback_flush)(
688 __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
689 }
690 #endif
691 }
692
693 /* -------------------------------------------------------------------------- */
694 /*!
695 @ingroup SYNCHRONIZATION
696 @param loc source location information
697 @param global_tid thread id.
698
699 Execute a barrier.
700 */
__kmpc_barrier(ident_t * loc,kmp_int32 global_tid)701 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
702 KMP_COUNT_BLOCK(OMP_BARRIER);
703 KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
704
705 if (!TCR_4(__kmp_init_parallel))
706 __kmp_parallel_initialize();
707
708 __kmp_resume_if_soft_paused();
709
710 if (__kmp_env_consistency_check) {
711 if (loc == 0) {
712 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
713 }
714 __kmp_check_barrier(global_tid, ct_barrier, loc);
715 }
716
717 #if OMPT_SUPPORT
718 ompt_frame_t *ompt_frame;
719 if (ompt_enabled.enabled) {
720 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
721 if (ompt_frame->enter_frame.ptr == NULL)
722 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
723 OMPT_STORE_RETURN_ADDRESS(global_tid);
724 }
725 #endif
726 __kmp_threads[global_tid]->th.th_ident = loc;
727 // TODO: explicit barrier_wait_id:
728 // this function is called when 'barrier' directive is present or
729 // implicit barrier at the end of a worksharing construct.
730 // 1) better to add a per-thread barrier counter to a thread data structure
731 // 2) set to 0 when a new team is created
732 // 4) no sync is required
733
734 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
735 #if OMPT_SUPPORT && OMPT_OPTIONAL
736 if (ompt_enabled.enabled) {
737 ompt_frame->enter_frame = ompt_data_none;
738 }
739 #endif
740 }
741
742 /* The BARRIER for a MASTER section is always explicit */
743 /*!
744 @ingroup WORK_SHARING
745 @param loc source location information.
746 @param global_tid global thread number .
747 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
748 */
__kmpc_master(ident_t * loc,kmp_int32 global_tid)749 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
750 int status = 0;
751
752 KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
753
754 if (!TCR_4(__kmp_init_parallel))
755 __kmp_parallel_initialize();
756
757 __kmp_resume_if_soft_paused();
758
759 if (KMP_MASTER_GTID(global_tid)) {
760 KMP_COUNT_BLOCK(OMP_MASTER);
761 KMP_PUSH_PARTITIONED_TIMER(OMP_master);
762 status = 1;
763 }
764
765 #if OMPT_SUPPORT && OMPT_OPTIONAL
766 if (status) {
767 if (ompt_enabled.ompt_callback_master) {
768 kmp_info_t *this_thr = __kmp_threads[global_tid];
769 kmp_team_t *team = this_thr->th.th_team;
770
771 int tid = __kmp_tid_from_gtid(global_tid);
772 ompt_callbacks.ompt_callback(ompt_callback_master)(
773 ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
774 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
775 OMPT_GET_RETURN_ADDRESS(0));
776 }
777 }
778 #endif
779
780 if (__kmp_env_consistency_check) {
781 #if KMP_USE_DYNAMIC_LOCK
782 if (status)
783 __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
784 else
785 __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
786 #else
787 if (status)
788 __kmp_push_sync(global_tid, ct_master, loc, NULL);
789 else
790 __kmp_check_sync(global_tid, ct_master, loc, NULL);
791 #endif
792 }
793
794 return status;
795 }
796
797 /*!
798 @ingroup WORK_SHARING
799 @param loc source location information.
800 @param global_tid global thread number .
801
802 Mark the end of a <tt>master</tt> region. This should only be called by the
803 thread that executes the <tt>master</tt> region.
804 */
__kmpc_end_master(ident_t * loc,kmp_int32 global_tid)805 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
806 KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
807
808 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
809 KMP_POP_PARTITIONED_TIMER();
810
811 #if OMPT_SUPPORT && OMPT_OPTIONAL
812 kmp_info_t *this_thr = __kmp_threads[global_tid];
813 kmp_team_t *team = this_thr->th.th_team;
814 if (ompt_enabled.ompt_callback_master) {
815 int tid = __kmp_tid_from_gtid(global_tid);
816 ompt_callbacks.ompt_callback(ompt_callback_master)(
817 ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
818 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
819 OMPT_GET_RETURN_ADDRESS(0));
820 }
821 #endif
822
823 if (__kmp_env_consistency_check) {
824 if (global_tid < 0)
825 KMP_WARNING(ThreadIdentInvalid);
826
827 if (KMP_MASTER_GTID(global_tid))
828 __kmp_pop_sync(global_tid, ct_master, loc);
829 }
830 }
831
832 /*!
833 @ingroup WORK_SHARING
834 @param loc source location information.
835 @param gtid global thread number.
836
837 Start execution of an <tt>ordered</tt> construct.
838 */
__kmpc_ordered(ident_t * loc,kmp_int32 gtid)839 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
840 int cid = 0;
841 kmp_info_t *th;
842 KMP_DEBUG_ASSERT(__kmp_init_serial);
843
844 KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
845
846 if (!TCR_4(__kmp_init_parallel))
847 __kmp_parallel_initialize();
848
849 __kmp_resume_if_soft_paused();
850
851 #if USE_ITT_BUILD
852 __kmp_itt_ordered_prep(gtid);
853 // TODO: ordered_wait_id
854 #endif /* USE_ITT_BUILD */
855
856 th = __kmp_threads[gtid];
857
858 #if OMPT_SUPPORT && OMPT_OPTIONAL
859 kmp_team_t *team;
860 ompt_wait_id_t lck;
861 void *codeptr_ra;
862 if (ompt_enabled.enabled) {
863 OMPT_STORE_RETURN_ADDRESS(gtid);
864 team = __kmp_team_from_gtid(gtid);
865 lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
866 /* OMPT state update */
867 th->th.ompt_thread_info.wait_id = lck;
868 th->th.ompt_thread_info.state = ompt_state_wait_ordered;
869
870 /* OMPT event callback */
871 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
872 if (ompt_enabled.ompt_callback_mutex_acquire) {
873 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
874 ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
875 codeptr_ra);
876 }
877 }
878 #endif
879
880 if (th->th.th_dispatch->th_deo_fcn != 0)
881 (*th->th.th_dispatch->th_deo_fcn)(>id, &cid, loc);
882 else
883 __kmp_parallel_deo(>id, &cid, loc);
884
885 #if OMPT_SUPPORT && OMPT_OPTIONAL
886 if (ompt_enabled.enabled) {
887 /* OMPT state update */
888 th->th.ompt_thread_info.state = ompt_state_work_parallel;
889 th->th.ompt_thread_info.wait_id = 0;
890
891 /* OMPT event callback */
892 if (ompt_enabled.ompt_callback_mutex_acquired) {
893 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
894 ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
895 }
896 }
897 #endif
898
899 #if USE_ITT_BUILD
900 __kmp_itt_ordered_start(gtid);
901 #endif /* USE_ITT_BUILD */
902 }
903
904 /*!
905 @ingroup WORK_SHARING
906 @param loc source location information.
907 @param gtid global thread number.
908
909 End execution of an <tt>ordered</tt> construct.
910 */
__kmpc_end_ordered(ident_t * loc,kmp_int32 gtid)911 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
912 int cid = 0;
913 kmp_info_t *th;
914
915 KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
916
917 #if USE_ITT_BUILD
918 __kmp_itt_ordered_end(gtid);
919 // TODO: ordered_wait_id
920 #endif /* USE_ITT_BUILD */
921
922 th = __kmp_threads[gtid];
923
924 if (th->th.th_dispatch->th_dxo_fcn != 0)
925 (*th->th.th_dispatch->th_dxo_fcn)(>id, &cid, loc);
926 else
927 __kmp_parallel_dxo(>id, &cid, loc);
928
929 #if OMPT_SUPPORT && OMPT_OPTIONAL
930 OMPT_STORE_RETURN_ADDRESS(gtid);
931 if (ompt_enabled.ompt_callback_mutex_released) {
932 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
933 ompt_mutex_ordered,
934 (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
935 ->t.t_ordered.dt.t_value,
936 OMPT_LOAD_RETURN_ADDRESS(gtid));
937 }
938 #endif
939 }
940
941 #if KMP_USE_DYNAMIC_LOCK
942
943 static __forceinline void
__kmp_init_indirect_csptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid,kmp_indirect_locktag_t tag)944 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
945 kmp_int32 gtid, kmp_indirect_locktag_t tag) {
946 // Pointer to the allocated indirect lock is written to crit, while indexing
947 // is ignored.
948 void *idx;
949 kmp_indirect_lock_t **lck;
950 lck = (kmp_indirect_lock_t **)crit;
951 kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
952 KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
953 KMP_SET_I_LOCK_LOCATION(ilk, loc);
954 KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
955 KA_TRACE(20,
956 ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
957 #if USE_ITT_BUILD
958 __kmp_itt_critical_creating(ilk->lock, loc);
959 #endif
960 int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
961 if (status == 0) {
962 #if USE_ITT_BUILD
963 __kmp_itt_critical_destroyed(ilk->lock);
964 #endif
965 // We don't really need to destroy the unclaimed lock here since it will be
966 // cleaned up at program exit.
967 // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
968 }
969 KMP_DEBUG_ASSERT(*lck != NULL);
970 }
971
972 // Fast-path acquire tas lock
973 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) \
974 { \
975 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
976 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
977 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
978 if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
979 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) { \
980 kmp_uint32 spins; \
981 KMP_FSYNC_PREPARE(l); \
982 KMP_INIT_YIELD(spins); \
983 kmp_backoff_t backoff = __kmp_spin_backoff_params; \
984 do { \
985 if (TCR_4(__kmp_nth) > \
986 (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
987 KMP_YIELD(TRUE); \
988 } else { \
989 KMP_YIELD_SPIN(spins); \
990 } \
991 __kmp_spin_backoff(&backoff); \
992 } while ( \
993 KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free || \
994 !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)); \
995 } \
996 KMP_FSYNC_ACQUIRED(l); \
997 }
998
999 // Fast-path test tas lock
1000 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) \
1001 { \
1002 kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
1003 kmp_int32 tas_free = KMP_LOCK_FREE(tas); \
1004 kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas); \
1005 rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free && \
1006 __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy); \
1007 }
1008
1009 // Fast-path release tas lock
1010 #define KMP_RELEASE_TAS_LOCK(lock, gtid) \
1011 { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
1012
1013 #if KMP_USE_FUTEX
1014
1015 #include <sys/syscall.h>
1016 #include <unistd.h>
1017 #ifndef FUTEX_WAIT
1018 #define FUTEX_WAIT 0
1019 #endif
1020 #ifndef FUTEX_WAKE
1021 #define FUTEX_WAKE 1
1022 #endif
1023
1024 // Fast-path acquire futex lock
1025 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) \
1026 { \
1027 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1028 kmp_int32 gtid_code = (gtid + 1) << 1; \
1029 KMP_MB(); \
1030 KMP_FSYNC_PREPARE(ftx); \
1031 kmp_int32 poll_val; \
1032 while ((poll_val = KMP_COMPARE_AND_STORE_RET32( \
1033 &(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1034 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
1035 kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
1036 if (!cond) { \
1037 if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, \
1038 poll_val | \
1039 KMP_LOCK_BUSY(1, futex))) { \
1040 continue; \
1041 } \
1042 poll_val |= KMP_LOCK_BUSY(1, futex); \
1043 } \
1044 kmp_int32 rc; \
1045 if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, \
1046 NULL, NULL, 0)) != 0) { \
1047 continue; \
1048 } \
1049 gtid_code |= 1; \
1050 } \
1051 KMP_FSYNC_ACQUIRED(ftx); \
1052 }
1053
1054 // Fast-path test futex lock
1055 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) \
1056 { \
1057 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1058 if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
1059 KMP_LOCK_BUSY(gtid + 1 << 1, futex))) { \
1060 KMP_FSYNC_ACQUIRED(ftx); \
1061 rc = TRUE; \
1062 } else { \
1063 rc = FALSE; \
1064 } \
1065 }
1066
1067 // Fast-path release futex lock
1068 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) \
1069 { \
1070 kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1071 KMP_MB(); \
1072 KMP_FSYNC_RELEASING(ftx); \
1073 kmp_int32 poll_val = \
1074 KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1075 if (KMP_LOCK_STRIP(poll_val) & 1) { \
1076 syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, \
1077 KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1078 } \
1079 KMP_MB(); \
1080 KMP_YIELD_OVERSUB(); \
1081 }
1082
1083 #endif // KMP_USE_FUTEX
1084
1085 #else // KMP_USE_DYNAMIC_LOCK
1086
__kmp_get_critical_section_ptr(kmp_critical_name * crit,ident_t const * loc,kmp_int32 gtid)1087 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
1088 ident_t const *loc,
1089 kmp_int32 gtid) {
1090 kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1091
1092 // Because of the double-check, the following load doesn't need to be volatile
1093 kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1094
1095 if (lck == NULL) {
1096 void *idx;
1097
1098 // Allocate & initialize the lock.
1099 // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
1100 lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
1101 __kmp_init_user_lock_with_checks(lck);
1102 __kmp_set_user_lock_location(lck, loc);
1103 #if USE_ITT_BUILD
1104 __kmp_itt_critical_creating(lck);
1105 // __kmp_itt_critical_creating() should be called *before* the first usage
1106 // of underlying lock. It is the only place where we can guarantee it. There
1107 // are chances the lock will destroyed with no usage, but it is not a
1108 // problem, because this is not real event seen by user but rather setting
1109 // name for object (lock). See more details in kmp_itt.h.
1110 #endif /* USE_ITT_BUILD */
1111
1112 // Use a cmpxchg instruction to slam the start of the critical section with
1113 // the lock pointer. If another thread beat us to it, deallocate the lock,
1114 // and use the lock that the other thread allocated.
1115 int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
1116
1117 if (status == 0) {
1118 // Deallocate the lock and reload the value.
1119 #if USE_ITT_BUILD
1120 __kmp_itt_critical_destroyed(lck);
1121 // Let ITT know the lock is destroyed and the same memory location may be reused
1122 // for another purpose.
1123 #endif /* USE_ITT_BUILD */
1124 __kmp_destroy_user_lock_with_checks(lck);
1125 __kmp_user_lock_free(&idx, gtid, lck);
1126 lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
1127 KMP_DEBUG_ASSERT(lck != NULL);
1128 }
1129 }
1130 return lck;
1131 }
1132
1133 #endif // KMP_USE_DYNAMIC_LOCK
1134
1135 /*!
1136 @ingroup WORK_SHARING
1137 @param loc source location information.
1138 @param global_tid global thread number .
1139 @param crit identity of the critical section. This could be a pointer to a lock
1140 associated with the critical section, or some other suitably unique value.
1141
1142 Enter code protected by a `critical` construct.
1143 This function blocks until the executing thread can enter the critical section.
1144 */
__kmpc_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1145 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
1146 kmp_critical_name *crit) {
1147 #if KMP_USE_DYNAMIC_LOCK
1148 #if OMPT_SUPPORT && OMPT_OPTIONAL
1149 OMPT_STORE_RETURN_ADDRESS(global_tid);
1150 #endif // OMPT_SUPPORT
1151 __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1152 #else
1153 KMP_COUNT_BLOCK(OMP_CRITICAL);
1154 #if OMPT_SUPPORT && OMPT_OPTIONAL
1155 ompt_state_t prev_state = ompt_state_undefined;
1156 ompt_thread_info_t ti;
1157 #endif
1158 kmp_user_lock_p lck;
1159
1160 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1161
1162 // TODO: add THR_OVHD_STATE
1163
1164 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1165 KMP_CHECK_USER_LOCK_INIT();
1166
1167 if ((__kmp_user_lock_kind == lk_tas) &&
1168 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1169 lck = (kmp_user_lock_p)crit;
1170 }
1171 #if KMP_USE_FUTEX
1172 else if ((__kmp_user_lock_kind == lk_futex) &&
1173 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1174 lck = (kmp_user_lock_p)crit;
1175 }
1176 #endif
1177 else { // ticket, queuing or drdpa
1178 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
1179 }
1180
1181 if (__kmp_env_consistency_check)
1182 __kmp_push_sync(global_tid, ct_critical, loc, lck);
1183
1184 // since the critical directive binds to all threads, not just the current
1185 // team we have to check this even if we are in a serialized team.
1186 // also, even if we are the uber thread, we still have to conduct the lock,
1187 // as we have to contend with sibling threads.
1188
1189 #if USE_ITT_BUILD
1190 __kmp_itt_critical_acquiring(lck);
1191 #endif /* USE_ITT_BUILD */
1192 #if OMPT_SUPPORT && OMPT_OPTIONAL
1193 OMPT_STORE_RETURN_ADDRESS(gtid);
1194 void *codeptr_ra = NULL;
1195 if (ompt_enabled.enabled) {
1196 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1197 /* OMPT state update */
1198 prev_state = ti.state;
1199 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1200 ti.state = ompt_state_wait_critical;
1201
1202 /* OMPT event callback */
1203 codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
1204 if (ompt_enabled.ompt_callback_mutex_acquire) {
1205 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1206 ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
1207 (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1208 }
1209 }
1210 #endif
1211 // Value of 'crit' should be good for using as a critical_id of the critical
1212 // section directive.
1213 __kmp_acquire_user_lock_with_checks(lck, global_tid);
1214
1215 #if USE_ITT_BUILD
1216 __kmp_itt_critical_acquired(lck);
1217 #endif /* USE_ITT_BUILD */
1218 #if OMPT_SUPPORT && OMPT_OPTIONAL
1219 if (ompt_enabled.enabled) {
1220 /* OMPT state update */
1221 ti.state = prev_state;
1222 ti.wait_id = 0;
1223
1224 /* OMPT event callback */
1225 if (ompt_enabled.ompt_callback_mutex_acquired) {
1226 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1227 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
1228 }
1229 }
1230 #endif
1231 KMP_POP_PARTITIONED_TIMER();
1232
1233 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1234 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1235 #endif // KMP_USE_DYNAMIC_LOCK
1236 }
1237
1238 #if KMP_USE_DYNAMIC_LOCK
1239
1240 // Converts the given hint to an internal lock implementation
__kmp_map_hint_to_lock(uintptr_t hint)1241 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
1242 #if KMP_USE_TSX
1243 #define KMP_TSX_LOCK(seq) lockseq_##seq
1244 #else
1245 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1246 #endif
1247
1248 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1249 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1250 #else
1251 #define KMP_CPUINFO_RTM 0
1252 #endif
1253
1254 // Hints that do not require further logic
1255 if (hint & kmp_lock_hint_hle)
1256 return KMP_TSX_LOCK(hle);
1257 if (hint & kmp_lock_hint_rtm)
1258 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
1259 if (hint & kmp_lock_hint_adaptive)
1260 return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
1261
1262 // Rule out conflicting hints first by returning the default lock
1263 if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1264 return __kmp_user_lock_seq;
1265 if ((hint & omp_lock_hint_speculative) &&
1266 (hint & omp_lock_hint_nonspeculative))
1267 return __kmp_user_lock_seq;
1268
1269 // Do not even consider speculation when it appears to be contended
1270 if (hint & omp_lock_hint_contended)
1271 return lockseq_queuing;
1272
1273 // Uncontended lock without speculation
1274 if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1275 return lockseq_tas;
1276
1277 // HLE lock for speculation
1278 if (hint & omp_lock_hint_speculative)
1279 return KMP_TSX_LOCK(hle);
1280
1281 return __kmp_user_lock_seq;
1282 }
1283
1284 #if OMPT_SUPPORT && OMPT_OPTIONAL
1285 #if KMP_USE_DYNAMIC_LOCK
1286 static kmp_mutex_impl_t
__ompt_get_mutex_impl_type(void * user_lock,kmp_indirect_lock_t * ilock=0)1287 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
1288 if (user_lock) {
1289 switch (KMP_EXTRACT_D_TAG(user_lock)) {
1290 case 0:
1291 break;
1292 #if KMP_USE_FUTEX
1293 case locktag_futex:
1294 return kmp_mutex_impl_queuing;
1295 #endif
1296 case locktag_tas:
1297 return kmp_mutex_impl_spin;
1298 #if KMP_USE_TSX
1299 case locktag_hle:
1300 return kmp_mutex_impl_speculative;
1301 #endif
1302 default:
1303 return kmp_mutex_impl_none;
1304 }
1305 ilock = KMP_LOOKUP_I_LOCK(user_lock);
1306 }
1307 KMP_ASSERT(ilock);
1308 switch (ilock->type) {
1309 #if KMP_USE_TSX
1310 case locktag_adaptive:
1311 case locktag_rtm:
1312 return kmp_mutex_impl_speculative;
1313 #endif
1314 case locktag_nested_tas:
1315 return kmp_mutex_impl_spin;
1316 #if KMP_USE_FUTEX
1317 case locktag_nested_futex:
1318 #endif
1319 case locktag_ticket:
1320 case locktag_queuing:
1321 case locktag_drdpa:
1322 case locktag_nested_ticket:
1323 case locktag_nested_queuing:
1324 case locktag_nested_drdpa:
1325 return kmp_mutex_impl_queuing;
1326 default:
1327 return kmp_mutex_impl_none;
1328 }
1329 }
1330 #else
1331 // For locks without dynamic binding
__ompt_get_mutex_impl_type()1332 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
1333 switch (__kmp_user_lock_kind) {
1334 case lk_tas:
1335 return kmp_mutex_impl_spin;
1336 #if KMP_USE_FUTEX
1337 case lk_futex:
1338 #endif
1339 case lk_ticket:
1340 case lk_queuing:
1341 case lk_drdpa:
1342 return kmp_mutex_impl_queuing;
1343 #if KMP_USE_TSX
1344 case lk_hle:
1345 case lk_rtm:
1346 case lk_adaptive:
1347 return kmp_mutex_impl_speculative;
1348 #endif
1349 default:
1350 return kmp_mutex_impl_none;
1351 }
1352 }
1353 #endif // KMP_USE_DYNAMIC_LOCK
1354 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1355
1356 /*!
1357 @ingroup WORK_SHARING
1358 @param loc source location information.
1359 @param global_tid global thread number.
1360 @param crit identity of the critical section. This could be a pointer to a lock
1361 associated with the critical section, or some other suitably unique value.
1362 @param hint the lock hint.
1363
1364 Enter code protected by a `critical` construct with a hint. The hint value is
1365 used to suggest a lock implementation. This function blocks until the executing
1366 thread can enter the critical section unless the hint suggests use of
1367 speculative execution and the hardware supports it.
1368 */
__kmpc_critical_with_hint(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit,uint32_t hint)1369 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
1370 kmp_critical_name *crit, uint32_t hint) {
1371 KMP_COUNT_BLOCK(OMP_CRITICAL);
1372 kmp_user_lock_p lck;
1373 #if OMPT_SUPPORT && OMPT_OPTIONAL
1374 ompt_state_t prev_state = ompt_state_undefined;
1375 ompt_thread_info_t ti;
1376 // This is the case, if called from __kmpc_critical:
1377 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1378 if (!codeptr)
1379 codeptr = OMPT_GET_RETURN_ADDRESS(0);
1380 #endif
1381
1382 KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
1383
1384 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1385 // Check if it is initialized.
1386 KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
1387 if (*lk == 0) {
1388 kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1389 if (KMP_IS_D_LOCK(lckseq)) {
1390 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
1391 KMP_GET_D_TAG(lckseq));
1392 } else {
1393 __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1394 }
1395 }
1396 // Branch for accessing the actual lock object and set operation. This
1397 // branching is inevitable since this lock initialization does not follow the
1398 // normal dispatch path (lock table is not used).
1399 if (KMP_EXTRACT_D_TAG(lk) != 0) {
1400 lck = (kmp_user_lock_p)lk;
1401 if (__kmp_env_consistency_check) {
1402 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1403 __kmp_map_hint_to_lock(hint));
1404 }
1405 #if USE_ITT_BUILD
1406 __kmp_itt_critical_acquiring(lck);
1407 #endif
1408 #if OMPT_SUPPORT && OMPT_OPTIONAL
1409 if (ompt_enabled.enabled) {
1410 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1411 /* OMPT state update */
1412 prev_state = ti.state;
1413 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1414 ti.state = ompt_state_wait_critical;
1415
1416 /* OMPT event callback */
1417 if (ompt_enabled.ompt_callback_mutex_acquire) {
1418 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1419 ompt_mutex_critical, (unsigned int)hint,
1420 __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
1421 codeptr);
1422 }
1423 }
1424 #endif
1425 #if KMP_USE_INLINED_TAS
1426 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1427 KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1428 } else
1429 #elif KMP_USE_INLINED_FUTEX
1430 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1431 KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1432 } else
1433 #endif
1434 {
1435 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1436 }
1437 } else {
1438 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1439 lck = ilk->lock;
1440 if (__kmp_env_consistency_check) {
1441 __kmp_push_sync(global_tid, ct_critical, loc, lck,
1442 __kmp_map_hint_to_lock(hint));
1443 }
1444 #if USE_ITT_BUILD
1445 __kmp_itt_critical_acquiring(lck);
1446 #endif
1447 #if OMPT_SUPPORT && OMPT_OPTIONAL
1448 if (ompt_enabled.enabled) {
1449 ti = __kmp_threads[global_tid]->th.ompt_thread_info;
1450 /* OMPT state update */
1451 prev_state = ti.state;
1452 ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
1453 ti.state = ompt_state_wait_critical;
1454
1455 /* OMPT event callback */
1456 if (ompt_enabled.ompt_callback_mutex_acquire) {
1457 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
1458 ompt_mutex_critical, (unsigned int)hint,
1459 __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
1460 codeptr);
1461 }
1462 }
1463 #endif
1464 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1465 }
1466 KMP_POP_PARTITIONED_TIMER();
1467
1468 #if USE_ITT_BUILD
1469 __kmp_itt_critical_acquired(lck);
1470 #endif /* USE_ITT_BUILD */
1471 #if OMPT_SUPPORT && OMPT_OPTIONAL
1472 if (ompt_enabled.enabled) {
1473 /* OMPT state update */
1474 ti.state = prev_state;
1475 ti.wait_id = 0;
1476
1477 /* OMPT event callback */
1478 if (ompt_enabled.ompt_callback_mutex_acquired) {
1479 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
1480 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
1481 }
1482 }
1483 #endif
1484
1485 KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1486 KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
1487 } // __kmpc_critical_with_hint
1488
1489 #endif // KMP_USE_DYNAMIC_LOCK
1490
1491 /*!
1492 @ingroup WORK_SHARING
1493 @param loc source location information.
1494 @param global_tid global thread number .
1495 @param crit identity of the critical section. This could be a pointer to a lock
1496 associated with the critical section, or some other suitably unique value.
1497
1498 Leave a critical section, releasing any lock that was held during its execution.
1499 */
__kmpc_end_critical(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)1500 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
1501 kmp_critical_name *crit) {
1502 kmp_user_lock_p lck;
1503
1504 KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
1505
1506 #if KMP_USE_DYNAMIC_LOCK
1507 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1508 lck = (kmp_user_lock_p)crit;
1509 KMP_ASSERT(lck != NULL);
1510 if (__kmp_env_consistency_check) {
1511 __kmp_pop_sync(global_tid, ct_critical, loc);
1512 }
1513 #if USE_ITT_BUILD
1514 __kmp_itt_critical_releasing(lck);
1515 #endif
1516 #if KMP_USE_INLINED_TAS
1517 if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1518 KMP_RELEASE_TAS_LOCK(lck, global_tid);
1519 } else
1520 #elif KMP_USE_INLINED_FUTEX
1521 if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1522 KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1523 } else
1524 #endif
1525 {
1526 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1527 }
1528 } else {
1529 kmp_indirect_lock_t *ilk =
1530 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1531 KMP_ASSERT(ilk != NULL);
1532 lck = ilk->lock;
1533 if (__kmp_env_consistency_check) {
1534 __kmp_pop_sync(global_tid, ct_critical, loc);
1535 }
1536 #if USE_ITT_BUILD
1537 __kmp_itt_critical_releasing(lck);
1538 #endif
1539 KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1540 }
1541
1542 #else // KMP_USE_DYNAMIC_LOCK
1543
1544 if ((__kmp_user_lock_kind == lk_tas) &&
1545 (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
1546 lck = (kmp_user_lock_p)crit;
1547 }
1548 #if KMP_USE_FUTEX
1549 else if ((__kmp_user_lock_kind == lk_futex) &&
1550 (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
1551 lck = (kmp_user_lock_p)crit;
1552 }
1553 #endif
1554 else { // ticket, queuing or drdpa
1555 lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
1556 }
1557
1558 KMP_ASSERT(lck != NULL);
1559
1560 if (__kmp_env_consistency_check)
1561 __kmp_pop_sync(global_tid, ct_critical, loc);
1562
1563 #if USE_ITT_BUILD
1564 __kmp_itt_critical_releasing(lck);
1565 #endif /* USE_ITT_BUILD */
1566 // Value of 'crit' should be good for using as a critical_id of the critical
1567 // section directive.
1568 __kmp_release_user_lock_with_checks(lck, global_tid);
1569
1570 #endif // KMP_USE_DYNAMIC_LOCK
1571
1572 #if OMPT_SUPPORT && OMPT_OPTIONAL
1573 /* OMPT release event triggers after lock is released; place here to trigger
1574 * for all #if branches */
1575 OMPT_STORE_RETURN_ADDRESS(global_tid);
1576 if (ompt_enabled.ompt_callback_mutex_released) {
1577 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
1578 ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
1579 OMPT_LOAD_RETURN_ADDRESS(0));
1580 }
1581 #endif
1582
1583 KMP_POP_PARTITIONED_TIMER();
1584 KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
1585 }
1586
1587 /*!
1588 @ingroup SYNCHRONIZATION
1589 @param loc source location information
1590 @param global_tid thread id.
1591 @return one if the thread should execute the master block, zero otherwise
1592
1593 Start execution of a combined barrier and master. The barrier is executed inside
1594 this function.
1595 */
__kmpc_barrier_master(ident_t * loc,kmp_int32 global_tid)1596 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1597 int status;
1598
1599 KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
1600
1601 if (!TCR_4(__kmp_init_parallel))
1602 __kmp_parallel_initialize();
1603
1604 __kmp_resume_if_soft_paused();
1605
1606 if (__kmp_env_consistency_check)
1607 __kmp_check_barrier(global_tid, ct_barrier, loc);
1608
1609 #if OMPT_SUPPORT
1610 ompt_frame_t *ompt_frame;
1611 if (ompt_enabled.enabled) {
1612 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1613 if (ompt_frame->enter_frame.ptr == NULL)
1614 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1615 OMPT_STORE_RETURN_ADDRESS(global_tid);
1616 }
1617 #endif
1618 #if USE_ITT_NOTIFY
1619 __kmp_threads[global_tid]->th.th_ident = loc;
1620 #endif
1621 status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
1622 #if OMPT_SUPPORT && OMPT_OPTIONAL
1623 if (ompt_enabled.enabled) {
1624 ompt_frame->enter_frame = ompt_data_none;
1625 }
1626 #endif
1627
1628 return (status != 0) ? 0 : 1;
1629 }
1630
1631 /*!
1632 @ingroup SYNCHRONIZATION
1633 @param loc source location information
1634 @param global_tid thread id.
1635
1636 Complete the execution of a combined barrier and master. This function should
1637 only be called at the completion of the <tt>master</tt> code. Other threads will
1638 still be waiting at the barrier and this call releases them.
1639 */
__kmpc_end_barrier_master(ident_t * loc,kmp_int32 global_tid)1640 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
1641 KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
1642
1643 __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1644 }
1645
1646 /*!
1647 @ingroup SYNCHRONIZATION
1648 @param loc source location information
1649 @param global_tid thread id.
1650 @return one if the thread should execute the master block, zero otherwise
1651
1652 Start execution of a combined barrier and master(nowait) construct.
1653 The barrier is executed inside this function.
1654 There is no equivalent "end" function, since the
1655 */
__kmpc_barrier_master_nowait(ident_t * loc,kmp_int32 global_tid)1656 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
1657 kmp_int32 ret;
1658
1659 KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
1660
1661 if (!TCR_4(__kmp_init_parallel))
1662 __kmp_parallel_initialize();
1663
1664 __kmp_resume_if_soft_paused();
1665
1666 if (__kmp_env_consistency_check) {
1667 if (loc == 0) {
1668 KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
1669 }
1670 __kmp_check_barrier(global_tid, ct_barrier, loc);
1671 }
1672
1673 #if OMPT_SUPPORT
1674 ompt_frame_t *ompt_frame;
1675 if (ompt_enabled.enabled) {
1676 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1677 if (ompt_frame->enter_frame.ptr == NULL)
1678 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1679 OMPT_STORE_RETURN_ADDRESS(global_tid);
1680 }
1681 #endif
1682 #if USE_ITT_NOTIFY
1683 __kmp_threads[global_tid]->th.th_ident = loc;
1684 #endif
1685 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
1686 #if OMPT_SUPPORT && OMPT_OPTIONAL
1687 if (ompt_enabled.enabled) {
1688 ompt_frame->enter_frame = ompt_data_none;
1689 }
1690 #endif
1691
1692 ret = __kmpc_master(loc, global_tid);
1693
1694 if (__kmp_env_consistency_check) {
1695 /* there's no __kmpc_end_master called; so the (stats) */
1696 /* actions of __kmpc_end_master are done here */
1697
1698 if (global_tid < 0) {
1699 KMP_WARNING(ThreadIdentInvalid);
1700 }
1701 if (ret) {
1702 /* only one thread should do the pop since only */
1703 /* one did the push (see __kmpc_master()) */
1704
1705 __kmp_pop_sync(global_tid, ct_master, loc);
1706 }
1707 }
1708
1709 return (ret);
1710 }
1711
1712 /* The BARRIER for a SINGLE process section is always explicit */
1713 /*!
1714 @ingroup WORK_SHARING
1715 @param loc source location information
1716 @param global_tid global thread number
1717 @return One if this thread should execute the single construct, zero otherwise.
1718
1719 Test whether to execute a <tt>single</tt> construct.
1720 There are no implicit barriers in the two "single" calls, rather the compiler
1721 should introduce an explicit barrier if it is required.
1722 */
1723
__kmpc_single(ident_t * loc,kmp_int32 global_tid)1724 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
1725 kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
1726
1727 if (rc) {
1728 // We are going to execute the single statement, so we should count it.
1729 KMP_COUNT_BLOCK(OMP_SINGLE);
1730 KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1731 }
1732
1733 #if OMPT_SUPPORT && OMPT_OPTIONAL
1734 kmp_info_t *this_thr = __kmp_threads[global_tid];
1735 kmp_team_t *team = this_thr->th.th_team;
1736 int tid = __kmp_tid_from_gtid(global_tid);
1737
1738 if (ompt_enabled.enabled) {
1739 if (rc) {
1740 if (ompt_enabled.ompt_callback_work) {
1741 ompt_callbacks.ompt_callback(ompt_callback_work)(
1742 ompt_work_single_executor, ompt_scope_begin,
1743 &(team->t.ompt_team_info.parallel_data),
1744 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1745 1, OMPT_GET_RETURN_ADDRESS(0));
1746 }
1747 } else {
1748 if (ompt_enabled.ompt_callback_work) {
1749 ompt_callbacks.ompt_callback(ompt_callback_work)(
1750 ompt_work_single_other, ompt_scope_begin,
1751 &(team->t.ompt_team_info.parallel_data),
1752 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1753 1, OMPT_GET_RETURN_ADDRESS(0));
1754 ompt_callbacks.ompt_callback(ompt_callback_work)(
1755 ompt_work_single_other, ompt_scope_end,
1756 &(team->t.ompt_team_info.parallel_data),
1757 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
1758 1, OMPT_GET_RETURN_ADDRESS(0));
1759 }
1760 }
1761 }
1762 #endif
1763
1764 return rc;
1765 }
1766
1767 /*!
1768 @ingroup WORK_SHARING
1769 @param loc source location information
1770 @param global_tid global thread number
1771
1772 Mark the end of a <tt>single</tt> construct. This function should
1773 only be called by the thread that executed the block of code protected
1774 by the `single` construct.
1775 */
__kmpc_end_single(ident_t * loc,kmp_int32 global_tid)1776 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
1777 __kmp_exit_single(global_tid);
1778 KMP_POP_PARTITIONED_TIMER();
1779
1780 #if OMPT_SUPPORT && OMPT_OPTIONAL
1781 kmp_info_t *this_thr = __kmp_threads[global_tid];
1782 kmp_team_t *team = this_thr->th.th_team;
1783 int tid = __kmp_tid_from_gtid(global_tid);
1784
1785 if (ompt_enabled.ompt_callback_work) {
1786 ompt_callbacks.ompt_callback(ompt_callback_work)(
1787 ompt_work_single_executor, ompt_scope_end,
1788 &(team->t.ompt_team_info.parallel_data),
1789 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
1790 OMPT_GET_RETURN_ADDRESS(0));
1791 }
1792 #endif
1793 }
1794
1795 /*!
1796 @ingroup WORK_SHARING
1797 @param loc Source location
1798 @param global_tid Global thread id
1799
1800 Mark the end of a statically scheduled loop.
1801 */
__kmpc_for_static_fini(ident_t * loc,kmp_int32 global_tid)1802 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
1803 KMP_POP_PARTITIONED_TIMER();
1804 KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1805
1806 #if OMPT_SUPPORT && OMPT_OPTIONAL
1807 if (ompt_enabled.ompt_callback_work) {
1808 ompt_work_t ompt_work_type = ompt_work_loop;
1809 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1810 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1811 // Determine workshare type
1812 if (loc != NULL) {
1813 if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
1814 ompt_work_type = ompt_work_loop;
1815 } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
1816 ompt_work_type = ompt_work_sections;
1817 } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
1818 ompt_work_type = ompt_work_distribute;
1819 } else {
1820 // use default set above.
1821 // a warning about this case is provided in __kmpc_for_static_init
1822 }
1823 KMP_DEBUG_ASSERT(ompt_work_type);
1824 }
1825 ompt_callbacks.ompt_callback(ompt_callback_work)(
1826 ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
1827 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
1828 }
1829 #endif
1830 if (__kmp_env_consistency_check)
1831 __kmp_pop_workshare(global_tid, ct_pdo, loc);
1832 }
1833
1834 // User routines which take C-style arguments (call by value)
1835 // different from the Fortran equivalent routines
1836
ompc_set_num_threads(int arg)1837 void ompc_set_num_threads(int arg) {
1838 // !!!!! TODO: check the per-task binding
1839 __kmp_set_num_threads(arg, __kmp_entry_gtid());
1840 }
1841
ompc_set_dynamic(int flag)1842 void ompc_set_dynamic(int flag) {
1843 kmp_info_t *thread;
1844
1845 /* For the thread-private implementation of the internal controls */
1846 thread = __kmp_entry_thread();
1847
1848 __kmp_save_internal_controls(thread);
1849
1850 set__dynamic(thread, flag ? TRUE : FALSE);
1851 }
1852
ompc_set_nested(int flag)1853 void ompc_set_nested(int flag) {
1854 kmp_info_t *thread;
1855
1856 /* For the thread-private internal controls implementation */
1857 thread = __kmp_entry_thread();
1858
1859 __kmp_save_internal_controls(thread);
1860
1861 set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
1862 }
1863
ompc_set_max_active_levels(int max_active_levels)1864 void ompc_set_max_active_levels(int max_active_levels) {
1865 /* TO DO */
1866 /* we want per-task implementation of this internal control */
1867
1868 /* For the per-thread internal controls implementation */
1869 __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
1870 }
1871
ompc_set_schedule(omp_sched_t kind,int modifier)1872 void ompc_set_schedule(omp_sched_t kind, int modifier) {
1873 // !!!!! TODO: check the per-task binding
1874 __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
1875 }
1876
ompc_get_ancestor_thread_num(int level)1877 int ompc_get_ancestor_thread_num(int level) {
1878 return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
1879 }
1880
ompc_get_team_size(int level)1881 int ompc_get_team_size(int level) {
1882 return __kmp_get_team_size(__kmp_entry_gtid(), level);
1883 }
1884
1885 /* OpenMP 5.0 Affinity Format API */
1886
ompc_set_affinity_format(char const * format)1887 void ompc_set_affinity_format(char const *format) {
1888 if (!__kmp_init_serial) {
1889 __kmp_serial_initialize();
1890 }
1891 __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
1892 format, KMP_STRLEN(format) + 1);
1893 }
1894
ompc_get_affinity_format(char * buffer,size_t size)1895 size_t ompc_get_affinity_format(char *buffer, size_t size) {
1896 size_t format_size;
1897 if (!__kmp_init_serial) {
1898 __kmp_serial_initialize();
1899 }
1900 format_size = KMP_STRLEN(__kmp_affinity_format);
1901 if (buffer && size) {
1902 __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
1903 format_size + 1);
1904 }
1905 return format_size;
1906 }
1907
ompc_display_affinity(char const * format)1908 void ompc_display_affinity(char const *format) {
1909 int gtid;
1910 if (!TCR_4(__kmp_init_middle)) {
1911 __kmp_middle_initialize();
1912 }
1913 gtid = __kmp_get_gtid();
1914 __kmp_aux_display_affinity(gtid, format);
1915 }
1916
ompc_capture_affinity(char * buffer,size_t buf_size,char const * format)1917 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
1918 char const *format) {
1919 int gtid;
1920 size_t num_required;
1921 kmp_str_buf_t capture_buf;
1922 if (!TCR_4(__kmp_init_middle)) {
1923 __kmp_middle_initialize();
1924 }
1925 gtid = __kmp_get_gtid();
1926 __kmp_str_buf_init(&capture_buf);
1927 num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
1928 if (buffer && buf_size) {
1929 __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
1930 capture_buf.used + 1);
1931 }
1932 __kmp_str_buf_free(&capture_buf);
1933 return num_required;
1934 }
1935
kmpc_set_stacksize(int arg)1936 void kmpc_set_stacksize(int arg) {
1937 // __kmp_aux_set_stacksize initializes the library if needed
1938 __kmp_aux_set_stacksize(arg);
1939 }
1940
kmpc_set_stacksize_s(size_t arg)1941 void kmpc_set_stacksize_s(size_t arg) {
1942 // __kmp_aux_set_stacksize initializes the library if needed
1943 __kmp_aux_set_stacksize(arg);
1944 }
1945
kmpc_set_blocktime(int arg)1946 void kmpc_set_blocktime(int arg) {
1947 int gtid, tid;
1948 kmp_info_t *thread;
1949
1950 gtid = __kmp_entry_gtid();
1951 tid = __kmp_tid_from_gtid(gtid);
1952 thread = __kmp_thread_from_gtid(gtid);
1953
1954 __kmp_aux_set_blocktime(arg, thread, tid);
1955 }
1956
kmpc_set_library(int arg)1957 void kmpc_set_library(int arg) {
1958 // __kmp_user_set_library initializes the library if needed
1959 __kmp_user_set_library((enum library_type)arg);
1960 }
1961
kmpc_set_defaults(char const * str)1962 void kmpc_set_defaults(char const *str) {
1963 // __kmp_aux_set_defaults initializes the library if needed
1964 __kmp_aux_set_defaults(str, KMP_STRLEN(str));
1965 }
1966
kmpc_set_disp_num_buffers(int arg)1967 void kmpc_set_disp_num_buffers(int arg) {
1968 // ignore after initialization because some teams have already
1969 // allocated dispatch buffers
1970 if (__kmp_init_serial == 0 && arg > 0)
1971 __kmp_dispatch_num_buffers = arg;
1972 }
1973
kmpc_set_affinity_mask_proc(int proc,void ** mask)1974 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
1975 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1976 return -1;
1977 #else
1978 if (!TCR_4(__kmp_init_middle)) {
1979 __kmp_middle_initialize();
1980 }
1981 return __kmp_aux_set_affinity_mask_proc(proc, mask);
1982 #endif
1983 }
1984
kmpc_unset_affinity_mask_proc(int proc,void ** mask)1985 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
1986 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1987 return -1;
1988 #else
1989 if (!TCR_4(__kmp_init_middle)) {
1990 __kmp_middle_initialize();
1991 }
1992 return __kmp_aux_unset_affinity_mask_proc(proc, mask);
1993 #endif
1994 }
1995
kmpc_get_affinity_mask_proc(int proc,void ** mask)1996 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
1997 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1998 return -1;
1999 #else
2000 if (!TCR_4(__kmp_init_middle)) {
2001 __kmp_middle_initialize();
2002 }
2003 return __kmp_aux_get_affinity_mask_proc(proc, mask);
2004 #endif
2005 }
2006
2007 /* -------------------------------------------------------------------------- */
2008 /*!
2009 @ingroup THREADPRIVATE
2010 @param loc source location information
2011 @param gtid global thread number
2012 @param cpy_size size of the cpy_data buffer
2013 @param cpy_data pointer to data to be copied
2014 @param cpy_func helper function to call for copying data
2015 @param didit flag variable: 1=single thread; 0=not single thread
2016
2017 __kmpc_copyprivate implements the interface for the private data broadcast
2018 needed for the copyprivate clause associated with a single region in an
2019 OpenMP<sup>*</sup> program (both C and Fortran).
2020 All threads participating in the parallel region call this routine.
2021 One of the threads (called the single thread) should have the <tt>didit</tt>
2022 variable set to 1 and all other threads should have that variable set to 0.
2023 All threads pass a pointer to a data buffer (cpy_data) that they have built.
2024
2025 The OpenMP specification forbids the use of nowait on the single region when a
2026 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
2027 barrier internally to avoid race conditions, so the code generation for the
2028 single region should avoid generating a barrier after the call to @ref
2029 __kmpc_copyprivate.
2030
2031 The <tt>gtid</tt> parameter is the global thread id for the current thread.
2032 The <tt>loc</tt> parameter is a pointer to source location information.
2033
2034 Internal implementation: The single thread will first copy its descriptor
2035 address (cpy_data) to a team-private location, then the other threads will each
2036 call the function pointed to by the parameter cpy_func, which carries out the
2037 copy by copying the data using the cpy_data buffer.
2038
2039 The cpy_func routine used for the copy and the contents of the data area defined
2040 by cpy_data and cpy_size may be built in any fashion that will allow the copy
2041 to be done. For instance, the cpy_data buffer can hold the actual data to be
2042 copied or it may hold a list of pointers to the data. The cpy_func routine must
2043 interpret the cpy_data buffer appropriately.
2044
2045 The interface to cpy_func is as follows:
2046 @code
2047 void cpy_func( void *destination, void *source )
2048 @endcode
2049 where void *destination is the cpy_data pointer for the thread being copied to
2050 and void *source is the cpy_data pointer for the thread being copied from.
2051 */
__kmpc_copyprivate(ident_t * loc,kmp_int32 gtid,size_t cpy_size,void * cpy_data,void (* cpy_func)(void *,void *),kmp_int32 didit)2052 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
2053 void *cpy_data, void (*cpy_func)(void *, void *),
2054 kmp_int32 didit) {
2055 void **data_ptr;
2056
2057 KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
2058
2059 KMP_MB();
2060
2061 data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
2062
2063 if (__kmp_env_consistency_check) {
2064 if (loc == 0) {
2065 KMP_WARNING(ConstructIdentInvalid);
2066 }
2067 }
2068
2069 // ToDo: Optimize the following two barriers into some kind of split barrier
2070
2071 if (didit)
2072 *data_ptr = cpy_data;
2073
2074 #if OMPT_SUPPORT
2075 ompt_frame_t *ompt_frame;
2076 if (ompt_enabled.enabled) {
2077 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
2078 if (ompt_frame->enter_frame.ptr == NULL)
2079 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2080 OMPT_STORE_RETURN_ADDRESS(gtid);
2081 }
2082 #endif
2083 /* This barrier is not a barrier region boundary */
2084 #if USE_ITT_NOTIFY
2085 __kmp_threads[gtid]->th.th_ident = loc;
2086 #endif
2087 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2088
2089 if (!didit)
2090 (*cpy_func)(cpy_data, *data_ptr);
2091
2092 // Consider next barrier a user-visible barrier for barrier region boundaries
2093 // Nesting checks are already handled by the single construct checks
2094
2095 #if OMPT_SUPPORT
2096 if (ompt_enabled.enabled) {
2097 OMPT_STORE_RETURN_ADDRESS(gtid);
2098 }
2099 #endif
2100 #if USE_ITT_NOTIFY
2101 __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
2102 // tasks can overwrite the location)
2103 #endif
2104 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2105 #if OMPT_SUPPORT && OMPT_OPTIONAL
2106 if (ompt_enabled.enabled) {
2107 ompt_frame->enter_frame = ompt_data_none;
2108 }
2109 #endif
2110 }
2111
2112 /* -------------------------------------------------------------------------- */
2113
2114 #define INIT_LOCK __kmp_init_user_lock_with_checks
2115 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
2116 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
2117 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
2118 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
2119 #define ACQUIRE_NESTED_LOCK_TIMED \
2120 __kmp_acquire_nested_user_lock_with_checks_timed
2121 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
2122 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
2123 #define TEST_LOCK __kmp_test_user_lock_with_checks
2124 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
2125 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
2126 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
2127
2128 // TODO: Make check abort messages use location info & pass it into
2129 // with_checks routines
2130
2131 #if KMP_USE_DYNAMIC_LOCK
2132
2133 // internal lock initializer
__kmp_init_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2134 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
2135 kmp_dyna_lockseq_t seq) {
2136 if (KMP_IS_D_LOCK(seq)) {
2137 KMP_INIT_D_LOCK(lock, seq);
2138 #if USE_ITT_BUILD
2139 __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
2140 #endif
2141 } else {
2142 KMP_INIT_I_LOCK(lock, seq);
2143 #if USE_ITT_BUILD
2144 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2145 __kmp_itt_lock_creating(ilk->lock, loc);
2146 #endif
2147 }
2148 }
2149
2150 // internal nest lock initializer
2151 static __forceinline void
__kmp_init_nest_lock_with_hint(ident_t * loc,void ** lock,kmp_dyna_lockseq_t seq)2152 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
2153 kmp_dyna_lockseq_t seq) {
2154 #if KMP_USE_TSX
2155 // Don't have nested lock implementation for speculative locks
2156 if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
2157 seq = __kmp_user_lock_seq;
2158 #endif
2159 switch (seq) {
2160 case lockseq_tas:
2161 seq = lockseq_nested_tas;
2162 break;
2163 #if KMP_USE_FUTEX
2164 case lockseq_futex:
2165 seq = lockseq_nested_futex;
2166 break;
2167 #endif
2168 case lockseq_ticket:
2169 seq = lockseq_nested_ticket;
2170 break;
2171 case lockseq_queuing:
2172 seq = lockseq_nested_queuing;
2173 break;
2174 case lockseq_drdpa:
2175 seq = lockseq_nested_drdpa;
2176 break;
2177 default:
2178 seq = lockseq_nested_queuing;
2179 }
2180 KMP_INIT_I_LOCK(lock, seq);
2181 #if USE_ITT_BUILD
2182 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
2183 __kmp_itt_lock_creating(ilk->lock, loc);
2184 #endif
2185 }
2186
2187 /* initialize the lock with a hint */
__kmpc_init_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2188 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
2189 uintptr_t hint) {
2190 KMP_DEBUG_ASSERT(__kmp_init_serial);
2191 if (__kmp_env_consistency_check && user_lock == NULL) {
2192 KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
2193 }
2194
2195 __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2196
2197 #if OMPT_SUPPORT && OMPT_OPTIONAL
2198 // This is the case, if called from omp_init_lock_with_hint:
2199 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2200 if (!codeptr)
2201 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2202 if (ompt_enabled.ompt_callback_lock_init) {
2203 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2204 ompt_mutex_lock, (omp_lock_hint_t)hint,
2205 __ompt_get_mutex_impl_type(user_lock),
2206 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2207 }
2208 #endif
2209 }
2210
2211 /* initialize the lock with a hint */
__kmpc_init_nest_lock_with_hint(ident_t * loc,kmp_int32 gtid,void ** user_lock,uintptr_t hint)2212 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
2213 void **user_lock, uintptr_t hint) {
2214 KMP_DEBUG_ASSERT(__kmp_init_serial);
2215 if (__kmp_env_consistency_check && user_lock == NULL) {
2216 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
2217 }
2218
2219 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
2220
2221 #if OMPT_SUPPORT && OMPT_OPTIONAL
2222 // This is the case, if called from omp_init_lock_with_hint:
2223 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2224 if (!codeptr)
2225 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2226 if (ompt_enabled.ompt_callback_lock_init) {
2227 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2228 ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
2229 __ompt_get_mutex_impl_type(user_lock),
2230 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2231 }
2232 #endif
2233 }
2234
2235 #endif // KMP_USE_DYNAMIC_LOCK
2236
2237 /* initialize the lock */
__kmpc_init_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2238 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2239 #if KMP_USE_DYNAMIC_LOCK
2240
2241 KMP_DEBUG_ASSERT(__kmp_init_serial);
2242 if (__kmp_env_consistency_check && user_lock == NULL) {
2243 KMP_FATAL(LockIsUninitialized, "omp_init_lock");
2244 }
2245 __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2246
2247 #if OMPT_SUPPORT && OMPT_OPTIONAL
2248 // This is the case, if called from omp_init_lock_with_hint:
2249 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2250 if (!codeptr)
2251 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2252 if (ompt_enabled.ompt_callback_lock_init) {
2253 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2254 ompt_mutex_lock, omp_lock_hint_none,
2255 __ompt_get_mutex_impl_type(user_lock),
2256 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2257 }
2258 #endif
2259
2260 #else // KMP_USE_DYNAMIC_LOCK
2261
2262 static char const *const func = "omp_init_lock";
2263 kmp_user_lock_p lck;
2264 KMP_DEBUG_ASSERT(__kmp_init_serial);
2265
2266 if (__kmp_env_consistency_check) {
2267 if (user_lock == NULL) {
2268 KMP_FATAL(LockIsUninitialized, func);
2269 }
2270 }
2271
2272 KMP_CHECK_USER_LOCK_INIT();
2273
2274 if ((__kmp_user_lock_kind == lk_tas) &&
2275 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2276 lck = (kmp_user_lock_p)user_lock;
2277 }
2278 #if KMP_USE_FUTEX
2279 else if ((__kmp_user_lock_kind == lk_futex) &&
2280 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2281 lck = (kmp_user_lock_p)user_lock;
2282 }
2283 #endif
2284 else {
2285 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2286 }
2287 INIT_LOCK(lck);
2288 __kmp_set_user_lock_location(lck, loc);
2289
2290 #if OMPT_SUPPORT && OMPT_OPTIONAL
2291 // This is the case, if called from omp_init_lock_with_hint:
2292 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2293 if (!codeptr)
2294 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2295 if (ompt_enabled.ompt_callback_lock_init) {
2296 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2297 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2298 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2299 }
2300 #endif
2301
2302 #if USE_ITT_BUILD
2303 __kmp_itt_lock_creating(lck);
2304 #endif /* USE_ITT_BUILD */
2305
2306 #endif // KMP_USE_DYNAMIC_LOCK
2307 } // __kmpc_init_lock
2308
2309 /* initialize the lock */
__kmpc_init_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2310 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2311 #if KMP_USE_DYNAMIC_LOCK
2312
2313 KMP_DEBUG_ASSERT(__kmp_init_serial);
2314 if (__kmp_env_consistency_check && user_lock == NULL) {
2315 KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
2316 }
2317 __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
2318
2319 #if OMPT_SUPPORT && OMPT_OPTIONAL
2320 // This is the case, if called from omp_init_lock_with_hint:
2321 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2322 if (!codeptr)
2323 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2324 if (ompt_enabled.ompt_callback_lock_init) {
2325 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2326 ompt_mutex_nest_lock, omp_lock_hint_none,
2327 __ompt_get_mutex_impl_type(user_lock),
2328 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2329 }
2330 #endif
2331
2332 #else // KMP_USE_DYNAMIC_LOCK
2333
2334 static char const *const func = "omp_init_nest_lock";
2335 kmp_user_lock_p lck;
2336 KMP_DEBUG_ASSERT(__kmp_init_serial);
2337
2338 if (__kmp_env_consistency_check) {
2339 if (user_lock == NULL) {
2340 KMP_FATAL(LockIsUninitialized, func);
2341 }
2342 }
2343
2344 KMP_CHECK_USER_LOCK_INIT();
2345
2346 if ((__kmp_user_lock_kind == lk_tas) &&
2347 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2348 OMP_NEST_LOCK_T_SIZE)) {
2349 lck = (kmp_user_lock_p)user_lock;
2350 }
2351 #if KMP_USE_FUTEX
2352 else if ((__kmp_user_lock_kind == lk_futex) &&
2353 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2354 OMP_NEST_LOCK_T_SIZE)) {
2355 lck = (kmp_user_lock_p)user_lock;
2356 }
2357 #endif
2358 else {
2359 lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
2360 }
2361
2362 INIT_NESTED_LOCK(lck);
2363 __kmp_set_user_lock_location(lck, loc);
2364
2365 #if OMPT_SUPPORT && OMPT_OPTIONAL
2366 // This is the case, if called from omp_init_lock_with_hint:
2367 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2368 if (!codeptr)
2369 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2370 if (ompt_enabled.ompt_callback_lock_init) {
2371 ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
2372 ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2373 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2374 }
2375 #endif
2376
2377 #if USE_ITT_BUILD
2378 __kmp_itt_lock_creating(lck);
2379 #endif /* USE_ITT_BUILD */
2380
2381 #endif // KMP_USE_DYNAMIC_LOCK
2382 } // __kmpc_init_nest_lock
2383
__kmpc_destroy_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2384 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2385 #if KMP_USE_DYNAMIC_LOCK
2386
2387 #if USE_ITT_BUILD
2388 kmp_user_lock_p lck;
2389 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2390 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2391 } else {
2392 lck = (kmp_user_lock_p)user_lock;
2393 }
2394 __kmp_itt_lock_destroyed(lck);
2395 #endif
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397 // This is the case, if called from omp_init_lock_with_hint:
2398 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2399 if (!codeptr)
2400 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2401 if (ompt_enabled.ompt_callback_lock_destroy) {
2402 kmp_user_lock_p lck;
2403 if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2404 lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2405 } else {
2406 lck = (kmp_user_lock_p)user_lock;
2407 }
2408 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2409 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2410 }
2411 #endif
2412 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2413 #else
2414 kmp_user_lock_p lck;
2415
2416 if ((__kmp_user_lock_kind == lk_tas) &&
2417 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2418 lck = (kmp_user_lock_p)user_lock;
2419 }
2420 #if KMP_USE_FUTEX
2421 else if ((__kmp_user_lock_kind == lk_futex) &&
2422 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2423 lck = (kmp_user_lock_p)user_lock;
2424 }
2425 #endif
2426 else {
2427 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
2428 }
2429
2430 #if OMPT_SUPPORT && OMPT_OPTIONAL
2431 // This is the case, if called from omp_init_lock_with_hint:
2432 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2433 if (!codeptr)
2434 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2435 if (ompt_enabled.ompt_callback_lock_destroy) {
2436 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2437 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2438 }
2439 #endif
2440
2441 #if USE_ITT_BUILD
2442 __kmp_itt_lock_destroyed(lck);
2443 #endif /* USE_ITT_BUILD */
2444 DESTROY_LOCK(lck);
2445
2446 if ((__kmp_user_lock_kind == lk_tas) &&
2447 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2448 ;
2449 }
2450 #if KMP_USE_FUTEX
2451 else if ((__kmp_user_lock_kind == lk_futex) &&
2452 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2453 ;
2454 }
2455 #endif
2456 else {
2457 __kmp_user_lock_free(user_lock, gtid, lck);
2458 }
2459 #endif // KMP_USE_DYNAMIC_LOCK
2460 } // __kmpc_destroy_lock
2461
2462 /* destroy the lock */
__kmpc_destroy_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2463 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2464 #if KMP_USE_DYNAMIC_LOCK
2465
2466 #if USE_ITT_BUILD
2467 kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2468 __kmp_itt_lock_destroyed(ilk->lock);
2469 #endif
2470 #if OMPT_SUPPORT && OMPT_OPTIONAL
2471 // This is the case, if called from omp_init_lock_with_hint:
2472 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2473 if (!codeptr)
2474 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2475 if (ompt_enabled.ompt_callback_lock_destroy) {
2476 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2477 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2478 }
2479 #endif
2480 KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2481
2482 #else // KMP_USE_DYNAMIC_LOCK
2483
2484 kmp_user_lock_p lck;
2485
2486 if ((__kmp_user_lock_kind == lk_tas) &&
2487 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2488 OMP_NEST_LOCK_T_SIZE)) {
2489 lck = (kmp_user_lock_p)user_lock;
2490 }
2491 #if KMP_USE_FUTEX
2492 else if ((__kmp_user_lock_kind == lk_futex) &&
2493 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2494 OMP_NEST_LOCK_T_SIZE)) {
2495 lck = (kmp_user_lock_p)user_lock;
2496 }
2497 #endif
2498 else {
2499 lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
2500 }
2501
2502 #if OMPT_SUPPORT && OMPT_OPTIONAL
2503 // This is the case, if called from omp_init_lock_with_hint:
2504 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2505 if (!codeptr)
2506 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2507 if (ompt_enabled.ompt_callback_lock_destroy) {
2508 ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
2509 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2510 }
2511 #endif
2512
2513 #if USE_ITT_BUILD
2514 __kmp_itt_lock_destroyed(lck);
2515 #endif /* USE_ITT_BUILD */
2516
2517 DESTROY_NESTED_LOCK(lck);
2518
2519 if ((__kmp_user_lock_kind == lk_tas) &&
2520 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2521 OMP_NEST_LOCK_T_SIZE)) {
2522 ;
2523 }
2524 #if KMP_USE_FUTEX
2525 else if ((__kmp_user_lock_kind == lk_futex) &&
2526 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2527 OMP_NEST_LOCK_T_SIZE)) {
2528 ;
2529 }
2530 #endif
2531 else {
2532 __kmp_user_lock_free(user_lock, gtid, lck);
2533 }
2534 #endif // KMP_USE_DYNAMIC_LOCK
2535 } // __kmpc_destroy_nest_lock
2536
__kmpc_set_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2537 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2538 KMP_COUNT_BLOCK(OMP_set_lock);
2539 #if KMP_USE_DYNAMIC_LOCK
2540 int tag = KMP_EXTRACT_D_TAG(user_lock);
2541 #if USE_ITT_BUILD
2542 __kmp_itt_lock_acquiring(
2543 (kmp_user_lock_p)
2544 user_lock); // itt function will get to the right lock object.
2545 #endif
2546 #if OMPT_SUPPORT && OMPT_OPTIONAL
2547 // This is the case, if called from omp_init_lock_with_hint:
2548 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2549 if (!codeptr)
2550 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2551 if (ompt_enabled.ompt_callback_mutex_acquire) {
2552 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2553 ompt_mutex_lock, omp_lock_hint_none,
2554 __ompt_get_mutex_impl_type(user_lock),
2555 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2556 }
2557 #endif
2558 #if KMP_USE_INLINED_TAS
2559 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2560 KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2561 } else
2562 #elif KMP_USE_INLINED_FUTEX
2563 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2564 KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2565 } else
2566 #endif
2567 {
2568 __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2569 }
2570 #if USE_ITT_BUILD
2571 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2572 #endif
2573 #if OMPT_SUPPORT && OMPT_OPTIONAL
2574 if (ompt_enabled.ompt_callback_mutex_acquired) {
2575 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2576 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2577 }
2578 #endif
2579
2580 #else // KMP_USE_DYNAMIC_LOCK
2581
2582 kmp_user_lock_p lck;
2583
2584 if ((__kmp_user_lock_kind == lk_tas) &&
2585 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2586 lck = (kmp_user_lock_p)user_lock;
2587 }
2588 #if KMP_USE_FUTEX
2589 else if ((__kmp_user_lock_kind == lk_futex) &&
2590 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2591 lck = (kmp_user_lock_p)user_lock;
2592 }
2593 #endif
2594 else {
2595 lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
2596 }
2597
2598 #if USE_ITT_BUILD
2599 __kmp_itt_lock_acquiring(lck);
2600 #endif /* USE_ITT_BUILD */
2601 #if OMPT_SUPPORT && OMPT_OPTIONAL
2602 // This is the case, if called from omp_init_lock_with_hint:
2603 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2604 if (!codeptr)
2605 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2606 if (ompt_enabled.ompt_callback_mutex_acquire) {
2607 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2608 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
2609 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2610 }
2611 #endif
2612
2613 ACQUIRE_LOCK(lck, gtid);
2614
2615 #if USE_ITT_BUILD
2616 __kmp_itt_lock_acquired(lck);
2617 #endif /* USE_ITT_BUILD */
2618
2619 #if OMPT_SUPPORT && OMPT_OPTIONAL
2620 if (ompt_enabled.ompt_callback_mutex_acquired) {
2621 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2622 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2623 }
2624 #endif
2625
2626 #endif // KMP_USE_DYNAMIC_LOCK
2627 }
2628
__kmpc_set_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2629 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2630 #if KMP_USE_DYNAMIC_LOCK
2631
2632 #if USE_ITT_BUILD
2633 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2634 #endif
2635 #if OMPT_SUPPORT && OMPT_OPTIONAL
2636 // This is the case, if called from omp_init_lock_with_hint:
2637 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2638 if (!codeptr)
2639 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2640 if (ompt_enabled.enabled) {
2641 if (ompt_enabled.ompt_callback_mutex_acquire) {
2642 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2643 ompt_mutex_nest_lock, omp_lock_hint_none,
2644 __ompt_get_mutex_impl_type(user_lock),
2645 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2646 }
2647 }
2648 #endif
2649 int acquire_status =
2650 KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2651 (void) acquire_status;
2652 #if USE_ITT_BUILD
2653 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2654 #endif
2655
2656 #if OMPT_SUPPORT && OMPT_OPTIONAL
2657 if (ompt_enabled.enabled) {
2658 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2659 if (ompt_enabled.ompt_callback_mutex_acquired) {
2660 // lock_first
2661 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2662 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2663 codeptr);
2664 }
2665 } else {
2666 if (ompt_enabled.ompt_callback_nest_lock) {
2667 // lock_next
2668 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2669 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2670 }
2671 }
2672 }
2673 #endif
2674
2675 #else // KMP_USE_DYNAMIC_LOCK
2676 int acquire_status;
2677 kmp_user_lock_p lck;
2678
2679 if ((__kmp_user_lock_kind == lk_tas) &&
2680 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2681 OMP_NEST_LOCK_T_SIZE)) {
2682 lck = (kmp_user_lock_p)user_lock;
2683 }
2684 #if KMP_USE_FUTEX
2685 else if ((__kmp_user_lock_kind == lk_futex) &&
2686 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2687 OMP_NEST_LOCK_T_SIZE)) {
2688 lck = (kmp_user_lock_p)user_lock;
2689 }
2690 #endif
2691 else {
2692 lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
2693 }
2694
2695 #if USE_ITT_BUILD
2696 __kmp_itt_lock_acquiring(lck);
2697 #endif /* USE_ITT_BUILD */
2698 #if OMPT_SUPPORT && OMPT_OPTIONAL
2699 // This is the case, if called from omp_init_lock_with_hint:
2700 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2701 if (!codeptr)
2702 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2703 if (ompt_enabled.enabled) {
2704 if (ompt_enabled.ompt_callback_mutex_acquire) {
2705 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2706 ompt_mutex_nest_lock, omp_lock_hint_none,
2707 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
2708 codeptr);
2709 }
2710 }
2711 #endif
2712
2713 ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
2714
2715 #if USE_ITT_BUILD
2716 __kmp_itt_lock_acquired(lck);
2717 #endif /* USE_ITT_BUILD */
2718
2719 #if OMPT_SUPPORT && OMPT_OPTIONAL
2720 if (ompt_enabled.enabled) {
2721 if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2722 if (ompt_enabled.ompt_callback_mutex_acquired) {
2723 // lock_first
2724 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
2725 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2726 }
2727 } else {
2728 if (ompt_enabled.ompt_callback_nest_lock) {
2729 // lock_next
2730 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2731 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2732 }
2733 }
2734 }
2735 #endif
2736
2737 #endif // KMP_USE_DYNAMIC_LOCK
2738 }
2739
__kmpc_unset_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2740 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2741 #if KMP_USE_DYNAMIC_LOCK
2742
2743 int tag = KMP_EXTRACT_D_TAG(user_lock);
2744 #if USE_ITT_BUILD
2745 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2746 #endif
2747 #if KMP_USE_INLINED_TAS
2748 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2749 KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2750 } else
2751 #elif KMP_USE_INLINED_FUTEX
2752 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2753 KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2754 } else
2755 #endif
2756 {
2757 __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2758 }
2759
2760 #if OMPT_SUPPORT && OMPT_OPTIONAL
2761 // This is the case, if called from omp_init_lock_with_hint:
2762 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2763 if (!codeptr)
2764 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2765 if (ompt_enabled.ompt_callback_mutex_released) {
2766 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2767 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2768 }
2769 #endif
2770
2771 #else // KMP_USE_DYNAMIC_LOCK
2772
2773 kmp_user_lock_p lck;
2774
2775 /* Can't use serial interval since not block structured */
2776 /* release the lock */
2777
2778 if ((__kmp_user_lock_kind == lk_tas) &&
2779 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
2780 #if KMP_OS_LINUX && \
2781 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2782 // "fast" path implemented to fix customer performance issue
2783 #if USE_ITT_BUILD
2784 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2785 #endif /* USE_ITT_BUILD */
2786 TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2787 KMP_MB();
2788
2789 #if OMPT_SUPPORT && OMPT_OPTIONAL
2790 // This is the case, if called from omp_init_lock_with_hint:
2791 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2792 if (!codeptr)
2793 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2794 if (ompt_enabled.ompt_callback_mutex_released) {
2795 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2796 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2797 }
2798 #endif
2799
2800 return;
2801 #else
2802 lck = (kmp_user_lock_p)user_lock;
2803 #endif
2804 }
2805 #if KMP_USE_FUTEX
2806 else if ((__kmp_user_lock_kind == lk_futex) &&
2807 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
2808 lck = (kmp_user_lock_p)user_lock;
2809 }
2810 #endif
2811 else {
2812 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
2813 }
2814
2815 #if USE_ITT_BUILD
2816 __kmp_itt_lock_releasing(lck);
2817 #endif /* USE_ITT_BUILD */
2818
2819 RELEASE_LOCK(lck, gtid);
2820
2821 #if OMPT_SUPPORT && OMPT_OPTIONAL
2822 // This is the case, if called from omp_init_lock_with_hint:
2823 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2824 if (!codeptr)
2825 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2826 if (ompt_enabled.ompt_callback_mutex_released) {
2827 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2828 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2829 }
2830 #endif
2831
2832 #endif // KMP_USE_DYNAMIC_LOCK
2833 }
2834
2835 /* release the lock */
__kmpc_unset_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2836 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2837 #if KMP_USE_DYNAMIC_LOCK
2838
2839 #if USE_ITT_BUILD
2840 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2841 #endif
2842 int release_status =
2843 KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2844 (void) release_status;
2845
2846 #if OMPT_SUPPORT && OMPT_OPTIONAL
2847 // This is the case, if called from omp_init_lock_with_hint:
2848 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2849 if (!codeptr)
2850 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2851 if (ompt_enabled.enabled) {
2852 if (release_status == KMP_LOCK_RELEASED) {
2853 if (ompt_enabled.ompt_callback_mutex_released) {
2854 // release_lock_last
2855 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2856 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
2857 codeptr);
2858 }
2859 } else if (ompt_enabled.ompt_callback_nest_lock) {
2860 // release_lock_prev
2861 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2862 ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2863 }
2864 }
2865 #endif
2866
2867 #else // KMP_USE_DYNAMIC_LOCK
2868
2869 kmp_user_lock_p lck;
2870
2871 /* Can't use serial interval since not block structured */
2872
2873 if ((__kmp_user_lock_kind == lk_tas) &&
2874 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
2875 OMP_NEST_LOCK_T_SIZE)) {
2876 #if KMP_OS_LINUX && \
2877 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2878 // "fast" path implemented to fix customer performance issue
2879 kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
2880 #if USE_ITT_BUILD
2881 __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2882 #endif /* USE_ITT_BUILD */
2883
2884 #if OMPT_SUPPORT && OMPT_OPTIONAL
2885 int release_status = KMP_LOCK_STILL_HELD;
2886 #endif
2887
2888 if (--(tl->lk.depth_locked) == 0) {
2889 TCW_4(tl->lk.poll, 0);
2890 #if OMPT_SUPPORT && OMPT_OPTIONAL
2891 release_status = KMP_LOCK_RELEASED;
2892 #endif
2893 }
2894 KMP_MB();
2895
2896 #if OMPT_SUPPORT && OMPT_OPTIONAL
2897 // This is the case, if called from omp_init_lock_with_hint:
2898 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2899 if (!codeptr)
2900 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2901 if (ompt_enabled.enabled) {
2902 if (release_status == KMP_LOCK_RELEASED) {
2903 if (ompt_enabled.ompt_callback_mutex_released) {
2904 // release_lock_last
2905 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2906 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2907 }
2908 } else if (ompt_enabled.ompt_callback_nest_lock) {
2909 // release_lock_previous
2910 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2911 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2912 }
2913 }
2914 #endif
2915
2916 return;
2917 #else
2918 lck = (kmp_user_lock_p)user_lock;
2919 #endif
2920 }
2921 #if KMP_USE_FUTEX
2922 else if ((__kmp_user_lock_kind == lk_futex) &&
2923 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
2924 OMP_NEST_LOCK_T_SIZE)) {
2925 lck = (kmp_user_lock_p)user_lock;
2926 }
2927 #endif
2928 else {
2929 lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
2930 }
2931
2932 #if USE_ITT_BUILD
2933 __kmp_itt_lock_releasing(lck);
2934 #endif /* USE_ITT_BUILD */
2935
2936 int release_status;
2937 release_status = RELEASE_NESTED_LOCK(lck, gtid);
2938 #if OMPT_SUPPORT && OMPT_OPTIONAL
2939 // This is the case, if called from omp_init_lock_with_hint:
2940 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2941 if (!codeptr)
2942 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2943 if (ompt_enabled.enabled) {
2944 if (release_status == KMP_LOCK_RELEASED) {
2945 if (ompt_enabled.ompt_callback_mutex_released) {
2946 // release_lock_last
2947 ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
2948 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2949 }
2950 } else if (ompt_enabled.ompt_callback_nest_lock) {
2951 // release_lock_previous
2952 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
2953 ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
2954 }
2955 }
2956 #endif
2957
2958 #endif // KMP_USE_DYNAMIC_LOCK
2959 }
2960
2961 /* try to acquire the lock */
__kmpc_test_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)2962 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
2963 KMP_COUNT_BLOCK(OMP_test_lock);
2964
2965 #if KMP_USE_DYNAMIC_LOCK
2966 int rc;
2967 int tag = KMP_EXTRACT_D_TAG(user_lock);
2968 #if USE_ITT_BUILD
2969 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2970 #endif
2971 #if OMPT_SUPPORT && OMPT_OPTIONAL
2972 // This is the case, if called from omp_init_lock_with_hint:
2973 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2974 if (!codeptr)
2975 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2976 if (ompt_enabled.ompt_callback_mutex_acquire) {
2977 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
2978 ompt_mutex_lock, omp_lock_hint_none,
2979 __ompt_get_mutex_impl_type(user_lock),
2980 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
2981 }
2982 #endif
2983 #if KMP_USE_INLINED_TAS
2984 if (tag == locktag_tas && !__kmp_env_consistency_check) {
2985 KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2986 } else
2987 #elif KMP_USE_INLINED_FUTEX
2988 if (tag == locktag_futex && !__kmp_env_consistency_check) {
2989 KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2990 } else
2991 #endif
2992 {
2993 rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2994 }
2995 if (rc) {
2996 #if USE_ITT_BUILD
2997 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2998 #endif
2999 #if OMPT_SUPPORT && OMPT_OPTIONAL
3000 if (ompt_enabled.ompt_callback_mutex_acquired) {
3001 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3002 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3003 }
3004 #endif
3005 return FTN_TRUE;
3006 } else {
3007 #if USE_ITT_BUILD
3008 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3009 #endif
3010 return FTN_FALSE;
3011 }
3012
3013 #else // KMP_USE_DYNAMIC_LOCK
3014
3015 kmp_user_lock_p lck;
3016 int rc;
3017
3018 if ((__kmp_user_lock_kind == lk_tas) &&
3019 (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
3020 lck = (kmp_user_lock_p)user_lock;
3021 }
3022 #if KMP_USE_FUTEX
3023 else if ((__kmp_user_lock_kind == lk_futex) &&
3024 (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
3025 lck = (kmp_user_lock_p)user_lock;
3026 }
3027 #endif
3028 else {
3029 lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
3030 }
3031
3032 #if USE_ITT_BUILD
3033 __kmp_itt_lock_acquiring(lck);
3034 #endif /* USE_ITT_BUILD */
3035 #if OMPT_SUPPORT && OMPT_OPTIONAL
3036 // This is the case, if called from omp_init_lock_with_hint:
3037 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3038 if (!codeptr)
3039 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3040 if (ompt_enabled.ompt_callback_mutex_acquire) {
3041 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3042 ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
3043 (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3044 }
3045 #endif
3046
3047 rc = TEST_LOCK(lck, gtid);
3048 #if USE_ITT_BUILD
3049 if (rc) {
3050 __kmp_itt_lock_acquired(lck);
3051 } else {
3052 __kmp_itt_lock_cancelled(lck);
3053 }
3054 #endif /* USE_ITT_BUILD */
3055 #if OMPT_SUPPORT && OMPT_OPTIONAL
3056 if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
3057 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3058 ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3059 }
3060 #endif
3061
3062 return (rc ? FTN_TRUE : FTN_FALSE);
3063
3064 /* Can't use serial interval since not block structured */
3065
3066 #endif // KMP_USE_DYNAMIC_LOCK
3067 }
3068
3069 /* try to acquire the lock */
__kmpc_test_nest_lock(ident_t * loc,kmp_int32 gtid,void ** user_lock)3070 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
3071 #if KMP_USE_DYNAMIC_LOCK
3072 int rc;
3073 #if USE_ITT_BUILD
3074 __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
3075 #endif
3076 #if OMPT_SUPPORT && OMPT_OPTIONAL
3077 // This is the case, if called from omp_init_lock_with_hint:
3078 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3079 if (!codeptr)
3080 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3081 if (ompt_enabled.ompt_callback_mutex_acquire) {
3082 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3083 ompt_mutex_nest_lock, omp_lock_hint_none,
3084 __ompt_get_mutex_impl_type(user_lock),
3085 (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3086 }
3087 #endif
3088 rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
3089 #if USE_ITT_BUILD
3090 if (rc) {
3091 __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
3092 } else {
3093 __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
3094 }
3095 #endif
3096 #if OMPT_SUPPORT && OMPT_OPTIONAL
3097 if (ompt_enabled.enabled && rc) {
3098 if (rc == 1) {
3099 if (ompt_enabled.ompt_callback_mutex_acquired) {
3100 // lock_first
3101 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3102 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
3103 codeptr);
3104 }
3105 } else {
3106 if (ompt_enabled.ompt_callback_nest_lock) {
3107 // lock_next
3108 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3109 ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
3110 }
3111 }
3112 }
3113 #endif
3114 return rc;
3115
3116 #else // KMP_USE_DYNAMIC_LOCK
3117
3118 kmp_user_lock_p lck;
3119 int rc;
3120
3121 if ((__kmp_user_lock_kind == lk_tas) &&
3122 (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
3123 OMP_NEST_LOCK_T_SIZE)) {
3124 lck = (kmp_user_lock_p)user_lock;
3125 }
3126 #if KMP_USE_FUTEX
3127 else if ((__kmp_user_lock_kind == lk_futex) &&
3128 (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
3129 OMP_NEST_LOCK_T_SIZE)) {
3130 lck = (kmp_user_lock_p)user_lock;
3131 }
3132 #endif
3133 else {
3134 lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
3135 }
3136
3137 #if USE_ITT_BUILD
3138 __kmp_itt_lock_acquiring(lck);
3139 #endif /* USE_ITT_BUILD */
3140
3141 #if OMPT_SUPPORT && OMPT_OPTIONAL
3142 // This is the case, if called from omp_init_lock_with_hint:
3143 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
3144 if (!codeptr)
3145 codeptr = OMPT_GET_RETURN_ADDRESS(0);
3146 if (ompt_enabled.enabled) &&
3147 ompt_enabled.ompt_callback_mutex_acquire) {
3148 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
3149 ompt_mutex_nest_lock, omp_lock_hint_none,
3150 __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
3151 codeptr);
3152 }
3153 #endif
3154
3155 rc = TEST_NESTED_LOCK(lck, gtid);
3156 #if USE_ITT_BUILD
3157 if (rc) {
3158 __kmp_itt_lock_acquired(lck);
3159 } else {
3160 __kmp_itt_lock_cancelled(lck);
3161 }
3162 #endif /* USE_ITT_BUILD */
3163 #if OMPT_SUPPORT && OMPT_OPTIONAL
3164 if (ompt_enabled.enabled && rc) {
3165 if (rc == 1) {
3166 if (ompt_enabled.ompt_callback_mutex_acquired) {
3167 // lock_first
3168 ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
3169 ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3170 }
3171 } else {
3172 if (ompt_enabled.ompt_callback_nest_lock) {
3173 // lock_next
3174 ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
3175 ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
3176 }
3177 }
3178 }
3179 #endif
3180 return rc;
3181
3182 /* Can't use serial interval since not block structured */
3183
3184 #endif // KMP_USE_DYNAMIC_LOCK
3185 }
3186
3187 // Interface to fast scalable reduce methods routines
3188
3189 // keep the selected method in a thread local structure for cross-function
3190 // usage: will be used in __kmpc_end_reduce* functions;
3191 // another solution: to re-determine the method one more time in
3192 // __kmpc_end_reduce* functions (new prototype required then)
3193 // AT: which solution is better?
3194 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod) \
3195 ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
3196
3197 #define __KMP_GET_REDUCTION_METHOD(gtid) \
3198 (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
3199
3200 // description of the packed_reduction_method variable: look at the macros in
3201 // kmp.h
3202
3203 // used in a critical section reduce block
3204 static __forceinline void
__kmp_enter_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3205 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3206 kmp_critical_name *crit) {
3207
3208 // this lock was visible to a customer and to the threading profile tool as a
3209 // serial overhead span (although it's used for an internal purpose only)
3210 // why was it visible in previous implementation?
3211 // should we keep it visible in new reduce block?
3212 kmp_user_lock_p lck;
3213
3214 #if KMP_USE_DYNAMIC_LOCK
3215
3216 kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
3217 // Check if it is initialized.
3218 if (*lk == 0) {
3219 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3220 KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
3221 KMP_GET_D_TAG(__kmp_user_lock_seq));
3222 } else {
3223 __kmp_init_indirect_csptr(crit, loc, global_tid,
3224 KMP_GET_I_TAG(__kmp_user_lock_seq));
3225 }
3226 }
3227 // Branch for accessing the actual lock object and set operation. This
3228 // branching is inevitable since this lock initialization does not follow the
3229 // normal dispatch path (lock table is not used).
3230 if (KMP_EXTRACT_D_TAG(lk) != 0) {
3231 lck = (kmp_user_lock_p)lk;
3232 KMP_DEBUG_ASSERT(lck != NULL);
3233 if (__kmp_env_consistency_check) {
3234 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3235 }
3236 KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
3237 } else {
3238 kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
3239 lck = ilk->lock;
3240 KMP_DEBUG_ASSERT(lck != NULL);
3241 if (__kmp_env_consistency_check) {
3242 __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
3243 }
3244 KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
3245 }
3246
3247 #else // KMP_USE_DYNAMIC_LOCK
3248
3249 // We know that the fast reduction code is only emitted by Intel compilers
3250 // with 32 byte critical sections. If there isn't enough space, then we
3251 // have to use a pointer.
3252 if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
3253 lck = (kmp_user_lock_p)crit;
3254 } else {
3255 lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
3256 }
3257 KMP_DEBUG_ASSERT(lck != NULL);
3258
3259 if (__kmp_env_consistency_check)
3260 __kmp_push_sync(global_tid, ct_critical, loc, lck);
3261
3262 __kmp_acquire_user_lock_with_checks(lck, global_tid);
3263
3264 #endif // KMP_USE_DYNAMIC_LOCK
3265 }
3266
3267 // used in a critical section reduce block
3268 static __forceinline void
__kmp_end_critical_section_reduce_block(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * crit)3269 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
3270 kmp_critical_name *crit) {
3271
3272 kmp_user_lock_p lck;
3273
3274 #if KMP_USE_DYNAMIC_LOCK
3275
3276 if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
3277 lck = (kmp_user_lock_p)crit;
3278 if (__kmp_env_consistency_check)
3279 __kmp_pop_sync(global_tid, ct_critical, loc);
3280 KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
3281 } else {
3282 kmp_indirect_lock_t *ilk =
3283 (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
3284 if (__kmp_env_consistency_check)
3285 __kmp_pop_sync(global_tid, ct_critical, loc);
3286 KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
3287 }
3288
3289 #else // KMP_USE_DYNAMIC_LOCK
3290
3291 // We know that the fast reduction code is only emitted by Intel compilers
3292 // with 32 byte critical sections. If there isn't enough space, then we have
3293 // to use a pointer.
3294 if (__kmp_base_user_lock_size > 32) {
3295 lck = *((kmp_user_lock_p *)crit);
3296 KMP_ASSERT(lck != NULL);
3297 } else {
3298 lck = (kmp_user_lock_p)crit;
3299 }
3300
3301 if (__kmp_env_consistency_check)
3302 __kmp_pop_sync(global_tid, ct_critical, loc);
3303
3304 __kmp_release_user_lock_with_checks(lck, global_tid);
3305
3306 #endif // KMP_USE_DYNAMIC_LOCK
3307 } // __kmp_end_critical_section_reduce_block
3308
3309 static __forceinline int
__kmp_swap_teams_for_teams_reduction(kmp_info_t * th,kmp_team_t ** team_p,int * task_state)3310 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
3311 int *task_state) {
3312 kmp_team_t *team;
3313
3314 // Check if we are inside the teams construct?
3315 if (th->th.th_teams_microtask) {
3316 *team_p = team = th->th.th_team;
3317 if (team->t.t_level == th->th.th_teams_level) {
3318 // This is reduction at teams construct.
3319 KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
3320 // Let's swap teams temporarily for the reduction.
3321 th->th.th_info.ds.ds_tid = team->t.t_master_tid;
3322 th->th.th_team = team->t.t_parent;
3323 th->th.th_team_nproc = th->th.th_team->t.t_nproc;
3324 th->th.th_task_team = th->th.th_team->t.t_task_team[0];
3325 *task_state = th->th.th_task_state;
3326 th->th.th_task_state = 0;
3327
3328 return 1;
3329 }
3330 }
3331 return 0;
3332 }
3333
3334 static __forceinline void
__kmp_restore_swapped_teams(kmp_info_t * th,kmp_team_t * team,int task_state)3335 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
3336 // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
3337 th->th.th_info.ds.ds_tid = 0;
3338 th->th.th_team = team;
3339 th->th.th_team_nproc = team->t.t_nproc;
3340 th->th.th_task_team = team->t.t_task_team[task_state];
3341 th->th.th_task_state = task_state;
3342 }
3343
3344 /* 2.a.i. Reduce Block without a terminating barrier */
3345 /*!
3346 @ingroup SYNCHRONIZATION
3347 @param loc source location information
3348 @param global_tid global thread number
3349 @param num_vars number of items (variables) to be reduced
3350 @param reduce_size size of data in bytes to be reduced
3351 @param reduce_data pointer to data to be reduced
3352 @param reduce_func callback function providing reduction operation on two
3353 operands and returning result of reduction in lhs_data
3354 @param lck pointer to the unique lock data structure
3355 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3356 threads if atomic reduction needed
3357
3358 The nowait version is used for a reduce clause with the nowait argument.
3359 */
3360 kmp_int32
__kmpc_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3361 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3362 size_t reduce_size, void *reduce_data,
3363 void (*reduce_func)(void *lhs_data, void *rhs_data),
3364 kmp_critical_name *lck) {
3365
3366 KMP_COUNT_BLOCK(REDUCE_nowait);
3367 int retval = 0;
3368 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3369 kmp_info_t *th;
3370 kmp_team_t *team;
3371 int teams_swapped = 0, task_state;
3372 KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
3373
3374 // why do we need this initialization here at all?
3375 // Reduction clause can not be used as a stand-alone directive.
3376
3377 // do not call __kmp_serial_initialize(), it will be called by
3378 // __kmp_parallel_initialize() if needed
3379 // possible detection of false-positive race by the threadchecker ???
3380 if (!TCR_4(__kmp_init_parallel))
3381 __kmp_parallel_initialize();
3382
3383 __kmp_resume_if_soft_paused();
3384
3385 // check correctness of reduce block nesting
3386 #if KMP_USE_DYNAMIC_LOCK
3387 if (__kmp_env_consistency_check)
3388 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3389 #else
3390 if (__kmp_env_consistency_check)
3391 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3392 #endif
3393
3394 th = __kmp_thread_from_gtid(global_tid);
3395 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3396
3397 // packed_reduction_method value will be reused by __kmp_end_reduce* function,
3398 // the value should be kept in a variable
3399 // the variable should be either a construct-specific or thread-specific
3400 // property, not a team specific property
3401 // (a thread can reach the next reduce block on the next construct, reduce
3402 // method may differ on the next construct)
3403 // an ident_t "loc" parameter could be used as a construct-specific property
3404 // (what if loc == 0?)
3405 // (if both construct-specific and team-specific variables were shared,
3406 // then unness extra syncs should be needed)
3407 // a thread-specific variable is better regarding two issues above (next
3408 // construct and extra syncs)
3409 // a thread-specific "th_local.reduction_method" variable is used currently
3410 // each thread executes 'determine' and 'set' lines (no need to execute by one
3411 // thread, to avoid unness extra syncs)
3412
3413 packed_reduction_method = __kmp_determine_reduction_method(
3414 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3415 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3416
3417 OMPT_REDUCTION_DECL(th, global_tid);
3418 if (packed_reduction_method == critical_reduce_block) {
3419
3420 OMPT_REDUCTION_BEGIN;
3421
3422 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3423 retval = 1;
3424
3425 } else if (packed_reduction_method == empty_reduce_block) {
3426
3427 OMPT_REDUCTION_BEGIN;
3428
3429 // usage: if team size == 1, no synchronization is required ( Intel
3430 // platforms only )
3431 retval = 1;
3432
3433 } else if (packed_reduction_method == atomic_reduce_block) {
3434
3435 retval = 2;
3436
3437 // all threads should do this pop here (because __kmpc_end_reduce_nowait()
3438 // won't be called by the code gen)
3439 // (it's not quite good, because the checking block has been closed by
3440 // this 'pop',
3441 // but atomic operation has not been executed yet, will be executed
3442 // slightly later, literally on next instruction)
3443 if (__kmp_env_consistency_check)
3444 __kmp_pop_sync(global_tid, ct_reduce, loc);
3445
3446 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3447 tree_reduce_block)) {
3448
3449 // AT: performance issue: a real barrier here
3450 // AT: (if master goes slow, other threads are blocked here waiting for the
3451 // master to come and release them)
3452 // AT: (it's not what a customer might expect specifying NOWAIT clause)
3453 // AT: (specifying NOWAIT won't result in improvement of performance, it'll
3454 // be confusing to a customer)
3455 // AT: another implementation of *barrier_gather*nowait() (or some other design)
3456 // might go faster and be more in line with sense of NOWAIT
3457 // AT: TO DO: do epcc test and compare times
3458
3459 // this barrier should be invisible to a customer and to the threading profile
3460 // tool (it's neither a terminating barrier nor customer's code, it's
3461 // used for an internal purpose)
3462 #if OMPT_SUPPORT
3463 // JP: can this barrier potentially leed to task scheduling?
3464 // JP: as long as there is a barrier in the implementation, OMPT should and
3465 // will provide the barrier events
3466 // so we set-up the necessary frame/return addresses.
3467 ompt_frame_t *ompt_frame;
3468 if (ompt_enabled.enabled) {
3469 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3470 if (ompt_frame->enter_frame.ptr == NULL)
3471 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3472 OMPT_STORE_RETURN_ADDRESS(global_tid);
3473 }
3474 #endif
3475 #if USE_ITT_NOTIFY
3476 __kmp_threads[global_tid]->th.th_ident = loc;
3477 #endif
3478 retval =
3479 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3480 global_tid, FALSE, reduce_size, reduce_data, reduce_func);
3481 retval = (retval != 0) ? (0) : (1);
3482 #if OMPT_SUPPORT && OMPT_OPTIONAL
3483 if (ompt_enabled.enabled) {
3484 ompt_frame->enter_frame = ompt_data_none;
3485 }
3486 #endif
3487
3488 // all other workers except master should do this pop here
3489 // ( none of other workers will get to __kmpc_end_reduce_nowait() )
3490 if (__kmp_env_consistency_check) {
3491 if (retval == 0) {
3492 __kmp_pop_sync(global_tid, ct_reduce, loc);
3493 }
3494 }
3495
3496 } else {
3497
3498 // should never reach this block
3499 KMP_ASSERT(0); // "unexpected method"
3500 }
3501 if (teams_swapped) {
3502 __kmp_restore_swapped_teams(th, team, task_state);
3503 }
3504 KA_TRACE(
3505 10,
3506 ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
3507 global_tid, packed_reduction_method, retval));
3508
3509 return retval;
3510 }
3511
3512 /*!
3513 @ingroup SYNCHRONIZATION
3514 @param loc source location information
3515 @param global_tid global thread id.
3516 @param lck pointer to the unique lock data structure
3517
3518 Finish the execution of a reduce nowait.
3519 */
__kmpc_end_reduce_nowait(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3520 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
3521 kmp_critical_name *lck) {
3522
3523 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3524
3525 KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
3526
3527 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3528
3529 OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid);
3530
3531 if (packed_reduction_method == critical_reduce_block) {
3532
3533 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3534 OMPT_REDUCTION_END;
3535
3536 } else if (packed_reduction_method == empty_reduce_block) {
3537
3538 // usage: if team size == 1, no synchronization is required ( on Intel
3539 // platforms only )
3540
3541 OMPT_REDUCTION_END;
3542
3543 } else if (packed_reduction_method == atomic_reduce_block) {
3544
3545 // neither master nor other workers should get here
3546 // (code gen does not generate this call in case 2: atomic reduce block)
3547 // actually it's better to remove this elseif at all;
3548 // after removal this value will checked by the 'else' and will assert
3549
3550 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3551 tree_reduce_block)) {
3552
3553 // only master gets here
3554 // OMPT: tree reduction is annotated in the barrier code
3555
3556 } else {
3557
3558 // should never reach this block
3559 KMP_ASSERT(0); // "unexpected method"
3560 }
3561
3562 if (__kmp_env_consistency_check)
3563 __kmp_pop_sync(global_tid, ct_reduce, loc);
3564
3565 KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
3566 global_tid, packed_reduction_method));
3567
3568 return;
3569 }
3570
3571 /* 2.a.ii. Reduce Block with a terminating barrier */
3572
3573 /*!
3574 @ingroup SYNCHRONIZATION
3575 @param loc source location information
3576 @param global_tid global thread number
3577 @param num_vars number of items (variables) to be reduced
3578 @param reduce_size size of data in bytes to be reduced
3579 @param reduce_data pointer to data to be reduced
3580 @param reduce_func callback function providing reduction operation on two
3581 operands and returning result of reduction in lhs_data
3582 @param lck pointer to the unique lock data structure
3583 @result 1 for the master thread, 0 for all other team threads, 2 for all team
3584 threads if atomic reduction needed
3585
3586 A blocking reduce that includes an implicit barrier.
3587 */
__kmpc_reduce(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)3588 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
3589 size_t reduce_size, void *reduce_data,
3590 void (*reduce_func)(void *lhs_data, void *rhs_data),
3591 kmp_critical_name *lck) {
3592 KMP_COUNT_BLOCK(REDUCE_wait);
3593 int retval = 0;
3594 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3595 kmp_info_t *th;
3596 kmp_team_t *team;
3597 int teams_swapped = 0, task_state;
3598
3599 KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
3600
3601 // why do we need this initialization here at all?
3602 // Reduction clause can not be a stand-alone directive.
3603
3604 // do not call __kmp_serial_initialize(), it will be called by
3605 // __kmp_parallel_initialize() if needed
3606 // possible detection of false-positive race by the threadchecker ???
3607 if (!TCR_4(__kmp_init_parallel))
3608 __kmp_parallel_initialize();
3609
3610 __kmp_resume_if_soft_paused();
3611
3612 // check correctness of reduce block nesting
3613 #if KMP_USE_DYNAMIC_LOCK
3614 if (__kmp_env_consistency_check)
3615 __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
3616 #else
3617 if (__kmp_env_consistency_check)
3618 __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
3619 #endif
3620
3621 th = __kmp_thread_from_gtid(global_tid);
3622 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3623
3624 packed_reduction_method = __kmp_determine_reduction_method(
3625 loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
3626 __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
3627
3628 OMPT_REDUCTION_DECL(th, global_tid);
3629
3630 if (packed_reduction_method == critical_reduce_block) {
3631
3632 OMPT_REDUCTION_BEGIN;
3633 __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
3634 retval = 1;
3635
3636 } else if (packed_reduction_method == empty_reduce_block) {
3637
3638 OMPT_REDUCTION_BEGIN;
3639 // usage: if team size == 1, no synchronization is required ( Intel
3640 // platforms only )
3641 retval = 1;
3642
3643 } else if (packed_reduction_method == atomic_reduce_block) {
3644
3645 retval = 2;
3646
3647 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3648 tree_reduce_block)) {
3649
3650 // case tree_reduce_block:
3651 // this barrier should be visible to a customer and to the threading profile
3652 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3653 #if OMPT_SUPPORT
3654 ompt_frame_t *ompt_frame;
3655 if (ompt_enabled.enabled) {
3656 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3657 if (ompt_frame->enter_frame.ptr == NULL)
3658 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3659 OMPT_STORE_RETURN_ADDRESS(global_tid);
3660 }
3661 #endif
3662 #if USE_ITT_NOTIFY
3663 __kmp_threads[global_tid]->th.th_ident =
3664 loc; // needed for correct notification of frames
3665 #endif
3666 retval =
3667 __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3668 global_tid, TRUE, reduce_size, reduce_data, reduce_func);
3669 retval = (retval != 0) ? (0) : (1);
3670 #if OMPT_SUPPORT && OMPT_OPTIONAL
3671 if (ompt_enabled.enabled) {
3672 ompt_frame->enter_frame = ompt_data_none;
3673 }
3674 #endif
3675
3676 // all other workers except master should do this pop here
3677 // ( none of other workers except master will enter __kmpc_end_reduce() )
3678 if (__kmp_env_consistency_check) {
3679 if (retval == 0) { // 0: all other workers; 1: master
3680 __kmp_pop_sync(global_tid, ct_reduce, loc);
3681 }
3682 }
3683
3684 } else {
3685
3686 // should never reach this block
3687 KMP_ASSERT(0); // "unexpected method"
3688 }
3689 if (teams_swapped) {
3690 __kmp_restore_swapped_teams(th, team, task_state);
3691 }
3692
3693 KA_TRACE(10,
3694 ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
3695 global_tid, packed_reduction_method, retval));
3696 return retval;
3697 }
3698
3699 /*!
3700 @ingroup SYNCHRONIZATION
3701 @param loc source location information
3702 @param global_tid global thread id.
3703 @param lck pointer to the unique lock data structure
3704
3705 Finish the execution of a blocking reduce.
3706 The <tt>lck</tt> pointer must be the same as that used in the corresponding
3707 start function.
3708 */
__kmpc_end_reduce(ident_t * loc,kmp_int32 global_tid,kmp_critical_name * lck)3709 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
3710 kmp_critical_name *lck) {
3711
3712 PACKED_REDUCTION_METHOD_T packed_reduction_method;
3713 kmp_info_t *th;
3714 kmp_team_t *team;
3715 int teams_swapped = 0, task_state;
3716
3717 KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
3718
3719 th = __kmp_thread_from_gtid(global_tid);
3720 teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
3721
3722 packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
3723
3724 // this barrier should be visible to a customer and to the threading profile
3725 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
3726 OMPT_REDUCTION_DECL(th, global_tid);
3727
3728 if (packed_reduction_method == critical_reduce_block) {
3729 __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
3730
3731 OMPT_REDUCTION_END;
3732
3733 // TODO: implicit barrier: should be exposed
3734 #if OMPT_SUPPORT
3735 ompt_frame_t *ompt_frame;
3736 if (ompt_enabled.enabled) {
3737 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3738 if (ompt_frame->enter_frame.ptr == NULL)
3739 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3740 OMPT_STORE_RETURN_ADDRESS(global_tid);
3741 }
3742 #endif
3743 #if USE_ITT_NOTIFY
3744 __kmp_threads[global_tid]->th.th_ident = loc;
3745 #endif
3746 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3747 #if OMPT_SUPPORT && OMPT_OPTIONAL
3748 if (ompt_enabled.enabled) {
3749 ompt_frame->enter_frame = ompt_data_none;
3750 }
3751 #endif
3752
3753 } else if (packed_reduction_method == empty_reduce_block) {
3754
3755 OMPT_REDUCTION_END;
3756
3757 // usage: if team size==1, no synchronization is required (Intel platforms only)
3758
3759 // TODO: implicit barrier: should be exposed
3760 #if OMPT_SUPPORT
3761 ompt_frame_t *ompt_frame;
3762 if (ompt_enabled.enabled) {
3763 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3764 if (ompt_frame->enter_frame.ptr == NULL)
3765 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3766 OMPT_STORE_RETURN_ADDRESS(global_tid);
3767 }
3768 #endif
3769 #if USE_ITT_NOTIFY
3770 __kmp_threads[global_tid]->th.th_ident = loc;
3771 #endif
3772 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3773 #if OMPT_SUPPORT && OMPT_OPTIONAL
3774 if (ompt_enabled.enabled) {
3775 ompt_frame->enter_frame = ompt_data_none;
3776 }
3777 #endif
3778
3779 } else if (packed_reduction_method == atomic_reduce_block) {
3780
3781 #if OMPT_SUPPORT
3782 ompt_frame_t *ompt_frame;
3783 if (ompt_enabled.enabled) {
3784 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
3785 if (ompt_frame->enter_frame.ptr == NULL)
3786 ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
3787 OMPT_STORE_RETURN_ADDRESS(global_tid);
3788 }
3789 #endif
3790 // TODO: implicit barrier: should be exposed
3791 #if USE_ITT_NOTIFY
3792 __kmp_threads[global_tid]->th.th_ident = loc;
3793 #endif
3794 __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
3795 #if OMPT_SUPPORT && OMPT_OPTIONAL
3796 if (ompt_enabled.enabled) {
3797 ompt_frame->enter_frame = ompt_data_none;
3798 }
3799 #endif
3800
3801 } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
3802 tree_reduce_block)) {
3803
3804 // only master executes here (master releases all other workers)
3805 __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
3806 global_tid);
3807
3808 } else {
3809
3810 // should never reach this block
3811 KMP_ASSERT(0); // "unexpected method"
3812 }
3813 if (teams_swapped) {
3814 __kmp_restore_swapped_teams(th, team, task_state);
3815 }
3816
3817 if (__kmp_env_consistency_check)
3818 __kmp_pop_sync(global_tid, ct_reduce, loc);
3819
3820 KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
3821 global_tid, packed_reduction_method));
3822
3823 return;
3824 }
3825
3826 #undef __KMP_GET_REDUCTION_METHOD
3827 #undef __KMP_SET_REDUCTION_METHOD
3828
3829 /* end of interface to fast scalable reduce routines */
3830
__kmpc_get_taskid()3831 kmp_uint64 __kmpc_get_taskid() {
3832
3833 kmp_int32 gtid;
3834 kmp_info_t *thread;
3835
3836 gtid = __kmp_get_gtid();
3837 if (gtid < 0) {
3838 return 0;
3839 }
3840 thread = __kmp_thread_from_gtid(gtid);
3841 return thread->th.th_current_task->td_task_id;
3842
3843 } // __kmpc_get_taskid
3844
__kmpc_get_parent_taskid()3845 kmp_uint64 __kmpc_get_parent_taskid() {
3846
3847 kmp_int32 gtid;
3848 kmp_info_t *thread;
3849 kmp_taskdata_t *parent_task;
3850
3851 gtid = __kmp_get_gtid();
3852 if (gtid < 0) {
3853 return 0;
3854 }
3855 thread = __kmp_thread_from_gtid(gtid);
3856 parent_task = thread->th.th_current_task->td_parent;
3857 return (parent_task == NULL ? 0 : parent_task->td_task_id);
3858
3859 } // __kmpc_get_parent_taskid
3860
3861 /*!
3862 @ingroup WORK_SHARING
3863 @param loc source location information.
3864 @param gtid global thread number.
3865 @param num_dims number of associated doacross loops.
3866 @param dims info on loops bounds.
3867
3868 Initialize doacross loop information.
3869 Expect compiler send us inclusive bounds,
3870 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
3871 */
__kmpc_doacross_init(ident_t * loc,int gtid,int num_dims,const struct kmp_dim * dims)3872 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
3873 const struct kmp_dim *dims) {
3874 int j, idx;
3875 kmp_int64 last, trace_count;
3876 kmp_info_t *th = __kmp_threads[gtid];
3877 kmp_team_t *team = th->th.th_team;
3878 kmp_uint32 *flags;
3879 kmp_disp_t *pr_buf = th->th.th_dispatch;
3880 dispatch_shared_info_t *sh_buf;
3881
3882 KA_TRACE(
3883 20,
3884 ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3885 gtid, num_dims, !team->t.t_serialized));
3886 KMP_DEBUG_ASSERT(dims != NULL);
3887 KMP_DEBUG_ASSERT(num_dims > 0);
3888
3889 if (team->t.t_serialized) {
3890 KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
3891 return; // no dependencies if team is serialized
3892 }
3893 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3894 idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
3895 // the next loop
3896 sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3897
3898 // Save bounds info into allocated private buffer
3899 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3900 pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
3901 th, sizeof(kmp_int64) * (4 * num_dims + 1));
3902 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3903 pr_buf->th_doacross_info[0] =
3904 (kmp_int64)num_dims; // first element is number of dimensions
3905 // Save also address of num_done in order to access it later without knowing
3906 // the buffer index
3907 pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3908 pr_buf->th_doacross_info[2] = dims[0].lo;
3909 pr_buf->th_doacross_info[3] = dims[0].up;
3910 pr_buf->th_doacross_info[4] = dims[0].st;
3911 last = 5;
3912 for (j = 1; j < num_dims; ++j) {
3913 kmp_int64
3914 range_length; // To keep ranges of all dimensions but the first dims[0]
3915 if (dims[j].st == 1) { // most common case
3916 // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3917 range_length = dims[j].up - dims[j].lo + 1;
3918 } else {
3919 if (dims[j].st > 0) {
3920 KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3921 range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3922 } else { // negative increment
3923 KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3924 range_length =
3925 (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3926 }
3927 }
3928 pr_buf->th_doacross_info[last++] = range_length;
3929 pr_buf->th_doacross_info[last++] = dims[j].lo;
3930 pr_buf->th_doacross_info[last++] = dims[j].up;
3931 pr_buf->th_doacross_info[last++] = dims[j].st;
3932 }
3933
3934 // Compute total trip count.
3935 // Start with range of dims[0] which we don't need to keep in the buffer.
3936 if (dims[0].st == 1) { // most common case
3937 trace_count = dims[0].up - dims[0].lo + 1;
3938 } else if (dims[0].st > 0) {
3939 KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3940 trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3941 } else { // negative increment
3942 KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3943 trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3944 }
3945 for (j = 1; j < num_dims; ++j) {
3946 trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3947 }
3948 KMP_DEBUG_ASSERT(trace_count > 0);
3949
3950 // Check if shared buffer is not occupied by other loop (idx -
3951 // __kmp_dispatch_num_buffers)
3952 if (idx != sh_buf->doacross_buf_idx) {
3953 // Shared buffer is occupied, wait for it to be free
3954 __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
3955 __kmp_eq_4, NULL);
3956 }
3957 #if KMP_32_BIT_ARCH
3958 // Check if we are the first thread. After the CAS the first thread gets 0,
3959 // others get 1 if initialization is in progress, allocated pointer otherwise.
3960 // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
3961 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
3962 (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
3963 #else
3964 flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
3965 (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
3966 #endif
3967 if (flags == NULL) {
3968 // we are the first thread, allocate the array of flags
3969 size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3970 flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
3971 KMP_MB();
3972 sh_buf->doacross_flags = flags;
3973 } else if (flags == (kmp_uint32 *)1) {
3974 #if KMP_32_BIT_ARCH
3975 // initialization is still in progress, need to wait
3976 while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
3977 #else
3978 while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
3979 #endif
3980 KMP_YIELD(TRUE);
3981 KMP_MB();
3982 } else {
3983 KMP_MB();
3984 }
3985 KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
3986 pr_buf->th_doacross_flags =
3987 sh_buf->doacross_flags; // save private copy in order to not
3988 // touch shared buffer on each iteration
3989 KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
3990 }
3991
__kmpc_doacross_wait(ident_t * loc,int gtid,const kmp_int64 * vec)3992 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
3993 kmp_int32 shft, num_dims, i;
3994 kmp_uint32 flag;
3995 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3996 kmp_info_t *th = __kmp_threads[gtid];
3997 kmp_team_t *team = th->th.th_team;
3998 kmp_disp_t *pr_buf;
3999 kmp_int64 lo, up, st;
4000
4001 KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
4002 if (team->t.t_serialized) {
4003 KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
4004 return; // no dependencies if team is serialized
4005 }
4006
4007 // calculate sequential iteration number and check out-of-bounds condition
4008 pr_buf = th->th.th_dispatch;
4009 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4010 num_dims = pr_buf->th_doacross_info[0];
4011 lo = pr_buf->th_doacross_info[2];
4012 up = pr_buf->th_doacross_info[3];
4013 st = pr_buf->th_doacross_info[4];
4014 if (st == 1) { // most common case
4015 if (vec[0] < lo || vec[0] > up) {
4016 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4017 "bounds [%lld,%lld]\n",
4018 gtid, vec[0], lo, up));
4019 return;
4020 }
4021 iter_number = vec[0] - lo;
4022 } else if (st > 0) {
4023 if (vec[0] < lo || vec[0] > up) {
4024 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4025 "bounds [%lld,%lld]\n",
4026 gtid, vec[0], lo, up));
4027 return;
4028 }
4029 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4030 } else { // negative increment
4031 if (vec[0] > lo || vec[0] < up) {
4032 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4033 "bounds [%lld,%lld]\n",
4034 gtid, vec[0], lo, up));
4035 return;
4036 }
4037 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4038 }
4039 for (i = 1; i < num_dims; ++i) {
4040 kmp_int64 iter, ln;
4041 kmp_int32 j = i * 4;
4042 ln = pr_buf->th_doacross_info[j + 1];
4043 lo = pr_buf->th_doacross_info[j + 2];
4044 up = pr_buf->th_doacross_info[j + 3];
4045 st = pr_buf->th_doacross_info[j + 4];
4046 if (st == 1) {
4047 if (vec[i] < lo || vec[i] > up) {
4048 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4049 "bounds [%lld,%lld]\n",
4050 gtid, vec[i], lo, up));
4051 return;
4052 }
4053 iter = vec[i] - lo;
4054 } else if (st > 0) {
4055 if (vec[i] < lo || vec[i] > up) {
4056 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4057 "bounds [%lld,%lld]\n",
4058 gtid, vec[i], lo, up));
4059 return;
4060 }
4061 iter = (kmp_uint64)(vec[i] - lo) / st;
4062 } else { // st < 0
4063 if (vec[i] > lo || vec[i] < up) {
4064 KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
4065 "bounds [%lld,%lld]\n",
4066 gtid, vec[i], lo, up));
4067 return;
4068 }
4069 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4070 }
4071 iter_number = iter + ln * iter_number;
4072 }
4073 shft = iter_number % 32; // use 32-bit granularity
4074 iter_number >>= 5; // divided by 32
4075 flag = 1 << shft;
4076 while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
4077 KMP_YIELD(TRUE);
4078 }
4079 KMP_MB();
4080 KA_TRACE(20,
4081 ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
4082 gtid, (iter_number << 5) + shft));
4083 }
4084
__kmpc_doacross_post(ident_t * loc,int gtid,const kmp_int64 * vec)4085 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
4086 kmp_int32 shft, num_dims, i;
4087 kmp_uint32 flag;
4088 kmp_int64 iter_number; // iteration number of "collapsed" loop nest
4089 kmp_info_t *th = __kmp_threads[gtid];
4090 kmp_team_t *team = th->th.th_team;
4091 kmp_disp_t *pr_buf;
4092 kmp_int64 lo, st;
4093
4094 KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
4095 if (team->t.t_serialized) {
4096 KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
4097 return; // no dependencies if team is serialized
4098 }
4099
4100 // calculate sequential iteration number (same as in "wait" but no
4101 // out-of-bounds checks)
4102 pr_buf = th->th.th_dispatch;
4103 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
4104 num_dims = pr_buf->th_doacross_info[0];
4105 lo = pr_buf->th_doacross_info[2];
4106 st = pr_buf->th_doacross_info[4];
4107 if (st == 1) { // most common case
4108 iter_number = vec[0] - lo;
4109 } else if (st > 0) {
4110 iter_number = (kmp_uint64)(vec[0] - lo) / st;
4111 } else { // negative increment
4112 iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
4113 }
4114 for (i = 1; i < num_dims; ++i) {
4115 kmp_int64 iter, ln;
4116 kmp_int32 j = i * 4;
4117 ln = pr_buf->th_doacross_info[j + 1];
4118 lo = pr_buf->th_doacross_info[j + 2];
4119 st = pr_buf->th_doacross_info[j + 4];
4120 if (st == 1) {
4121 iter = vec[i] - lo;
4122 } else if (st > 0) {
4123 iter = (kmp_uint64)(vec[i] - lo) / st;
4124 } else { // st < 0
4125 iter = (kmp_uint64)(lo - vec[i]) / (-st);
4126 }
4127 iter_number = iter + ln * iter_number;
4128 }
4129 shft = iter_number % 32; // use 32-bit granularity
4130 iter_number >>= 5; // divided by 32
4131 flag = 1 << shft;
4132 KMP_MB();
4133 if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
4134 KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
4135 KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
4136 (iter_number << 5) + shft));
4137 }
4138
__kmpc_doacross_fini(ident_t * loc,int gtid)4139 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
4140 kmp_int32 num_done;
4141 kmp_info_t *th = __kmp_threads[gtid];
4142 kmp_team_t *team = th->th.th_team;
4143 kmp_disp_t *pr_buf = th->th.th_dispatch;
4144
4145 KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
4146 if (team->t.t_serialized) {
4147 KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
4148 return; // nothing to do
4149 }
4150 num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
4151 if (num_done == th->th.th_team_nproc) {
4152 // we are the last thread, need to free shared resources
4153 int idx = pr_buf->th_doacross_buf_idx - 1;
4154 dispatch_shared_info_t *sh_buf =
4155 &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
4156 KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
4157 (kmp_int64)&sh_buf->doacross_num_done);
4158 KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
4159 KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
4160 __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
4161 sh_buf->doacross_flags = NULL;
4162 sh_buf->doacross_num_done = 0;
4163 sh_buf->doacross_buf_idx +=
4164 __kmp_dispatch_num_buffers; // free buffer for future re-use
4165 }
4166 // free private resources (need to keep buffer index forever)
4167 pr_buf->th_doacross_flags = NULL;
4168 __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
4169 pr_buf->th_doacross_info = NULL;
4170 KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
4171 }
4172
4173 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
omp_alloc(size_t size,omp_allocator_handle_t allocator)4174 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
4175 return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
4176 }
4177
omp_free(void * ptr,omp_allocator_handle_t allocator)4178 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
4179 __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
4180 }
4181
__kmpc_get_target_offload(void)4182 int __kmpc_get_target_offload(void) {
4183 if (!__kmp_init_serial) {
4184 __kmp_serial_initialize();
4185 }
4186 return __kmp_target_offload;
4187 }
4188
__kmpc_pause_resource(kmp_pause_status_t level)4189 int __kmpc_pause_resource(kmp_pause_status_t level) {
4190 if (!__kmp_init_serial) {
4191 return 1; // Can't pause if runtime is not initialized
4192 }
4193 return __kmp_pause_resource(level);
4194 }
4195