1 /*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 /* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41
__kmp_dispatch_deo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56 }
57 }
58 }
59
__kmp_dispatch_dxo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69 }
70
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
__kmp_get_monotonicity(enum sched_type schedule,bool use_hier=false)72 static inline int __kmp_get_monotonicity(enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 int monotonicity;
76 // default to monotonic
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
82 return monotonicity;
83 }
84
85 // Initialize a dispatch_private_info_template<T> buffer for a particular
86 // type of schedule,chunk. The loop description is found in lb (lower bound),
87 // ub (upper bound), and st (stride). nproc is the number of threads relevant
88 // to the scheduling (often the number of threads in a team, but not always if
89 // hierarchical scheduling is used). tid is the id of the thread calling
90 // the function within the group of nproc threads. It will have a value
91 // between 0 and nproc - 1. This is often just the thread id within a team, but
92 // is not necessarily the case when using hierarchical scheduling.
93 // loc is the source file location of the corresponding loop
94 // gtid is the global thread id
95 template <typename T>
__kmp_dispatch_init_algorithm(ident_t * loc,int gtid,dispatch_private_info_template<T> * pr,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,kmp_uint64 * cur_chunk,typename traits_t<T>::signed_t chunk,T nproc,T tid)96 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
97 dispatch_private_info_template<T> *pr,
98 enum sched_type schedule, T lb, T ub,
99 typename traits_t<T>::signed_t st,
100 #if USE_ITT_BUILD
101 kmp_uint64 *cur_chunk,
102 #endif
103 typename traits_t<T>::signed_t chunk,
104 T nproc, T tid) {
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
107
108 int active;
109 T tc;
110 kmp_info_t *th;
111 kmp_team_t *team;
112 int monotonicity;
113 bool use_hier;
114
115 #ifdef KMP_DEBUG
116 typedef typename traits_t<T>::signed_t ST;
117 {
118 char *buff;
119 // create format specifiers before the debug output
120 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
121 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
128 }
129 #endif
130 /* setup data */
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
134
135 #if USE_ITT_BUILD
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
140 #endif
141
142 #if KMP_USE_HIER_SCHED
143 use_hier = pr->flags.use_hier;
144 #else
145 use_hier = false;
146 #endif
147
148 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
151
152 /* Pick up the nomerge/ordered bits from the scheduling type */
153 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
154 pr->flags.nomerge = TRUE;
155 schedule =
156 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
157 } else {
158 pr->flags.nomerge = FALSE;
159 }
160 pr->type_size = traits_t<T>::type_size; // remember the size of variables
161 if (kmp_ord_lower & schedule) {
162 pr->flags.ordered = TRUE;
163 schedule =
164 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
165 } else {
166 pr->flags.ordered = FALSE;
167 }
168 // Ordered overrides nonmonotonic
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
171 }
172
173 if (schedule == kmp_sch_static) {
174 schedule = __kmp_static;
175 } else {
176 if (schedule == kmp_sch_runtime) {
177 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
178 // not specified)
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
182 // Detail the schedule if needed (global controls are differentiated
183 // appropriately)
184 if (schedule == kmp_sch_guided_chunked) {
185 schedule = __kmp_guided;
186 } else if (schedule == kmp_sch_static) {
187 schedule = __kmp_static;
188 }
189 // Use the chunk size specified by OMP_SCHEDULE (or default if not
190 // specified)
191 chunk = team->t.t_sched.chunk;
192 #if USE_ITT_BUILD
193 if (cur_chunk)
194 *cur_chunk = chunk;
195 #endif
196 #ifdef KMP_DEBUG
197 {
198 char *buff;
199 // create format specifiers before the debug output
200 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
201 "schedule:%%d chunk:%%%s\n",
202 traits_t<ST>::spec);
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
205 }
206 #endif
207 } else {
208 if (schedule == kmp_sch_guided_chunked) {
209 schedule = __kmp_guided;
210 }
211 if (chunk <= 0) {
212 chunk = KMP_DEFAULT_CHUNK;
213 }
214 }
215
216 if (schedule == kmp_sch_auto) {
217 // mapping and differentiation: in the __kmp_do_serial_initialize()
218 schedule = __kmp_auto;
219 #ifdef KMP_DEBUG
220 {
221 char *buff;
222 // create format specifiers before the debug output
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
225 "schedule:%%d chunk:%%%s\n",
226 traits_t<ST>::spec);
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
229 }
230 #endif
231 }
232 #if KMP_STATIC_STEAL_ENABLED
233 // map nonmonotonic:dynamic to static steal
234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
237 }
238 #endif
239 /* guided analytical not safe for too many threads */
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
243 }
244 if (schedule == kmp_sch_runtime_simd) {
245 // compiler provides simd_width in the chunk parameter
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
249 // Detail the schedule if needed (global controls are differentiated
250 // appropriately)
251 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
254 } else {
255 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
256 schedule = kmp_sch_guided_simd;
257 }
258 chunk = team->t.t_sched.chunk * chunk;
259 }
260 #if USE_ITT_BUILD
261 if (cur_chunk)
262 *cur_chunk = chunk;
263 #endif
264 #ifdef KMP_DEBUG
265 {
266 char *buff;
267 // create format specifiers before the debug output
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
270 " chunk:%%%s\n",
271 traits_t<ST>::spec);
272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
274 }
275 #endif
276 }
277 pr->u.p.parm1 = chunk;
278 }
279 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
280 "unknown scheduling type");
281
282 pr->u.p.count = 0;
283
284 if (__kmp_env_consistency_check) {
285 if (st == 0) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
288 }
289 }
290 // compute trip count
291 if (st == 1) { // most common case
292 if (ub >= lb) {
293 tc = ub - lb + 1;
294 } else { // ub < lb
295 tc = 0; // zero-trip
296 }
297 } else if (st < 0) {
298 if (lb >= ub) {
299 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
300 // where the division needs to be unsigned regardless of the result type
301 tc = (UT)(lb - ub) / (-st) + 1;
302 } else { // lb < ub
303 tc = 0; // zero-trip
304 }
305 } else { // st > 0
306 if (ub >= lb) {
307 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
308 // where the division needs to be unsigned regardless of the result type
309 tc = (UT)(ub - lb) / st + 1;
310 } else { // ub < lb
311 tc = 0; // zero-trip
312 }
313 }
314
315 #if KMP_STATS_ENABLED
316 if (KMP_MASTER_GTID(gtid)) {
317 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
318 }
319 #endif
320
321 pr->u.p.lb = lb;
322 pr->u.p.ub = ub;
323 pr->u.p.st = st;
324 pr->u.p.tc = tc;
325
326 #if KMP_OS_WINDOWS
327 pr->u.p.last_upper = ub + st;
328 #endif /* KMP_OS_WINDOWS */
329
330 /* NOTE: only the active parallel region(s) has active ordered sections */
331
332 if (active) {
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
337 }
338 }
339
340 switch (schedule) {
341 #if (KMP_STATIC_STEAL_ENABLED)
342 case kmp_sch_static_steal: {
343 T ntc, init;
344
345 KD_TRACE(100,
346 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
347 gtid));
348
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
351 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
352 T id = tid;
353 T small_chunk, extras;
354
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
357
358 init = id * small_chunk + (id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
361
362 pr->u.p.parm2 = lb;
363 // parm3 is the number of times to attempt stealing which is
364 // proportional to the number of chunks per thread up until
365 // the maximum value of nproc.
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
368 pr->u.p.st = st;
369 if (traits_t<T>::type_size > 4) {
370 // AC: TODO: check if 16-byte CAS available and use it to
371 // improve performance (probably wait for explicit request
372 // before spending time on this).
373 // For now use dynamically allocated per-thread lock,
374 // free memory in __kmp_dispatch_next when status==0.
375 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376 th->th.th_dispatch->th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
378 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
379 }
380 break;
381 } else {
382 /* too few chunks: switching to kmp_sch_dynamic_chunked */
383 schedule = kmp_sch_dynamic_chunked;
384 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
385 "kmp_sch_dynamic_chunked\n",
386 gtid));
387 if (pr->u.p.parm1 <= 0)
388 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
389 break;
390 } // if
391 } // case
392 #endif
393 case kmp_sch_static_balanced: {
394 T init, limit;
395
396 KD_TRACE(
397 100,
398 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
399 gtid));
400
401 if (nproc > 1) {
402 T id = tid;
403
404 if (tc < nproc) {
405 if (id < tc) {
406 init = id;
407 limit = id;
408 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
409 } else {
410 pr->u.p.count = 1; /* means no more chunks to execute */
411 pr->u.p.parm1 = FALSE;
412 break;
413 }
414 } else {
415 T small_chunk = tc / nproc;
416 T extras = tc % nproc;
417 init = id * small_chunk + (id < extras ? id : extras);
418 limit = init + small_chunk - (id < extras ? 0 : 1);
419 pr->u.p.parm1 = (id == nproc - 1);
420 }
421 } else {
422 if (tc > 0) {
423 init = 0;
424 limit = tc - 1;
425 pr->u.p.parm1 = TRUE;
426 } else {
427 // zero trip count
428 pr->u.p.count = 1; /* means no more chunks to execute */
429 pr->u.p.parm1 = FALSE;
430 break;
431 }
432 }
433 #if USE_ITT_BUILD
434 // Calculate chunk for metadata report
435 if (itt_need_metadata_reporting)
436 if (cur_chunk)
437 *cur_chunk = limit - init + 1;
438 #endif
439 if (st == 1) {
440 pr->u.p.lb = lb + init;
441 pr->u.p.ub = lb + limit;
442 } else {
443 // calculated upper bound, "ub" is user-defined upper bound
444 T ub_tmp = lb + limit * st;
445 pr->u.p.lb = lb + init * st;
446 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
447 // it exactly
448 if (st > 0) {
449 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450 } else {
451 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
452 }
453 }
454 if (pr->flags.ordered) {
455 pr->u.p.ordered_lower = init;
456 pr->u.p.ordered_upper = limit;
457 }
458 break;
459 } // case
460 case kmp_sch_static_balanced_chunked: {
461 // similar to balanced, but chunk adjusted to multiple of simd width
462 T nth = nproc;
463 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
464 " -> falling-through to static_greedy\n",
465 gtid));
466 schedule = kmp_sch_static_greedy;
467 if (nth > 1)
468 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
469 else
470 pr->u.p.parm1 = tc;
471 break;
472 } // case
473 case kmp_sch_guided_simd:
474 case kmp_sch_guided_iterative_chunked: {
475 KD_TRACE(
476 100,
477 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
478 " case\n",
479 gtid));
480
481 if (nproc > 1) {
482 if ((2L * chunk + 1) * nproc >= tc) {
483 /* chunk size too large, switch to dynamic */
484 schedule = kmp_sch_dynamic_chunked;
485 } else {
486 // when remaining iters become less than parm2 - switch to dynamic
487 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
488 *(double *)&pr->u.p.parm3 =
489 guided_flt_param / nproc; // may occupy parm3 and parm4
490 }
491 } else {
492 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
493 "kmp_sch_static_greedy\n",
494 gtid));
495 schedule = kmp_sch_static_greedy;
496 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
497 KD_TRACE(
498 100,
499 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
500 gtid));
501 pr->u.p.parm1 = tc;
502 } // if
503 } // case
504 break;
505 case kmp_sch_guided_analytical_chunked: {
506 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
507 "kmp_sch_guided_analytical_chunked case\n",
508 gtid));
509
510 if (nproc > 1) {
511 if ((2L * chunk + 1) * nproc >= tc) {
512 /* chunk size too large, switch to dynamic */
513 schedule = kmp_sch_dynamic_chunked;
514 } else {
515 /* commonly used term: (2 nproc - 1)/(2 nproc) */
516 DBL x;
517
518 #if KMP_USE_X87CONTROL
519 /* Linux* OS already has 64-bit computation by default for long double,
520 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
521 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
522 instead of the default 53-bit. Even though long double doesn't work
523 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
524 expected to impact the correctness of the algorithm, but this has not
525 been mathematically proven. */
526 // save original FPCW and set precision to 64-bit, as
527 // Windows* OS on IA-32 architecture defaults to 53-bit
528 unsigned int oldFpcw = _control87(0, 0);
529 _control87(_PC_64, _MCW_PC); // 0,0x30000
530 #endif
531 /* value used for comparison in solver for cross-over point */
532 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
533
534 /* crossover point--chunk indexes equal to or greater than
535 this point switch to dynamic-style scheduling */
536 UT cross;
537
538 /* commonly used term: (2 nproc - 1)/(2 nproc) */
539 x = (long double)1.0 - (long double)0.5 / nproc;
540
541 #ifdef KMP_DEBUG
542 { // test natural alignment
543 struct _test_a {
544 char a;
545 union {
546 char b;
547 DBL d;
548 };
549 } t;
550 ptrdiff_t natural_alignment =
551 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
552 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
553 // long)natural_alignment );
554 KMP_DEBUG_ASSERT(
555 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
556 }
557 #endif // KMP_DEBUG
558
559 /* save the term in thread private dispatch structure */
560 *(DBL *)&pr->u.p.parm3 = x;
561
562 /* solve for the crossover point to the nearest integer i for which C_i
563 <= chunk */
564 {
565 UT left, right, mid;
566 long double p;
567
568 /* estimate initial upper and lower bound */
569
570 /* doesn't matter what value right is as long as it is positive, but
571 it affects performance of the solver */
572 right = 229;
573 p = __kmp_pow<UT>(x, right);
574 if (p > target) {
575 do {
576 p *= p;
577 right <<= 1;
578 } while (p > target && right < (1 << 27));
579 /* lower bound is previous (failed) estimate of upper bound */
580 left = right >> 1;
581 } else {
582 left = 0;
583 }
584
585 /* bisection root-finding method */
586 while (left + 1 < right) {
587 mid = (left + right) / 2;
588 if (__kmp_pow<UT>(x, mid) > target) {
589 left = mid;
590 } else {
591 right = mid;
592 }
593 } // while
594 cross = right;
595 }
596 /* assert sanity of computed crossover point */
597 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
598 __kmp_pow<UT>(x, cross) <= target);
599
600 /* save the crossover point in thread private dispatch structure */
601 pr->u.p.parm2 = cross;
602
603 // C75803
604 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
605 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
606 #else
607 #define GUIDED_ANALYTICAL_WORKAROUND (x)
608 #endif
609 /* dynamic-style scheduling offset */
610 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
611 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612 cross * chunk;
613 #if KMP_USE_X87CONTROL
614 // restore FPCW
615 _control87(oldFpcw, _MCW_PC);
616 #endif
617 } // if
618 } else {
619 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
620 "kmp_sch_static_greedy\n",
621 gtid));
622 schedule = kmp_sch_static_greedy;
623 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
624 pr->u.p.parm1 = tc;
625 } // if
626 } // case
627 break;
628 case kmp_sch_static_greedy:
629 KD_TRACE(
630 100,
631 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632 gtid));
633 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634 break;
635 case kmp_sch_static_chunked:
636 case kmp_sch_dynamic_chunked:
637 if (pr->u.p.parm1 <= 0) {
638 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639 }
640 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
641 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
642 gtid));
643 break;
644 case kmp_sch_trapezoidal: {
645 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
646
647 T parm1, parm2, parm3, parm4;
648 KD_TRACE(100,
649 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
650 gtid));
651
652 parm1 = chunk;
653
654 /* F : size of the first cycle */
655 parm2 = (tc / (2 * nproc));
656
657 if (parm2 < 1) {
658 parm2 = 1;
659 }
660
661 /* L : size of the last cycle. Make sure the last cycle is not larger
662 than the first cycle. */
663 if (parm1 < 1) {
664 parm1 = 1;
665 } else if (parm1 > parm2) {
666 parm1 = parm2;
667 }
668
669 /* N : number of cycles */
670 parm3 = (parm2 + parm1);
671 parm3 = (2 * tc + parm3 - 1) / parm3;
672
673 if (parm3 < 2) {
674 parm3 = 2;
675 }
676
677 /* sigma : decreasing incr of the trapezoid */
678 parm4 = (parm3 - 1);
679 parm4 = (parm2 - parm1) / parm4;
680
681 // pointless check, because parm4 >= 0 always
682 // if ( parm4 < 0 ) {
683 // parm4 = 0;
684 //}
685
686 pr->u.p.parm1 = parm1;
687 pr->u.p.parm2 = parm2;
688 pr->u.p.parm3 = parm3;
689 pr->u.p.parm4 = parm4;
690 } // case
691 break;
692
693 default: {
694 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
695 KMP_HNT(GetNewerLibrary), // Hint
696 __kmp_msg_null // Variadic argument list terminator
697 );
698 } break;
699 } // switch
700 pr->schedule = schedule;
701 }
702
703 #if KMP_USE_HIER_SCHED
704 template <typename T>
705 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
706 typename traits_t<T>::signed_t st);
707 template <>
708 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int32 lb,kmp_int32 ub,kmp_int32 st)709 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
710 kmp_int32 ub, kmp_int32 st) {
711 __kmp_dispatch_init_hierarchy<kmp_int32>(
712 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
713 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
714 }
715 template <>
716 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st)717 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
718 kmp_uint32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_uint32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int64 lb,kmp_int64 ub,kmp_int64 st)725 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
726 kmp_int64 ub, kmp_int64 st) {
727 __kmp_dispatch_init_hierarchy<kmp_int64>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st)733 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
734 kmp_uint64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_uint64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739
740 // free all the hierarchy scheduling memory associated with the team
__kmp_dispatch_free_hierarchies(kmp_team_t * team)741 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
742 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
743 for (int i = 0; i < num_disp_buff; ++i) {
744 // type does not matter here so use kmp_int32
745 auto sh =
746 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
747 &team->t.t_disp_buffer[i]);
748 if (sh->hier) {
749 sh->hier->deallocate();
750 __kmp_free(sh->hier);
751 }
752 }
753 }
754 #endif
755
756 // UT - unsigned flavor of T, ST - signed flavor of T,
757 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
758 template <typename T>
759 static void
__kmp_dispatch_init(ident_t * loc,int gtid,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,typename traits_t<T>::signed_t chunk,int push_ws)760 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
761 T ub, typename traits_t<T>::signed_t st,
762 typename traits_t<T>::signed_t chunk, int push_ws) {
763 typedef typename traits_t<T>::unsigned_t UT;
764
765 int active;
766 kmp_info_t *th;
767 kmp_team_t *team;
768 kmp_uint32 my_buffer_index;
769 dispatch_private_info_template<T> *pr;
770 dispatch_shared_info_template<T> volatile *sh;
771
772 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
773 sizeof(dispatch_private_info));
774 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
775 sizeof(dispatch_shared_info));
776
777 if (!TCR_4(__kmp_init_parallel))
778 __kmp_parallel_initialize();
779
780 __kmp_resume_if_soft_paused();
781
782 #if INCLUDE_SSC_MARKS
783 SSC_MARK_DISPATCH_INIT();
784 #endif
785 #ifdef KMP_DEBUG
786 typedef typename traits_t<T>::signed_t ST;
787 {
788 char *buff;
789 // create format specifiers before the debug output
790 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
791 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
792 traits_t<ST>::spec, traits_t<T>::spec,
793 traits_t<T>::spec, traits_t<ST>::spec);
794 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
795 __kmp_str_free(&buff);
796 }
797 #endif
798 /* setup data */
799 th = __kmp_threads[gtid];
800 team = th->th.th_team;
801 active = !team->t.t_serialized;
802 th->th.th_ident = loc;
803
804 // Any half-decent optimizer will remove this test when the blocks are empty
805 // since the macros expand to nothing
806 // when statistics are disabled.
807 if (schedule == __kmp_static) {
808 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
809 } else {
810 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
811 }
812
813 #if KMP_USE_HIER_SCHED
814 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
815 // Hierarchical scheduling does not work with ordered, so if ordered is
816 // detected, then revert back to threaded scheduling.
817 bool ordered;
818 enum sched_type my_sched = schedule;
819 my_buffer_index = th->th.th_dispatch->th_disp_index;
820 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821 &th->th.th_dispatch
822 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
823 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
824 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
825 my_sched =
826 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
827 ordered = (kmp_ord_lower & my_sched);
828 if (pr->flags.use_hier) {
829 if (ordered) {
830 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
831 "Disabling hierarchical scheduling.\n",
832 gtid));
833 pr->flags.use_hier = FALSE;
834 }
835 }
836 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
837 // Don't use hierarchical for ordered parallel loops and don't
838 // use the runtime hierarchy if one was specified in the program
839 if (!ordered && !pr->flags.use_hier)
840 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841 }
842 #endif // KMP_USE_HIER_SCHED
843
844 #if USE_ITT_BUILD
845 kmp_uint64 cur_chunk = chunk;
846 int itt_need_metadata_reporting =
847 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
848 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
849 team->t.t_active_level == 1;
850 #endif
851 if (!active) {
852 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
853 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
854 } else {
855 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
856 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857
858 my_buffer_index = th->th.th_dispatch->th_disp_index++;
859
860 /* What happens when number of threads changes, need to resize buffer? */
861 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862 &th->th.th_dispatch
863 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
864 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
865 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
866 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
867 my_buffer_index));
868 }
869
870 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
871 #if USE_ITT_BUILD
872 &cur_chunk,
873 #endif
874 chunk, (T)th->th.th_team_nproc,
875 (T)th->th.th_info.ds.ds_tid);
876 if (active) {
877 if (pr->flags.ordered == 0) {
878 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
879 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880 } else {
881 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
882 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
883 }
884 }
885
886 if (active) {
887 /* The name of this buffer should be my_buffer_index when it's free to use
888 * it */
889
890 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
891 "sh->buffer_index:%d\n",
892 gtid, my_buffer_index, sh->buffer_index));
893 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
894 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
895 // Note: KMP_WAIT() cannot be used there: buffer index and
896 // my_buffer_index are *always* 32-bit integers.
897 KMP_MB(); /* is this necessary? */
898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
899 "sh->buffer_index:%d\n",
900 gtid, my_buffer_index, sh->buffer_index));
901
902 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
903 th->th.th_dispatch->th_dispatch_sh_current =
904 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
905 #if USE_ITT_BUILD
906 if (pr->flags.ordered) {
907 __kmp_itt_ordered_init(gtid);
908 }
909 // Report loop metadata
910 if (itt_need_metadata_reporting) {
911 // Only report metadata by master of active team at level 1
912 kmp_uint64 schedtype = 0;
913 switch (schedule) {
914 case kmp_sch_static_chunked:
915 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
916 break;
917 case kmp_sch_static_greedy:
918 cur_chunk = pr->u.p.parm1;
919 break;
920 case kmp_sch_dynamic_chunked:
921 schedtype = 1;
922 break;
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_analytical_chunked:
925 case kmp_sch_guided_simd:
926 schedtype = 2;
927 break;
928 default:
929 // Should we put this case under "static"?
930 // case kmp_sch_static_steal:
931 schedtype = 3;
932 break;
933 }
934 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935 }
936 #if KMP_USE_HIER_SCHED
937 if (pr->flags.use_hier) {
938 pr->u.p.count = 0;
939 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940 }
941 #endif // KMP_USER_HIER_SCHED
942 #endif /* USE_ITT_BUILD */
943 }
944
945 #ifdef KMP_DEBUG
946 {
947 char *buff;
948 // create format specifiers before the debug output
949 buff = __kmp_str_format(
950 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
951 "lb:%%%s ub:%%%s"
952 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
953 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
954 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
955 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
956 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
957 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
958 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
959 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
960 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
961 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
962 __kmp_str_free(&buff);
963 }
964 #endif
965 #if (KMP_STATIC_STEAL_ENABLED)
966 // It cannot be guaranteed that after execution of a loop with some other
967 // schedule kind all the parm3 variables will contain the same value. Even if
968 // all parm3 will be the same, it still exists a bad case like using 0 and 1
969 // rather than program life-time increment. So the dedicated variable is
970 // required. The 'static_steal_counter' is used.
971 if (schedule == kmp_sch_static_steal) {
972 // Other threads will inspect this variable when searching for a victim.
973 // This is a flag showing that other threads may steal from this thread
974 // since then.
975 volatile T *p = &pr->u.p.static_steal_counter;
976 *p = *p + 1;
977 }
978 #endif // ( KMP_STATIC_STEAL_ENABLED )
979
980 #if OMPT_SUPPORT && OMPT_OPTIONAL
981 if (ompt_enabled.ompt_callback_work) {
982 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
983 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
984 ompt_callbacks.ompt_callback(ompt_callback_work)(
985 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
986 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
987 }
988 #endif
989 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
990 }
991
992 /* For ordered loops, either __kmp_dispatch_finish() should be called after
993 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
994 * every chunk of iterations. If the ordered section(s) were not executed
995 * for this iteration (or every iteration in this chunk), we need to set the
996 * ordered iteration counters so that the next thread can proceed. */
997 template <typename UT>
__kmp_dispatch_finish(int gtid,ident_t * loc)998 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
999 typedef typename traits_t<UT>::signed_t ST;
1000 kmp_info_t *th = __kmp_threads[gtid];
1001
1002 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1003 if (!th->th.th_team->t.t_serialized) {
1004
1005 dispatch_private_info_template<UT> *pr =
1006 reinterpret_cast<dispatch_private_info_template<UT> *>(
1007 th->th.th_dispatch->th_dispatch_pr_current);
1008 dispatch_shared_info_template<UT> volatile *sh =
1009 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1010 th->th.th_dispatch->th_dispatch_sh_current);
1011 KMP_DEBUG_ASSERT(pr);
1012 KMP_DEBUG_ASSERT(sh);
1013 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1014 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015
1016 if (pr->ordered_bumped) {
1017 KD_TRACE(
1018 1000,
1019 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020 gtid));
1021 pr->ordered_bumped = 0;
1022 } else {
1023 UT lower = pr->u.p.ordered_lower;
1024
1025 #ifdef KMP_DEBUG
1026 {
1027 char *buff;
1028 // create format specifiers before the debug output
1029 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1030 "ordered_iteration:%%%s lower:%%%s\n",
1031 traits_t<UT>::spec, traits_t<UT>::spec);
1032 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1033 __kmp_str_free(&buff);
1034 }
1035 #endif
1036
1037 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1038 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1039 KMP_MB(); /* is this necessary? */
1040 #ifdef KMP_DEBUG
1041 {
1042 char *buff;
1043 // create format specifiers before the debug output
1044 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1045 "ordered_iteration:%%%s lower:%%%s\n",
1046 traits_t<UT>::spec, traits_t<UT>::spec);
1047 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1048 __kmp_str_free(&buff);
1049 }
1050 #endif
1051
1052 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1053 } // if
1054 } // if
1055 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1056 }
1057
1058 #ifdef KMP_GOMP_COMPAT
1059
1060 template <typename UT>
__kmp_dispatch_finish_chunk(int gtid,ident_t * loc)1061 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1062 typedef typename traits_t<UT>::signed_t ST;
1063 kmp_info_t *th = __kmp_threads[gtid];
1064
1065 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1066 if (!th->th.th_team->t.t_serialized) {
1067 // int cid;
1068 dispatch_private_info_template<UT> *pr =
1069 reinterpret_cast<dispatch_private_info_template<UT> *>(
1070 th->th.th_dispatch->th_dispatch_pr_current);
1071 dispatch_shared_info_template<UT> volatile *sh =
1072 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1073 th->th.th_dispatch->th_dispatch_sh_current);
1074 KMP_DEBUG_ASSERT(pr);
1075 KMP_DEBUG_ASSERT(sh);
1076 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1077 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1078
1079 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1080 UT lower = pr->u.p.ordered_lower;
1081 UT upper = pr->u.p.ordered_upper;
1082 UT inc = upper - lower + 1;
1083
1084 if (pr->ordered_bumped == inc) {
1085 KD_TRACE(
1086 1000,
1087 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088 gtid));
1089 pr->ordered_bumped = 0;
1090 } else {
1091 inc -= pr->ordered_bumped;
1092
1093 #ifdef KMP_DEBUG
1094 {
1095 char *buff;
1096 // create format specifiers before the debug output
1097 buff = __kmp_str_format(
1098 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1099 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1100 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1101 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1102 __kmp_str_free(&buff);
1103 }
1104 #endif
1105
1106 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1107 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1108
1109 KMP_MB(); /* is this necessary? */
1110 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1111 "ordered_bumped to zero\n",
1112 gtid));
1113 pr->ordered_bumped = 0;
1114 //!!!!! TODO check if the inc should be unsigned, or signed???
1115 #ifdef KMP_DEBUG
1116 {
1117 char *buff;
1118 // create format specifiers before the debug output
1119 buff = __kmp_str_format(
1120 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1121 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1122 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1123 traits_t<UT>::spec);
1124 KD_TRACE(1000,
1125 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1126 __kmp_str_free(&buff);
1127 }
1128 #endif
1129
1130 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1131 }
1132 // }
1133 }
1134 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1135 }
1136
1137 #endif /* KMP_GOMP_COMPAT */
1138
1139 template <typename T>
__kmp_dispatch_next_algorithm(int gtid,dispatch_private_info_template<T> * pr,dispatch_shared_info_template<T> volatile * sh,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,T nproc,T tid)1140 int __kmp_dispatch_next_algorithm(int gtid,
1141 dispatch_private_info_template<T> *pr,
1142 dispatch_shared_info_template<T> volatile *sh,
1143 kmp_int32 *p_last, T *p_lb, T *p_ub,
1144 typename traits_t<T>::signed_t *p_st, T nproc,
1145 T tid) {
1146 typedef typename traits_t<T>::unsigned_t UT;
1147 typedef typename traits_t<T>::signed_t ST;
1148 typedef typename traits_t<T>::floating_t DBL;
1149 int status = 0;
1150 kmp_int32 last = 0;
1151 T start;
1152 ST incr;
1153 UT limit, trip, init;
1154 kmp_info_t *th = __kmp_threads[gtid];
1155 kmp_team_t *team = th->th.th_team;
1156
1157 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1158 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1159 KMP_DEBUG_ASSERT(pr);
1160 KMP_DEBUG_ASSERT(sh);
1161 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1162 #ifdef KMP_DEBUG
1163 {
1164 char *buff;
1165 // create format specifiers before the debug output
1166 buff =
1167 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1168 "sh:%%p nproc:%%%s tid:%%%s\n",
1169 traits_t<T>::spec, traits_t<T>::spec);
1170 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1171 __kmp_str_free(&buff);
1172 }
1173 #endif
1174
1175 // zero trip count
1176 if (pr->u.p.tc == 0) {
1177 KD_TRACE(10,
1178 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1179 "zero status:%d\n",
1180 gtid, status));
1181 return 0;
1182 }
1183
1184 switch (pr->schedule) {
1185 #if (KMP_STATIC_STEAL_ENABLED)
1186 case kmp_sch_static_steal: {
1187 T chunk = pr->u.p.parm1;
1188
1189 KD_TRACE(100,
1190 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1191 gtid));
1192
1193 trip = pr->u.p.tc - 1;
1194
1195 if (traits_t<T>::type_size > 4) {
1196 // use lock for 8-byte and CAS for 4-byte induction
1197 // variable. TODO (optional): check and use 16-byte CAS
1198 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1199 KMP_DEBUG_ASSERT(lck != NULL);
1200 if (pr->u.p.count < (UT)pr->u.p.ub) {
1201 __kmp_acquire_lock(lck, gtid);
1202 // try to get own chunk of iterations
1203 init = (pr->u.p.count)++;
1204 status = (init < (UT)pr->u.p.ub);
1205 __kmp_release_lock(lck, gtid);
1206 } else {
1207 status = 0; // no own chunks
1208 }
1209 if (!status) { // try to steal
1210 kmp_info_t **other_threads = team->t.t_threads;
1211 int while_limit = pr->u.p.parm3;
1212 int while_index = 0;
1213 // TODO: algorithm of searching for a victim
1214 // should be cleaned up and measured
1215 while ((!status) && (while_limit != ++while_index)) {
1216 T remaining;
1217 T victimIdx = pr->u.p.parm4;
1218 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1219 dispatch_private_info_template<T> *victim =
1220 reinterpret_cast<dispatch_private_info_template<T> *>(
1221 other_threads[victimIdx]
1222 ->th.th_dispatch->th_dispatch_pr_current);
1223 while ((victim == NULL || victim == pr ||
1224 (*(volatile T *)&victim->u.p.static_steal_counter !=
1225 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1226 oldVictimIdx != victimIdx) {
1227 victimIdx = (victimIdx + 1) % nproc;
1228 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1229 other_threads[victimIdx]
1230 ->th.th_dispatch->th_dispatch_pr_current);
1231 }
1232 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1233 *(volatile T *)&pr->u.p.static_steal_counter)) {
1234 continue; // try once more (nproc attempts in total)
1235 // no victim is ready yet to participate in stealing
1236 // because all victims are still in kmp_init_dispatch
1237 }
1238 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1239 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1240 continue; // not enough chunks to steal, goto next victim
1241 }
1242
1243 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1244 KMP_ASSERT(lck != NULL);
1245 __kmp_acquire_lock(lck, gtid);
1246 limit = victim->u.p.ub; // keep initial ub
1247 if (victim->u.p.count >= limit ||
1248 (remaining = limit - victim->u.p.count) < 2) {
1249 __kmp_release_lock(lck, gtid);
1250 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1251 continue; // not enough chunks to steal
1252 }
1253 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1254 // by 1
1255 if (remaining > 3) {
1256 // steal 1/4 of remaining
1257 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1258 init = (victim->u.p.ub -= (remaining >> 2));
1259 } else {
1260 // steal 1 chunk of 2 or 3 remaining
1261 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1262 init = (victim->u.p.ub -= 1);
1263 }
1264 __kmp_release_lock(lck, gtid);
1265
1266 KMP_DEBUG_ASSERT(init + 1 <= limit);
1267 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1268 status = 1;
1269 while_index = 0;
1270 // now update own count and ub with stolen range but init chunk
1271 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1272 pr->u.p.count = init + 1;
1273 pr->u.p.ub = limit;
1274 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1275 } // while (search for victim)
1276 } // if (try to find victim and steal)
1277 } else {
1278 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1279 typedef union {
1280 struct {
1281 UT count;
1282 T ub;
1283 } p;
1284 kmp_int64 b;
1285 } union_i4;
1286 // All operations on 'count' or 'ub' must be combined atomically
1287 // together.
1288 {
1289 union_i4 vold, vnew;
1290 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1291 vnew = vold;
1292 vnew.p.count++;
1293 while (!KMP_COMPARE_AND_STORE_ACQ64(
1294 (volatile kmp_int64 *)&pr->u.p.count,
1295 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1296 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1297 KMP_CPU_PAUSE();
1298 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1299 vnew = vold;
1300 vnew.p.count++;
1301 }
1302 vnew = vold;
1303 init = vnew.p.count;
1304 status = (init < (UT)vnew.p.ub);
1305 }
1306
1307 if (!status) {
1308 kmp_info_t **other_threads = team->t.t_threads;
1309 int while_limit = pr->u.p.parm3;
1310 int while_index = 0;
1311
1312 // TODO: algorithm of searching for a victim
1313 // should be cleaned up and measured
1314 while ((!status) && (while_limit != ++while_index)) {
1315 union_i4 vold, vnew;
1316 kmp_int32 remaining;
1317 T victimIdx = pr->u.p.parm4;
1318 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1319 dispatch_private_info_template<T> *victim =
1320 reinterpret_cast<dispatch_private_info_template<T> *>(
1321 other_threads[victimIdx]
1322 ->th.th_dispatch->th_dispatch_pr_current);
1323 while ((victim == NULL || victim == pr ||
1324 (*(volatile T *)&victim->u.p.static_steal_counter !=
1325 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1326 oldVictimIdx != victimIdx) {
1327 victimIdx = (victimIdx + 1) % nproc;
1328 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1329 other_threads[victimIdx]
1330 ->th.th_dispatch->th_dispatch_pr_current);
1331 }
1332 if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1333 *(volatile T *)&pr->u.p.static_steal_counter)) {
1334 continue; // try once more (nproc attempts in total)
1335 // no victim is ready yet to participate in stealing
1336 // because all victims are still in kmp_init_dispatch
1337 }
1338 pr->u.p.parm4 = victimIdx; // new victim found
1339 while (1) { // CAS loop if victim has enough chunks to steal
1340 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1341 vnew = vold;
1342
1343 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1344 if (vnew.p.count >= (UT)vnew.p.ub ||
1345 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1346 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1347 break; // not enough chunks to steal, goto next victim
1348 }
1349 if (remaining > 3) {
1350 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1351 } else {
1352 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1353 }
1354 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1355 // TODO: Should this be acquire or release?
1356 if (KMP_COMPARE_AND_STORE_ACQ64(
1357 (volatile kmp_int64 *)&victim->u.p.count,
1358 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1359 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1360 // stealing succedded
1361 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1362 vold.p.ub - vnew.p.ub);
1363 status = 1;
1364 while_index = 0;
1365 // now update own count and ub
1366 init = vnew.p.ub;
1367 vold.p.count = init + 1;
1368 #if KMP_ARCH_X86
1369 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1370 #else
1371 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1372 #endif
1373 break;
1374 } // if (check CAS result)
1375 KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1376 } // while (try to steal from particular victim)
1377 } // while (search for victim)
1378 } // if (try to find victim and steal)
1379 } // if (4-byte induction variable)
1380 if (!status) {
1381 *p_lb = 0;
1382 *p_ub = 0;
1383 if (p_st != NULL)
1384 *p_st = 0;
1385 } else {
1386 start = pr->u.p.parm2;
1387 init *= chunk;
1388 limit = chunk + init - 1;
1389 incr = pr->u.p.st;
1390 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1391
1392 KMP_DEBUG_ASSERT(init <= trip);
1393 if ((last = (limit >= trip)) != 0)
1394 limit = trip;
1395 if (p_st != NULL)
1396 *p_st = incr;
1397
1398 if (incr == 1) {
1399 *p_lb = start + init;
1400 *p_ub = start + limit;
1401 } else {
1402 *p_lb = start + init * incr;
1403 *p_ub = start + limit * incr;
1404 }
1405
1406 if (pr->flags.ordered) {
1407 pr->u.p.ordered_lower = init;
1408 pr->u.p.ordered_upper = limit;
1409 } // if
1410 } // if
1411 break;
1412 } // case
1413 #endif // ( KMP_STATIC_STEAL_ENABLED )
1414 case kmp_sch_static_balanced: {
1415 KD_TRACE(
1416 10,
1417 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1418 gtid));
1419 /* check if thread has any iteration to do */
1420 if ((status = !pr->u.p.count) != 0) {
1421 pr->u.p.count = 1;
1422 *p_lb = pr->u.p.lb;
1423 *p_ub = pr->u.p.ub;
1424 last = pr->u.p.parm1;
1425 if (p_st != NULL)
1426 *p_st = pr->u.p.st;
1427 } else { /* no iterations to do */
1428 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1429 }
1430 } // case
1431 break;
1432 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1433 merged here */
1434 case kmp_sch_static_chunked: {
1435 T parm1;
1436
1437 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1438 "kmp_sch_static_[affinity|chunked] case\n",
1439 gtid));
1440 parm1 = pr->u.p.parm1;
1441
1442 trip = pr->u.p.tc - 1;
1443 init = parm1 * (pr->u.p.count + tid);
1444
1445 if ((status = (init <= trip)) != 0) {
1446 start = pr->u.p.lb;
1447 incr = pr->u.p.st;
1448 limit = parm1 + init - 1;
1449
1450 if ((last = (limit >= trip)) != 0)
1451 limit = trip;
1452
1453 if (p_st != NULL)
1454 *p_st = incr;
1455
1456 pr->u.p.count += nproc;
1457
1458 if (incr == 1) {
1459 *p_lb = start + init;
1460 *p_ub = start + limit;
1461 } else {
1462 *p_lb = start + init * incr;
1463 *p_ub = start + limit * incr;
1464 }
1465
1466 if (pr->flags.ordered) {
1467 pr->u.p.ordered_lower = init;
1468 pr->u.p.ordered_upper = limit;
1469 } // if
1470 } // if
1471 } // case
1472 break;
1473
1474 case kmp_sch_dynamic_chunked: {
1475 T chunk = pr->u.p.parm1;
1476
1477 KD_TRACE(
1478 100,
1479 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1480 gtid));
1481
1482 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1483 trip = pr->u.p.tc - 1;
1484
1485 if ((status = (init <= trip)) == 0) {
1486 *p_lb = 0;
1487 *p_ub = 0;
1488 if (p_st != NULL)
1489 *p_st = 0;
1490 } else {
1491 start = pr->u.p.lb;
1492 limit = chunk + init - 1;
1493 incr = pr->u.p.st;
1494
1495 if ((last = (limit >= trip)) != 0)
1496 limit = trip;
1497
1498 if (p_st != NULL)
1499 *p_st = incr;
1500
1501 if (incr == 1) {
1502 *p_lb = start + init;
1503 *p_ub = start + limit;
1504 } else {
1505 *p_lb = start + init * incr;
1506 *p_ub = start + limit * incr;
1507 }
1508
1509 if (pr->flags.ordered) {
1510 pr->u.p.ordered_lower = init;
1511 pr->u.p.ordered_upper = limit;
1512 } // if
1513 } // if
1514 } // case
1515 break;
1516
1517 case kmp_sch_guided_iterative_chunked: {
1518 T chunkspec = pr->u.p.parm1;
1519 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1520 "iterative case\n",
1521 gtid));
1522 trip = pr->u.p.tc;
1523 // Start atomic part of calculations
1524 while (1) {
1525 ST remaining; // signed, because can be < 0
1526 init = sh->u.s.iteration; // shared value
1527 remaining = trip - init;
1528 if (remaining <= 0) { // AC: need to compare with 0 first
1529 // nothing to do, don't try atomic op
1530 status = 0;
1531 break;
1532 }
1533 if ((T)remaining <
1534 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1535 // use dynamic-style shcedule
1536 // atomically increment iterations, get old value
1537 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1538 (ST)chunkspec);
1539 remaining = trip - init;
1540 if (remaining <= 0) {
1541 status = 0; // all iterations got by other threads
1542 } else {
1543 // got some iterations to work on
1544 status = 1;
1545 if ((T)remaining > chunkspec) {
1546 limit = init + chunkspec - 1;
1547 } else {
1548 last = 1; // the last chunk
1549 limit = init + remaining - 1;
1550 } // if
1551 } // if
1552 break;
1553 } // if
1554 limit = init +
1555 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1556 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1557 (ST)init, (ST)limit)) {
1558 // CAS was successful, chunk obtained
1559 status = 1;
1560 --limit;
1561 break;
1562 } // if
1563 } // while
1564 if (status != 0) {
1565 start = pr->u.p.lb;
1566 incr = pr->u.p.st;
1567 if (p_st != NULL)
1568 *p_st = incr;
1569 *p_lb = start + init * incr;
1570 *p_ub = start + limit * incr;
1571 if (pr->flags.ordered) {
1572 pr->u.p.ordered_lower = init;
1573 pr->u.p.ordered_upper = limit;
1574 } // if
1575 } else {
1576 *p_lb = 0;
1577 *p_ub = 0;
1578 if (p_st != NULL)
1579 *p_st = 0;
1580 } // if
1581 } // case
1582 break;
1583
1584 case kmp_sch_guided_simd: {
1585 // same as iterative but curr-chunk adjusted to be multiple of given
1586 // chunk
1587 T chunk = pr->u.p.parm1;
1588 KD_TRACE(100,
1589 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1590 gtid));
1591 trip = pr->u.p.tc;
1592 // Start atomic part of calculations
1593 while (1) {
1594 ST remaining; // signed, because can be < 0
1595 init = sh->u.s.iteration; // shared value
1596 remaining = trip - init;
1597 if (remaining <= 0) { // AC: need to compare with 0 first
1598 status = 0; // nothing to do, don't try atomic op
1599 break;
1600 }
1601 KMP_DEBUG_ASSERT(init % chunk == 0);
1602 // compare with K*nproc*(chunk+1), K=2 by default
1603 if ((T)remaining < pr->u.p.parm2) {
1604 // use dynamic-style shcedule
1605 // atomically increment iterations, get old value
1606 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1607 (ST)chunk);
1608 remaining = trip - init;
1609 if (remaining <= 0) {
1610 status = 0; // all iterations got by other threads
1611 } else {
1612 // got some iterations to work on
1613 status = 1;
1614 if ((T)remaining > chunk) {
1615 limit = init + chunk - 1;
1616 } else {
1617 last = 1; // the last chunk
1618 limit = init + remaining - 1;
1619 } // if
1620 } // if
1621 break;
1622 } // if
1623 // divide by K*nproc
1624 UT span = remaining * (*(double *)&pr->u.p.parm3);
1625 UT rem = span % chunk;
1626 if (rem) // adjust so that span%chunk == 0
1627 span += chunk - rem;
1628 limit = init + span;
1629 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1630 (ST)init, (ST)limit)) {
1631 // CAS was successful, chunk obtained
1632 status = 1;
1633 --limit;
1634 break;
1635 } // if
1636 } // while
1637 if (status != 0) {
1638 start = pr->u.p.lb;
1639 incr = pr->u.p.st;
1640 if (p_st != NULL)
1641 *p_st = incr;
1642 *p_lb = start + init * incr;
1643 *p_ub = start + limit * incr;
1644 if (pr->flags.ordered) {
1645 pr->u.p.ordered_lower = init;
1646 pr->u.p.ordered_upper = limit;
1647 } // if
1648 } else {
1649 *p_lb = 0;
1650 *p_ub = 0;
1651 if (p_st != NULL)
1652 *p_st = 0;
1653 } // if
1654 } // case
1655 break;
1656
1657 case kmp_sch_guided_analytical_chunked: {
1658 T chunkspec = pr->u.p.parm1;
1659 UT chunkIdx;
1660 #if KMP_USE_X87CONTROL
1661 /* for storing original FPCW value for Windows* OS on
1662 IA-32 architecture 8-byte version */
1663 unsigned int oldFpcw;
1664 unsigned int fpcwSet = 0;
1665 #endif
1666 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1667 "kmp_sch_guided_analytical_chunked case\n",
1668 gtid));
1669
1670 trip = pr->u.p.tc;
1671
1672 KMP_DEBUG_ASSERT(nproc > 1);
1673 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1674
1675 while (1) { /* this while loop is a safeguard against unexpected zero
1676 chunk sizes */
1677 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1678 if (chunkIdx >= (UT)pr->u.p.parm2) {
1679 --trip;
1680 /* use dynamic-style scheduling */
1681 init = chunkIdx * chunkspec + pr->u.p.count;
1682 /* need to verify init > 0 in case of overflow in the above
1683 * calculation */
1684 if ((status = (init > 0 && init <= trip)) != 0) {
1685 limit = init + chunkspec - 1;
1686
1687 if ((last = (limit >= trip)) != 0)
1688 limit = trip;
1689 }
1690 break;
1691 } else {
1692 /* use exponential-style scheduling */
1693 /* The following check is to workaround the lack of long double precision on
1694 Windows* OS.
1695 This check works around the possible effect that init != 0 for chunkIdx == 0.
1696 */
1697 #if KMP_USE_X87CONTROL
1698 /* If we haven't already done so, save original
1699 FPCW and set precision to 64-bit, as Windows* OS
1700 on IA-32 architecture defaults to 53-bit */
1701 if (!fpcwSet) {
1702 oldFpcw = _control87(0, 0);
1703 _control87(_PC_64, _MCW_PC);
1704 fpcwSet = 0x30000;
1705 }
1706 #endif
1707 if (chunkIdx) {
1708 init = __kmp_dispatch_guided_remaining<T>(
1709 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1710 KMP_DEBUG_ASSERT(init);
1711 init = trip - init;
1712 } else
1713 init = 0;
1714 limit = trip - __kmp_dispatch_guided_remaining<T>(
1715 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1716 KMP_ASSERT(init <= limit);
1717 if (init < limit) {
1718 KMP_DEBUG_ASSERT(limit <= trip);
1719 --limit;
1720 status = 1;
1721 break;
1722 } // if
1723 } // if
1724 } // while (1)
1725 #if KMP_USE_X87CONTROL
1726 /* restore FPCW if necessary
1727 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1728 */
1729 if (fpcwSet && (oldFpcw & fpcwSet))
1730 _control87(oldFpcw, _MCW_PC);
1731 #endif
1732 if (status != 0) {
1733 start = pr->u.p.lb;
1734 incr = pr->u.p.st;
1735 if (p_st != NULL)
1736 *p_st = incr;
1737 *p_lb = start + init * incr;
1738 *p_ub = start + limit * incr;
1739 if (pr->flags.ordered) {
1740 pr->u.p.ordered_lower = init;
1741 pr->u.p.ordered_upper = limit;
1742 }
1743 } else {
1744 *p_lb = 0;
1745 *p_ub = 0;
1746 if (p_st != NULL)
1747 *p_st = 0;
1748 }
1749 } // case
1750 break;
1751
1752 case kmp_sch_trapezoidal: {
1753 UT index;
1754 T parm2 = pr->u.p.parm2;
1755 T parm3 = pr->u.p.parm3;
1756 T parm4 = pr->u.p.parm4;
1757 KD_TRACE(100,
1758 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1759 gtid));
1760
1761 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1762
1763 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1764 trip = pr->u.p.tc - 1;
1765
1766 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1767 *p_lb = 0;
1768 *p_ub = 0;
1769 if (p_st != NULL)
1770 *p_st = 0;
1771 } else {
1772 start = pr->u.p.lb;
1773 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1774 incr = pr->u.p.st;
1775
1776 if ((last = (limit >= trip)) != 0)
1777 limit = trip;
1778
1779 if (p_st != NULL)
1780 *p_st = incr;
1781
1782 if (incr == 1) {
1783 *p_lb = start + init;
1784 *p_ub = start + limit;
1785 } else {
1786 *p_lb = start + init * incr;
1787 *p_ub = start + limit * incr;
1788 }
1789
1790 if (pr->flags.ordered) {
1791 pr->u.p.ordered_lower = init;
1792 pr->u.p.ordered_upper = limit;
1793 } // if
1794 } // if
1795 } // case
1796 break;
1797 default: {
1798 status = 0; // to avoid complaints on uninitialized variable use
1799 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1800 KMP_HNT(GetNewerLibrary), // Hint
1801 __kmp_msg_null // Variadic argument list terminator
1802 );
1803 } break;
1804 } // switch
1805 if (p_last)
1806 *p_last = last;
1807 #ifdef KMP_DEBUG
1808 if (pr->flags.ordered) {
1809 char *buff;
1810 // create format specifiers before the debug output
1811 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1812 "ordered_lower:%%%s ordered_upper:%%%s\n",
1813 traits_t<UT>::spec, traits_t<UT>::spec);
1814 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1815 __kmp_str_free(&buff);
1816 }
1817 {
1818 char *buff;
1819 // create format specifiers before the debug output
1820 buff = __kmp_str_format(
1821 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1822 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1823 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1824 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1825 __kmp_str_free(&buff);
1826 }
1827 #endif
1828 return status;
1829 }
1830
1831 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1832 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1833 is not called. */
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835 #define OMPT_LOOP_END \
1836 if (status == 0) { \
1837 if (ompt_enabled.ompt_callback_work) { \
1838 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1839 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1840 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1841 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1842 &(task_info->task_data), 0, codeptr); \
1843 } \
1844 }
1845 // TODO: implement count
1846 #else
1847 #define OMPT_LOOP_END // no-op
1848 #endif
1849
1850 #if KMP_STATS_ENABLED
1851 #define KMP_STATS_LOOP_END \
1852 { \
1853 kmp_int64 u, l, t, i; \
1854 l = (kmp_int64)(*p_lb); \
1855 u = (kmp_int64)(*p_ub); \
1856 i = (kmp_int64)(pr->u.p.st); \
1857 if (status == 0) { \
1858 t = 0; \
1859 KMP_POP_PARTITIONED_TIMER(); \
1860 } else if (i == 1) { \
1861 if (u >= l) \
1862 t = u - l + 1; \
1863 else \
1864 t = 0; \
1865 } else if (i < 0) { \
1866 if (l >= u) \
1867 t = (l - u) / (-i) + 1; \
1868 else \
1869 t = 0; \
1870 } else { \
1871 if (u >= l) \
1872 t = (u - l) / i + 1; \
1873 else \
1874 t = 0; \
1875 } \
1876 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1877 }
1878 #else
1879 #define KMP_STATS_LOOP_END /* Nothing */
1880 #endif
1881
1882 template <typename T>
__kmp_dispatch_next(ident_t * loc,int gtid,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,void * codeptr)1883 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1884 T *p_lb, T *p_ub,
1885 typename traits_t<T>::signed_t *p_st
1886 #if OMPT_SUPPORT && OMPT_OPTIONAL
1887 ,
1888 void *codeptr
1889 #endif
1890 ) {
1891
1892 typedef typename traits_t<T>::unsigned_t UT;
1893 typedef typename traits_t<T>::signed_t ST;
1894 // This is potentially slightly misleading, schedule(runtime) will appear here
1895 // even if the actual runtme schedule is static. (Which points out a
1896 // disadvantage of schedule(runtime): even when static scheduling is used it
1897 // costs more than a compile time choice to use static scheduling would.)
1898 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1899
1900 int status;
1901 dispatch_private_info_template<T> *pr;
1902 kmp_info_t *th = __kmp_threads[gtid];
1903 kmp_team_t *team = th->th.th_team;
1904
1905 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1906 KD_TRACE(
1907 1000,
1908 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1909 gtid, p_lb, p_ub, p_st, p_last));
1910
1911 if (team->t.t_serialized) {
1912 /* NOTE: serialize this dispatch becase we are not at the active level */
1913 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1914 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1915 KMP_DEBUG_ASSERT(pr);
1916
1917 if ((status = (pr->u.p.tc != 0)) == 0) {
1918 *p_lb = 0;
1919 *p_ub = 0;
1920 // if ( p_last != NULL )
1921 // *p_last = 0;
1922 if (p_st != NULL)
1923 *p_st = 0;
1924 if (__kmp_env_consistency_check) {
1925 if (pr->pushed_ws != ct_none) {
1926 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1927 }
1928 }
1929 } else if (pr->flags.nomerge) {
1930 kmp_int32 last;
1931 T start;
1932 UT limit, trip, init;
1933 ST incr;
1934 T chunk = pr->u.p.parm1;
1935
1936 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1937 gtid));
1938
1939 init = chunk * pr->u.p.count++;
1940 trip = pr->u.p.tc - 1;
1941
1942 if ((status = (init <= trip)) == 0) {
1943 *p_lb = 0;
1944 *p_ub = 0;
1945 // if ( p_last != NULL )
1946 // *p_last = 0;
1947 if (p_st != NULL)
1948 *p_st = 0;
1949 if (__kmp_env_consistency_check) {
1950 if (pr->pushed_ws != ct_none) {
1951 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1952 }
1953 }
1954 } else {
1955 start = pr->u.p.lb;
1956 limit = chunk + init - 1;
1957 incr = pr->u.p.st;
1958
1959 if ((last = (limit >= trip)) != 0) {
1960 limit = trip;
1961 #if KMP_OS_WINDOWS
1962 pr->u.p.last_upper = pr->u.p.ub;
1963 #endif /* KMP_OS_WINDOWS */
1964 }
1965 if (p_last != NULL)
1966 *p_last = last;
1967 if (p_st != NULL)
1968 *p_st = incr;
1969 if (incr == 1) {
1970 *p_lb = start + init;
1971 *p_ub = start + limit;
1972 } else {
1973 *p_lb = start + init * incr;
1974 *p_ub = start + limit * incr;
1975 }
1976
1977 if (pr->flags.ordered) {
1978 pr->u.p.ordered_lower = init;
1979 pr->u.p.ordered_upper = limit;
1980 #ifdef KMP_DEBUG
1981 {
1982 char *buff;
1983 // create format specifiers before the debug output
1984 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1985 "ordered_lower:%%%s ordered_upper:%%%s\n",
1986 traits_t<UT>::spec, traits_t<UT>::spec);
1987 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1988 pr->u.p.ordered_upper));
1989 __kmp_str_free(&buff);
1990 }
1991 #endif
1992 } // if
1993 } // if
1994 } else {
1995 pr->u.p.tc = 0;
1996 *p_lb = pr->u.p.lb;
1997 *p_ub = pr->u.p.ub;
1998 #if KMP_OS_WINDOWS
1999 pr->u.p.last_upper = *p_ub;
2000 #endif /* KMP_OS_WINDOWS */
2001 if (p_last != NULL)
2002 *p_last = TRUE;
2003 if (p_st != NULL)
2004 *p_st = pr->u.p.st;
2005 } // if
2006 #ifdef KMP_DEBUG
2007 {
2008 char *buff;
2009 // create format specifiers before the debug output
2010 buff = __kmp_str_format(
2011 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2012 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2013 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2014 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2015 __kmp_str_free(&buff);
2016 }
2017 #endif
2018 #if INCLUDE_SSC_MARKS
2019 SSC_MARK_DISPATCH_NEXT();
2020 #endif
2021 OMPT_LOOP_END;
2022 KMP_STATS_LOOP_END;
2023 return status;
2024 } else {
2025 kmp_int32 last = 0;
2026 dispatch_shared_info_template<T> volatile *sh;
2027
2028 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2029 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2030
2031 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2032 th->th.th_dispatch->th_dispatch_pr_current);
2033 KMP_DEBUG_ASSERT(pr);
2034 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2035 th->th.th_dispatch->th_dispatch_sh_current);
2036 KMP_DEBUG_ASSERT(sh);
2037
2038 #if KMP_USE_HIER_SCHED
2039 if (pr->flags.use_hier)
2040 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2041 else
2042 #endif // KMP_USE_HIER_SCHED
2043 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2044 p_st, th->th.th_team_nproc,
2045 th->th.th_info.ds.ds_tid);
2046 // status == 0: no more iterations to execute
2047 if (status == 0) {
2048 UT num_done;
2049
2050 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2051 #ifdef KMP_DEBUG
2052 {
2053 char *buff;
2054 // create format specifiers before the debug output
2055 buff = __kmp_str_format(
2056 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2057 traits_t<UT>::spec);
2058 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2059 __kmp_str_free(&buff);
2060 }
2061 #endif
2062
2063 #if KMP_USE_HIER_SCHED
2064 pr->flags.use_hier = FALSE;
2065 #endif
2066 if ((ST)num_done == th->th.th_team_nproc - 1) {
2067 #if (KMP_STATIC_STEAL_ENABLED)
2068 if (pr->schedule == kmp_sch_static_steal &&
2069 traits_t<T>::type_size > 4) {
2070 int i;
2071 kmp_info_t **other_threads = team->t.t_threads;
2072 // loop complete, safe to destroy locks used for stealing
2073 for (i = 0; i < th->th.th_team_nproc; ++i) {
2074 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2075 KMP_ASSERT(lck != NULL);
2076 __kmp_destroy_lock(lck);
2077 __kmp_free(lck);
2078 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2079 }
2080 }
2081 #endif
2082 /* NOTE: release this buffer to be reused */
2083
2084 KMP_MB(); /* Flush all pending memory write invalidates. */
2085
2086 sh->u.s.num_done = 0;
2087 sh->u.s.iteration = 0;
2088
2089 /* TODO replace with general release procedure? */
2090 if (pr->flags.ordered) {
2091 sh->u.s.ordered_iteration = 0;
2092 }
2093
2094 KMP_MB(); /* Flush all pending memory write invalidates. */
2095
2096 sh->buffer_index += __kmp_dispatch_num_buffers;
2097 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2098 gtid, sh->buffer_index));
2099
2100 KMP_MB(); /* Flush all pending memory write invalidates. */
2101
2102 } // if
2103 if (__kmp_env_consistency_check) {
2104 if (pr->pushed_ws != ct_none) {
2105 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2106 }
2107 }
2108
2109 th->th.th_dispatch->th_deo_fcn = NULL;
2110 th->th.th_dispatch->th_dxo_fcn = NULL;
2111 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2112 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2113 } // if (status == 0)
2114 #if KMP_OS_WINDOWS
2115 else if (last) {
2116 pr->u.p.last_upper = pr->u.p.ub;
2117 }
2118 #endif /* KMP_OS_WINDOWS */
2119 if (p_last != NULL && status != 0)
2120 *p_last = last;
2121 } // if
2122
2123 #ifdef KMP_DEBUG
2124 {
2125 char *buff;
2126 // create format specifiers before the debug output
2127 buff = __kmp_str_format(
2128 "__kmp_dispatch_next: T#%%d normal case: "
2129 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2130 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2131 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2132 (p_last ? *p_last : 0), status));
2133 __kmp_str_free(&buff);
2134 }
2135 #endif
2136 #if INCLUDE_SSC_MARKS
2137 SSC_MARK_DISPATCH_NEXT();
2138 #endif
2139 OMPT_LOOP_END;
2140 KMP_STATS_LOOP_END;
2141 return status;
2142 }
2143
2144 template <typename T>
__kmp_dist_get_bounds(ident_t * loc,kmp_int32 gtid,kmp_int32 * plastiter,T * plower,T * pupper,typename traits_t<T>::signed_t incr)2145 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2146 kmp_int32 *plastiter, T *plower, T *pupper,
2147 typename traits_t<T>::signed_t incr) {
2148 typedef typename traits_t<T>::unsigned_t UT;
2149 kmp_uint32 team_id;
2150 kmp_uint32 nteams;
2151 UT trip_count;
2152 kmp_team_t *team;
2153 kmp_info_t *th;
2154
2155 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2156 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2157 #ifdef KMP_DEBUG
2158 typedef typename traits_t<T>::signed_t ST;
2159 {
2160 char *buff;
2161 // create format specifiers before the debug output
2162 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2163 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2164 traits_t<T>::spec, traits_t<T>::spec,
2165 traits_t<ST>::spec, traits_t<T>::spec);
2166 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2167 __kmp_str_free(&buff);
2168 }
2169 #endif
2170
2171 if (__kmp_env_consistency_check) {
2172 if (incr == 0) {
2173 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2174 loc);
2175 }
2176 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2177 // The loop is illegal.
2178 // Some zero-trip loops maintained by compiler, e.g.:
2179 // for(i=10;i<0;++i) // lower >= upper - run-time check
2180 // for(i=0;i>10;--i) // lower <= upper - run-time check
2181 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2182 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2183 // Compiler does not check the following illegal loops:
2184 // for(i=0;i<10;i+=incr) // where incr<0
2185 // for(i=10;i>0;i-=incr) // where incr<0
2186 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2187 }
2188 }
2189 th = __kmp_threads[gtid];
2190 team = th->th.th_team;
2191 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2192 nteams = th->th.th_teams_size.nteams;
2193 team_id = team->t.t_master_tid;
2194 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2195
2196 // compute global trip count
2197 if (incr == 1) {
2198 trip_count = *pupper - *plower + 1;
2199 } else if (incr == -1) {
2200 trip_count = *plower - *pupper + 1;
2201 } else if (incr > 0) {
2202 // upper-lower can exceed the limit of signed type
2203 trip_count = (UT)(*pupper - *plower) / incr + 1;
2204 } else {
2205 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2206 }
2207
2208 if (trip_count <= nteams) {
2209 KMP_DEBUG_ASSERT(
2210 __kmp_static == kmp_sch_static_greedy ||
2211 __kmp_static ==
2212 kmp_sch_static_balanced); // Unknown static scheduling type.
2213 // only some teams get single iteration, others get nothing
2214 if (team_id < trip_count) {
2215 *pupper = *plower = *plower + team_id * incr;
2216 } else {
2217 *plower = *pupper + incr; // zero-trip loop
2218 }
2219 if (plastiter != NULL)
2220 *plastiter = (team_id == trip_count - 1);
2221 } else {
2222 if (__kmp_static == kmp_sch_static_balanced) {
2223 UT chunk = trip_count / nteams;
2224 UT extras = trip_count % nteams;
2225 *plower +=
2226 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2227 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2228 if (plastiter != NULL)
2229 *plastiter = (team_id == nteams - 1);
2230 } else {
2231 T chunk_inc_count =
2232 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2233 T upper = *pupper;
2234 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2235 // Unknown static scheduling type.
2236 *plower += team_id * chunk_inc_count;
2237 *pupper = *plower + chunk_inc_count - incr;
2238 // Check/correct bounds if needed
2239 if (incr > 0) {
2240 if (*pupper < *plower)
2241 *pupper = traits_t<T>::max_value;
2242 if (plastiter != NULL)
2243 *plastiter = *plower <= upper && *pupper > upper - incr;
2244 if (*pupper > upper)
2245 *pupper = upper; // tracker C73258
2246 } else {
2247 if (*pupper > *plower)
2248 *pupper = traits_t<T>::min_value;
2249 if (plastiter != NULL)
2250 *plastiter = *plower >= upper && *pupper < upper - incr;
2251 if (*pupper < upper)
2252 *pupper = upper; // tracker C73258
2253 }
2254 }
2255 }
2256 }
2257
2258 //-----------------------------------------------------------------------------
2259 // Dispatch routines
2260 // Transfer call to template< type T >
2261 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2262 // T lb, T ub, ST st, ST chunk )
2263 extern "C" {
2264
2265 /*!
2266 @ingroup WORK_SHARING
2267 @{
2268 @param loc Source location
2269 @param gtid Global thread id
2270 @param schedule Schedule type
2271 @param lb Lower bound
2272 @param ub Upper bound
2273 @param st Step (or increment if you prefer)
2274 @param chunk The chunk size to block with
2275
2276 This function prepares the runtime to start a dynamically scheduled for loop,
2277 saving the loop arguments.
2278 These functions are all identical apart from the types of the arguments.
2279 */
2280
__kmpc_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2281 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2282 enum sched_type schedule, kmp_int32 lb,
2283 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2284 KMP_DEBUG_ASSERT(__kmp_init_serial);
2285 #if OMPT_SUPPORT && OMPT_OPTIONAL
2286 OMPT_STORE_RETURN_ADDRESS(gtid);
2287 #endif
2288 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2289 }
2290 /*!
2291 See @ref __kmpc_dispatch_init_4
2292 */
__kmpc_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2293 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2294 enum sched_type schedule, kmp_uint32 lb,
2295 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2296 KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298 OMPT_STORE_RETURN_ADDRESS(gtid);
2299 #endif
2300 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2301 }
2302
2303 /*!
2304 See @ref __kmpc_dispatch_init_4
2305 */
__kmpc_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2306 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2307 enum sched_type schedule, kmp_int64 lb,
2308 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2309 KMP_DEBUG_ASSERT(__kmp_init_serial);
2310 #if OMPT_SUPPORT && OMPT_OPTIONAL
2311 OMPT_STORE_RETURN_ADDRESS(gtid);
2312 #endif
2313 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2314 }
2315
2316 /*!
2317 See @ref __kmpc_dispatch_init_4
2318 */
__kmpc_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2319 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2320 enum sched_type schedule, kmp_uint64 lb,
2321 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2322 KMP_DEBUG_ASSERT(__kmp_init_serial);
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324 OMPT_STORE_RETURN_ADDRESS(gtid);
2325 #endif
2326 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2327 }
2328
2329 /*!
2330 See @ref __kmpc_dispatch_init_4
2331
2332 Difference from __kmpc_dispatch_init set of functions is these functions
2333 are called for composite distribute parallel for construct. Thus before
2334 regular iterations dispatching we need to calc per-team iteration space.
2335
2336 These functions are all identical apart from the types of the arguments.
2337 */
__kmpc_dist_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2338 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2339 enum sched_type schedule, kmp_int32 *p_last,
2340 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2341 kmp_int32 chunk) {
2342 KMP_DEBUG_ASSERT(__kmp_init_serial);
2343 #if OMPT_SUPPORT && OMPT_OPTIONAL
2344 OMPT_STORE_RETURN_ADDRESS(gtid);
2345 #endif
2346 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2347 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2348 }
2349
__kmpc_dist_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2350 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2351 enum sched_type schedule, kmp_int32 *p_last,
2352 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2353 kmp_int32 chunk) {
2354 KMP_DEBUG_ASSERT(__kmp_init_serial);
2355 #if OMPT_SUPPORT && OMPT_OPTIONAL
2356 OMPT_STORE_RETURN_ADDRESS(gtid);
2357 #endif
2358 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2359 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2360 }
2361
__kmpc_dist_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2362 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2363 enum sched_type schedule, kmp_int32 *p_last,
2364 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2365 kmp_int64 chunk) {
2366 KMP_DEBUG_ASSERT(__kmp_init_serial);
2367 #if OMPT_SUPPORT && OMPT_OPTIONAL
2368 OMPT_STORE_RETURN_ADDRESS(gtid);
2369 #endif
2370 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2371 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2372 }
2373
__kmpc_dist_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2374 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2375 enum sched_type schedule, kmp_int32 *p_last,
2376 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2377 kmp_int64 chunk) {
2378 KMP_DEBUG_ASSERT(__kmp_init_serial);
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380 OMPT_STORE_RETURN_ADDRESS(gtid);
2381 #endif
2382 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2383 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2384 }
2385
2386 /*!
2387 @param loc Source code location
2388 @param gtid Global thread id
2389 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2390 otherwise
2391 @param p_lb Pointer to the lower bound for the next chunk of work
2392 @param p_ub Pointer to the upper bound for the next chunk of work
2393 @param p_st Pointer to the stride for the next chunk of work
2394 @return one if there is work to be done, zero otherwise
2395
2396 Get the next dynamically allocated chunk of work for this thread.
2397 If there is no more work, then the lb,ub and stride need not be modified.
2398 */
__kmpc_dispatch_next_4(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int32 * p_lb,kmp_int32 * p_ub,kmp_int32 * p_st)2399 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2400 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2401 #if OMPT_SUPPORT && OMPT_OPTIONAL
2402 OMPT_STORE_RETURN_ADDRESS(gtid);
2403 #endif
2404 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2405 #if OMPT_SUPPORT && OMPT_OPTIONAL
2406 ,
2407 OMPT_LOAD_RETURN_ADDRESS(gtid)
2408 #endif
2409 );
2410 }
2411
2412 /*!
2413 See @ref __kmpc_dispatch_next_4
2414 */
__kmpc_dispatch_next_4u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint32 * p_lb,kmp_uint32 * p_ub,kmp_int32 * p_st)2415 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2416 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2417 kmp_int32 *p_st) {
2418 #if OMPT_SUPPORT && OMPT_OPTIONAL
2419 OMPT_STORE_RETURN_ADDRESS(gtid);
2420 #endif
2421 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2422 #if OMPT_SUPPORT && OMPT_OPTIONAL
2423 ,
2424 OMPT_LOAD_RETURN_ADDRESS(gtid)
2425 #endif
2426 );
2427 }
2428
2429 /*!
2430 See @ref __kmpc_dispatch_next_4
2431 */
__kmpc_dispatch_next_8(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int64 * p_lb,kmp_int64 * p_ub,kmp_int64 * p_st)2432 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2433 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2434 #if OMPT_SUPPORT && OMPT_OPTIONAL
2435 OMPT_STORE_RETURN_ADDRESS(gtid);
2436 #endif
2437 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2438 #if OMPT_SUPPORT && OMPT_OPTIONAL
2439 ,
2440 OMPT_LOAD_RETURN_ADDRESS(gtid)
2441 #endif
2442 );
2443 }
2444
2445 /*!
2446 See @ref __kmpc_dispatch_next_4
2447 */
__kmpc_dispatch_next_8u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint64 * p_lb,kmp_uint64 * p_ub,kmp_int64 * p_st)2448 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2449 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2450 kmp_int64 *p_st) {
2451 #if OMPT_SUPPORT && OMPT_OPTIONAL
2452 OMPT_STORE_RETURN_ADDRESS(gtid);
2453 #endif
2454 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2455 #if OMPT_SUPPORT && OMPT_OPTIONAL
2456 ,
2457 OMPT_LOAD_RETURN_ADDRESS(gtid)
2458 #endif
2459 );
2460 }
2461
2462 /*!
2463 @param loc Source code location
2464 @param gtid Global thread id
2465
2466 Mark the end of a dynamic loop.
2467 */
__kmpc_dispatch_fini_4(ident_t * loc,kmp_int32 gtid)2468 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2469 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2470 }
2471
2472 /*!
2473 See @ref __kmpc_dispatch_fini_4
2474 */
__kmpc_dispatch_fini_8(ident_t * loc,kmp_int32 gtid)2475 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2476 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2477 }
2478
2479 /*!
2480 See @ref __kmpc_dispatch_fini_4
2481 */
__kmpc_dispatch_fini_4u(ident_t * loc,kmp_int32 gtid)2482 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2483 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2484 }
2485
2486 /*!
2487 See @ref __kmpc_dispatch_fini_4
2488 */
__kmpc_dispatch_fini_8u(ident_t * loc,kmp_int32 gtid)2489 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2490 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2491 }
2492 /*! @} */
2493
2494 //-----------------------------------------------------------------------------
2495 // Non-template routines from kmp_dispatch.cpp used in other sources
2496
__kmp_eq_4(kmp_uint32 value,kmp_uint32 checker)2497 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2498 return value == checker;
2499 }
2500
__kmp_neq_4(kmp_uint32 value,kmp_uint32 checker)2501 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2502 return value != checker;
2503 }
2504
__kmp_lt_4(kmp_uint32 value,kmp_uint32 checker)2505 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2506 return value < checker;
2507 }
2508
__kmp_ge_4(kmp_uint32 value,kmp_uint32 checker)2509 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2510 return value >= checker;
2511 }
2512
__kmp_le_4(kmp_uint32 value,kmp_uint32 checker)2513 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2514 return value <= checker;
2515 }
2516
2517 kmp_uint32
__kmp_wait_4(volatile kmp_uint32 * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(kmp_uint32,kmp_uint32),void * obj)2518 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2519 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2520 void *obj // Higher-level synchronization object, or NULL.
2521 ) {
2522 // note: we may not belong to a team at this point
2523 volatile kmp_uint32 *spin = spinner;
2524 kmp_uint32 check = checker;
2525 kmp_uint32 spins;
2526 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2527 kmp_uint32 r;
2528
2529 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2530 KMP_INIT_YIELD(spins);
2531 // main wait spin loop
2532 while (!f(r = TCR_4(*spin), check)) {
2533 KMP_FSYNC_SPIN_PREPARE(obj);
2534 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2535 split. It causes problems with infinite recursion because of exit lock */
2536 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2537 __kmp_abort_thread(); */
2538 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2539 }
2540 KMP_FSYNC_SPIN_ACQUIRED(obj);
2541 return r;
2542 }
2543
__kmp_wait_4_ptr(void * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(void *,kmp_uint32),void * obj)2544 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2545 kmp_uint32 (*pred)(void *, kmp_uint32),
2546 void *obj // Higher-level synchronization object, or NULL.
2547 ) {
2548 // note: we may not belong to a team at this point
2549 void *spin = spinner;
2550 kmp_uint32 check = checker;
2551 kmp_uint32 spins;
2552 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2553
2554 KMP_FSYNC_SPIN_INIT(obj, spin);
2555 KMP_INIT_YIELD(spins);
2556 // main wait spin loop
2557 while (!f(spin, check)) {
2558 KMP_FSYNC_SPIN_PREPARE(obj);
2559 /* if we have waited a bit, or are noversubscribed, yield */
2560 /* pause is in the following code */
2561 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2562 }
2563 KMP_FSYNC_SPIN_ACQUIRED(obj);
2564 }
2565
2566 } // extern "C"
2567
2568 #ifdef KMP_GOMP_COMPAT
2569
__kmp_aux_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2570 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2571 enum sched_type schedule, kmp_int32 lb,
2572 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2573 int push_ws) {
2574 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2575 push_ws);
2576 }
2577
__kmp_aux_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2578 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2579 enum sched_type schedule, kmp_uint32 lb,
2580 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2581 int push_ws) {
2582 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2583 push_ws);
2584 }
2585
__kmp_aux_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2586 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2587 enum sched_type schedule, kmp_int64 lb,
2588 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2589 int push_ws) {
2590 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2591 push_ws);
2592 }
2593
__kmp_aux_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2594 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2595 enum sched_type schedule, kmp_uint64 lb,
2596 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2597 int push_ws) {
2598 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2599 push_ws);
2600 }
2601
__kmp_aux_dispatch_fini_chunk_4(ident_t * loc,kmp_int32 gtid)2602 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2603 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2604 }
2605
__kmp_aux_dispatch_fini_chunk_8(ident_t * loc,kmp_int32 gtid)2606 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2607 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2608 }
2609
__kmp_aux_dispatch_fini_chunk_4u(ident_t * loc,kmp_int32 gtid)2610 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2611 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2612 }
2613
__kmp_aux_dispatch_fini_chunk_8u(ident_t * loc,kmp_int32 gtid)2614 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2615 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2616 }
2617
2618 #endif /* KMP_GOMP_COMPAT */
2619
2620 /* ------------------------------------------------------------------------ */
2621