LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014 
1015  if (team->t.t_nproc > 1 &&
1016  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1017  team->t.b->update_num_threads(team->t.t_nproc);
1018  __kmp_add_threads_to_team(team, team->t.t_nproc);
1019  }
1020  }
1021 
1022  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1023  for (i = 0; i < team->t.t_nproc; i++) {
1024  kmp_info_t *thr = team->t.t_threads[i];
1025  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1026  thr->th.th_prev_level != team->t.t_level) {
1027  team->t.t_display_affinity = 1;
1028  break;
1029  }
1030  }
1031  }
1032 
1033  KMP_MB();
1034 }
1035 
1036 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1037 // Propagate any changes to the floating point control registers out to the team
1038 // We try to avoid unnecessary writes to the relevant cache line in the team
1039 // structure, so we don't make changes unless they are needed.
1040 inline static void propagateFPControl(kmp_team_t *team) {
1041  if (__kmp_inherit_fp_control) {
1042  kmp_int16 x87_fpu_control_word;
1043  kmp_uint32 mxcsr;
1044 
1045  // Get primary thread's values of FPU control flags (both X87 and vector)
1046  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1047  __kmp_store_mxcsr(&mxcsr);
1048  mxcsr &= KMP_X86_MXCSR_MASK;
1049 
1050  // There is no point looking at t_fp_control_saved here.
1051  // If it is TRUE, we still have to update the values if they are different
1052  // from those we now have. If it is FALSE we didn't save anything yet, but
1053  // our objective is the same. We have to ensure that the values in the team
1054  // are the same as those we have.
1055  // So, this code achieves what we need whether or not t_fp_control_saved is
1056  // true. By checking whether the value needs updating we avoid unnecessary
1057  // writes that would put the cache-line into a written state, causing all
1058  // threads in the team to have to read it again.
1059  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1060  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1061  // Although we don't use this value, other code in the runtime wants to know
1062  // whether it should restore them. So we must ensure it is correct.
1063  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1064  } else {
1065  // Similarly here. Don't write to this cache-line in the team structure
1066  // unless we have to.
1067  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1068  }
1069 }
1070 
1071 // Do the opposite, setting the hardware registers to the updated values from
1072 // the team.
1073 inline static void updateHWFPControl(kmp_team_t *team) {
1074  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1075  // Only reset the fp control regs if they have been changed in the team.
1076  // the parallel region that we are exiting.
1077  kmp_int16 x87_fpu_control_word;
1078  kmp_uint32 mxcsr;
1079  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1080  __kmp_store_mxcsr(&mxcsr);
1081  mxcsr &= KMP_X86_MXCSR_MASK;
1082 
1083  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1084  __kmp_clear_x87_fpu_status_word();
1085  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1086  }
1087 
1088  if (team->t.t_mxcsr != mxcsr) {
1089  __kmp_load_mxcsr(&team->t.t_mxcsr);
1090  }
1091  }
1092 }
1093 #else
1094 #define propagateFPControl(x) ((void)0)
1095 #define updateHWFPControl(x) ((void)0)
1096 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1097 
1098 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1099  int realloc); // forward declaration
1100 
1101 /* Run a parallel region that has been serialized, so runs only in a team of the
1102  single primary thread. */
1103 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1104  kmp_info_t *this_thr;
1105  kmp_team_t *serial_team;
1106 
1107  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1108 
1109  /* Skip all this code for autopar serialized loops since it results in
1110  unacceptable overhead */
1111  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1112  return;
1113 
1114  if (!TCR_4(__kmp_init_parallel))
1115  __kmp_parallel_initialize();
1116  __kmp_resume_if_soft_paused();
1117 
1118  this_thr = __kmp_threads[global_tid];
1119  serial_team = this_thr->th.th_serial_team;
1120 
1121  /* utilize the serialized team held by this thread */
1122  KMP_DEBUG_ASSERT(serial_team);
1123  KMP_MB();
1124 
1125  if (__kmp_tasking_mode != tskm_immediate_exec) {
1126  KMP_DEBUG_ASSERT(
1127  this_thr->th.th_task_team ==
1128  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1129  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1130  NULL);
1131  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1132  "team %p, new task_team = NULL\n",
1133  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1134  this_thr->th.th_task_team = NULL;
1135  }
1136 
1137  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1138  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1139  proc_bind = proc_bind_false;
1140  } else if (proc_bind == proc_bind_default) {
1141  // No proc_bind clause was specified, so use the current value
1142  // of proc-bind-var for this parallel region.
1143  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1144  }
1145  // Reset for next parallel region
1146  this_thr->th.th_set_proc_bind = proc_bind_default;
1147 
1148 #if OMPT_SUPPORT
1149  ompt_data_t ompt_parallel_data = ompt_data_none;
1150  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1151  if (ompt_enabled.enabled &&
1152  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1153 
1154  ompt_task_info_t *parent_task_info;
1155  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1156 
1157  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1158  if (ompt_enabled.ompt_callback_parallel_begin) {
1159  int team_size = 1;
1160 
1161  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1162  &(parent_task_info->task_data), &(parent_task_info->frame),
1163  &ompt_parallel_data, team_size,
1164  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1165  }
1166  }
1167 #endif // OMPT_SUPPORT
1168 
1169  if (this_thr->th.th_team != serial_team) {
1170  // Nested level will be an index in the nested nthreads array
1171  int level = this_thr->th.th_team->t.t_level;
1172 
1173  if (serial_team->t.t_serialized) {
1174  /* this serial team was already used
1175  TODO increase performance by making this locks more specific */
1176  kmp_team_t *new_team;
1177 
1178  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1179 
1180  new_team =
1181  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1182 #if OMPT_SUPPORT
1183  ompt_parallel_data,
1184 #endif
1185  proc_bind, &this_thr->th.th_current_task->td_icvs,
1186  0 USE_NESTED_HOT_ARG(NULL));
1187  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1188  KMP_ASSERT(new_team);
1189 
1190  /* setup new serialized team and install it */
1191  new_team->t.t_threads[0] = this_thr;
1192  new_team->t.t_parent = this_thr->th.th_team;
1193  serial_team = new_team;
1194  this_thr->th.th_serial_team = serial_team;
1195 
1196  KF_TRACE(
1197  10,
1198  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1199  global_tid, serial_team));
1200 
1201  /* TODO the above breaks the requirement that if we run out of resources,
1202  then we can still guarantee that serialized teams are ok, since we may
1203  need to allocate a new one */
1204  } else {
1205  KF_TRACE(
1206  10,
1207  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1208  global_tid, serial_team));
1209  }
1210 
1211  /* we have to initialize this serial team */
1212  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1213  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1214  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1215  serial_team->t.t_ident = loc;
1216  serial_team->t.t_serialized = 1;
1217  serial_team->t.t_nproc = 1;
1218  serial_team->t.t_parent = this_thr->th.th_team;
1219  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1220  this_thr->th.th_team = serial_team;
1221  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1222 
1223  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1224  this_thr->th.th_current_task));
1225  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1226  this_thr->th.th_current_task->td_flags.executing = 0;
1227 
1228  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1229 
1230  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1231  implicit task for each serialized task represented by
1232  team->t.t_serialized? */
1233  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1234  &this_thr->th.th_current_task->td_parent->td_icvs);
1235 
1236  // Thread value exists in the nested nthreads array for the next nested
1237  // level
1238  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1239  this_thr->th.th_current_task->td_icvs.nproc =
1240  __kmp_nested_nth.nth[level + 1];
1241  }
1242 
1243  if (__kmp_nested_proc_bind.used &&
1244  (level + 1 < __kmp_nested_proc_bind.used)) {
1245  this_thr->th.th_current_task->td_icvs.proc_bind =
1246  __kmp_nested_proc_bind.bind_types[level + 1];
1247  }
1248 
1249 #if USE_DEBUGGER
1250  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1251 #endif
1252  this_thr->th.th_info.ds.ds_tid = 0;
1253 
1254  /* set thread cache values */
1255  this_thr->th.th_team_nproc = 1;
1256  this_thr->th.th_team_master = this_thr;
1257  this_thr->th.th_team_serialized = 1;
1258 
1259  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1260  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1261  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1262 
1263  propagateFPControl(serial_team);
1264 
1265  /* check if we need to allocate dispatch buffers stack */
1266  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1267  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1268  serial_team->t.t_dispatch->th_disp_buffer =
1269  (dispatch_private_info_t *)__kmp_allocate(
1270  sizeof(dispatch_private_info_t));
1271  }
1272  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1273 
1274  KMP_MB();
1275 
1276  } else {
1277  /* this serialized team is already being used,
1278  * that's fine, just add another nested level */
1279  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1280  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1281  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1282  ++serial_team->t.t_serialized;
1283  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1284 
1285  // Nested level will be an index in the nested nthreads array
1286  int level = this_thr->th.th_team->t.t_level;
1287  // Thread value exists in the nested nthreads array for the next nested
1288  // level
1289  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1290  this_thr->th.th_current_task->td_icvs.nproc =
1291  __kmp_nested_nth.nth[level + 1];
1292  }
1293  serial_team->t.t_level++;
1294  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1295  "of serial team %p to %d\n",
1296  global_tid, serial_team, serial_team->t.t_level));
1297 
1298  /* allocate/push dispatch buffers stack */
1299  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1300  {
1301  dispatch_private_info_t *disp_buffer =
1302  (dispatch_private_info_t *)__kmp_allocate(
1303  sizeof(dispatch_private_info_t));
1304  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1305  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1306  }
1307  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1308 
1309  KMP_MB();
1310  }
1311  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1312 
1313  // Perform the display affinity functionality for
1314  // serialized parallel regions
1315  if (__kmp_display_affinity) {
1316  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1317  this_thr->th.th_prev_num_threads != 1) {
1318  // NULL means use the affinity-format-var ICV
1319  __kmp_aux_display_affinity(global_tid, NULL);
1320  this_thr->th.th_prev_level = serial_team->t.t_level;
1321  this_thr->th.th_prev_num_threads = 1;
1322  }
1323  }
1324 
1325  if (__kmp_env_consistency_check)
1326  __kmp_push_parallel(global_tid, NULL);
1327 #if OMPT_SUPPORT
1328  serial_team->t.ompt_team_info.master_return_address = codeptr;
1329  if (ompt_enabled.enabled &&
1330  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1331  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1332  OMPT_GET_FRAME_ADDRESS(0);
1333 
1334  ompt_lw_taskteam_t lw_taskteam;
1335  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1336  &ompt_parallel_data, codeptr);
1337 
1338  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1339  // don't use lw_taskteam after linking. content was swaped
1340 
1341  /* OMPT implicit task begin */
1342  if (ompt_enabled.ompt_callback_implicit_task) {
1343  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1344  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1345  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1346  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1347  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1348  __kmp_tid_from_gtid(global_tid);
1349  }
1350 
1351  /* OMPT state */
1352  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1353  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1354  OMPT_GET_FRAME_ADDRESS(0);
1355  }
1356 #endif
1357 }
1358 
1359 // Test if this fork is for a team closely nested in a teams construct
1360 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1361  microtask_t microtask, int level,
1362  int teams_level, kmp_va_list ap) {
1363  return (master_th->th.th_teams_microtask && ap &&
1364  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1365 }
1366 
1367 // Test if this fork is for the teams construct, i.e. to form the outer league
1368 // of teams
1369 static inline bool __kmp_is_entering_teams(int active_level, int level,
1370  int teams_level, kmp_va_list ap) {
1371  return ((ap == NULL && active_level == 0) ||
1372  (ap && teams_level > 0 && teams_level == level));
1373 }
1374 
1375 // AC: This is start of parallel that is nested inside teams construct.
1376 // The team is actual (hot), all workers are ready at the fork barrier.
1377 // No lock needed to initialize the team a bit, then free workers.
1378 static inline int
1379 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1380  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1381  enum fork_context_e call_context, microtask_t microtask,
1382  launch_t invoker, int master_set_numthreads, int level,
1383 #if OMPT_SUPPORT
1384  ompt_data_t ompt_parallel_data, void *return_address,
1385 #endif
1386  kmp_va_list ap) {
1387  void **argv;
1388  int i;
1389 
1390  parent_team->t.t_ident = loc;
1391  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1392  parent_team->t.t_argc = argc;
1393  argv = (void **)parent_team->t.t_argv;
1394  for (i = argc - 1; i >= 0; --i) {
1395  *argv++ = va_arg(kmp_va_deref(ap), void *);
1396  }
1397  // Increment our nested depth levels, but not increase the serialization
1398  if (parent_team == master_th->th.th_serial_team) {
1399  // AC: we are in serialized parallel
1400  __kmpc_serialized_parallel(loc, gtid);
1401  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1402 
1403  if (call_context == fork_context_gnu) {
1404  // AC: need to decrement t_serialized for enquiry functions to work
1405  // correctly, will restore at join time
1406  parent_team->t.t_serialized--;
1407  return TRUE;
1408  }
1409 
1410 #if OMPD_SUPPORT
1411  parent_team->t.t_pkfn = microtask;
1412 #endif
1413 
1414 #if OMPT_SUPPORT
1415  void *dummy;
1416  void **exit_frame_p;
1417  ompt_data_t *implicit_task_data;
1418  ompt_lw_taskteam_t lw_taskteam;
1419 
1420  if (ompt_enabled.enabled) {
1421  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1422  &ompt_parallel_data, return_address);
1423  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1424 
1425  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1426  // Don't use lw_taskteam after linking. Content was swapped.
1427 
1428  /* OMPT implicit task begin */
1429  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1430  if (ompt_enabled.ompt_callback_implicit_task) {
1431  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1432  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1433  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1434  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1435  }
1436 
1437  /* OMPT state */
1438  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1439  } else {
1440  exit_frame_p = &dummy;
1441  }
1442 #endif
1443 
1444  // AC: need to decrement t_serialized for enquiry functions to work
1445  // correctly, will restore at join time
1446  parent_team->t.t_serialized--;
1447 
1448  {
1449  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1450  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1451  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1452 #if OMPT_SUPPORT
1453  ,
1454  exit_frame_p
1455 #endif
1456  );
1457  }
1458 
1459 #if OMPT_SUPPORT
1460  if (ompt_enabled.enabled) {
1461  *exit_frame_p = NULL;
1462  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1463  if (ompt_enabled.ompt_callback_implicit_task) {
1464  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1465  ompt_scope_end, NULL, implicit_task_data, 1,
1466  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1467  }
1468  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1469  __ompt_lw_taskteam_unlink(master_th);
1470  if (ompt_enabled.ompt_callback_parallel_end) {
1471  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1472  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1473  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1474  }
1475  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1476  }
1477 #endif
1478  return TRUE;
1479  }
1480 
1481  parent_team->t.t_pkfn = microtask;
1482  parent_team->t.t_invoke = invoker;
1483  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1484  parent_team->t.t_active_level++;
1485  parent_team->t.t_level++;
1486  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1487 
1488  // If the threads allocated to the team are less than the thread limit, update
1489  // the thread limit here. th_teams_size.nth is specific to this team nested
1490  // in a teams construct, the team is fully created, and we're about to do
1491  // the actual fork. Best to do this here so that the subsequent uses below
1492  // and in the join have the correct value.
1493  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1494 
1495 #if OMPT_SUPPORT
1496  if (ompt_enabled.enabled) {
1497  ompt_lw_taskteam_t lw_taskteam;
1498  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1499  return_address);
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1501  }
1502 #endif
1503 
1504  /* Change number of threads in the team if requested */
1505  if (master_set_numthreads) { // The parallel has num_threads clause
1506  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1507  // AC: only can reduce number of threads dynamically, can't increase
1508  kmp_info_t **other_threads = parent_team->t.t_threads;
1509  // NOTE: if using distributed barrier, we need to run this code block
1510  // even when the team size appears not to have changed from the max.
1511  int old_proc = master_th->th.th_teams_size.nth;
1512  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1513  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1514  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1515  }
1516  parent_team->t.t_nproc = master_set_numthreads;
1517  for (i = 0; i < master_set_numthreads; ++i) {
1518  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1519  }
1520  }
1521  // Keep extra threads hot in the team for possible next parallels
1522  master_th->th.th_set_nproc = 0;
1523  }
1524 
1525 #if USE_DEBUGGER
1526  if (__kmp_debugging) { // Let debugger override number of threads.
1527  int nth = __kmp_omp_num_threads(loc);
1528  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1529  master_set_numthreads = nth;
1530  }
1531  }
1532 #endif
1533 
1534  // Figure out the proc_bind policy for the nested parallel within teams
1535  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1536  // proc_bind_default means don't update
1537  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1538  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1539  proc_bind = proc_bind_false;
1540  } else {
1541  // No proc_bind clause specified; use current proc-bind-var
1542  if (proc_bind == proc_bind_default) {
1543  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1544  }
1545  /* else: The proc_bind policy was specified explicitly on parallel clause.
1546  This overrides proc-bind-var for this parallel region, but does not
1547  change proc-bind-var. */
1548  // Figure the value of proc-bind-var for the child threads.
1549  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1550  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1551  master_th->th.th_current_task->td_icvs.proc_bind)) {
1552  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1553  }
1554  }
1555  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1556  // Need to change the bind-var ICV to correct value for each implicit task
1557  if (proc_bind_icv != proc_bind_default &&
1558  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1559  kmp_info_t **other_threads = parent_team->t.t_threads;
1560  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1561  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1562  }
1563  }
1564  // Reset for next parallel region
1565  master_th->th.th_set_proc_bind = proc_bind_default;
1566 
1567 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1568  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1569  KMP_ITT_DEBUG) &&
1570  __kmp_forkjoin_frames_mode == 3 &&
1571  parent_team->t.t_active_level == 1 // only report frames at level 1
1572  && master_th->th.th_teams_size.nteams == 1) {
1573  kmp_uint64 tmp_time = __itt_get_timestamp();
1574  master_th->th.th_frame_time = tmp_time;
1575  parent_team->t.t_region_time = tmp_time;
1576  }
1577  if (__itt_stack_caller_create_ptr) {
1578  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1579  // create new stack stitching id before entering fork barrier
1580  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1581  }
1582 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1583 #if KMP_AFFINITY_SUPPORTED
1584  __kmp_partition_places(parent_team);
1585 #endif
1586 
1587  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1588  "master_th=%p, gtid=%d\n",
1589  root, parent_team, master_th, gtid));
1590  __kmp_internal_fork(loc, gtid, parent_team);
1591  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1592  "master_th=%p, gtid=%d\n",
1593  root, parent_team, master_th, gtid));
1594 
1595  if (call_context == fork_context_gnu)
1596  return TRUE;
1597 
1598  /* Invoke microtask for PRIMARY thread */
1599  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1600  parent_team->t.t_id, parent_team->t.t_pkfn));
1601 
1602  if (!parent_team->t.t_invoke(gtid)) {
1603  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1604  }
1605  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1606  parent_team->t.t_id, parent_team->t.t_pkfn));
1607  KMP_MB(); /* Flush all pending memory write invalidates. */
1608 
1609  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1610 
1611  return TRUE;
1612 }
1613 
1614 // Create a serialized parallel region
1615 static inline int
1616 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1617  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1618  kmp_info_t *master_th, kmp_team_t *parent_team,
1619 #if OMPT_SUPPORT
1620  ompt_data_t *ompt_parallel_data, void **return_address,
1621  ompt_data_t **parent_task_data,
1622 #endif
1623  kmp_va_list ap) {
1624  kmp_team_t *team;
1625  int i;
1626  void **argv;
1627 
1628 /* josh todo: hypothetical question: what do we do for OS X*? */
1629 #if KMP_OS_LINUX && \
1630  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1631  void *args[argc];
1632 #else
1633  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1634 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1635  KMP_ARCH_AARCH64) */
1636 
1637  KA_TRACE(
1638  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1639 
1640  __kmpc_serialized_parallel(loc, gtid);
1641 
1642 #if OMPD_SUPPORT
1643  master_th->th.th_serial_team->t.t_pkfn = microtask;
1644 #endif
1645 
1646  if (call_context == fork_context_intel) {
1647  /* TODO this sucks, use the compiler itself to pass args! :) */
1648  master_th->th.th_serial_team->t.t_ident = loc;
1649  if (!ap) {
1650  // revert change made in __kmpc_serialized_parallel()
1651  master_th->th.th_serial_team->t.t_level--;
1652 // Get args from parent team for teams construct
1653 
1654 #if OMPT_SUPPORT
1655  void *dummy;
1656  void **exit_frame_p;
1657  ompt_task_info_t *task_info;
1658  ompt_lw_taskteam_t lw_taskteam;
1659 
1660  if (ompt_enabled.enabled) {
1661  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1662  ompt_parallel_data, *return_address);
1663 
1664  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1665  // don't use lw_taskteam after linking. content was swaped
1666  task_info = OMPT_CUR_TASK_INFO(master_th);
1667  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1668  if (ompt_enabled.ompt_callback_implicit_task) {
1669  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1670  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1671  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1672  &(task_info->task_data), 1,
1673  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1674  }
1675 
1676  /* OMPT state */
1677  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1678  } else {
1679  exit_frame_p = &dummy;
1680  }
1681 #endif
1682 
1683  {
1684  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1685  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1686  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1687 #if OMPT_SUPPORT
1688  ,
1689  exit_frame_p
1690 #endif
1691  );
1692  }
1693 
1694 #if OMPT_SUPPORT
1695  if (ompt_enabled.enabled) {
1696  *exit_frame_p = NULL;
1697  if (ompt_enabled.ompt_callback_implicit_task) {
1698  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1699  ompt_scope_end, NULL, &(task_info->task_data), 1,
1700  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1701  }
1702  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1703  __ompt_lw_taskteam_unlink(master_th);
1704  if (ompt_enabled.ompt_callback_parallel_end) {
1705  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1706  ompt_parallel_data, *parent_task_data,
1707  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1708  }
1709  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1710  }
1711 #endif
1712  } else if (microtask == (microtask_t)__kmp_teams_master) {
1713  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1714  team = master_th->th.th_team;
1715  // team->t.t_pkfn = microtask;
1716  team->t.t_invoke = invoker;
1717  __kmp_alloc_argv_entries(argc, team, TRUE);
1718  team->t.t_argc = argc;
1719  argv = (void **)team->t.t_argv;
1720  if (ap) {
1721  for (i = argc - 1; i >= 0; --i)
1722  *argv++ = va_arg(kmp_va_deref(ap), void *);
1723  } else {
1724  for (i = 0; i < argc; ++i)
1725  // Get args from parent team for teams construct
1726  argv[i] = parent_team->t.t_argv[i];
1727  }
1728  // AC: revert change made in __kmpc_serialized_parallel()
1729  // because initial code in teams should have level=0
1730  team->t.t_level--;
1731  // AC: call special invoker for outer "parallel" of teams construct
1732  invoker(gtid);
1733 #if OMPT_SUPPORT
1734  if (ompt_enabled.enabled) {
1735  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1736  if (ompt_enabled.ompt_callback_implicit_task) {
1737  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1738  ompt_scope_end, NULL, &(task_info->task_data), 0,
1739  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1740  }
1741  if (ompt_enabled.ompt_callback_parallel_end) {
1742  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1743  ompt_parallel_data, *parent_task_data,
1744  OMPT_INVOKER(call_context) | ompt_parallel_league,
1745  *return_address);
1746  }
1747  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1748  }
1749 #endif
1750  } else {
1751  argv = args;
1752  for (i = argc - 1; i >= 0; --i)
1753  *argv++ = va_arg(kmp_va_deref(ap), void *);
1754  KMP_MB();
1755 
1756 #if OMPT_SUPPORT
1757  void *dummy;
1758  void **exit_frame_p;
1759  ompt_task_info_t *task_info;
1760  ompt_lw_taskteam_t lw_taskteam;
1761  ompt_data_t *implicit_task_data;
1762 
1763  if (ompt_enabled.enabled) {
1764  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1765  ompt_parallel_data, *return_address);
1766  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767  // don't use lw_taskteam after linking. content was swaped
1768  task_info = OMPT_CUR_TASK_INFO(master_th);
1769  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1770 
1771  /* OMPT implicit task begin */
1772  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1773  if (ompt_enabled.ompt_callback_implicit_task) {
1774  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1777  ompt_task_implicit);
1778  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1779  }
1780 
1781  /* OMPT state */
1782  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783  } else {
1784  exit_frame_p = &dummy;
1785  }
1786 #endif
1787 
1788  {
1789  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1792 #if OMPT_SUPPORT
1793  ,
1794  exit_frame_p
1795 #endif
1796  );
1797  }
1798 
1799 #if OMPT_SUPPORT
1800  if (ompt_enabled.enabled) {
1801  *exit_frame_p = NULL;
1802  if (ompt_enabled.ompt_callback_implicit_task) {
1803  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1804  ompt_scope_end, NULL, &(task_info->task_data), 1,
1805  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1806  }
1807 
1808  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1809  __ompt_lw_taskteam_unlink(master_th);
1810  if (ompt_enabled.ompt_callback_parallel_end) {
1811  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812  ompt_parallel_data, *parent_task_data,
1813  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1814  }
1815  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1816  }
1817 #endif
1818  }
1819  } else if (call_context == fork_context_gnu) {
1820 #if OMPT_SUPPORT
1821  if (ompt_enabled.enabled) {
1822  ompt_lw_taskteam_t lwt;
1823  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1824  *return_address);
1825 
1826  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1827  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1828  }
1829 // don't use lw_taskteam after linking. content was swaped
1830 #endif
1831 
1832  // we were called from GNU native code
1833  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1834  return FALSE;
1835  } else {
1836  KMP_ASSERT2(call_context < fork_context_last,
1837  "__kmp_serial_fork_call: unknown fork_context parameter");
1838  }
1839 
1840  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1841  KMP_MB();
1842  return FALSE;
1843 }
1844 
1845 /* most of the work for a fork */
1846 /* return true if we really went parallel, false if serialized */
1847 int __kmp_fork_call(ident_t *loc, int gtid,
1848  enum fork_context_e call_context, // Intel, GNU, ...
1849  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1850  kmp_va_list ap) {
1851  void **argv;
1852  int i;
1853  int master_tid;
1854  int master_this_cons;
1855  kmp_team_t *team;
1856  kmp_team_t *parent_team;
1857  kmp_info_t *master_th;
1858  kmp_root_t *root;
1859  int nthreads;
1860  int master_active;
1861  int master_set_numthreads;
1862  int level;
1863  int active_level;
1864  int teams_level;
1865 #if KMP_NESTED_HOT_TEAMS
1866  kmp_hot_team_ptr_t **p_hot_teams;
1867 #endif
1868  { // KMP_TIME_BLOCK
1869  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1870  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1871 
1872  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1873  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1874  /* Some systems prefer the stack for the root thread(s) to start with */
1875  /* some gap from the parent stack to prevent false sharing. */
1876  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1877  /* These 2 lines below are so this does not get optimized out */
1878  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1879  __kmp_stkpadding += (short)((kmp_int64)dummy);
1880  }
1881 
1882  /* initialize if needed */
1883  KMP_DEBUG_ASSERT(
1884  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1885  if (!TCR_4(__kmp_init_parallel))
1886  __kmp_parallel_initialize();
1887  __kmp_resume_if_soft_paused();
1888 
1889  /* setup current data */
1890  // AC: potentially unsafe, not in sync with library shutdown,
1891  // __kmp_threads can be freed
1892  master_th = __kmp_threads[gtid];
1893 
1894  parent_team = master_th->th.th_team;
1895  master_tid = master_th->th.th_info.ds.ds_tid;
1896  master_this_cons = master_th->th.th_local.this_construct;
1897  root = master_th->th.th_root;
1898  master_active = root->r.r_active;
1899  master_set_numthreads = master_th->th.th_set_nproc;
1900 
1901 #if OMPT_SUPPORT
1902  ompt_data_t ompt_parallel_data = ompt_data_none;
1903  ompt_data_t *parent_task_data;
1904  ompt_frame_t *ompt_frame;
1905  void *return_address = NULL;
1906 
1907  if (ompt_enabled.enabled) {
1908  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1909  NULL, NULL);
1910  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1911  }
1912 #endif
1913 
1914  // Assign affinity to root thread if it hasn't happened yet
1915  __kmp_assign_root_init_mask();
1916 
1917  // Nested level will be an index in the nested nthreads array
1918  level = parent_team->t.t_level;
1919  // used to launch non-serial teams even if nested is not allowed
1920  active_level = parent_team->t.t_active_level;
1921  // needed to check nesting inside the teams
1922  teams_level = master_th->th.th_teams_level;
1923 #if KMP_NESTED_HOT_TEAMS
1924  p_hot_teams = &master_th->th.th_hot_teams;
1925  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1926  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1927  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1928  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1929  // it is either actual or not needed (when active_level > 0)
1930  (*p_hot_teams)[0].hot_team_nth = 1;
1931  }
1932 #endif
1933 
1934 #if OMPT_SUPPORT
1935  if (ompt_enabled.enabled) {
1936  if (ompt_enabled.ompt_callback_parallel_begin) {
1937  int team_size = master_set_numthreads
1938  ? master_set_numthreads
1939  : get__nproc_2(parent_team, master_tid);
1940  int flags = OMPT_INVOKER(call_context) |
1941  ((microtask == (microtask_t)__kmp_teams_master)
1942  ? ompt_parallel_league
1943  : ompt_parallel_team);
1944  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1945  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1946  return_address);
1947  }
1948  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1949  }
1950 #endif
1951 
1952  master_th->th.th_ident = loc;
1953 
1954  // Parallel closely nested in teams construct:
1955  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1956  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1957  call_context, microtask, invoker,
1958  master_set_numthreads, level,
1959 #if OMPT_SUPPORT
1960  ompt_parallel_data, return_address,
1961 #endif
1962  ap);
1963  } // End parallel closely nested in teams construct
1964 
1965 #if KMP_DEBUG
1966  if (__kmp_tasking_mode != tskm_immediate_exec) {
1967  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1968  parent_team->t.t_task_team[master_th->th.th_task_state]);
1969  }
1970 #endif
1971 
1972  // Need this to happen before we determine the number of threads, not while
1973  // we are allocating the team
1974  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1975 
1976  // Determine the number of threads
1977  int enter_teams =
1978  __kmp_is_entering_teams(active_level, level, teams_level, ap);
1979  if ((!enter_teams &&
1980  (parent_team->t.t_active_level >=
1981  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1982  (__kmp_library == library_serial)) {
1983  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1984  nthreads = 1;
1985  } else {
1986  nthreads = master_set_numthreads
1987  ? master_set_numthreads
1988  // TODO: get nproc directly from current task
1989  : get__nproc_2(parent_team, master_tid);
1990  // Check if we need to take forkjoin lock? (no need for serialized
1991  // parallel out of teams construct).
1992  if (nthreads > 1) {
1993  /* determine how many new threads we can use */
1994  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1995  /* AC: If we execute teams from parallel region (on host), then teams
1996  should be created but each can only have 1 thread if nesting is
1997  disabled. If teams called from serial region, then teams and their
1998  threads should be created regardless of the nesting setting. */
1999  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2000  nthreads, enter_teams);
2001  if (nthreads == 1) {
2002  // Free lock for single thread execution here; for multi-thread
2003  // execution it will be freed later after team of threads created
2004  // and initialized
2005  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2006  }
2007  }
2008  }
2009  KMP_DEBUG_ASSERT(nthreads > 0);
2010 
2011  // If we temporarily changed the set number of threads then restore it now
2012  master_th->th.th_set_nproc = 0;
2013 
2014  if (nthreads == 1) {
2015  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2016  invoker, master_th, parent_team,
2017 #if OMPT_SUPPORT
2018  &ompt_parallel_data, &return_address,
2019  &parent_task_data,
2020 #endif
2021  ap);
2022  } // if (nthreads == 1)
2023 
2024  // GEH: only modify the executing flag in the case when not serialized
2025  // serialized case is handled in kmpc_serialized_parallel
2026  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2027  "curtask=%p, curtask_max_aclevel=%d\n",
2028  parent_team->t.t_active_level, master_th,
2029  master_th->th.th_current_task,
2030  master_th->th.th_current_task->td_icvs.max_active_levels));
2031  // TODO: GEH - cannot do this assertion because root thread not set up as
2032  // executing
2033  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2034  master_th->th.th_current_task->td_flags.executing = 0;
2035 
2036  if (!master_th->th.th_teams_microtask || level > teams_level) {
2037  /* Increment our nested depth level */
2038  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2039  }
2040 
2041  // See if we need to make a copy of the ICVs.
2042  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2043  if ((level + 1 < __kmp_nested_nth.used) &&
2044  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2045  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2046  } else {
2047  nthreads_icv = 0; // don't update
2048  }
2049 
2050  // Figure out the proc_bind_policy for the new team.
2051  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2052  // proc_bind_default means don't update
2053  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2054  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2055  proc_bind = proc_bind_false;
2056  } else {
2057  // No proc_bind clause specified; use current proc-bind-var for this
2058  // parallel region
2059  if (proc_bind == proc_bind_default) {
2060  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2061  }
2062  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2063  if (master_th->th.th_teams_microtask &&
2064  microtask == (microtask_t)__kmp_teams_master) {
2065  proc_bind = __kmp_teams_proc_bind;
2066  }
2067  /* else: The proc_bind policy was specified explicitly on parallel clause.
2068  This overrides proc-bind-var for this parallel region, but does not
2069  change proc-bind-var. */
2070  // Figure the value of proc-bind-var for the child threads.
2071  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2072  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2073  master_th->th.th_current_task->td_icvs.proc_bind)) {
2074  // Do not modify the proc bind icv for the two teams construct forks
2075  // They just let the proc bind icv pass through
2076  if (!master_th->th.th_teams_microtask ||
2077  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2078  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2079  }
2080  }
2081 
2082  // Reset for next parallel region
2083  master_th->th.th_set_proc_bind = proc_bind_default;
2084 
2085  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2086  kmp_internal_control_t new_icvs;
2087  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2088  new_icvs.next = NULL;
2089  if (nthreads_icv > 0) {
2090  new_icvs.nproc = nthreads_icv;
2091  }
2092  if (proc_bind_icv != proc_bind_default) {
2093  new_icvs.proc_bind = proc_bind_icv;
2094  }
2095 
2096  /* allocate a new parallel team */
2097  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2098  team = __kmp_allocate_team(root, nthreads, nthreads,
2099 #if OMPT_SUPPORT
2100  ompt_parallel_data,
2101 #endif
2102  proc_bind, &new_icvs,
2103  argc USE_NESTED_HOT_ARG(master_th));
2104  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2105  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2106  } else {
2107  /* allocate a new parallel team */
2108  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2109  team = __kmp_allocate_team(root, nthreads, nthreads,
2110 #if OMPT_SUPPORT
2111  ompt_parallel_data,
2112 #endif
2113  proc_bind,
2114  &master_th->th.th_current_task->td_icvs,
2115  argc USE_NESTED_HOT_ARG(master_th));
2116  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2117  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2118  &master_th->th.th_current_task->td_icvs);
2119  }
2120  KF_TRACE(
2121  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2122 
2123  /* setup the new team */
2124  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2125  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2126  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2127  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2128  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2129 #if OMPT_SUPPORT
2130  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2131  return_address);
2132 #endif
2133  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2134  // TODO: parent_team->t.t_level == INT_MAX ???
2135  if (!master_th->th.th_teams_microtask || level > teams_level) {
2136  int new_level = parent_team->t.t_level + 1;
2137  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2138  new_level = parent_team->t.t_active_level + 1;
2139  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2140  } else {
2141  // AC: Do not increase parallel level at start of the teams construct
2142  int new_level = parent_team->t.t_level;
2143  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2144  new_level = parent_team->t.t_active_level;
2145  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2146  }
2147  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2148  // set primary thread's schedule as new run-time schedule
2149  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2150 
2151  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2152  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2153 
2154  // Update the floating point rounding in the team if required.
2155  propagateFPControl(team);
2156 #if OMPD_SUPPORT
2157  if (ompd_state & OMPD_ENABLE_BP)
2158  ompd_bp_parallel_begin();
2159 #endif
2160 
2161  if (__kmp_tasking_mode != tskm_immediate_exec) {
2162  // Set primary thread's task team to team's task team. Unless this is hot
2163  // team, it should be NULL.
2164  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2165  parent_team->t.t_task_team[master_th->th.th_task_state]);
2166  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2167  "%p, new task_team %p / team %p\n",
2168  __kmp_gtid_from_thread(master_th),
2169  master_th->th.th_task_team, parent_team,
2170  team->t.t_task_team[master_th->th.th_task_state], team));
2171 
2172  if (active_level || master_th->th.th_task_team) {
2173  // Take a memo of primary thread's task_state
2174  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2175  if (master_th->th.th_task_state_top >=
2176  master_th->th.th_task_state_stack_sz) { // increase size
2177  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2178  kmp_uint8 *old_stack, *new_stack;
2179  kmp_uint32 i;
2180  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2181  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2182  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2183  }
2184  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2185  ++i) { // zero-init rest of stack
2186  new_stack[i] = 0;
2187  }
2188  old_stack = master_th->th.th_task_state_memo_stack;
2189  master_th->th.th_task_state_memo_stack = new_stack;
2190  master_th->th.th_task_state_stack_sz = new_size;
2191  __kmp_free(old_stack);
2192  }
2193  // Store primary thread's task_state on stack
2194  master_th->th
2195  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2196  master_th->th.th_task_state;
2197  master_th->th.th_task_state_top++;
2198 #if KMP_NESTED_HOT_TEAMS
2199  if (master_th->th.th_hot_teams &&
2200  active_level < __kmp_hot_teams_max_level &&
2201  team == master_th->th.th_hot_teams[active_level].hot_team) {
2202  // Restore primary thread's nested state if nested hot team
2203  master_th->th.th_task_state =
2204  master_th->th
2205  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2206  } else {
2207 #endif
2208  master_th->th.th_task_state = 0;
2209 #if KMP_NESTED_HOT_TEAMS
2210  }
2211 #endif
2212  }
2213 #if !KMP_NESTED_HOT_TEAMS
2214  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2215  (team == root->r.r_hot_team));
2216 #endif
2217  }
2218 
2219  KA_TRACE(
2220  20,
2221  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2222  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2223  team->t.t_nproc));
2224  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2225  (team->t.t_master_tid == 0 &&
2226  (team->t.t_parent == root->r.r_root_team ||
2227  team->t.t_parent->t.t_serialized)));
2228  KMP_MB();
2229 
2230  /* now, setup the arguments */
2231  argv = (void **)team->t.t_argv;
2232  if (ap) {
2233  for (i = argc - 1; i >= 0; --i) {
2234  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2235  KMP_CHECK_UPDATE(*argv, new_argv);
2236  argv++;
2237  }
2238  } else {
2239  for (i = 0; i < argc; ++i) {
2240  // Get args from parent team for teams construct
2241  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2242  }
2243  }
2244 
2245  /* now actually fork the threads */
2246  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2247  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2248  root->r.r_active = TRUE;
2249 
2250  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2251  __kmp_setup_icv_copy(team, nthreads,
2252  &master_th->th.th_current_task->td_icvs, loc);
2253 
2254 #if OMPT_SUPPORT
2255  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2256 #endif
2257 
2258  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2259 
2260 #if USE_ITT_BUILD
2261  if (team->t.t_active_level == 1 // only report frames at level 1
2262  && !master_th->th.th_teams_microtask) { // not in teams construct
2263 #if USE_ITT_NOTIFY
2264  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2265  (__kmp_forkjoin_frames_mode == 3 ||
2266  __kmp_forkjoin_frames_mode == 1)) {
2267  kmp_uint64 tmp_time = 0;
2268  if (__itt_get_timestamp_ptr)
2269  tmp_time = __itt_get_timestamp();
2270  // Internal fork - report frame begin
2271  master_th->th.th_frame_time = tmp_time;
2272  if (__kmp_forkjoin_frames_mode == 3)
2273  team->t.t_region_time = tmp_time;
2274  } else
2275 // only one notification scheme (either "submit" or "forking/joined", not both)
2276 #endif /* USE_ITT_NOTIFY */
2277  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2278  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2279  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2280  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2281  }
2282  }
2283 #endif /* USE_ITT_BUILD */
2284 
2285  /* now go on and do the work */
2286  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2287  KMP_MB();
2288  KF_TRACE(10,
2289  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2290  root, team, master_th, gtid));
2291 
2292 #if USE_ITT_BUILD
2293  if (__itt_stack_caller_create_ptr) {
2294  // create new stack stitching id before entering fork barrier
2295  if (!enter_teams) {
2296  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2297  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2298  } else if (parent_team->t.t_serialized) {
2299  // keep stack stitching id in the serialized parent_team;
2300  // current team will be used for parallel inside the teams;
2301  // if parent_team is active, then it already keeps stack stitching id
2302  // for the league of teams
2303  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2304  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2305  }
2306  }
2307 #endif /* USE_ITT_BUILD */
2308 
2309  // AC: skip __kmp_internal_fork at teams construct, let only primary
2310  // threads execute
2311  if (ap) {
2312  __kmp_internal_fork(loc, gtid, team);
2313  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2314  "master_th=%p, gtid=%d\n",
2315  root, team, master_th, gtid));
2316  }
2317 
2318  if (call_context == fork_context_gnu) {
2319  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2320  return TRUE;
2321  }
2322 
2323  /* Invoke microtask for PRIMARY thread */
2324  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2325  team->t.t_id, team->t.t_pkfn));
2326  } // END of timer KMP_fork_call block
2327 
2328 #if KMP_STATS_ENABLED
2329  // If beginning a teams construct, then change thread state
2330  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2331  if (!ap) {
2332  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2333  }
2334 #endif
2335 
2336  if (!team->t.t_invoke(gtid)) {
2337  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2338  }
2339 
2340 #if KMP_STATS_ENABLED
2341  // If was beginning of a teams construct, then reset thread state
2342  if (!ap) {
2343  KMP_SET_THREAD_STATE(previous_state);
2344  }
2345 #endif
2346 
2347  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2348  team->t.t_id, team->t.t_pkfn));
2349  KMP_MB(); /* Flush all pending memory write invalidates. */
2350 
2351  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352 #if OMPT_SUPPORT
2353  if (ompt_enabled.enabled) {
2354  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2355  }
2356 #endif
2357 
2358  return TRUE;
2359 }
2360 
2361 #if OMPT_SUPPORT
2362 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2363  kmp_team_t *team) {
2364  // restore state outside the region
2365  thread->th.ompt_thread_info.state =
2366  ((team->t.t_serialized) ? ompt_state_work_serial
2367  : ompt_state_work_parallel);
2368 }
2369 
2370 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2371  kmp_team_t *team, ompt_data_t *parallel_data,
2372  int flags, void *codeptr) {
2373  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2374  if (ompt_enabled.ompt_callback_parallel_end) {
2375  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2376  parallel_data, &(task_info->task_data), flags, codeptr);
2377  }
2378 
2379  task_info->frame.enter_frame = ompt_data_none;
2380  __kmp_join_restore_state(thread, team);
2381 }
2382 #endif
2383 
2384 void __kmp_join_call(ident_t *loc, int gtid
2385 #if OMPT_SUPPORT
2386  ,
2387  enum fork_context_e fork_context
2388 #endif
2389  ,
2390  int exit_teams) {
2391  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2392  kmp_team_t *team;
2393  kmp_team_t *parent_team;
2394  kmp_info_t *master_th;
2395  kmp_root_t *root;
2396  int master_active;
2397 
2398  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2399 
2400  /* setup current data */
2401  master_th = __kmp_threads[gtid];
2402  root = master_th->th.th_root;
2403  team = master_th->th.th_team;
2404  parent_team = team->t.t_parent;
2405 
2406  master_th->th.th_ident = loc;
2407 
2408 #if OMPT_SUPPORT
2409  void *team_microtask = (void *)team->t.t_pkfn;
2410  // For GOMP interface with serialized parallel, need the
2411  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2412  // and end-parallel events.
2413  if (ompt_enabled.enabled &&
2414  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2415  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2416  }
2417 #endif
2418 
2419 #if KMP_DEBUG
2420  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2421  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2422  "th_task_team = %p\n",
2423  __kmp_gtid_from_thread(master_th), team,
2424  team->t.t_task_team[master_th->th.th_task_state],
2425  master_th->th.th_task_team));
2426  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2427  team->t.t_task_team[master_th->th.th_task_state]);
2428  }
2429 #endif
2430 
2431  if (team->t.t_serialized) {
2432  if (master_th->th.th_teams_microtask) {
2433  // We are in teams construct
2434  int level = team->t.t_level;
2435  int tlevel = master_th->th.th_teams_level;
2436  if (level == tlevel) {
2437  // AC: we haven't incremented it earlier at start of teams construct,
2438  // so do it here - at the end of teams construct
2439  team->t.t_level++;
2440  } else if (level == tlevel + 1) {
2441  // AC: we are exiting parallel inside teams, need to increment
2442  // serialization in order to restore it in the next call to
2443  // __kmpc_end_serialized_parallel
2444  team->t.t_serialized++;
2445  }
2446  }
2447  __kmpc_end_serialized_parallel(loc, gtid);
2448 
2449 #if OMPT_SUPPORT
2450  if (ompt_enabled.enabled) {
2451  if (fork_context == fork_context_gnu) {
2452  __ompt_lw_taskteam_unlink(master_th);
2453  }
2454  __kmp_join_restore_state(master_th, parent_team);
2455  }
2456 #endif
2457 
2458  return;
2459  }
2460 
2461  master_active = team->t.t_master_active;
2462 
2463  if (!exit_teams) {
2464  // AC: No barrier for internal teams at exit from teams construct.
2465  // But there is barrier for external team (league).
2466  __kmp_internal_join(loc, gtid, team);
2467 #if USE_ITT_BUILD
2468  if (__itt_stack_caller_create_ptr) {
2469  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2470  // destroy the stack stitching id after join barrier
2471  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2472  team->t.t_stack_id = NULL;
2473  }
2474 #endif
2475  } else {
2476  master_th->th.th_task_state =
2477  0; // AC: no tasking in teams (out of any parallel)
2478 #if USE_ITT_BUILD
2479  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2480  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2481  // destroy the stack stitching id on exit from the teams construct
2482  // if parent_team is active, then the id will be destroyed later on
2483  // by master of the league of teams
2484  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2485  parent_team->t.t_stack_id = NULL;
2486  }
2487 #endif
2488  }
2489 
2490  KMP_MB();
2491 
2492 #if OMPT_SUPPORT
2493  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2494  void *codeptr = team->t.ompt_team_info.master_return_address;
2495 #endif
2496 
2497 #if USE_ITT_BUILD
2498  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2499  if (team->t.t_active_level == 1 &&
2500  (!master_th->th.th_teams_microtask || /* not in teams construct */
2501  master_th->th.th_teams_size.nteams == 1)) {
2502  master_th->th.th_ident = loc;
2503  // only one notification scheme (either "submit" or "forking/joined", not
2504  // both)
2505  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2506  __kmp_forkjoin_frames_mode == 3)
2507  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2508  master_th->th.th_frame_time, 0, loc,
2509  master_th->th.th_team_nproc, 1);
2510  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2511  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2512  __kmp_itt_region_joined(gtid);
2513  } // active_level == 1
2514 #endif /* USE_ITT_BUILD */
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517  if (!exit_teams) {
2518  // Restore master thread's partition.
2519  master_th->th.th_first_place = team->t.t_first_place;
2520  master_th->th.th_last_place = team->t.t_last_place;
2521  }
2522 #endif // KMP_AFFINITY_SUPPORTED
2523 
2524  if (master_th->th.th_teams_microtask && !exit_teams &&
2525  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2526  team->t.t_level == master_th->th.th_teams_level + 1) {
2527 // AC: We need to leave the team structure intact at the end of parallel
2528 // inside the teams construct, so that at the next parallel same (hot) team
2529 // works, only adjust nesting levels
2530 #if OMPT_SUPPORT
2531  ompt_data_t ompt_parallel_data = ompt_data_none;
2532  if (ompt_enabled.enabled) {
2533  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2534  if (ompt_enabled.ompt_callback_implicit_task) {
2535  int ompt_team_size = team->t.t_nproc;
2536  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2537  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2538  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2539  }
2540  task_info->frame.exit_frame = ompt_data_none;
2541  task_info->task_data = ompt_data_none;
2542  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2543  __ompt_lw_taskteam_unlink(master_th);
2544  }
2545 #endif
2546  /* Decrement our nested depth level */
2547  team->t.t_level--;
2548  team->t.t_active_level--;
2549  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2550 
2551  // Restore number of threads in the team if needed. This code relies on
2552  // the proper adjustment of th_teams_size.nth after the fork in
2553  // __kmp_teams_master on each teams primary thread in the case that
2554  // __kmp_reserve_threads reduced it.
2555  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2556  int old_num = master_th->th.th_team_nproc;
2557  int new_num = master_th->th.th_teams_size.nth;
2558  kmp_info_t **other_threads = team->t.t_threads;
2559  team->t.t_nproc = new_num;
2560  for (int i = 0; i < old_num; ++i) {
2561  other_threads[i]->th.th_team_nproc = new_num;
2562  }
2563  // Adjust states of non-used threads of the team
2564  for (int i = old_num; i < new_num; ++i) {
2565  // Re-initialize thread's barrier data.
2566  KMP_DEBUG_ASSERT(other_threads[i]);
2567  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2568  for (int b = 0; b < bs_last_barrier; ++b) {
2569  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2570  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2571 #if USE_DEBUGGER
2572  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2573 #endif
2574  }
2575  if (__kmp_tasking_mode != tskm_immediate_exec) {
2576  // Synchronize thread's task state
2577  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2578  }
2579  }
2580  }
2581 
2582 #if OMPT_SUPPORT
2583  if (ompt_enabled.enabled) {
2584  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2585  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2586  }
2587 #endif
2588 
2589  return;
2590  }
2591 
2592  /* do cleanup and restore the parent team */
2593  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2594  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2595 
2596  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2597 
2598  /* jc: The following lock has instructions with REL and ACQ semantics,
2599  separating the parallel user code called in this parallel region
2600  from the serial user code called after this function returns. */
2601  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2602 
2603  if (!master_th->th.th_teams_microtask ||
2604  team->t.t_level > master_th->th.th_teams_level) {
2605  /* Decrement our nested depth level */
2606  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2607  }
2608  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2609 
2610 #if OMPT_SUPPORT
2611  if (ompt_enabled.enabled) {
2612  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2613  if (ompt_enabled.ompt_callback_implicit_task) {
2614  int flags = (team_microtask == (void *)__kmp_teams_master)
2615  ? ompt_task_initial
2616  : ompt_task_implicit;
2617  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2618  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2619  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2620  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2621  }
2622  task_info->frame.exit_frame = ompt_data_none;
2623  task_info->task_data = ompt_data_none;
2624  }
2625 #endif
2626 
2627  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2628  master_th, team));
2629  __kmp_pop_current_task_from_thread(master_th);
2630 
2631  master_th->th.th_def_allocator = team->t.t_def_allocator;
2632 
2633 #if OMPD_SUPPORT
2634  if (ompd_state & OMPD_ENABLE_BP)
2635  ompd_bp_parallel_end();
2636 #endif
2637  updateHWFPControl(team);
2638 
2639  if (root->r.r_active != master_active)
2640  root->r.r_active = master_active;
2641 
2642  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2643  master_th)); // this will free worker threads
2644 
2645  /* this race was fun to find. make sure the following is in the critical
2646  region otherwise assertions may fail occasionally since the old team may be
2647  reallocated and the hierarchy appears inconsistent. it is actually safe to
2648  run and won't cause any bugs, but will cause those assertion failures. it's
2649  only one deref&assign so might as well put this in the critical region */
2650  master_th->th.th_team = parent_team;
2651  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2652  master_th->th.th_team_master = parent_team->t.t_threads[0];
2653  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2654 
2655  /* restore serialized team, if need be */
2656  if (parent_team->t.t_serialized &&
2657  parent_team != master_th->th.th_serial_team &&
2658  parent_team != root->r.r_root_team) {
2659  __kmp_free_team(root,
2660  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2661  master_th->th.th_serial_team = parent_team;
2662  }
2663 
2664  if (__kmp_tasking_mode != tskm_immediate_exec) {
2665  if (master_th->th.th_task_state_top >
2666  0) { // Restore task state from memo stack
2667  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2668  // Remember primary thread's state if we re-use this nested hot team
2669  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2670  master_th->th.th_task_state;
2671  --master_th->th.th_task_state_top; // pop
2672  // Now restore state at this level
2673  master_th->th.th_task_state =
2674  master_th->th
2675  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2676  } else if (team != root->r.r_hot_team) {
2677  // Reset the task state of primary thread if we are not hot team because
2678  // in this case all the worker threads will be free, and their task state
2679  // will be reset. If not reset the primary's, the task state will be
2680  // inconsistent.
2681  master_th->th.th_task_state = 0;
2682  }
2683  // Copy the task team from the parent team to the primary thread
2684  master_th->th.th_task_team =
2685  parent_team->t.t_task_team[master_th->th.th_task_state];
2686  KA_TRACE(20,
2687  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2688  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2689  parent_team));
2690  }
2691 
2692  // TODO: GEH - cannot do this assertion because root thread not set up as
2693  // executing
2694  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2695  master_th->th.th_current_task->td_flags.executing = 1;
2696 
2697  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2698 
2699 #if KMP_AFFINITY_SUPPORTED
2700  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2701  __kmp_reset_root_init_mask(gtid);
2702  }
2703 #endif
2704 #if OMPT_SUPPORT
2705  int flags =
2706  OMPT_INVOKER(fork_context) |
2707  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2708  : ompt_parallel_team);
2709  if (ompt_enabled.enabled) {
2710  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2711  codeptr);
2712  }
2713 #endif
2714 
2715  KMP_MB();
2716  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2717 }
2718 
2719 /* Check whether we should push an internal control record onto the
2720  serial team stack. If so, do it. */
2721 void __kmp_save_internal_controls(kmp_info_t *thread) {
2722 
2723  if (thread->th.th_team != thread->th.th_serial_team) {
2724  return;
2725  }
2726  if (thread->th.th_team->t.t_serialized > 1) {
2727  int push = 0;
2728 
2729  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2730  push = 1;
2731  } else {
2732  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2733  thread->th.th_team->t.t_serialized) {
2734  push = 1;
2735  }
2736  }
2737  if (push) { /* push a record on the serial team's stack */
2738  kmp_internal_control_t *control =
2739  (kmp_internal_control_t *)__kmp_allocate(
2740  sizeof(kmp_internal_control_t));
2741 
2742  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2743 
2744  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2745 
2746  control->next = thread->th.th_team->t.t_control_stack_top;
2747  thread->th.th_team->t.t_control_stack_top = control;
2748  }
2749  }
2750 }
2751 
2752 /* Changes set_nproc */
2753 void __kmp_set_num_threads(int new_nth, int gtid) {
2754  kmp_info_t *thread;
2755  kmp_root_t *root;
2756 
2757  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2758  KMP_DEBUG_ASSERT(__kmp_init_serial);
2759 
2760  if (new_nth < 1)
2761  new_nth = 1;
2762  else if (new_nth > __kmp_max_nth)
2763  new_nth = __kmp_max_nth;
2764 
2765  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2766  thread = __kmp_threads[gtid];
2767  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2768  return; // nothing to do
2769 
2770  __kmp_save_internal_controls(thread);
2771 
2772  set__nproc(thread, new_nth);
2773 
2774  // If this omp_set_num_threads() call will cause the hot team size to be
2775  // reduced (in the absence of a num_threads clause), then reduce it now,
2776  // rather than waiting for the next parallel region.
2777  root = thread->th.th_root;
2778  if (__kmp_init_parallel && (!root->r.r_active) &&
2779  (root->r.r_hot_team->t.t_nproc > new_nth)
2780 #if KMP_NESTED_HOT_TEAMS
2781  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2782 #endif
2783  ) {
2784  kmp_team_t *hot_team = root->r.r_hot_team;
2785  int f;
2786 
2787  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2788 
2789  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2790  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2791  }
2792  // Release the extra threads we don't need any more.
2793  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2794  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2795  if (__kmp_tasking_mode != tskm_immediate_exec) {
2796  // When decreasing team size, threads no longer in the team should unref
2797  // task team.
2798  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2799  }
2800  __kmp_free_thread(hot_team->t.t_threads[f]);
2801  hot_team->t.t_threads[f] = NULL;
2802  }
2803  hot_team->t.t_nproc = new_nth;
2804 #if KMP_NESTED_HOT_TEAMS
2805  if (thread->th.th_hot_teams) {
2806  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2807  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2808  }
2809 #endif
2810 
2811  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2812  hot_team->t.b->update_num_threads(new_nth);
2813  __kmp_add_threads_to_team(hot_team, new_nth);
2814  }
2815 
2816  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2817 
2818  // Update the t_nproc field in the threads that are still active.
2819  for (f = 0; f < new_nth; f++) {
2820  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2821  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2822  }
2823  // Special flag in case omp_set_num_threads() call
2824  hot_team->t.t_size_changed = -1;
2825  }
2826 }
2827 
2828 /* Changes max_active_levels */
2829 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2830  kmp_info_t *thread;
2831 
2832  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2833  "%d = (%d)\n",
2834  gtid, max_active_levels));
2835  KMP_DEBUG_ASSERT(__kmp_init_serial);
2836 
2837  // validate max_active_levels
2838  if (max_active_levels < 0) {
2839  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2840  // We ignore this call if the user has specified a negative value.
2841  // The current setting won't be changed. The last valid setting will be
2842  // used. A warning will be issued (if warnings are allowed as controlled by
2843  // the KMP_WARNINGS env var).
2844  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2845  "max_active_levels for thread %d = (%d)\n",
2846  gtid, max_active_levels));
2847  return;
2848  }
2849  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2850  // it's OK, the max_active_levels is within the valid range: [ 0;
2851  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2852  // We allow a zero value. (implementation defined behavior)
2853  } else {
2854  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2855  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2856  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2857  // Current upper limit is MAX_INT. (implementation defined behavior)
2858  // If the input exceeds the upper limit, we correct the input to be the
2859  // upper limit. (implementation defined behavior)
2860  // Actually, the flow should never get here until we use MAX_INT limit.
2861  }
2862  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2863  "max_active_levels for thread %d = (%d)\n",
2864  gtid, max_active_levels));
2865 
2866  thread = __kmp_threads[gtid];
2867 
2868  __kmp_save_internal_controls(thread);
2869 
2870  set__max_active_levels(thread, max_active_levels);
2871 }
2872 
2873 /* Gets max_active_levels */
2874 int __kmp_get_max_active_levels(int gtid) {
2875  kmp_info_t *thread;
2876 
2877  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2878  KMP_DEBUG_ASSERT(__kmp_init_serial);
2879 
2880  thread = __kmp_threads[gtid];
2881  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2882  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2883  "curtask_maxaclevel=%d\n",
2884  gtid, thread->th.th_current_task,
2885  thread->th.th_current_task->td_icvs.max_active_levels));
2886  return thread->th.th_current_task->td_icvs.max_active_levels;
2887 }
2888 
2889 // nteams-var per-device ICV
2890 void __kmp_set_num_teams(int num_teams) {
2891  if (num_teams > 0)
2892  __kmp_nteams = num_teams;
2893 }
2894 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2895 // teams-thread-limit-var per-device ICV
2896 void __kmp_set_teams_thread_limit(int limit) {
2897  if (limit > 0)
2898  __kmp_teams_thread_limit = limit;
2899 }
2900 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2901 
2902 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2903 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2904 
2905 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2906 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2907  kmp_info_t *thread;
2908  kmp_sched_t orig_kind;
2909  // kmp_team_t *team;
2910 
2911  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2912  gtid, (int)kind, chunk));
2913  KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915  // Check if the kind parameter is valid, correct if needed.
2916  // Valid parameters should fit in one of two intervals - standard or extended:
2917  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2918  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2919  orig_kind = kind;
2920  kind = __kmp_sched_without_mods(kind);
2921 
2922  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2923  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2924  // TODO: Hint needs attention in case we change the default schedule.
2925  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2926  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2927  __kmp_msg_null);
2928  kind = kmp_sched_default;
2929  chunk = 0; // ignore chunk value in case of bad kind
2930  }
2931 
2932  thread = __kmp_threads[gtid];
2933 
2934  __kmp_save_internal_controls(thread);
2935 
2936  if (kind < kmp_sched_upper_std) {
2937  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2938  // differ static chunked vs. unchunked: chunk should be invalid to
2939  // indicate unchunked schedule (which is the default)
2940  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2941  } else {
2942  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2943  __kmp_sch_map[kind - kmp_sched_lower - 1];
2944  }
2945  } else {
2946  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2947  // kmp_sched_lower - 2 ];
2948  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2949  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2950  kmp_sched_lower - 2];
2951  }
2952  __kmp_sched_apply_mods_intkind(
2953  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2954  if (kind == kmp_sched_auto || chunk < 1) {
2955  // ignore parameter chunk for schedule auto
2956  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2957  } else {
2958  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2959  }
2960 }
2961 
2962 /* Gets def_sched_var ICV values */
2963 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2964  kmp_info_t *thread;
2965  enum sched_type th_type;
2966 
2967  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2968  KMP_DEBUG_ASSERT(__kmp_init_serial);
2969 
2970  thread = __kmp_threads[gtid];
2971 
2972  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2973  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2974  case kmp_sch_static:
2975  case kmp_sch_static_greedy:
2976  case kmp_sch_static_balanced:
2977  *kind = kmp_sched_static;
2978  __kmp_sched_apply_mods_stdkind(kind, th_type);
2979  *chunk = 0; // chunk was not set, try to show this fact via zero value
2980  return;
2981  case kmp_sch_static_chunked:
2982  *kind = kmp_sched_static;
2983  break;
2984  case kmp_sch_dynamic_chunked:
2985  *kind = kmp_sched_dynamic;
2986  break;
2988  case kmp_sch_guided_iterative_chunked:
2989  case kmp_sch_guided_analytical_chunked:
2990  *kind = kmp_sched_guided;
2991  break;
2992  case kmp_sch_auto:
2993  *kind = kmp_sched_auto;
2994  break;
2995  case kmp_sch_trapezoidal:
2996  *kind = kmp_sched_trapezoidal;
2997  break;
2998 #if KMP_STATIC_STEAL_ENABLED
2999  case kmp_sch_static_steal:
3000  *kind = kmp_sched_static_steal;
3001  break;
3002 #endif
3003  default:
3004  KMP_FATAL(UnknownSchedulingType, th_type);
3005  }
3006 
3007  __kmp_sched_apply_mods_stdkind(kind, th_type);
3008  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3009 }
3010 
3011 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3012 
3013  int ii, dd;
3014  kmp_team_t *team;
3015  kmp_info_t *thr;
3016 
3017  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3018  KMP_DEBUG_ASSERT(__kmp_init_serial);
3019 
3020  // validate level
3021  if (level == 0)
3022  return 0;
3023  if (level < 0)
3024  return -1;
3025  thr = __kmp_threads[gtid];
3026  team = thr->th.th_team;
3027  ii = team->t.t_level;
3028  if (level > ii)
3029  return -1;
3030 
3031  if (thr->th.th_teams_microtask) {
3032  // AC: we are in teams region where multiple nested teams have same level
3033  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3034  if (level <=
3035  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3036  KMP_DEBUG_ASSERT(ii >= tlevel);
3037  // AC: As we need to pass by the teams league, we need to artificially
3038  // increase ii
3039  if (ii == tlevel) {
3040  ii += 2; // three teams have same level
3041  } else {
3042  ii++; // two teams have same level
3043  }
3044  }
3045  }
3046 
3047  if (ii == level)
3048  return __kmp_tid_from_gtid(gtid);
3049 
3050  dd = team->t.t_serialized;
3051  level++;
3052  while (ii > level) {
3053  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3054  }
3055  if ((team->t.t_serialized) && (!dd)) {
3056  team = team->t.t_parent;
3057  continue;
3058  }
3059  if (ii > level) {
3060  team = team->t.t_parent;
3061  dd = team->t.t_serialized;
3062  ii--;
3063  }
3064  }
3065 
3066  return (dd > 1) ? (0) : (team->t.t_master_tid);
3067 }
3068 
3069 int __kmp_get_team_size(int gtid, int level) {
3070 
3071  int ii, dd;
3072  kmp_team_t *team;
3073  kmp_info_t *thr;
3074 
3075  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3076  KMP_DEBUG_ASSERT(__kmp_init_serial);
3077 
3078  // validate level
3079  if (level == 0)
3080  return 1;
3081  if (level < 0)
3082  return -1;
3083  thr = __kmp_threads[gtid];
3084  team = thr->th.th_team;
3085  ii = team->t.t_level;
3086  if (level > ii)
3087  return -1;
3088 
3089  if (thr->th.th_teams_microtask) {
3090  // AC: we are in teams region where multiple nested teams have same level
3091  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3092  if (level <=
3093  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3094  KMP_DEBUG_ASSERT(ii >= tlevel);
3095  // AC: As we need to pass by the teams league, we need to artificially
3096  // increase ii
3097  if (ii == tlevel) {
3098  ii += 2; // three teams have same level
3099  } else {
3100  ii++; // two teams have same level
3101  }
3102  }
3103  }
3104 
3105  while (ii > level) {
3106  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3107  }
3108  if (team->t.t_serialized && (!dd)) {
3109  team = team->t.t_parent;
3110  continue;
3111  }
3112  if (ii > level) {
3113  team = team->t.t_parent;
3114  ii--;
3115  }
3116  }
3117 
3118  return team->t.t_nproc;
3119 }
3120 
3121 kmp_r_sched_t __kmp_get_schedule_global() {
3122  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3123  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3124  // independently. So one can get the updated schedule here.
3125 
3126  kmp_r_sched_t r_sched;
3127 
3128  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3129  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3130  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3131  // different roots (even in OMP 2.5)
3132  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3133  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3134  if (s == kmp_sch_static) {
3135  // replace STATIC with more detailed schedule (balanced or greedy)
3136  r_sched.r_sched_type = __kmp_static;
3137  } else if (s == kmp_sch_guided_chunked) {
3138  // replace GUIDED with more detailed schedule (iterative or analytical)
3139  r_sched.r_sched_type = __kmp_guided;
3140  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3141  r_sched.r_sched_type = __kmp_sched;
3142  }
3143  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3144 
3145  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3146  // __kmp_chunk may be wrong here (if it was not ever set)
3147  r_sched.chunk = KMP_DEFAULT_CHUNK;
3148  } else {
3149  r_sched.chunk = __kmp_chunk;
3150  }
3151 
3152  return r_sched;
3153 }
3154 
3155 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3156  at least argc number of *t_argv entries for the requested team. */
3157 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3158 
3159  KMP_DEBUG_ASSERT(team);
3160  if (!realloc || argc > team->t.t_max_argc) {
3161 
3162  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3163  "current entries=%d\n",
3164  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3165  /* if previously allocated heap space for args, free them */
3166  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3167  __kmp_free((void *)team->t.t_argv);
3168 
3169  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3170  /* use unused space in the cache line for arguments */
3171  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3172  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3173  "argv entries\n",
3174  team->t.t_id, team->t.t_max_argc));
3175  team->t.t_argv = &team->t.t_inline_argv[0];
3176  if (__kmp_storage_map) {
3177  __kmp_print_storage_map_gtid(
3178  -1, &team->t.t_inline_argv[0],
3179  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3180  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3181  team->t.t_id);
3182  }
3183  } else {
3184  /* allocate space for arguments in the heap */
3185  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3186  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3187  : 2 * argc;
3188  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3189  "argv entries\n",
3190  team->t.t_id, team->t.t_max_argc));
3191  team->t.t_argv =
3192  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3193  if (__kmp_storage_map) {
3194  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3195  &team->t.t_argv[team->t.t_max_argc],
3196  sizeof(void *) * team->t.t_max_argc,
3197  "team_%d.t_argv", team->t.t_id);
3198  }
3199  }
3200  }
3201 }
3202 
3203 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3204  int i;
3205  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3206  team->t.t_threads =
3207  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3208  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3209  sizeof(dispatch_shared_info_t) * num_disp_buff);
3210  team->t.t_dispatch =
3211  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3212  team->t.t_implicit_task_taskdata =
3213  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3214  team->t.t_max_nproc = max_nth;
3215 
3216  /* setup dispatch buffers */
3217  for (i = 0; i < num_disp_buff; ++i) {
3218  team->t.t_disp_buffer[i].buffer_index = i;
3219  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3220  }
3221 }
3222 
3223 static void __kmp_free_team_arrays(kmp_team_t *team) {
3224  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3225  int i;
3226  for (i = 0; i < team->t.t_max_nproc; ++i) {
3227  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3228  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3229  team->t.t_dispatch[i].th_disp_buffer = NULL;
3230  }
3231  }
3232 #if KMP_USE_HIER_SCHED
3233  __kmp_dispatch_free_hierarchies(team);
3234 #endif
3235  __kmp_free(team->t.t_threads);
3236  __kmp_free(team->t.t_disp_buffer);
3237  __kmp_free(team->t.t_dispatch);
3238  __kmp_free(team->t.t_implicit_task_taskdata);
3239  team->t.t_threads = NULL;
3240  team->t.t_disp_buffer = NULL;
3241  team->t.t_dispatch = NULL;
3242  team->t.t_implicit_task_taskdata = 0;
3243 }
3244 
3245 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3246  kmp_info_t **oldThreads = team->t.t_threads;
3247 
3248  __kmp_free(team->t.t_disp_buffer);
3249  __kmp_free(team->t.t_dispatch);
3250  __kmp_free(team->t.t_implicit_task_taskdata);
3251  __kmp_allocate_team_arrays(team, max_nth);
3252 
3253  KMP_MEMCPY(team->t.t_threads, oldThreads,
3254  team->t.t_nproc * sizeof(kmp_info_t *));
3255 
3256  __kmp_free(oldThreads);
3257 }
3258 
3259 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3260 
3261  kmp_r_sched_t r_sched =
3262  __kmp_get_schedule_global(); // get current state of scheduling globals
3263 
3264  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3265 
3266  kmp_internal_control_t g_icvs = {
3267  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3268  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3269  // adjustment of threads (per thread)
3270  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3271  // whether blocktime is explicitly set
3272  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3273 #if KMP_USE_MONITOR
3274  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3275 // intervals
3276 #endif
3277  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3278  // next parallel region (per thread)
3279  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3280  __kmp_cg_max_nth, // int thread_limit;
3281  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3282  // for max_active_levels
3283  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3284  // {sched,chunk} pair
3285  __kmp_nested_proc_bind.bind_types[0],
3286  __kmp_default_device,
3287  NULL // struct kmp_internal_control *next;
3288  };
3289 
3290  return g_icvs;
3291 }
3292 
3293 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3294 
3295  kmp_internal_control_t gx_icvs;
3296  gx_icvs.serial_nesting_level =
3297  0; // probably =team->t.t_serial like in save_inter_controls
3298  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3299  gx_icvs.next = NULL;
3300 
3301  return gx_icvs;
3302 }
3303 
3304 static void __kmp_initialize_root(kmp_root_t *root) {
3305  int f;
3306  kmp_team_t *root_team;
3307  kmp_team_t *hot_team;
3308  int hot_team_max_nth;
3309  kmp_r_sched_t r_sched =
3310  __kmp_get_schedule_global(); // get current state of scheduling globals
3311  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3312  KMP_DEBUG_ASSERT(root);
3313  KMP_ASSERT(!root->r.r_begin);
3314 
3315  /* setup the root state structure */
3316  __kmp_init_lock(&root->r.r_begin_lock);
3317  root->r.r_begin = FALSE;
3318  root->r.r_active = FALSE;
3319  root->r.r_in_parallel = 0;
3320  root->r.r_blocktime = __kmp_dflt_blocktime;
3321 #if KMP_AFFINITY_SUPPORTED
3322  root->r.r_affinity_assigned = FALSE;
3323 #endif
3324 
3325  /* setup the root team for this task */
3326  /* allocate the root team structure */
3327  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3328 
3329  root_team =
3330  __kmp_allocate_team(root,
3331  1, // new_nproc
3332  1, // max_nproc
3333 #if OMPT_SUPPORT
3334  ompt_data_none, // root parallel id
3335 #endif
3336  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3337  0 // argc
3338  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3339  );
3340 #if USE_DEBUGGER
3341  // Non-NULL value should be assigned to make the debugger display the root
3342  // team.
3343  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3344 #endif
3345 
3346  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3347 
3348  root->r.r_root_team = root_team;
3349  root_team->t.t_control_stack_top = NULL;
3350 
3351  /* initialize root team */
3352  root_team->t.t_threads[0] = NULL;
3353  root_team->t.t_nproc = 1;
3354  root_team->t.t_serialized = 1;
3355  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3356  root_team->t.t_sched.sched = r_sched.sched;
3357  KA_TRACE(
3358  20,
3359  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3360  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3361 
3362  /* setup the hot team for this task */
3363  /* allocate the hot team structure */
3364  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3365 
3366  hot_team =
3367  __kmp_allocate_team(root,
3368  1, // new_nproc
3369  __kmp_dflt_team_nth_ub * 2, // max_nproc
3370 #if OMPT_SUPPORT
3371  ompt_data_none, // root parallel id
3372 #endif
3373  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3374  0 // argc
3375  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3376  );
3377  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3378 
3379  root->r.r_hot_team = hot_team;
3380  root_team->t.t_control_stack_top = NULL;
3381 
3382  /* first-time initialization */
3383  hot_team->t.t_parent = root_team;
3384 
3385  /* initialize hot team */
3386  hot_team_max_nth = hot_team->t.t_max_nproc;
3387  for (f = 0; f < hot_team_max_nth; ++f) {
3388  hot_team->t.t_threads[f] = NULL;
3389  }
3390  hot_team->t.t_nproc = 1;
3391  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392  hot_team->t.t_sched.sched = r_sched.sched;
3393  hot_team->t.t_size_changed = 0;
3394 }
3395 
3396 #ifdef KMP_DEBUG
3397 
3398 typedef struct kmp_team_list_item {
3399  kmp_team_p const *entry;
3400  struct kmp_team_list_item *next;
3401 } kmp_team_list_item_t;
3402 typedef kmp_team_list_item_t *kmp_team_list_t;
3403 
3404 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3405  kmp_team_list_t list, // List of teams.
3406  kmp_team_p const *team // Team to add.
3407 ) {
3408 
3409  // List must terminate with item where both entry and next are NULL.
3410  // Team is added to the list only once.
3411  // List is sorted in ascending order by team id.
3412  // Team id is *not* a key.
3413 
3414  kmp_team_list_t l;
3415 
3416  KMP_DEBUG_ASSERT(list != NULL);
3417  if (team == NULL) {
3418  return;
3419  }
3420 
3421  __kmp_print_structure_team_accum(list, team->t.t_parent);
3422  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3423 
3424  // Search list for the team.
3425  l = list;
3426  while (l->next != NULL && l->entry != team) {
3427  l = l->next;
3428  }
3429  if (l->next != NULL) {
3430  return; // Team has been added before, exit.
3431  }
3432 
3433  // Team is not found. Search list again for insertion point.
3434  l = list;
3435  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3436  l = l->next;
3437  }
3438 
3439  // Insert team.
3440  {
3441  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3442  sizeof(kmp_team_list_item_t));
3443  *item = *l;
3444  l->entry = team;
3445  l->next = item;
3446  }
3447 }
3448 
3449 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3450 
3451 ) {
3452  __kmp_printf("%s", title);
3453  if (team != NULL) {
3454  __kmp_printf("%2x %p\n", team->t.t_id, team);
3455  } else {
3456  __kmp_printf(" - (nil)\n");
3457  }
3458 }
3459 
3460 static void __kmp_print_structure_thread(char const *title,
3461  kmp_info_p const *thread) {
3462  __kmp_printf("%s", title);
3463  if (thread != NULL) {
3464  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3465  } else {
3466  __kmp_printf(" - (nil)\n");
3467  }
3468 }
3469 
3470 void __kmp_print_structure(void) {
3471 
3472  kmp_team_list_t list;
3473 
3474  // Initialize list of teams.
3475  list =
3476  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3477  list->entry = NULL;
3478  list->next = NULL;
3479 
3480  __kmp_printf("\n------------------------------\nGlobal Thread "
3481  "Table\n------------------------------\n");
3482  {
3483  int gtid;
3484  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3485  __kmp_printf("%2d", gtid);
3486  if (__kmp_threads != NULL) {
3487  __kmp_printf(" %p", __kmp_threads[gtid]);
3488  }
3489  if (__kmp_root != NULL) {
3490  __kmp_printf(" %p", __kmp_root[gtid]);
3491  }
3492  __kmp_printf("\n");
3493  }
3494  }
3495 
3496  // Print out __kmp_threads array.
3497  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3498  "----------\n");
3499  if (__kmp_threads != NULL) {
3500  int gtid;
3501  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3502  kmp_info_t const *thread = __kmp_threads[gtid];
3503  if (thread != NULL) {
3504  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3505  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3506  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3507  __kmp_print_structure_team(" Serial Team: ",
3508  thread->th.th_serial_team);
3509  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3510  __kmp_print_structure_thread(" Primary: ",
3511  thread->th.th_team_master);
3512  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3513  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3514  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3515  __kmp_print_structure_thread(" Next in pool: ",
3516  thread->th.th_next_pool);
3517  __kmp_printf("\n");
3518  __kmp_print_structure_team_accum(list, thread->th.th_team);
3519  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3520  }
3521  }
3522  } else {
3523  __kmp_printf("Threads array is not allocated.\n");
3524  }
3525 
3526  // Print out __kmp_root array.
3527  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3528  "--------\n");
3529  if (__kmp_root != NULL) {
3530  int gtid;
3531  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3532  kmp_root_t const *root = __kmp_root[gtid];
3533  if (root != NULL) {
3534  __kmp_printf("GTID %2d %p:\n", gtid, root);
3535  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3536  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3537  __kmp_print_structure_thread(" Uber Thread: ",
3538  root->r.r_uber_thread);
3539  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3540  __kmp_printf(" In Parallel: %2d\n",
3541  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3542  __kmp_printf("\n");
3543  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3544  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3545  }
3546  }
3547  } else {
3548  __kmp_printf("Ubers array is not allocated.\n");
3549  }
3550 
3551  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3552  "--------\n");
3553  while (list->next != NULL) {
3554  kmp_team_p const *team = list->entry;
3555  int i;
3556  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3557  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3558  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3559  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3560  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3561  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3562  for (i = 0; i < team->t.t_nproc; ++i) {
3563  __kmp_printf(" Thread %2d: ", i);
3564  __kmp_print_structure_thread("", team->t.t_threads[i]);
3565  }
3566  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3567  __kmp_printf("\n");
3568  list = list->next;
3569  }
3570 
3571  // Print out __kmp_thread_pool and __kmp_team_pool.
3572  __kmp_printf("\n------------------------------\nPools\n----------------------"
3573  "--------\n");
3574  __kmp_print_structure_thread("Thread pool: ",
3575  CCAST(kmp_info_t *, __kmp_thread_pool));
3576  __kmp_print_structure_team("Team pool: ",
3577  CCAST(kmp_team_t *, __kmp_team_pool));
3578  __kmp_printf("\n");
3579 
3580  // Free team list.
3581  while (list != NULL) {
3582  kmp_team_list_item_t *item = list;
3583  list = list->next;
3584  KMP_INTERNAL_FREE(item);
3585  }
3586 }
3587 
3588 #endif
3589 
3590 //---------------------------------------------------------------------------
3591 // Stuff for per-thread fast random number generator
3592 // Table of primes
3593 static const unsigned __kmp_primes[] = {
3594  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3595  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3596  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3597  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3598  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3599  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3600  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3601  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3602  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3603  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3604  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3605 
3606 //---------------------------------------------------------------------------
3607 // __kmp_get_random: Get a random number using a linear congruential method.
3608 unsigned short __kmp_get_random(kmp_info_t *thread) {
3609  unsigned x = thread->th.th_x;
3610  unsigned short r = (unsigned short)(x >> 16);
3611 
3612  thread->th.th_x = x * thread->th.th_a + 1;
3613 
3614  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3615  thread->th.th_info.ds.ds_tid, r));
3616 
3617  return r;
3618 }
3619 //--------------------------------------------------------
3620 // __kmp_init_random: Initialize a random number generator
3621 void __kmp_init_random(kmp_info_t *thread) {
3622  unsigned seed = thread->th.th_info.ds.ds_tid;
3623 
3624  thread->th.th_a =
3625  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3626  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3627  KA_TRACE(30,
3628  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3629 }
3630 
3631 #if KMP_OS_WINDOWS
3632 /* reclaim array entries for root threads that are already dead, returns number
3633  * reclaimed */
3634 static int __kmp_reclaim_dead_roots(void) {
3635  int i, r = 0;
3636 
3637  for (i = 0; i < __kmp_threads_capacity; ++i) {
3638  if (KMP_UBER_GTID(i) &&
3639  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3640  !__kmp_root[i]
3641  ->r.r_active) { // AC: reclaim only roots died in non-active state
3642  r += __kmp_unregister_root_other_thread(i);
3643  }
3644  }
3645  return r;
3646 }
3647 #endif
3648 
3649 /* This function attempts to create free entries in __kmp_threads and
3650  __kmp_root, and returns the number of free entries generated.
3651 
3652  For Windows* OS static library, the first mechanism used is to reclaim array
3653  entries for root threads that are already dead.
3654 
3655  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3656  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3657  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3658  threadprivate cache array has been created. Synchronization with
3659  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3660 
3661  After any dead root reclamation, if the clipping value allows array expansion
3662  to result in the generation of a total of nNeed free slots, the function does
3663  that expansion. If not, nothing is done beyond the possible initial root
3664  thread reclamation.
3665 
3666  If any argument is negative, the behavior is undefined. */
3667 static int __kmp_expand_threads(int nNeed) {
3668  int added = 0;
3669  int minimumRequiredCapacity;
3670  int newCapacity;
3671  kmp_info_t **newThreads;
3672  kmp_root_t **newRoot;
3673 
3674  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3675  // resizing __kmp_threads does not need additional protection if foreign
3676  // threads are present
3677 
3678 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3679  /* only for Windows static library */
3680  /* reclaim array entries for root threads that are already dead */
3681  added = __kmp_reclaim_dead_roots();
3682 
3683  if (nNeed) {
3684  nNeed -= added;
3685  if (nNeed < 0)
3686  nNeed = 0;
3687  }
3688 #endif
3689  if (nNeed <= 0)
3690  return added;
3691 
3692  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3693  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3694  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3695  // > __kmp_max_nth in one of two ways:
3696  //
3697  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3698  // may not be reused by another thread, so we may need to increase
3699  // __kmp_threads_capacity to __kmp_max_nth + 1.
3700  //
3701  // 2) New foreign root(s) are encountered. We always register new foreign
3702  // roots. This may cause a smaller # of threads to be allocated at
3703  // subsequent parallel regions, but the worker threads hang around (and
3704  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3705  //
3706  // Anyway, that is the reason for moving the check to see if
3707  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3708  // instead of having it performed here. -BB
3709 
3710  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3711 
3712  /* compute expansion headroom to check if we can expand */
3713  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3714  /* possible expansion too small -- give up */
3715  return added;
3716  }
3717  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3718 
3719  newCapacity = __kmp_threads_capacity;
3720  do {
3721  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3722  : __kmp_sys_max_nth;
3723  } while (newCapacity < minimumRequiredCapacity);
3724  newThreads = (kmp_info_t **)__kmp_allocate(
3725  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3726  newRoot =
3727  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3728  KMP_MEMCPY(newThreads, __kmp_threads,
3729  __kmp_threads_capacity * sizeof(kmp_info_t *));
3730  KMP_MEMCPY(newRoot, __kmp_root,
3731  __kmp_threads_capacity * sizeof(kmp_root_t *));
3732  // Put old __kmp_threads array on a list. Any ongoing references to the old
3733  // list will be valid. This list is cleaned up at library shutdown.
3734  kmp_old_threads_list_t *node =
3735  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3736  node->threads = __kmp_threads;
3737  node->next = __kmp_old_threads_list;
3738  __kmp_old_threads_list = node;
3739 
3740  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3741  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3742  added += newCapacity - __kmp_threads_capacity;
3743  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3744 
3745  if (newCapacity > __kmp_tp_capacity) {
3746  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3747  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3748  __kmp_threadprivate_resize_cache(newCapacity);
3749  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3750  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3751  }
3752  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3753  }
3754 
3755  return added;
3756 }
3757 
3758 /* Register the current thread as a root thread and obtain our gtid. We must
3759  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3760  thread that calls from __kmp_do_serial_initialize() */
3761 int __kmp_register_root(int initial_thread) {
3762  kmp_info_t *root_thread;
3763  kmp_root_t *root;
3764  int gtid;
3765  int capacity;
3766  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3767  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3768  KMP_MB();
3769 
3770  /* 2007-03-02:
3771  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3772  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3773  work as expected -- it may return false (that means there is at least one
3774  empty slot in __kmp_threads array), but it is possible the only free slot
3775  is #0, which is reserved for initial thread and so cannot be used for this
3776  one. Following code workarounds this bug.
3777 
3778  However, right solution seems to be not reserving slot #0 for initial
3779  thread because:
3780  (1) there is no magic in slot #0,
3781  (2) we cannot detect initial thread reliably (the first thread which does
3782  serial initialization may be not a real initial thread).
3783  */
3784  capacity = __kmp_threads_capacity;
3785  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3786  --capacity;
3787  }
3788 
3789  // If it is not for initializing the hidden helper team, we need to take
3790  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3791  // in __kmp_threads_capacity.
3792  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3793  capacity -= __kmp_hidden_helper_threads_num;
3794  }
3795 
3796  /* see if there are too many threads */
3797  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3798  if (__kmp_tp_cached) {
3799  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3800  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3801  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3802  } else {
3803  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3804  __kmp_msg_null);
3805  }
3806  }
3807 
3808  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3809  // 0: initial thread, also a regular OpenMP thread.
3810  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3811  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3812  // regular OpenMP threads.
3813  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3814  // Find an available thread slot for hidden helper thread. Slots for hidden
3815  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3816  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3817  gtid <= __kmp_hidden_helper_threads_num;
3818  gtid++)
3819  ;
3820  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3821  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3822  "hidden helper thread: T#%d\n",
3823  gtid));
3824  } else {
3825  /* find an available thread slot */
3826  // Don't reassign the zero slot since we need that to only be used by
3827  // initial thread. Slots for hidden helper threads should also be skipped.
3828  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3829  gtid = 0;
3830  } else {
3831  for (gtid = __kmp_hidden_helper_threads_num + 1;
3832  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3833  ;
3834  }
3835  KA_TRACE(
3836  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3837  KMP_ASSERT(gtid < __kmp_threads_capacity);
3838  }
3839 
3840  /* update global accounting */
3841  __kmp_all_nth++;
3842  TCW_4(__kmp_nth, __kmp_nth + 1);
3843 
3844  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3845  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3846  if (__kmp_adjust_gtid_mode) {
3847  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3848  if (TCR_4(__kmp_gtid_mode) != 2) {
3849  TCW_4(__kmp_gtid_mode, 2);
3850  }
3851  } else {
3852  if (TCR_4(__kmp_gtid_mode) != 1) {
3853  TCW_4(__kmp_gtid_mode, 1);
3854  }
3855  }
3856  }
3857 
3858 #ifdef KMP_ADJUST_BLOCKTIME
3859  /* Adjust blocktime to zero if necessary */
3860  /* Middle initialization might not have occurred yet */
3861  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3862  if (__kmp_nth > __kmp_avail_proc) {
3863  __kmp_zero_bt = TRUE;
3864  }
3865  }
3866 #endif /* KMP_ADJUST_BLOCKTIME */
3867 
3868  /* setup this new hierarchy */
3869  if (!(root = __kmp_root[gtid])) {
3870  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3871  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3872  }
3873 
3874 #if KMP_STATS_ENABLED
3875  // Initialize stats as soon as possible (right after gtid assignment).
3876  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3877  __kmp_stats_thread_ptr->startLife();
3878  KMP_SET_THREAD_STATE(SERIAL_REGION);
3879  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3880 #endif
3881  __kmp_initialize_root(root);
3882 
3883  /* setup new root thread structure */
3884  if (root->r.r_uber_thread) {
3885  root_thread = root->r.r_uber_thread;
3886  } else {
3887  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3888  if (__kmp_storage_map) {
3889  __kmp_print_thread_storage_map(root_thread, gtid);
3890  }
3891  root_thread->th.th_info.ds.ds_gtid = gtid;
3892 #if OMPT_SUPPORT
3893  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3894 #endif
3895  root_thread->th.th_root = root;
3896  if (__kmp_env_consistency_check) {
3897  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3898  }
3899 #if USE_FAST_MEMORY
3900  __kmp_initialize_fast_memory(root_thread);
3901 #endif /* USE_FAST_MEMORY */
3902 
3903 #if KMP_USE_BGET
3904  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3905  __kmp_initialize_bget(root_thread);
3906 #endif
3907  __kmp_init_random(root_thread); // Initialize random number generator
3908  }
3909 
3910  /* setup the serial team held in reserve by the root thread */
3911  if (!root_thread->th.th_serial_team) {
3912  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3913  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3914  root_thread->th.th_serial_team = __kmp_allocate_team(
3915  root, 1, 1,
3916 #if OMPT_SUPPORT
3917  ompt_data_none, // root parallel id
3918 #endif
3919  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3920  }
3921  KMP_ASSERT(root_thread->th.th_serial_team);
3922  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3923  root_thread->th.th_serial_team));
3924 
3925  /* drop root_thread into place */
3926  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3927 
3928  root->r.r_root_team->t.t_threads[0] = root_thread;
3929  root->r.r_hot_team->t.t_threads[0] = root_thread;
3930  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3931  // AC: the team created in reserve, not for execution (it is unused for now).
3932  root_thread->th.th_serial_team->t.t_serialized = 0;
3933  root->r.r_uber_thread = root_thread;
3934 
3935  /* initialize the thread, get it ready to go */
3936  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3937  TCW_4(__kmp_init_gtid, TRUE);
3938 
3939  /* prepare the primary thread for get_gtid() */
3940  __kmp_gtid_set_specific(gtid);
3941 
3942 #if USE_ITT_BUILD
3943  __kmp_itt_thread_name(gtid);
3944 #endif /* USE_ITT_BUILD */
3945 
3946 #ifdef KMP_TDATA_GTID
3947  __kmp_gtid = gtid;
3948 #endif
3949  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3950  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3951 
3952  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3953  "plain=%u\n",
3954  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3955  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3956  KMP_INIT_BARRIER_STATE));
3957  { // Initialize barrier data.
3958  int b;
3959  for (b = 0; b < bs_last_barrier; ++b) {
3960  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3961 #if USE_DEBUGGER
3962  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3963 #endif
3964  }
3965  }
3966  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3967  KMP_INIT_BARRIER_STATE);
3968 
3969 #if KMP_AFFINITY_SUPPORTED
3970  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3971  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3972  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3973  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3974 #endif /* KMP_AFFINITY_SUPPORTED */
3975  root_thread->th.th_def_allocator = __kmp_def_allocator;
3976  root_thread->th.th_prev_level = 0;
3977  root_thread->th.th_prev_num_threads = 1;
3978 
3979  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3980  tmp->cg_root = root_thread;
3981  tmp->cg_thread_limit = __kmp_cg_max_nth;
3982  tmp->cg_nthreads = 1;
3983  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3984  " cg_nthreads init to 1\n",
3985  root_thread, tmp));
3986  tmp->up = NULL;
3987  root_thread->th.th_cg_roots = tmp;
3988 
3989  __kmp_root_counter++;
3990 
3991 #if OMPT_SUPPORT
3992  if (!initial_thread && ompt_enabled.enabled) {
3993 
3994  kmp_info_t *root_thread = ompt_get_thread();
3995 
3996  ompt_set_thread_state(root_thread, ompt_state_overhead);
3997 
3998  if (ompt_enabled.ompt_callback_thread_begin) {
3999  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4000  ompt_thread_initial, __ompt_get_thread_data_internal());
4001  }
4002  ompt_data_t *task_data;
4003  ompt_data_t *parallel_data;
4004  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4005  NULL);
4006  if (ompt_enabled.ompt_callback_implicit_task) {
4007  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4008  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4009  }
4010 
4011  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4012  }
4013 #endif
4014 #if OMPD_SUPPORT
4015  if (ompd_state & OMPD_ENABLE_BP)
4016  ompd_bp_thread_begin();
4017 #endif
4018 
4019  KMP_MB();
4020  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4021 
4022  return gtid;
4023 }
4024 
4025 #if KMP_NESTED_HOT_TEAMS
4026 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4027  const int max_level) {
4028  int i, n, nth;
4029  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4030  if (!hot_teams || !hot_teams[level].hot_team) {
4031  return 0;
4032  }
4033  KMP_DEBUG_ASSERT(level < max_level);
4034  kmp_team_t *team = hot_teams[level].hot_team;
4035  nth = hot_teams[level].hot_team_nth;
4036  n = nth - 1; // primary thread is not freed
4037  if (level < max_level - 1) {
4038  for (i = 0; i < nth; ++i) {
4039  kmp_info_t *th = team->t.t_threads[i];
4040  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4041  if (i > 0 && th->th.th_hot_teams) {
4042  __kmp_free(th->th.th_hot_teams);
4043  th->th.th_hot_teams = NULL;
4044  }
4045  }
4046  }
4047  __kmp_free_team(root, team, NULL);
4048  return n;
4049 }
4050 #endif
4051 
4052 // Resets a root thread and clear its root and hot teams.
4053 // Returns the number of __kmp_threads entries directly and indirectly freed.
4054 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4055  kmp_team_t *root_team = root->r.r_root_team;
4056  kmp_team_t *hot_team = root->r.r_hot_team;
4057  int n = hot_team->t.t_nproc;
4058  int i;
4059 
4060  KMP_DEBUG_ASSERT(!root->r.r_active);
4061 
4062  root->r.r_root_team = NULL;
4063  root->r.r_hot_team = NULL;
4064  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4065  // before call to __kmp_free_team().
4066  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4067 #if KMP_NESTED_HOT_TEAMS
4068  if (__kmp_hot_teams_max_level >
4069  0) { // need to free nested hot teams and their threads if any
4070  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4071  kmp_info_t *th = hot_team->t.t_threads[i];
4072  if (__kmp_hot_teams_max_level > 1) {
4073  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4074  }
4075  if (th->th.th_hot_teams) {
4076  __kmp_free(th->th.th_hot_teams);
4077  th->th.th_hot_teams = NULL;
4078  }
4079  }
4080  }
4081 #endif
4082  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4083 
4084  // Before we can reap the thread, we need to make certain that all other
4085  // threads in the teams that had this root as ancestor have stopped trying to
4086  // steal tasks.
4087  if (__kmp_tasking_mode != tskm_immediate_exec) {
4088  __kmp_wait_to_unref_task_teams();
4089  }
4090 
4091 #if KMP_OS_WINDOWS
4092  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4093  KA_TRACE(
4094  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4095  "\n",
4096  (LPVOID) & (root->r.r_uber_thread->th),
4097  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4098  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4099 #endif /* KMP_OS_WINDOWS */
4100 
4101 #if OMPD_SUPPORT
4102  if (ompd_state & OMPD_ENABLE_BP)
4103  ompd_bp_thread_end();
4104 #endif
4105 
4106 #if OMPT_SUPPORT
4107  ompt_data_t *task_data;
4108  ompt_data_t *parallel_data;
4109  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4110  NULL);
4111  if (ompt_enabled.ompt_callback_implicit_task) {
4112  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4113  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4114  }
4115  if (ompt_enabled.ompt_callback_thread_end) {
4116  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4117  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4118  }
4119 #endif
4120 
4121  TCW_4(__kmp_nth,
4122  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4123  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4124  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4125  " to %d\n",
4126  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4127  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4128  if (i == 1) {
4129  // need to free contention group structure
4130  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4131  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4132  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4133  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4134  root->r.r_uber_thread->th.th_cg_roots = NULL;
4135  }
4136  __kmp_reap_thread(root->r.r_uber_thread, 1);
4137 
4138  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4139  // instead of freeing.
4140  root->r.r_uber_thread = NULL;
4141  /* mark root as no longer in use */
4142  root->r.r_begin = FALSE;
4143 
4144  return n;
4145 }
4146 
4147 void __kmp_unregister_root_current_thread(int gtid) {
4148  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4149  /* this lock should be ok, since unregister_root_current_thread is never
4150  called during an abort, only during a normal close. furthermore, if you
4151  have the forkjoin lock, you should never try to get the initz lock */
4152  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4153  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4154  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4155  "exiting T#%d\n",
4156  gtid));
4157  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4158  return;
4159  }
4160  kmp_root_t *root = __kmp_root[gtid];
4161 
4162  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4163  KMP_ASSERT(KMP_UBER_GTID(gtid));
4164  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4165  KMP_ASSERT(root->r.r_active == FALSE);
4166 
4167  KMP_MB();
4168 
4169  kmp_info_t *thread = __kmp_threads[gtid];
4170  kmp_team_t *team = thread->th.th_team;
4171  kmp_task_team_t *task_team = thread->th.th_task_team;
4172 
4173  // we need to wait for the proxy tasks before finishing the thread
4174  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4175  task_team->tt.tt_hidden_helper_task_encountered)) {
4176 #if OMPT_SUPPORT
4177  // the runtime is shutting down so we won't report any events
4178  thread->th.ompt_thread_info.state = ompt_state_undefined;
4179 #endif
4180  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4181  }
4182 
4183  __kmp_reset_root(gtid, root);
4184 
4185  KMP_MB();
4186  KC_TRACE(10,
4187  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4188 
4189  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4190 }
4191 
4192 #if KMP_OS_WINDOWS
4193 /* __kmp_forkjoin_lock must be already held
4194  Unregisters a root thread that is not the current thread. Returns the number
4195  of __kmp_threads entries freed as a result. */
4196 static int __kmp_unregister_root_other_thread(int gtid) {
4197  kmp_root_t *root = __kmp_root[gtid];
4198  int r;
4199 
4200  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4201  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4202  KMP_ASSERT(KMP_UBER_GTID(gtid));
4203  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4204  KMP_ASSERT(root->r.r_active == FALSE);
4205 
4206  r = __kmp_reset_root(gtid, root);
4207  KC_TRACE(10,
4208  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4209  return r;
4210 }
4211 #endif
4212 
4213 #if KMP_DEBUG
4214 void __kmp_task_info() {
4215 
4216  kmp_int32 gtid = __kmp_entry_gtid();
4217  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4218  kmp_info_t *this_thr = __kmp_threads[gtid];
4219  kmp_team_t *steam = this_thr->th.th_serial_team;
4220  kmp_team_t *team = this_thr->th.th_team;
4221 
4222  __kmp_printf(
4223  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4224  "ptask=%p\n",
4225  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4226  team->t.t_implicit_task_taskdata[tid].td_parent);
4227 }
4228 #endif // KMP_DEBUG
4229 
4230 /* TODO optimize with one big memclr, take out what isn't needed, split
4231  responsibility to workers as much as possible, and delay initialization of
4232  features as much as possible */
4233 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4234  int tid, int gtid) {
4235  /* this_thr->th.th_info.ds.ds_gtid is setup in
4236  kmp_allocate_thread/create_worker.
4237  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4238  KMP_DEBUG_ASSERT(this_thr != NULL);
4239  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4240  KMP_DEBUG_ASSERT(team);
4241  KMP_DEBUG_ASSERT(team->t.t_threads);
4242  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4243  kmp_info_t *master = team->t.t_threads[0];
4244  KMP_DEBUG_ASSERT(master);
4245  KMP_DEBUG_ASSERT(master->th.th_root);
4246 
4247  KMP_MB();
4248 
4249  TCW_SYNC_PTR(this_thr->th.th_team, team);
4250 
4251  this_thr->th.th_info.ds.ds_tid = tid;
4252  this_thr->th.th_set_nproc = 0;
4253  if (__kmp_tasking_mode != tskm_immediate_exec)
4254  // When tasking is possible, threads are not safe to reap until they are
4255  // done tasking; this will be set when tasking code is exited in wait
4256  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4257  else // no tasking --> always safe to reap
4258  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4259  this_thr->th.th_set_proc_bind = proc_bind_default;
4260 #if KMP_AFFINITY_SUPPORTED
4261  this_thr->th.th_new_place = this_thr->th.th_current_place;
4262 #endif
4263  this_thr->th.th_root = master->th.th_root;
4264 
4265  /* setup the thread's cache of the team structure */
4266  this_thr->th.th_team_nproc = team->t.t_nproc;
4267  this_thr->th.th_team_master = master;
4268  this_thr->th.th_team_serialized = team->t.t_serialized;
4269 
4270  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4271 
4272  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4273  tid, gtid, this_thr, this_thr->th.th_current_task));
4274 
4275  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4276  team, tid, TRUE);
4277 
4278  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4279  tid, gtid, this_thr, this_thr->th.th_current_task));
4280  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4281  // __kmp_initialize_team()?
4282 
4283  /* TODO no worksharing in speculative threads */
4284  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4285 
4286  this_thr->th.th_local.this_construct = 0;
4287 
4288  if (!this_thr->th.th_pri_common) {
4289  this_thr->th.th_pri_common =
4290  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4291  if (__kmp_storage_map) {
4292  __kmp_print_storage_map_gtid(
4293  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4294  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4295  }
4296  this_thr->th.th_pri_head = NULL;
4297  }
4298 
4299  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4300  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4301  // Make new thread's CG root same as primary thread's
4302  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4303  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4304  if (tmp) {
4305  // worker changes CG, need to check if old CG should be freed
4306  int i = tmp->cg_nthreads--;
4307  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4308  " on node %p of thread %p to %d\n",
4309  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4310  if (i == 1) {
4311  __kmp_free(tmp); // last thread left CG --> free it
4312  }
4313  }
4314  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4315  // Increment new thread's CG root's counter to add the new thread
4316  this_thr->th.th_cg_roots->cg_nthreads++;
4317  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4318  " node %p of thread %p to %d\n",
4319  this_thr, this_thr->th.th_cg_roots,
4320  this_thr->th.th_cg_roots->cg_root,
4321  this_thr->th.th_cg_roots->cg_nthreads));
4322  this_thr->th.th_current_task->td_icvs.thread_limit =
4323  this_thr->th.th_cg_roots->cg_thread_limit;
4324  }
4325 
4326  /* Initialize dynamic dispatch */
4327  {
4328  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4329  // Use team max_nproc since this will never change for the team.
4330  size_t disp_size =
4331  sizeof(dispatch_private_info_t) *
4332  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4333  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4334  team->t.t_max_nproc));
4335  KMP_ASSERT(dispatch);
4336  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4337  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4338 
4339  dispatch->th_disp_index = 0;
4340  dispatch->th_doacross_buf_idx = 0;
4341  if (!dispatch->th_disp_buffer) {
4342  dispatch->th_disp_buffer =
4343  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4344 
4345  if (__kmp_storage_map) {
4346  __kmp_print_storage_map_gtid(
4347  gtid, &dispatch->th_disp_buffer[0],
4348  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4349  ? 1
4350  : __kmp_dispatch_num_buffers],
4351  disp_size,
4352  "th_%d.th_dispatch.th_disp_buffer "
4353  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4354  gtid, team->t.t_id, gtid);
4355  }
4356  } else {
4357  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4358  }
4359 
4360  dispatch->th_dispatch_pr_current = 0;
4361  dispatch->th_dispatch_sh_current = 0;
4362 
4363  dispatch->th_deo_fcn = 0; /* ORDERED */
4364  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4365  }
4366 
4367  this_thr->th.th_next_pool = NULL;
4368 
4369  if (!this_thr->th.th_task_state_memo_stack) {
4370  size_t i;
4371  this_thr->th.th_task_state_memo_stack =
4372  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4373  this_thr->th.th_task_state_top = 0;
4374  this_thr->th.th_task_state_stack_sz = 4;
4375  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4376  ++i) // zero init the stack
4377  this_thr->th.th_task_state_memo_stack[i] = 0;
4378  }
4379 
4380  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4381  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4382 
4383  KMP_MB();
4384 }
4385 
4386 /* allocate a new thread for the requesting team. this is only called from
4387  within a forkjoin critical section. we will first try to get an available
4388  thread from the thread pool. if none is available, we will fork a new one
4389  assuming we are able to create a new one. this should be assured, as the
4390  caller should check on this first. */
4391 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4392  int new_tid) {
4393  kmp_team_t *serial_team;
4394  kmp_info_t *new_thr;
4395  int new_gtid;
4396 
4397  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4398  KMP_DEBUG_ASSERT(root && team);
4399 #if !KMP_NESTED_HOT_TEAMS
4400  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4401 #endif
4402  KMP_MB();
4403 
4404  /* first, try to get one from the thread pool */
4405  if (__kmp_thread_pool) {
4406  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4407  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4408  if (new_thr == __kmp_thread_pool_insert_pt) {
4409  __kmp_thread_pool_insert_pt = NULL;
4410  }
4411  TCW_4(new_thr->th.th_in_pool, FALSE);
4412  __kmp_suspend_initialize_thread(new_thr);
4413  __kmp_lock_suspend_mx(new_thr);
4414  if (new_thr->th.th_active_in_pool == TRUE) {
4415  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4416  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4417  new_thr->th.th_active_in_pool = FALSE;
4418  }
4419  __kmp_unlock_suspend_mx(new_thr);
4420 
4421  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4422  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4423  KMP_ASSERT(!new_thr->th.th_team);
4424  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4425 
4426  /* setup the thread structure */
4427  __kmp_initialize_info(new_thr, team, new_tid,
4428  new_thr->th.th_info.ds.ds_gtid);
4429  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4430 
4431  TCW_4(__kmp_nth, __kmp_nth + 1);
4432 
4433  new_thr->th.th_task_state = 0;
4434  new_thr->th.th_task_state_top = 0;
4435  new_thr->th.th_task_state_stack_sz = 4;
4436 
4437  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4438  // Make sure pool thread has transitioned to waiting on own thread struct
4439  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4440  // Thread activated in __kmp_allocate_team when increasing team size
4441  }
4442 
4443 #ifdef KMP_ADJUST_BLOCKTIME
4444  /* Adjust blocktime back to zero if necessary */
4445  /* Middle initialization might not have occurred yet */
4446  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4447  if (__kmp_nth > __kmp_avail_proc) {
4448  __kmp_zero_bt = TRUE;
4449  }
4450  }
4451 #endif /* KMP_ADJUST_BLOCKTIME */
4452 
4453 #if KMP_DEBUG
4454  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4455  // KMP_BARRIER_PARENT_FLAG.
4456  int b;
4457  kmp_balign_t *balign = new_thr->th.th_bar;
4458  for (b = 0; b < bs_last_barrier; ++b)
4459  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4460 #endif
4461 
4462  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4463  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4464 
4465  KMP_MB();
4466  return new_thr;
4467  }
4468 
4469  /* no, well fork a new one */
4470  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4471  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4472 
4473 #if KMP_USE_MONITOR
4474  // If this is the first worker thread the RTL is creating, then also
4475  // launch the monitor thread. We try to do this as early as possible.
4476  if (!TCR_4(__kmp_init_monitor)) {
4477  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4478  if (!TCR_4(__kmp_init_monitor)) {
4479  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4480  TCW_4(__kmp_init_monitor, 1);
4481  __kmp_create_monitor(&__kmp_monitor);
4482  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4483 #if KMP_OS_WINDOWS
4484  // AC: wait until monitor has started. This is a fix for CQ232808.
4485  // The reason is that if the library is loaded/unloaded in a loop with
4486  // small (parallel) work in between, then there is high probability that
4487  // monitor thread started after the library shutdown. At shutdown it is
4488  // too late to cope with the problem, because when the primary thread is
4489  // in DllMain (process detach) the monitor has no chances to start (it is
4490  // blocked), and primary thread has no means to inform the monitor that
4491  // the library has gone, because all the memory which the monitor can
4492  // access is going to be released/reset.
4493  while (TCR_4(__kmp_init_monitor) < 2) {
4494  KMP_YIELD(TRUE);
4495  }
4496  KF_TRACE(10, ("after monitor thread has started\n"));
4497 #endif
4498  }
4499  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4500  }
4501 #endif
4502 
4503  KMP_MB();
4504 
4505  {
4506  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4507  ? 1
4508  : __kmp_hidden_helper_threads_num + 1;
4509 
4510  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4511  ++new_gtid) {
4512  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4513  }
4514 
4515  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4516  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4517  }
4518  }
4519 
4520  /* allocate space for it. */
4521  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4522 
4523  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4524 
4525 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4526  // suppress race conditions detection on synchronization flags in debug mode
4527  // this helps to analyze library internals eliminating false positives
4528  __itt_suppress_mark_range(
4529  __itt_suppress_range, __itt_suppress_threading_errors,
4530  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4531  __itt_suppress_mark_range(
4532  __itt_suppress_range, __itt_suppress_threading_errors,
4533  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4534 #if KMP_OS_WINDOWS
4535  __itt_suppress_mark_range(
4536  __itt_suppress_range, __itt_suppress_threading_errors,
4537  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4538 #else
4539  __itt_suppress_mark_range(__itt_suppress_range,
4540  __itt_suppress_threading_errors,
4541  &new_thr->th.th_suspend_init_count,
4542  sizeof(new_thr->th.th_suspend_init_count));
4543 #endif
4544  // TODO: check if we need to also suppress b_arrived flags
4545  __itt_suppress_mark_range(__itt_suppress_range,
4546  __itt_suppress_threading_errors,
4547  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4548  sizeof(new_thr->th.th_bar[0].bb.b_go));
4549  __itt_suppress_mark_range(__itt_suppress_range,
4550  __itt_suppress_threading_errors,
4551  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4552  sizeof(new_thr->th.th_bar[1].bb.b_go));
4553  __itt_suppress_mark_range(__itt_suppress_range,
4554  __itt_suppress_threading_errors,
4555  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4556  sizeof(new_thr->th.th_bar[2].bb.b_go));
4557 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4558  if (__kmp_storage_map) {
4559  __kmp_print_thread_storage_map(new_thr, new_gtid);
4560  }
4561 
4562  // add the reserve serialized team, initialized from the team's primary thread
4563  {
4564  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4565  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4566  new_thr->th.th_serial_team = serial_team =
4567  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4568 #if OMPT_SUPPORT
4569  ompt_data_none, // root parallel id
4570 #endif
4571  proc_bind_default, &r_icvs,
4572  0 USE_NESTED_HOT_ARG(NULL));
4573  }
4574  KMP_ASSERT(serial_team);
4575  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4576  // execution (it is unused for now).
4577  serial_team->t.t_threads[0] = new_thr;
4578  KF_TRACE(10,
4579  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4580  new_thr));
4581 
4582  /* setup the thread structures */
4583  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4584 
4585 #if USE_FAST_MEMORY
4586  __kmp_initialize_fast_memory(new_thr);
4587 #endif /* USE_FAST_MEMORY */
4588 
4589 #if KMP_USE_BGET
4590  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4591  __kmp_initialize_bget(new_thr);
4592 #endif
4593 
4594  __kmp_init_random(new_thr); // Initialize random number generator
4595 
4596  /* Initialize these only once when thread is grabbed for a team allocation */
4597  KA_TRACE(20,
4598  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4599  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4600 
4601  int b;
4602  kmp_balign_t *balign = new_thr->th.th_bar;
4603  for (b = 0; b < bs_last_barrier; ++b) {
4604  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4605  balign[b].bb.team = NULL;
4606  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4607  balign[b].bb.use_oncore_barrier = 0;
4608  }
4609 
4610  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4611  new_thr->th.th_sleep_loc_type = flag_unset;
4612 
4613  new_thr->th.th_spin_here = FALSE;
4614  new_thr->th.th_next_waiting = 0;
4615 #if KMP_OS_UNIX
4616  new_thr->th.th_blocking = false;
4617 #endif
4618 
4619 #if KMP_AFFINITY_SUPPORTED
4620  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4621  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4622  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4623  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4624 #endif
4625  new_thr->th.th_def_allocator = __kmp_def_allocator;
4626  new_thr->th.th_prev_level = 0;
4627  new_thr->th.th_prev_num_threads = 1;
4628 
4629  TCW_4(new_thr->th.th_in_pool, FALSE);
4630  new_thr->th.th_active_in_pool = FALSE;
4631  TCW_4(new_thr->th.th_active, TRUE);
4632 
4633  /* adjust the global counters */
4634  __kmp_all_nth++;
4635  __kmp_nth++;
4636 
4637  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4638  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4639  if (__kmp_adjust_gtid_mode) {
4640  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4641  if (TCR_4(__kmp_gtid_mode) != 2) {
4642  TCW_4(__kmp_gtid_mode, 2);
4643  }
4644  } else {
4645  if (TCR_4(__kmp_gtid_mode) != 1) {
4646  TCW_4(__kmp_gtid_mode, 1);
4647  }
4648  }
4649  }
4650 
4651 #ifdef KMP_ADJUST_BLOCKTIME
4652  /* Adjust blocktime back to zero if necessary */
4653  /* Middle initialization might not have occurred yet */
4654  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4655  if (__kmp_nth > __kmp_avail_proc) {
4656  __kmp_zero_bt = TRUE;
4657  }
4658  }
4659 #endif /* KMP_ADJUST_BLOCKTIME */
4660 
4661  /* actually fork it and create the new worker thread */
4662  KF_TRACE(
4663  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4664  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4665  KF_TRACE(10,
4666  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4667 
4668  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4669  new_gtid));
4670  KMP_MB();
4671  return new_thr;
4672 }
4673 
4674 /* Reinitialize team for reuse.
4675  The hot team code calls this case at every fork barrier, so EPCC barrier
4676  test are extremely sensitive to changes in it, esp. writes to the team
4677  struct, which cause a cache invalidation in all threads.
4678  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4679 static void __kmp_reinitialize_team(kmp_team_t *team,
4680  kmp_internal_control_t *new_icvs,
4681  ident_t *loc) {
4682  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4683  team->t.t_threads[0], team));
4684  KMP_DEBUG_ASSERT(team && new_icvs);
4685  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4686  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4687 
4688  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4689  // Copy ICVs to the primary thread's implicit taskdata
4690  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4691  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4692 
4693  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4694  team->t.t_threads[0], team));
4695 }
4696 
4697 /* Initialize the team data structure.
4698  This assumes the t_threads and t_max_nproc are already set.
4699  Also, we don't touch the arguments */
4700 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4701  kmp_internal_control_t *new_icvs,
4702  ident_t *loc) {
4703  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4704 
4705  /* verify */
4706  KMP_DEBUG_ASSERT(team);
4707  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4708  KMP_DEBUG_ASSERT(team->t.t_threads);
4709  KMP_MB();
4710 
4711  team->t.t_master_tid = 0; /* not needed */
4712  /* team->t.t_master_bar; not needed */
4713  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4714  team->t.t_nproc = new_nproc;
4715 
4716  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4717  team->t.t_next_pool = NULL;
4718  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4719  * up hot team */
4720 
4721  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4722  team->t.t_invoke = NULL; /* not needed */
4723 
4724  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4725  team->t.t_sched.sched = new_icvs->sched.sched;
4726 
4727 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4728  team->t.t_fp_control_saved = FALSE; /* not needed */
4729  team->t.t_x87_fpu_control_word = 0; /* not needed */
4730  team->t.t_mxcsr = 0; /* not needed */
4731 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4732 
4733  team->t.t_construct = 0;
4734 
4735  team->t.t_ordered.dt.t_value = 0;
4736  team->t.t_master_active = FALSE;
4737 
4738 #ifdef KMP_DEBUG
4739  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4740 #endif
4741 #if KMP_OS_WINDOWS
4742  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4743 #endif
4744 
4745  team->t.t_control_stack_top = NULL;
4746 
4747  __kmp_reinitialize_team(team, new_icvs, loc);
4748 
4749  KMP_MB();
4750  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4751 }
4752 
4753 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4754 /* Sets full mask for thread and returns old mask, no changes to structures. */
4755 static void
4756 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4757  if (KMP_AFFINITY_CAPABLE()) {
4758  int status;
4759  if (old_mask != NULL) {
4760  status = __kmp_get_system_affinity(old_mask, TRUE);
4761  int error = errno;
4762  if (status != 0) {
4763  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4764  __kmp_msg_null);
4765  }
4766  }
4767  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4768  }
4769 }
4770 #endif
4771 
4772 #if KMP_AFFINITY_SUPPORTED
4773 
4774 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4775 // It calculates the worker + primary thread's partition based upon the parent
4776 // thread's partition, and binds each worker to a thread in their partition.
4777 // The primary thread's partition should already include its current binding.
4778 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4779  // Do not partition places for the hidden helper team
4780  if (KMP_HIDDEN_HELPER_TEAM(team))
4781  return;
4782  // Copy the primary thread's place partition to the team struct
4783  kmp_info_t *master_th = team->t.t_threads[0];
4784  KMP_DEBUG_ASSERT(master_th != NULL);
4785  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4786  int first_place = master_th->th.th_first_place;
4787  int last_place = master_th->th.th_last_place;
4788  int masters_place = master_th->th.th_current_place;
4789  int num_masks = __kmp_affinity.num_masks;
4790  team->t.t_first_place = first_place;
4791  team->t.t_last_place = last_place;
4792 
4793  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4794  "bound to place %d partition = [%d,%d]\n",
4795  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4796  team->t.t_id, masters_place, first_place, last_place));
4797 
4798  switch (proc_bind) {
4799 
4800  case proc_bind_default:
4801  // Serial teams might have the proc_bind policy set to proc_bind_default.
4802  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4803  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4804  break;
4805 
4806  case proc_bind_primary: {
4807  int f;
4808  int n_th = team->t.t_nproc;
4809  for (f = 1; f < n_th; f++) {
4810  kmp_info_t *th = team->t.t_threads[f];
4811  KMP_DEBUG_ASSERT(th != NULL);
4812  th->th.th_first_place = first_place;
4813  th->th.th_last_place = last_place;
4814  th->th.th_new_place = masters_place;
4815  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4816  team->t.t_display_affinity != 1) {
4817  team->t.t_display_affinity = 1;
4818  }
4819 
4820  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4821  "partition = [%d,%d]\n",
4822  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4823  f, masters_place, first_place, last_place));
4824  }
4825  } break;
4826 
4827  case proc_bind_close: {
4828  int f;
4829  int n_th = team->t.t_nproc;
4830  int n_places;
4831  if (first_place <= last_place) {
4832  n_places = last_place - first_place + 1;
4833  } else {
4834  n_places = num_masks - first_place + last_place + 1;
4835  }
4836  if (n_th <= n_places) {
4837  int place = masters_place;
4838  for (f = 1; f < n_th; f++) {
4839  kmp_info_t *th = team->t.t_threads[f];
4840  KMP_DEBUG_ASSERT(th != NULL);
4841 
4842  if (place == last_place) {
4843  place = first_place;
4844  } else if (place == (num_masks - 1)) {
4845  place = 0;
4846  } else {
4847  place++;
4848  }
4849  th->th.th_first_place = first_place;
4850  th->th.th_last_place = last_place;
4851  th->th.th_new_place = place;
4852  if (__kmp_display_affinity && place != th->th.th_current_place &&
4853  team->t.t_display_affinity != 1) {
4854  team->t.t_display_affinity = 1;
4855  }
4856 
4857  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4858  "partition = [%d,%d]\n",
4859  __kmp_gtid_from_thread(team->t.t_threads[f]),
4860  team->t.t_id, f, place, first_place, last_place));
4861  }
4862  } else {
4863  int S, rem, gap, s_count;
4864  S = n_th / n_places;
4865  s_count = 0;
4866  rem = n_th - (S * n_places);
4867  gap = rem > 0 ? n_places / rem : n_places;
4868  int place = masters_place;
4869  int gap_ct = gap;
4870  for (f = 0; f < n_th; f++) {
4871  kmp_info_t *th = team->t.t_threads[f];
4872  KMP_DEBUG_ASSERT(th != NULL);
4873 
4874  th->th.th_first_place = first_place;
4875  th->th.th_last_place = last_place;
4876  th->th.th_new_place = place;
4877  if (__kmp_display_affinity && place != th->th.th_current_place &&
4878  team->t.t_display_affinity != 1) {
4879  team->t.t_display_affinity = 1;
4880  }
4881  s_count++;
4882 
4883  if ((s_count == S) && rem && (gap_ct == gap)) {
4884  // do nothing, add an extra thread to place on next iteration
4885  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4886  // we added an extra thread to this place; move to next place
4887  if (place == last_place) {
4888  place = first_place;
4889  } else if (place == (num_masks - 1)) {
4890  place = 0;
4891  } else {
4892  place++;
4893  }
4894  s_count = 0;
4895  gap_ct = 1;
4896  rem--;
4897  } else if (s_count == S) { // place full; don't add extra
4898  if (place == last_place) {
4899  place = first_place;
4900  } else if (place == (num_masks - 1)) {
4901  place = 0;
4902  } else {
4903  place++;
4904  }
4905  gap_ct++;
4906  s_count = 0;
4907  }
4908 
4909  KA_TRACE(100,
4910  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4911  "partition = [%d,%d]\n",
4912  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4913  th->th.th_new_place, first_place, last_place));
4914  }
4915  KMP_DEBUG_ASSERT(place == masters_place);
4916  }
4917  } break;
4918 
4919  case proc_bind_spread: {
4920  int f;
4921  int n_th = team->t.t_nproc;
4922  int n_places;
4923  int thidx;
4924  if (first_place <= last_place) {
4925  n_places = last_place - first_place + 1;
4926  } else {
4927  n_places = num_masks - first_place + last_place + 1;
4928  }
4929  if (n_th <= n_places) {
4930  int place = -1;
4931 
4932  if (n_places != num_masks) {
4933  int S = n_places / n_th;
4934  int s_count, rem, gap, gap_ct;
4935 
4936  place = masters_place;
4937  rem = n_places - n_th * S;
4938  gap = rem ? n_th / rem : 1;
4939  gap_ct = gap;
4940  thidx = n_th;
4941  if (update_master_only == 1)
4942  thidx = 1;
4943  for (f = 0; f < thidx; f++) {
4944  kmp_info_t *th = team->t.t_threads[f];
4945  KMP_DEBUG_ASSERT(th != NULL);
4946 
4947  th->th.th_first_place = place;
4948  th->th.th_new_place = place;
4949  if (__kmp_display_affinity && place != th->th.th_current_place &&
4950  team->t.t_display_affinity != 1) {
4951  team->t.t_display_affinity = 1;
4952  }
4953  s_count = 1;
4954  while (s_count < S) {
4955  if (place == last_place) {
4956  place = first_place;
4957  } else if (place == (num_masks - 1)) {
4958  place = 0;
4959  } else {
4960  place++;
4961  }
4962  s_count++;
4963  }
4964  if (rem && (gap_ct == gap)) {
4965  if (place == last_place) {
4966  place = first_place;
4967  } else if (place == (num_masks - 1)) {
4968  place = 0;
4969  } else {
4970  place++;
4971  }
4972  rem--;
4973  gap_ct = 0;
4974  }
4975  th->th.th_last_place = place;
4976  gap_ct++;
4977 
4978  if (place == last_place) {
4979  place = first_place;
4980  } else if (place == (num_masks - 1)) {
4981  place = 0;
4982  } else {
4983  place++;
4984  }
4985 
4986  KA_TRACE(100,
4987  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988  "partition = [%d,%d], num_masks: %u\n",
4989  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4990  f, th->th.th_new_place, th->th.th_first_place,
4991  th->th.th_last_place, num_masks));
4992  }
4993  } else {
4994  /* Having uniform space of available computation places I can create
4995  T partitions of round(P/T) size and put threads into the first
4996  place of each partition. */
4997  double current = static_cast<double>(masters_place);
4998  double spacing =
4999  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5000  int first, last;
5001  kmp_info_t *th;
5002 
5003  thidx = n_th + 1;
5004  if (update_master_only == 1)
5005  thidx = 1;
5006  for (f = 0; f < thidx; f++) {
5007  first = static_cast<int>(current);
5008  last = static_cast<int>(current + spacing) - 1;
5009  KMP_DEBUG_ASSERT(last >= first);
5010  if (first >= n_places) {
5011  if (masters_place) {
5012  first -= n_places;
5013  last -= n_places;
5014  if (first == (masters_place + 1)) {
5015  KMP_DEBUG_ASSERT(f == n_th);
5016  first--;
5017  }
5018  if (last == masters_place) {
5019  KMP_DEBUG_ASSERT(f == (n_th - 1));
5020  last--;
5021  }
5022  } else {
5023  KMP_DEBUG_ASSERT(f == n_th);
5024  first = 0;
5025  last = 0;
5026  }
5027  }
5028  if (last >= n_places) {
5029  last = (n_places - 1);
5030  }
5031  place = first;
5032  current += spacing;
5033  if (f < n_th) {
5034  KMP_DEBUG_ASSERT(0 <= first);
5035  KMP_DEBUG_ASSERT(n_places > first);
5036  KMP_DEBUG_ASSERT(0 <= last);
5037  KMP_DEBUG_ASSERT(n_places > last);
5038  KMP_DEBUG_ASSERT(last_place >= first_place);
5039  th = team->t.t_threads[f];
5040  KMP_DEBUG_ASSERT(th);
5041  th->th.th_first_place = first;
5042  th->th.th_new_place = place;
5043  th->th.th_last_place = last;
5044  if (__kmp_display_affinity && place != th->th.th_current_place &&
5045  team->t.t_display_affinity != 1) {
5046  team->t.t_display_affinity = 1;
5047  }
5048  KA_TRACE(100,
5049  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5050  "partition = [%d,%d], spacing = %.4f\n",
5051  __kmp_gtid_from_thread(team->t.t_threads[f]),
5052  team->t.t_id, f, th->th.th_new_place,
5053  th->th.th_first_place, th->th.th_last_place, spacing));
5054  }
5055  }
5056  }
5057  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5058  } else {
5059  int S, rem, gap, s_count;
5060  S = n_th / n_places;
5061  s_count = 0;
5062  rem = n_th - (S * n_places);
5063  gap = rem > 0 ? n_places / rem : n_places;
5064  int place = masters_place;
5065  int gap_ct = gap;
5066  thidx = n_th;
5067  if (update_master_only == 1)
5068  thidx = 1;
5069  for (f = 0; f < thidx; f++) {
5070  kmp_info_t *th = team->t.t_threads[f];
5071  KMP_DEBUG_ASSERT(th != NULL);
5072 
5073  th->th.th_first_place = place;
5074  th->th.th_last_place = place;
5075  th->th.th_new_place = place;
5076  if (__kmp_display_affinity && place != th->th.th_current_place &&
5077  team->t.t_display_affinity != 1) {
5078  team->t.t_display_affinity = 1;
5079  }
5080  s_count++;
5081 
5082  if ((s_count == S) && rem && (gap_ct == gap)) {
5083  // do nothing, add an extra thread to place on next iteration
5084  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085  // we added an extra thread to this place; move on to next place
5086  if (place == last_place) {
5087  place = first_place;
5088  } else if (place == (num_masks - 1)) {
5089  place = 0;
5090  } else {
5091  place++;
5092  }
5093  s_count = 0;
5094  gap_ct = 1;
5095  rem--;
5096  } else if (s_count == S) { // place is full; don't add extra thread
5097  if (place == last_place) {
5098  place = first_place;
5099  } else if (place == (num_masks - 1)) {
5100  place = 0;
5101  } else {
5102  place++;
5103  }
5104  gap_ct++;
5105  s_count = 0;
5106  }
5107 
5108  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109  "partition = [%d,%d]\n",
5110  __kmp_gtid_from_thread(team->t.t_threads[f]),
5111  team->t.t_id, f, th->th.th_new_place,
5112  th->th.th_first_place, th->th.th_last_place));
5113  }
5114  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115  }
5116  } break;
5117 
5118  default:
5119  break;
5120  }
5121 
5122  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 }
5124 
5125 #endif // KMP_AFFINITY_SUPPORTED
5126 
5127 /* allocate a new team data structure to use. take one off of the free pool if
5128  available */
5129 kmp_team_t *
5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131 #if OMPT_SUPPORT
5132  ompt_data_t ompt_parallel_data,
5133 #endif
5134  kmp_proc_bind_t new_proc_bind,
5135  kmp_internal_control_t *new_icvs,
5136  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138  int f;
5139  kmp_team_t *team;
5140  int use_hot_team = !root->r.r_active;
5141  int level = 0;
5142  int do_place_partition = 1;
5143 
5144  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147  KMP_MB();
5148 
5149 #if KMP_NESTED_HOT_TEAMS
5150  kmp_hot_team_ptr_t *hot_teams;
5151  if (master) {
5152  team = master->th.th_team;
5153  level = team->t.t_active_level;
5154  if (master->th.th_teams_microtask) { // in teams construct?
5155  if (master->th.th_teams_size.nteams > 1 &&
5156  ( // #teams > 1
5157  team->t.t_pkfn ==
5158  (microtask_t)__kmp_teams_master || // inner fork of the teams
5159  master->th.th_teams_level <
5160  team->t.t_level)) { // or nested parallel inside the teams
5161  ++level; // not increment if #teams==1, or for outer fork of the teams;
5162  // increment otherwise
5163  }
5164  // Do not perform the place partition if inner fork of the teams
5165  // Wait until nested parallel region encountered inside teams construct
5166  if ((master->th.th_teams_size.nteams == 1 &&
5167  master->th.th_teams_level >= team->t.t_level) ||
5168  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169  do_place_partition = 0;
5170  }
5171  hot_teams = master->th.th_hot_teams;
5172  if (level < __kmp_hot_teams_max_level && hot_teams &&
5173  hot_teams[level].hot_team) {
5174  // hot team has already been allocated for given level
5175  use_hot_team = 1;
5176  } else {
5177  use_hot_team = 0;
5178  }
5179  } else {
5180  // check we won't access uninitialized hot_teams, just in case
5181  KMP_DEBUG_ASSERT(new_nproc == 1);
5182  }
5183 #endif
5184  // Optimization to use a "hot" team
5185  if (use_hot_team && new_nproc > 1) {
5186  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187 #if KMP_NESTED_HOT_TEAMS
5188  team = hot_teams[level].hot_team;
5189 #else
5190  team = root->r.r_hot_team;
5191 #endif
5192 #if KMP_DEBUG
5193  if (__kmp_tasking_mode != tskm_immediate_exec) {
5194  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195  "task_team[1] = %p before reinit\n",
5196  team->t.t_task_team[0], team->t.t_task_team[1]));
5197  }
5198 #endif
5199 
5200  if (team->t.t_nproc != new_nproc &&
5201  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202  // Distributed barrier may need a resize
5203  int old_nthr = team->t.t_nproc;
5204  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205  }
5206 
5207  // If not doing the place partition, then reset the team's proc bind
5208  // to indicate that partitioning of all threads still needs to take place
5209  if (do_place_partition == 0)
5210  team->t.t_proc_bind = proc_bind_default;
5211  // Has the number of threads changed?
5212  /* Let's assume the most common case is that the number of threads is
5213  unchanged, and put that case first. */
5214  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216  // This case can mean that omp_set_num_threads() was called and the hot
5217  // team size was already reduced, so we check the special flag
5218  if (team->t.t_size_changed == -1) {
5219  team->t.t_size_changed = 1;
5220  } else {
5221  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222  }
5223 
5224  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225  kmp_r_sched_t new_sched = new_icvs->sched;
5226  // set primary thread's schedule as new run-time schedule
5227  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228 
5229  __kmp_reinitialize_team(team, new_icvs,
5230  root->r.r_uber_thread->th.th_ident);
5231 
5232  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233  team->t.t_threads[0], team));
5234  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235 
5236 #if KMP_AFFINITY_SUPPORTED
5237  if ((team->t.t_size_changed == 0) &&
5238  (team->t.t_proc_bind == new_proc_bind)) {
5239  if (new_proc_bind == proc_bind_spread) {
5240  if (do_place_partition) {
5241  // add flag to update only master for spread
5242  __kmp_partition_places(team, 1);
5243  }
5244  }
5245  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246  "proc_bind = %d, partition = [%d,%d]\n",
5247  team->t.t_id, new_proc_bind, team->t.t_first_place,
5248  team->t.t_last_place));
5249  } else {
5250  if (do_place_partition) {
5251  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252  __kmp_partition_places(team);
5253  }
5254  }
5255 #else
5256  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257 #endif /* KMP_AFFINITY_SUPPORTED */
5258  } else if (team->t.t_nproc > new_nproc) {
5259  KA_TRACE(20,
5260  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261  new_nproc));
5262 
5263  team->t.t_size_changed = 1;
5264  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265  // Barrier size already reduced earlier in this function
5266  // Activate team threads via th_used_in_team
5267  __kmp_add_threads_to_team(team, new_nproc);
5268  }
5269 #if KMP_NESTED_HOT_TEAMS
5270  if (__kmp_hot_teams_mode == 0) {
5271  // AC: saved number of threads should correspond to team's value in this
5272  // mode, can be bigger in mode 1, when hot team has threads in reserve
5273  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274  hot_teams[level].hot_team_nth = new_nproc;
5275 #endif // KMP_NESTED_HOT_TEAMS
5276  /* release the extra threads we don't need any more */
5277  for (f = new_nproc; f < team->t.t_nproc; f++) {
5278  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279  if (__kmp_tasking_mode != tskm_immediate_exec) {
5280  // When decreasing team size, threads no longer in the team should
5281  // unref task team.
5282  team->t.t_threads[f]->th.th_task_team = NULL;
5283  }
5284  __kmp_free_thread(team->t.t_threads[f]);
5285  team->t.t_threads[f] = NULL;
5286  }
5287 #if KMP_NESTED_HOT_TEAMS
5288  } // (__kmp_hot_teams_mode == 0)
5289  else {
5290  // When keeping extra threads in team, switch threads to wait on own
5291  // b_go flag
5292  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295  for (int b = 0; b < bs_last_barrier; ++b) {
5296  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298  }
5299  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300  }
5301  }
5302  }
5303 #endif // KMP_NESTED_HOT_TEAMS
5304  team->t.t_nproc = new_nproc;
5305  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307  __kmp_reinitialize_team(team, new_icvs,
5308  root->r.r_uber_thread->th.th_ident);
5309 
5310  // Update remaining threads
5311  for (f = 0; f < new_nproc; ++f) {
5312  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313  }
5314 
5315  // restore the current task state of the primary thread: should be the
5316  // implicit task
5317  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318  team->t.t_threads[0], team));
5319 
5320  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321 
5322 #ifdef KMP_DEBUG
5323  for (f = 0; f < team->t.t_nproc; f++) {
5324  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325  team->t.t_threads[f]->th.th_team_nproc ==
5326  team->t.t_nproc);
5327  }
5328 #endif
5329 
5330  if (do_place_partition) {
5331  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333  __kmp_partition_places(team);
5334 #endif
5335  }
5336  } else { // team->t.t_nproc < new_nproc
5337 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5338  kmp_affin_mask_t *old_mask;
5339  if (KMP_AFFINITY_CAPABLE()) {
5340  KMP_CPU_ALLOC(old_mask);
5341  }
5342 #endif
5343 
5344  KA_TRACE(20,
5345  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5346  new_nproc));
5347  int old_nproc = team->t.t_nproc; // save old value and use to update only
5348  team->t.t_size_changed = 1;
5349 
5350 #if KMP_NESTED_HOT_TEAMS
5351  int avail_threads = hot_teams[level].hot_team_nth;
5352  if (new_nproc < avail_threads)
5353  avail_threads = new_nproc;
5354  kmp_info_t **other_threads = team->t.t_threads;
5355  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5356  // Adjust barrier data of reserved threads (if any) of the team
5357  // Other data will be set in __kmp_initialize_info() below.
5358  int b;
5359  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5360  for (b = 0; b < bs_last_barrier; ++b) {
5361  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5362  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5363 #if USE_DEBUGGER
5364  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5365 #endif
5366  }
5367  }
5368  if (hot_teams[level].hot_team_nth >= new_nproc) {
5369  // we have all needed threads in reserve, no need to allocate any
5370  // this only possible in mode 1, cannot have reserved threads in mode 0
5371  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5372  team->t.t_nproc = new_nproc; // just get reserved threads involved
5373  } else {
5374  // We may have some threads in reserve, but not enough;
5375  // get reserved threads involved if any.
5376  team->t.t_nproc = hot_teams[level].hot_team_nth;
5377  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5378 #endif // KMP_NESTED_HOT_TEAMS
5379  if (team->t.t_max_nproc < new_nproc) {
5380  /* reallocate larger arrays */
5381  __kmp_reallocate_team_arrays(team, new_nproc);
5382  __kmp_reinitialize_team(team, new_icvs, NULL);
5383  }
5384 
5385 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5386  /* Temporarily set full mask for primary thread before creation of
5387  workers. The reason is that workers inherit the affinity from the
5388  primary thread, so if a lot of workers are created on the single
5389  core quickly, they don't get a chance to set their own affinity for
5390  a long time. */
5391  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5392 #endif
5393 
5394  /* allocate new threads for the hot team */
5395  for (f = team->t.t_nproc; f < new_nproc; f++) {
5396  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5397  KMP_DEBUG_ASSERT(new_worker);
5398  team->t.t_threads[f] = new_worker;
5399 
5400  KA_TRACE(20,
5401  ("__kmp_allocate_team: team %d init T#%d arrived: "
5402  "join=%llu, plain=%llu\n",
5403  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5404  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5405  team->t.t_bar[bs_plain_barrier].b_arrived));
5406 
5407  { // Initialize barrier data for new threads.
5408  int b;
5409  kmp_balign_t *balign = new_worker->th.th_bar;
5410  for (b = 0; b < bs_last_barrier; ++b) {
5411  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5412  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5413  KMP_BARRIER_PARENT_FLAG);
5414 #if USE_DEBUGGER
5415  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5416 #endif
5417  }
5418  }
5419  }
5420 
5421 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5422  if (KMP_AFFINITY_CAPABLE()) {
5423  /* Restore initial primary thread's affinity mask */
5424  __kmp_set_system_affinity(old_mask, TRUE);
5425  KMP_CPU_FREE(old_mask);
5426  }
5427 #endif
5428 #if KMP_NESTED_HOT_TEAMS
5429  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5430 #endif // KMP_NESTED_HOT_TEAMS
5431  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5432  // Barrier size already increased earlier in this function
5433  // Activate team threads via th_used_in_team
5434  __kmp_add_threads_to_team(team, new_nproc);
5435  }
5436  /* make sure everyone is syncronized */
5437  // new threads below
5438  __kmp_initialize_team(team, new_nproc, new_icvs,
5439  root->r.r_uber_thread->th.th_ident);
5440 
5441  /* reinitialize the threads */
5442  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5443  for (f = 0; f < team->t.t_nproc; ++f)
5444  __kmp_initialize_info(team->t.t_threads[f], team, f,
5445  __kmp_gtid_from_tid(f, team));
5446 
5447  if (level) { // set th_task_state for new threads in nested hot team
5448  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5449  // only need to set the th_task_state for the new threads. th_task_state
5450  // for primary thread will not be accurate until after this in
5451  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5452  // get the correct value.
5453  for (f = old_nproc; f < team->t.t_nproc; ++f)
5454  team->t.t_threads[f]->th.th_task_state =
5455  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5456  } else { // set th_task_state for new threads in non-nested hot team
5457  // copy primary thread's state
5458  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5459  for (f = old_nproc; f < team->t.t_nproc; ++f)
5460  team->t.t_threads[f]->th.th_task_state = old_state;
5461  }
5462 
5463 #ifdef KMP_DEBUG
5464  for (f = 0; f < team->t.t_nproc; ++f) {
5465  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5466  team->t.t_threads[f]->th.th_team_nproc ==
5467  team->t.t_nproc);
5468  }
5469 #endif
5470 
5471  if (do_place_partition) {
5472  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5473 #if KMP_AFFINITY_SUPPORTED
5474  __kmp_partition_places(team);
5475 #endif
5476  }
5477  } // Check changes in number of threads
5478 
5479  kmp_info_t *master = team->t.t_threads[0];
5480  if (master->th.th_teams_microtask) {
5481  for (f = 1; f < new_nproc; ++f) {
5482  // propagate teams construct specific info to workers
5483  kmp_info_t *thr = team->t.t_threads[f];
5484  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5485  thr->th.th_teams_level = master->th.th_teams_level;
5486  thr->th.th_teams_size = master->th.th_teams_size;
5487  }
5488  }
5489 #if KMP_NESTED_HOT_TEAMS
5490  if (level) {
5491  // Sync barrier state for nested hot teams, not needed for outermost hot
5492  // team.
5493  for (f = 1; f < new_nproc; ++f) {
5494  kmp_info_t *thr = team->t.t_threads[f];
5495  int b;
5496  kmp_balign_t *balign = thr->th.th_bar;
5497  for (b = 0; b < bs_last_barrier; ++b) {
5498  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5499  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5500 #if USE_DEBUGGER
5501  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5502 #endif
5503  }
5504  }
5505  }
5506 #endif // KMP_NESTED_HOT_TEAMS
5507 
5508  /* reallocate space for arguments if necessary */
5509  __kmp_alloc_argv_entries(argc, team, TRUE);
5510  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5511  // The hot team re-uses the previous task team,
5512  // if untouched during the previous release->gather phase.
5513 
5514  KF_TRACE(10, (" hot_team = %p\n", team));
5515 
5516 #if KMP_DEBUG
5517  if (__kmp_tasking_mode != tskm_immediate_exec) {
5518  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5519  "task_team[1] = %p after reinit\n",
5520  team->t.t_task_team[0], team->t.t_task_team[1]));
5521  }
5522 #endif
5523 
5524 #if OMPT_SUPPORT
5525  __ompt_team_assign_id(team, ompt_parallel_data);
5526 #endif
5527 
5528  KMP_MB();
5529 
5530  return team;
5531  }
5532 
5533  /* next, let's try to take one from the team pool */
5534  KMP_MB();
5535  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5536  /* TODO: consider resizing undersized teams instead of reaping them, now
5537  that we have a resizing mechanism */
5538  if (team->t.t_max_nproc >= max_nproc) {
5539  /* take this team from the team pool */
5540  __kmp_team_pool = team->t.t_next_pool;
5541 
5542  if (max_nproc > 1 &&
5543  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5544  if (!team->t.b) { // Allocate barrier structure
5545  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5546  }
5547  }
5548 
5549  /* setup the team for fresh use */
5550  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5551 
5552  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5553  "task_team[1] %p to NULL\n",
5554  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5555  team->t.t_task_team[0] = NULL;
5556  team->t.t_task_team[1] = NULL;
5557 
5558  /* reallocate space for arguments if necessary */
5559  __kmp_alloc_argv_entries(argc, team, TRUE);
5560  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5561 
5562  KA_TRACE(
5563  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5564  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5565  { // Initialize barrier data.
5566  int b;
5567  for (b = 0; b < bs_last_barrier; ++b) {
5568  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5569 #if USE_DEBUGGER
5570  team->t.t_bar[b].b_master_arrived = 0;
5571  team->t.t_bar[b].b_team_arrived = 0;
5572 #endif
5573  }
5574  }
5575 
5576  team->t.t_proc_bind = new_proc_bind;
5577 
5578  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5579  team->t.t_id));
5580 
5581 #if OMPT_SUPPORT
5582  __ompt_team_assign_id(team, ompt_parallel_data);
5583 #endif
5584 
5585  KMP_MB();
5586 
5587  return team;
5588  }
5589 
5590  /* reap team if it is too small, then loop back and check the next one */
5591  // not sure if this is wise, but, will be redone during the hot-teams
5592  // rewrite.
5593  /* TODO: Use technique to find the right size hot-team, don't reap them */
5594  team = __kmp_reap_team(team);
5595  __kmp_team_pool = team;
5596  }
5597 
5598  /* nothing available in the pool, no matter, make a new team! */
5599  KMP_MB();
5600  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5601 
5602  /* and set it up */
5603  team->t.t_max_nproc = max_nproc;
5604  if (max_nproc > 1 &&
5605  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5606  // Allocate barrier structure
5607  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5608  }
5609 
5610  /* NOTE well, for some reason allocating one big buffer and dividing it up
5611  seems to really hurt performance a lot on the P4, so, let's not use this */
5612  __kmp_allocate_team_arrays(team, max_nproc);
5613 
5614  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5615  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5616 
5617  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5618  "%p to NULL\n",
5619  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5620  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5621  // memory, no need to duplicate
5622  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5623  // memory, no need to duplicate
5624 
5625  if (__kmp_storage_map) {
5626  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5627  }
5628 
5629  /* allocate space for arguments */
5630  __kmp_alloc_argv_entries(argc, team, FALSE);
5631  team->t.t_argc = argc;
5632 
5633  KA_TRACE(20,
5634  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5635  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5636  { // Initialize barrier data.
5637  int b;
5638  for (b = 0; b < bs_last_barrier; ++b) {
5639  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5640 #if USE_DEBUGGER
5641  team->t.t_bar[b].b_master_arrived = 0;
5642  team->t.t_bar[b].b_team_arrived = 0;
5643 #endif
5644  }
5645  }
5646 
5647  team->t.t_proc_bind = new_proc_bind;
5648 
5649 #if OMPT_SUPPORT
5650  __ompt_team_assign_id(team, ompt_parallel_data);
5651  team->t.ompt_serialized_team_info = NULL;
5652 #endif
5653 
5654  KMP_MB();
5655 
5656  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5657  team->t.t_id));
5658 
5659  return team;
5660 }
5661 
5662 /* TODO implement hot-teams at all levels */
5663 /* TODO implement lazy thread release on demand (disband request) */
5664 
5665 /* free the team. return it to the team pool. release all the threads
5666  * associated with it */
5667 void __kmp_free_team(kmp_root_t *root,
5668  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5669  int f;
5670  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5671  team->t.t_id));
5672 
5673  /* verify state */
5674  KMP_DEBUG_ASSERT(root);
5675  KMP_DEBUG_ASSERT(team);
5676  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5677  KMP_DEBUG_ASSERT(team->t.t_threads);
5678 
5679  int use_hot_team = team == root->r.r_hot_team;
5680 #if KMP_NESTED_HOT_TEAMS
5681  int level;
5682  if (master) {
5683  level = team->t.t_active_level - 1;
5684  if (master->th.th_teams_microtask) { // in teams construct?
5685  if (master->th.th_teams_size.nteams > 1) {
5686  ++level; // level was not increased in teams construct for
5687  // team_of_masters
5688  }
5689  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5690  master->th.th_teams_level == team->t.t_level) {
5691  ++level; // level was not increased in teams construct for
5692  // team_of_workers before the parallel
5693  } // team->t.t_level will be increased inside parallel
5694  }
5695 #if KMP_DEBUG
5696  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5697 #endif
5698  if (level < __kmp_hot_teams_max_level) {
5699  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5700  use_hot_team = 1;
5701  }
5702  }
5703 #endif // KMP_NESTED_HOT_TEAMS
5704 
5705  /* team is done working */
5706  TCW_SYNC_PTR(team->t.t_pkfn,
5707  NULL); // Important for Debugging Support Library.
5708 #if KMP_OS_WINDOWS
5709  team->t.t_copyin_counter = 0; // init counter for possible reuse
5710 #endif
5711  // Do not reset pointer to parent team to NULL for hot teams.
5712 
5713  /* if we are non-hot team, release our threads */
5714  if (!use_hot_team) {
5715  if (__kmp_tasking_mode != tskm_immediate_exec) {
5716  // Wait for threads to reach reapable state
5717  for (f = 1; f < team->t.t_nproc; ++f) {
5718  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5719  kmp_info_t *th = team->t.t_threads[f];
5720  volatile kmp_uint32 *state = &th->th.th_reap_state;
5721  while (*state != KMP_SAFE_TO_REAP) {
5722 #if KMP_OS_WINDOWS
5723  // On Windows a thread can be killed at any time, check this
5724  DWORD ecode;
5725  if (!__kmp_is_thread_alive(th, &ecode)) {
5726  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5727  break;
5728  }
5729 #endif
5730  // first check if thread is sleeping
5731  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5732  if (fl.is_sleeping())
5733  fl.resume(__kmp_gtid_from_thread(th));
5734  KMP_CPU_PAUSE();
5735  }
5736  }
5737 
5738  // Delete task teams
5739  int tt_idx;
5740  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5741  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5742  if (task_team != NULL) {
5743  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5744  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745  team->t.t_threads[f]->th.th_task_team = NULL;
5746  }
5747  KA_TRACE(
5748  20,
5749  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5750  __kmp_get_gtid(), task_team, team->t.t_id));
5751 #if KMP_NESTED_HOT_TEAMS
5752  __kmp_free_task_team(master, task_team);
5753 #endif
5754  team->t.t_task_team[tt_idx] = NULL;
5755  }
5756  }
5757  }
5758 
5759  // Reset pointer to parent team only for non-hot teams.
5760  team->t.t_parent = NULL;
5761  team->t.t_level = 0;
5762  team->t.t_active_level = 0;
5763 
5764  /* free the worker threads */
5765  for (f = 1; f < team->t.t_nproc; ++f) {
5766  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5767  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5768  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5769  1, 2);
5770  }
5771  __kmp_free_thread(team->t.t_threads[f]);
5772  }
5773 
5774  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775  if (team->t.b) {
5776  // wake up thread at old location
5777  team->t.b->go_release();
5778  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5779  for (f = 1; f < team->t.t_nproc; ++f) {
5780  if (team->t.b->sleep[f].sleep) {
5781  __kmp_atomic_resume_64(
5782  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5783  (kmp_atomic_flag_64<> *)NULL);
5784  }
5785  }
5786  }
5787  // Wait for threads to be removed from team
5788  for (int f = 1; f < team->t.t_nproc; ++f) {
5789  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5790  KMP_CPU_PAUSE();
5791  }
5792  }
5793  }
5794 
5795  for (f = 1; f < team->t.t_nproc; ++f) {
5796  team->t.t_threads[f] = NULL;
5797  }
5798 
5799  if (team->t.t_max_nproc > 1 &&
5800  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5801  distributedBarrier::deallocate(team->t.b);
5802  team->t.b = NULL;
5803  }
5804  /* put the team back in the team pool */
5805  /* TODO limit size of team pool, call reap_team if pool too large */
5806  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5807  __kmp_team_pool = (volatile kmp_team_t *)team;
5808  } else { // Check if team was created for primary threads in teams construct
5809  // See if first worker is a CG root
5810  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5811  team->t.t_threads[1]->th.th_cg_roots);
5812  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5813  // Clean up the CG root nodes on workers so that this team can be re-used
5814  for (f = 1; f < team->t.t_nproc; ++f) {
5815  kmp_info_t *thr = team->t.t_threads[f];
5816  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5817  thr->th.th_cg_roots->cg_root == thr);
5818  // Pop current CG root off list
5819  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5820  thr->th.th_cg_roots = tmp->up;
5821  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5822  " up to node %p. cg_nthreads was %d\n",
5823  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5824  int i = tmp->cg_nthreads--;
5825  if (i == 1) {
5826  __kmp_free(tmp); // free CG if we are the last thread in it
5827  }
5828  // Restore current task's thread_limit from CG root
5829  if (thr->th.th_cg_roots)
5830  thr->th.th_current_task->td_icvs.thread_limit =
5831  thr->th.th_cg_roots->cg_thread_limit;
5832  }
5833  }
5834  }
5835 
5836  KMP_MB();
5837 }
5838 
5839 /* reap the team. destroy it, reclaim all its resources and free its memory */
5840 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5841  kmp_team_t *next_pool = team->t.t_next_pool;
5842 
5843  KMP_DEBUG_ASSERT(team);
5844  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5845  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5846  KMP_DEBUG_ASSERT(team->t.t_threads);
5847  KMP_DEBUG_ASSERT(team->t.t_argv);
5848 
5849  /* TODO clean the threads that are a part of this? */
5850 
5851  /* free stuff */
5852  __kmp_free_team_arrays(team);
5853  if (team->t.t_argv != &team->t.t_inline_argv[0])
5854  __kmp_free((void *)team->t.t_argv);
5855  __kmp_free(team);
5856 
5857  KMP_MB();
5858  return next_pool;
5859 }
5860 
5861 // Free the thread. Don't reap it, just place it on the pool of available
5862 // threads.
5863 //
5864 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5865 // binding for the affinity mechanism to be useful.
5866 //
5867 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5868 // However, we want to avoid a potential performance problem by always
5869 // scanning through the list to find the correct point at which to insert
5870 // the thread (potential N**2 behavior). To do this we keep track of the
5871 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5872 // With single-level parallelism, threads will always be added to the tail
5873 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5874 // parallelism, all bets are off and we may need to scan through the entire
5875 // free list.
5876 //
5877 // This change also has a potentially large performance benefit, for some
5878 // applications. Previously, as threads were freed from the hot team, they
5879 // would be placed back on the free list in inverse order. If the hot team
5880 // grew back to it's original size, then the freed thread would be placed
5881 // back on the hot team in reverse order. This could cause bad cache
5882 // locality problems on programs where the size of the hot team regularly
5883 // grew and shrunk.
5884 //
5885 // Now, for single-level parallelism, the OMP tid is always == gtid.
5886 void __kmp_free_thread(kmp_info_t *this_th) {
5887  int gtid;
5888  kmp_info_t **scan;
5889 
5890  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5891  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5892 
5893  KMP_DEBUG_ASSERT(this_th);
5894 
5895  // When moving thread to pool, switch thread to wait on own b_go flag, and
5896  // uninitialized (NULL team).
5897  int b;
5898  kmp_balign_t *balign = this_th->th.th_bar;
5899  for (b = 0; b < bs_last_barrier; ++b) {
5900  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5901  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5902  balign[b].bb.team = NULL;
5903  balign[b].bb.leaf_kids = 0;
5904  }
5905  this_th->th.th_task_state = 0;
5906  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5907 
5908  /* put thread back on the free pool */
5909  TCW_PTR(this_th->th.th_team, NULL);
5910  TCW_PTR(this_th->th.th_root, NULL);
5911  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5912 
5913  while (this_th->th.th_cg_roots) {
5914  this_th->th.th_cg_roots->cg_nthreads--;
5915  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5916  " %p of thread %p to %d\n",
5917  this_th, this_th->th.th_cg_roots,
5918  this_th->th.th_cg_roots->cg_root,
5919  this_th->th.th_cg_roots->cg_nthreads));
5920  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5921  if (tmp->cg_root == this_th) { // Thread is a cg_root
5922  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5923  KA_TRACE(
5924  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5925  this_th->th.th_cg_roots = tmp->up;
5926  __kmp_free(tmp);
5927  } else { // Worker thread
5928  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5929  __kmp_free(tmp);
5930  }
5931  this_th->th.th_cg_roots = NULL;
5932  break;
5933  }
5934  }
5935 
5936  /* If the implicit task assigned to this thread can be used by other threads
5937  * -> multiple threads can share the data and try to free the task at
5938  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5939  * with higher probability when hot team is disabled but can occurs even when
5940  * the hot team is enabled */
5941  __kmp_free_implicit_task(this_th);
5942  this_th->th.th_current_task = NULL;
5943 
5944  // If the __kmp_thread_pool_insert_pt is already past the new insert
5945  // point, then we need to re-scan the entire list.
5946  gtid = this_th->th.th_info.ds.ds_gtid;
5947  if (__kmp_thread_pool_insert_pt != NULL) {
5948  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5949  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5950  __kmp_thread_pool_insert_pt = NULL;
5951  }
5952  }
5953 
5954  // Scan down the list to find the place to insert the thread.
5955  // scan is the address of a link in the list, possibly the address of
5956  // __kmp_thread_pool itself.
5957  //
5958  // In the absence of nested parallelism, the for loop will have 0 iterations.
5959  if (__kmp_thread_pool_insert_pt != NULL) {
5960  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5961  } else {
5962  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5963  }
5964  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5965  scan = &((*scan)->th.th_next_pool))
5966  ;
5967 
5968  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5969  // to its address.
5970  TCW_PTR(this_th->th.th_next_pool, *scan);
5971  __kmp_thread_pool_insert_pt = *scan = this_th;
5972  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5973  (this_th->th.th_info.ds.ds_gtid <
5974  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5975  TCW_4(this_th->th.th_in_pool, TRUE);
5976  __kmp_suspend_initialize_thread(this_th);
5977  __kmp_lock_suspend_mx(this_th);
5978  if (this_th->th.th_active == TRUE) {
5979  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5980  this_th->th.th_active_in_pool = TRUE;
5981  }
5982 #if KMP_DEBUG
5983  else {
5984  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5985  }
5986 #endif
5987  __kmp_unlock_suspend_mx(this_th);
5988 
5989  TCW_4(__kmp_nth, __kmp_nth - 1);
5990 
5991 #ifdef KMP_ADJUST_BLOCKTIME
5992  /* Adjust blocktime back to user setting or default if necessary */
5993  /* Middle initialization might never have occurred */
5994  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5995  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5996  if (__kmp_nth <= __kmp_avail_proc) {
5997  __kmp_zero_bt = FALSE;
5998  }
5999  }
6000 #endif /* KMP_ADJUST_BLOCKTIME */
6001 
6002  KMP_MB();
6003 }
6004 
6005 /* ------------------------------------------------------------------------ */
6006 
6007 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6008 #if OMP_PROFILING_SUPPORT
6009  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6010  // TODO: add a configuration option for time granularity
6011  if (ProfileTraceFile)
6012  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6013 #endif
6014 
6015  int gtid = this_thr->th.th_info.ds.ds_gtid;
6016  /* void *stack_data;*/
6017  kmp_team_t **volatile pteam;
6018 
6019  KMP_MB();
6020  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6021 
6022  if (__kmp_env_consistency_check) {
6023  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6024  }
6025 
6026 #if OMPD_SUPPORT
6027  if (ompd_state & OMPD_ENABLE_BP)
6028  ompd_bp_thread_begin();
6029 #endif
6030 
6031 #if OMPT_SUPPORT
6032  ompt_data_t *thread_data = nullptr;
6033  if (ompt_enabled.enabled) {
6034  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6035  *thread_data = ompt_data_none;
6036 
6037  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6038  this_thr->th.ompt_thread_info.wait_id = 0;
6039  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6040  this_thr->th.ompt_thread_info.parallel_flags = 0;
6041  if (ompt_enabled.ompt_callback_thread_begin) {
6042  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6043  ompt_thread_worker, thread_data);
6044  }
6045  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6046  }
6047 #endif
6048 
6049  /* This is the place where threads wait for work */
6050  while (!TCR_4(__kmp_global.g.g_done)) {
6051  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6052  KMP_MB();
6053 
6054  /* wait for work to do */
6055  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6056 
6057  /* No tid yet since not part of a team */
6058  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6059 
6060 #if OMPT_SUPPORT
6061  if (ompt_enabled.enabled) {
6062  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6063  }
6064 #endif
6065 
6066  pteam = &this_thr->th.th_team;
6067 
6068  /* have we been allocated? */
6069  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6070  /* we were just woken up, so run our new task */
6071  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6072  int rc;
6073  KA_TRACE(20,
6074  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6075  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6076  (*pteam)->t.t_pkfn));
6077 
6078  updateHWFPControl(*pteam);
6079 
6080 #if OMPT_SUPPORT
6081  if (ompt_enabled.enabled) {
6082  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6083  }
6084 #endif
6085 
6086  rc = (*pteam)->t.t_invoke(gtid);
6087  KMP_ASSERT(rc);
6088 
6089  KMP_MB();
6090  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6091  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6092  (*pteam)->t.t_pkfn));
6093  }
6094 #if OMPT_SUPPORT
6095  if (ompt_enabled.enabled) {
6096  /* no frame set while outside task */
6097  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6098 
6099  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6100  }
6101 #endif
6102  /* join barrier after parallel region */
6103  __kmp_join_barrier(gtid);
6104  }
6105  }
6106  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6107 
6108 #if OMPD_SUPPORT
6109  if (ompd_state & OMPD_ENABLE_BP)
6110  ompd_bp_thread_end();
6111 #endif
6112 
6113 #if OMPT_SUPPORT
6114  if (ompt_enabled.ompt_callback_thread_end) {
6115  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6116  }
6117 #endif
6118 
6119  this_thr->th.th_task_team = NULL;
6120  /* run the destructors for the threadprivate data for this thread */
6121  __kmp_common_destroy_gtid(gtid);
6122 
6123  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6124  KMP_MB();
6125 
6126 #if OMP_PROFILING_SUPPORT
6127  llvm::timeTraceProfilerFinishThread();
6128 #endif
6129  return this_thr;
6130 }
6131 
6132 /* ------------------------------------------------------------------------ */
6133 
6134 void __kmp_internal_end_dest(void *specific_gtid) {
6135  // Make sure no significant bits are lost
6136  int gtid;
6137  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6138 
6139  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6140  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6141  * this is because 0 is reserved for the nothing-stored case */
6142 
6143  __kmp_internal_end_thread(gtid);
6144 }
6145 
6146 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6147 
6148 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6149  __kmp_internal_end_atexit();
6150 }
6151 
6152 #endif
6153 
6154 /* [Windows] josh: when the atexit handler is called, there may still be more
6155  than one thread alive */
6156 void __kmp_internal_end_atexit(void) {
6157  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6158  /* [Windows]
6159  josh: ideally, we want to completely shutdown the library in this atexit
6160  handler, but stat code that depends on thread specific data for gtid fails
6161  because that data becomes unavailable at some point during the shutdown, so
6162  we call __kmp_internal_end_thread instead. We should eventually remove the
6163  dependency on __kmp_get_specific_gtid in the stat code and use
6164  __kmp_internal_end_library to cleanly shutdown the library.
6165 
6166  // TODO: Can some of this comment about GVS be removed?
6167  I suspect that the offending stat code is executed when the calling thread
6168  tries to clean up a dead root thread's data structures, resulting in GVS
6169  code trying to close the GVS structures for that thread, but since the stat
6170  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6171  the calling thread is cleaning up itself instead of another thread, it get
6172  confused. This happens because allowing a thread to unregister and cleanup
6173  another thread is a recent modification for addressing an issue.
6174  Based on the current design (20050722), a thread may end up
6175  trying to unregister another thread only if thread death does not trigger
6176  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6177  thread specific data destructor function to detect thread death. For
6178  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6179  is nothing. Thus, the workaround is applicable only for Windows static
6180  stat library. */
6181  __kmp_internal_end_library(-1);
6182 #if KMP_OS_WINDOWS
6183  __kmp_close_console();
6184 #endif
6185 }
6186 
6187 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6188  // It is assumed __kmp_forkjoin_lock is acquired.
6189 
6190  int gtid;
6191 
6192  KMP_DEBUG_ASSERT(thread != NULL);
6193 
6194  gtid = thread->th.th_info.ds.ds_gtid;
6195 
6196  if (!is_root) {
6197  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6198  /* Assume the threads are at the fork barrier here */
6199  KA_TRACE(
6200  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6201  gtid));
6202  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6203  while (
6204  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6205  KMP_CPU_PAUSE();
6206  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6207  } else {
6208  /* Need release fence here to prevent seg faults for tree forkjoin
6209  barrier (GEH) */
6210  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6211  thread);
6212  __kmp_release_64(&flag);
6213  }
6214  }
6215 
6216  // Terminate OS thread.
6217  __kmp_reap_worker(thread);
6218 
6219  // The thread was killed asynchronously. If it was actively
6220  // spinning in the thread pool, decrement the global count.
6221  //
6222  // There is a small timing hole here - if the worker thread was just waking
6223  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6224  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6225  // the global counter might not get updated.
6226  //
6227  // Currently, this can only happen as the library is unloaded,
6228  // so there are no harmful side effects.
6229  if (thread->th.th_active_in_pool) {
6230  thread->th.th_active_in_pool = FALSE;
6231  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6232  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6233  }
6234  }
6235 
6236  __kmp_free_implicit_task(thread);
6237 
6238 // Free the fast memory for tasking
6239 #if USE_FAST_MEMORY
6240  __kmp_free_fast_memory(thread);
6241 #endif /* USE_FAST_MEMORY */
6242 
6243  __kmp_suspend_uninitialize_thread(thread);
6244 
6245  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6246  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6247 
6248  --__kmp_all_nth;
6249  // __kmp_nth was decremented when thread is added to the pool.
6250 
6251 #ifdef KMP_ADJUST_BLOCKTIME
6252  /* Adjust blocktime back to user setting or default if necessary */
6253  /* Middle initialization might never have occurred */
6254  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6255  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6256  if (__kmp_nth <= __kmp_avail_proc) {
6257  __kmp_zero_bt = FALSE;
6258  }
6259  }
6260 #endif /* KMP_ADJUST_BLOCKTIME */
6261 
6262  /* free the memory being used */
6263  if (__kmp_env_consistency_check) {
6264  if (thread->th.th_cons) {
6265  __kmp_free_cons_stack(thread->th.th_cons);
6266  thread->th.th_cons = NULL;
6267  }
6268  }
6269 
6270  if (thread->th.th_pri_common != NULL) {
6271  __kmp_free(thread->th.th_pri_common);
6272  thread->th.th_pri_common = NULL;
6273  }
6274 
6275  if (thread->th.th_task_state_memo_stack != NULL) {
6276  __kmp_free(thread->th.th_task_state_memo_stack);
6277  thread->th.th_task_state_memo_stack = NULL;
6278  }
6279 
6280 #if KMP_USE_BGET
6281  if (thread->th.th_local.bget_data != NULL) {
6282  __kmp_finalize_bget(thread);
6283  }
6284 #endif
6285 
6286 #if KMP_AFFINITY_SUPPORTED
6287  if (thread->th.th_affin_mask != NULL) {
6288  KMP_CPU_FREE(thread->th.th_affin_mask);
6289  thread->th.th_affin_mask = NULL;
6290  }
6291 #endif /* KMP_AFFINITY_SUPPORTED */
6292 
6293 #if KMP_USE_HIER_SCHED
6294  if (thread->th.th_hier_bar_data != NULL) {
6295  __kmp_free(thread->th.th_hier_bar_data);
6296  thread->th.th_hier_bar_data = NULL;
6297  }
6298 #endif
6299 
6300  __kmp_reap_team(thread->th.th_serial_team);
6301  thread->th.th_serial_team = NULL;
6302  __kmp_free(thread);
6303 
6304  KMP_MB();
6305 
6306 } // __kmp_reap_thread
6307 
6308 static void __kmp_itthash_clean(kmp_info_t *th) {
6309 #if USE_ITT_NOTIFY
6310  if (__kmp_itt_region_domains.count > 0) {
6311  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6312  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6313  while (bucket) {
6314  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6315  __kmp_thread_free(th, bucket);
6316  bucket = next;
6317  }
6318  }
6319  }
6320  if (__kmp_itt_barrier_domains.count > 0) {
6321  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6322  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6323  while (bucket) {
6324  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6325  __kmp_thread_free(th, bucket);
6326  bucket = next;
6327  }
6328  }
6329  }
6330 #endif
6331 }
6332 
6333 static void __kmp_internal_end(void) {
6334  int i;
6335 
6336  /* First, unregister the library */
6337  __kmp_unregister_library();
6338 
6339 #if KMP_OS_WINDOWS
6340  /* In Win static library, we can't tell when a root actually dies, so we
6341  reclaim the data structures for any root threads that have died but not
6342  unregistered themselves, in order to shut down cleanly.
6343  In Win dynamic library we also can't tell when a thread dies. */
6344  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6345 // dead roots
6346 #endif
6347 
6348  for (i = 0; i < __kmp_threads_capacity; i++)
6349  if (__kmp_root[i])
6350  if (__kmp_root[i]->r.r_active)
6351  break;
6352  KMP_MB(); /* Flush all pending memory write invalidates. */
6353  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6354 
6355  if (i < __kmp_threads_capacity) {
6356 #if KMP_USE_MONITOR
6357  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6358  KMP_MB(); /* Flush all pending memory write invalidates. */
6359 
6360  // Need to check that monitor was initialized before reaping it. If we are
6361  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6362  // __kmp_monitor will appear to contain valid data, but it is only valid in
6363  // the parent process, not the child.
6364  // New behavior (201008): instead of keying off of the flag
6365  // __kmp_init_parallel, the monitor thread creation is keyed off
6366  // of the new flag __kmp_init_monitor.
6367  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6368  if (TCR_4(__kmp_init_monitor)) {
6369  __kmp_reap_monitor(&__kmp_monitor);
6370  TCW_4(__kmp_init_monitor, 0);
6371  }
6372  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6373  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6374 #endif // KMP_USE_MONITOR
6375  } else {
6376 /* TODO move this to cleanup code */
6377 #ifdef KMP_DEBUG
6378  /* make sure that everything has properly ended */
6379  for (i = 0; i < __kmp_threads_capacity; i++) {
6380  if (__kmp_root[i]) {
6381  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6382  // there can be uber threads alive here
6383  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6384  }
6385  }
6386 #endif
6387 
6388  KMP_MB();
6389 
6390  // Reap the worker threads.
6391  // This is valid for now, but be careful if threads are reaped sooner.
6392  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6393  // Get the next thread from the pool.
6394  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6395  __kmp_thread_pool = thread->th.th_next_pool;
6396  // Reap it.
6397  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6398  thread->th.th_next_pool = NULL;
6399  thread->th.th_in_pool = FALSE;
6400  __kmp_reap_thread(thread, 0);
6401  }
6402  __kmp_thread_pool_insert_pt = NULL;
6403 
6404  // Reap teams.
6405  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6406  // Get the next team from the pool.
6407  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6408  __kmp_team_pool = team->t.t_next_pool;
6409  // Reap it.
6410  team->t.t_next_pool = NULL;
6411  __kmp_reap_team(team);
6412  }
6413 
6414  __kmp_reap_task_teams();
6415 
6416 #if KMP_OS_UNIX
6417  // Threads that are not reaped should not access any resources since they
6418  // are going to be deallocated soon, so the shutdown sequence should wait
6419  // until all threads either exit the final spin-waiting loop or begin
6420  // sleeping after the given blocktime.
6421  for (i = 0; i < __kmp_threads_capacity; i++) {
6422  kmp_info_t *thr = __kmp_threads[i];
6423  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6424  KMP_CPU_PAUSE();
6425  }
6426 #endif
6427 
6428  for (i = 0; i < __kmp_threads_capacity; ++i) {
6429  // TBD: Add some checking...
6430  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6431  }
6432 
6433  /* Make sure all threadprivate destructors get run by joining with all
6434  worker threads before resetting this flag */
6435  TCW_SYNC_4(__kmp_init_common, FALSE);
6436 
6437  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6438  KMP_MB();
6439 
6440 #if KMP_USE_MONITOR
6441  // See note above: One of the possible fixes for CQ138434 / CQ140126
6442  //
6443  // FIXME: push both code fragments down and CSE them?
6444  // push them into __kmp_cleanup() ?
6445  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6446  if (TCR_4(__kmp_init_monitor)) {
6447  __kmp_reap_monitor(&__kmp_monitor);
6448  TCW_4(__kmp_init_monitor, 0);
6449  }
6450  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6451  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6452 #endif
6453  } /* else !__kmp_global.t_active */
6454  TCW_4(__kmp_init_gtid, FALSE);
6455  KMP_MB(); /* Flush all pending memory write invalidates. */
6456 
6457  __kmp_cleanup();
6458 #if OMPT_SUPPORT
6459  ompt_fini();
6460 #endif
6461 }
6462 
6463 void __kmp_internal_end_library(int gtid_req) {
6464  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6465  /* this shouldn't be a race condition because __kmp_internal_end() is the
6466  only place to clear __kmp_serial_init */
6467  /* we'll check this later too, after we get the lock */
6468  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6469  // redundant, because the next check will work in any case.
6470  if (__kmp_global.g.g_abort) {
6471  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6472  /* TODO abort? */
6473  return;
6474  }
6475  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6476  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6477  return;
6478  }
6479 
6480  // If hidden helper team has been initialized, we need to deinit it
6481  if (TCR_4(__kmp_init_hidden_helper) &&
6482  !TCR_4(__kmp_hidden_helper_team_done)) {
6483  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6484  // First release the main thread to let it continue its work
6485  __kmp_hidden_helper_main_thread_release();
6486  // Wait until the hidden helper team has been destroyed
6487  __kmp_hidden_helper_threads_deinitz_wait();
6488  }
6489 
6490  KMP_MB(); /* Flush all pending memory write invalidates. */
6491  /* find out who we are and what we should do */
6492  {
6493  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6494  KA_TRACE(
6495  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6496  if (gtid == KMP_GTID_SHUTDOWN) {
6497  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6498  "already shutdown\n"));
6499  return;
6500  } else if (gtid == KMP_GTID_MONITOR) {
6501  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6502  "registered, or system shutdown\n"));
6503  return;
6504  } else if (gtid == KMP_GTID_DNE) {
6505  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6506  "shutdown\n"));
6507  /* we don't know who we are, but we may still shutdown the library */
6508  } else if (KMP_UBER_GTID(gtid)) {
6509  /* unregister ourselves as an uber thread. gtid is no longer valid */
6510  if (__kmp_root[gtid]->r.r_active) {
6511  __kmp_global.g.g_abort = -1;
6512  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6513  __kmp_unregister_library();
6514  KA_TRACE(10,
6515  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6516  gtid));
6517  return;
6518  } else {
6519  __kmp_itthash_clean(__kmp_threads[gtid]);
6520  KA_TRACE(
6521  10,
6522  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6523  __kmp_unregister_root_current_thread(gtid);
6524  }
6525  } else {
6526 /* worker threads may call this function through the atexit handler, if they
6527  * call exit() */
6528 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6529  TODO: do a thorough shutdown instead */
6530 #ifdef DUMP_DEBUG_ON_EXIT
6531  if (__kmp_debug_buf)
6532  __kmp_dump_debug_buffer();
6533 #endif
6534  // added unregister library call here when we switch to shm linux
6535  // if we don't, it will leave lots of files in /dev/shm
6536  // cleanup shared memory file before exiting.
6537  __kmp_unregister_library();
6538  return;
6539  }
6540  }
6541  /* synchronize the termination process */
6542  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6543 
6544  /* have we already finished */
6545  if (__kmp_global.g.g_abort) {
6546  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6547  /* TODO abort? */
6548  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6549  return;
6550  }
6551  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6552  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6553  return;
6554  }
6555 
6556  /* We need this lock to enforce mutex between this reading of
6557  __kmp_threads_capacity and the writing by __kmp_register_root.
6558  Alternatively, we can use a counter of roots that is atomically updated by
6559  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6560  __kmp_internal_end_*. */
6561  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6562 
6563  /* now we can safely conduct the actual termination */
6564  __kmp_internal_end();
6565 
6566  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6567  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6568 
6569  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6570 
6571 #ifdef DUMP_DEBUG_ON_EXIT
6572  if (__kmp_debug_buf)
6573  __kmp_dump_debug_buffer();
6574 #endif
6575 
6576 #if KMP_OS_WINDOWS
6577  __kmp_close_console();
6578 #endif
6579 
6580  __kmp_fini_allocator();
6581 
6582 } // __kmp_internal_end_library
6583 
6584 void __kmp_internal_end_thread(int gtid_req) {
6585  int i;
6586 
6587  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6588  /* this shouldn't be a race condition because __kmp_internal_end() is the
6589  * only place to clear __kmp_serial_init */
6590  /* we'll check this later too, after we get the lock */
6591  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6592  // redundant, because the next check will work in any case.
6593  if (__kmp_global.g.g_abort) {
6594  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6595  /* TODO abort? */
6596  return;
6597  }
6598  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6599  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6600  return;
6601  }
6602 
6603  // If hidden helper team has been initialized, we need to deinit it
6604  if (TCR_4(__kmp_init_hidden_helper) &&
6605  !TCR_4(__kmp_hidden_helper_team_done)) {
6606  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6607  // First release the main thread to let it continue its work
6608  __kmp_hidden_helper_main_thread_release();
6609  // Wait until the hidden helper team has been destroyed
6610  __kmp_hidden_helper_threads_deinitz_wait();
6611  }
6612 
6613  KMP_MB(); /* Flush all pending memory write invalidates. */
6614 
6615  /* find out who we are and what we should do */
6616  {
6617  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6618  KA_TRACE(10,
6619  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6620  if (gtid == KMP_GTID_SHUTDOWN) {
6621  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6622  "already shutdown\n"));
6623  return;
6624  } else if (gtid == KMP_GTID_MONITOR) {
6625  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6626  "registered, or system shutdown\n"));
6627  return;
6628  } else if (gtid == KMP_GTID_DNE) {
6629  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6630  "shutdown\n"));
6631  return;
6632  /* we don't know who we are */
6633  } else if (KMP_UBER_GTID(gtid)) {
6634  /* unregister ourselves as an uber thread. gtid is no longer valid */
6635  if (__kmp_root[gtid]->r.r_active) {
6636  __kmp_global.g.g_abort = -1;
6637  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6638  KA_TRACE(10,
6639  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6640  gtid));
6641  return;
6642  } else {
6643  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6644  gtid));
6645  __kmp_unregister_root_current_thread(gtid);
6646  }
6647  } else {
6648  /* just a worker thread, let's leave */
6649  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6650 
6651  if (gtid >= 0) {
6652  __kmp_threads[gtid]->th.th_task_team = NULL;
6653  }
6654 
6655  KA_TRACE(10,
6656  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6657  gtid));
6658  return;
6659  }
6660  }
6661 #if KMP_DYNAMIC_LIB
6662  if (__kmp_pause_status != kmp_hard_paused)
6663  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6664  // because we will better shutdown later in the library destructor.
6665  {
6666  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6667  return;
6668  }
6669 #endif
6670  /* synchronize the termination process */
6671  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6672 
6673  /* have we already finished */
6674  if (__kmp_global.g.g_abort) {
6675  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6676  /* TODO abort? */
6677  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678  return;
6679  }
6680  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6681  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6682  return;
6683  }
6684 
6685  /* We need this lock to enforce mutex between this reading of
6686  __kmp_threads_capacity and the writing by __kmp_register_root.
6687  Alternatively, we can use a counter of roots that is atomically updated by
6688  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6689  __kmp_internal_end_*. */
6690 
6691  /* should we finish the run-time? are all siblings done? */
6692  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6693 
6694  for (i = 0; i < __kmp_threads_capacity; ++i) {
6695  if (KMP_UBER_GTID(i)) {
6696  KA_TRACE(
6697  10,
6698  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6699  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6700  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6701  return;
6702  }
6703  }
6704 
6705  /* now we can safely conduct the actual termination */
6706 
6707  __kmp_internal_end();
6708 
6709  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6710  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6711 
6712  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6713 
6714 #ifdef DUMP_DEBUG_ON_EXIT
6715  if (__kmp_debug_buf)
6716  __kmp_dump_debug_buffer();
6717 #endif
6718 } // __kmp_internal_end_thread
6719 
6720 // -----------------------------------------------------------------------------
6721 // Library registration stuff.
6722 
6723 static long __kmp_registration_flag = 0;
6724 // Random value used to indicate library initialization.
6725 static char *__kmp_registration_str = NULL;
6726 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6727 
6728 static inline char *__kmp_reg_status_name() {
6729 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6730  each thread. If registration and unregistration go in different threads
6731  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6732  env var can not be found, because the name will contain different pid. */
6733 // macOS* complains about name being too long with additional getuid()
6734 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6735  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6736  (int)getuid());
6737 #else
6738  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6739 #endif
6740 } // __kmp_reg_status_get
6741 
6742 #if defined(KMP_USE_SHM)
6743 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6744 char *temp_reg_status_file_name = nullptr;
6745 #endif
6746 
6747 void __kmp_register_library_startup(void) {
6748 
6749  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6750  int done = 0;
6751  union {
6752  double dtime;
6753  long ltime;
6754  } time;
6755 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6756  __kmp_initialize_system_tick();
6757 #endif
6758  __kmp_read_system_time(&time.dtime);
6759  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6760  __kmp_registration_str =
6761  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6762  __kmp_registration_flag, KMP_LIBRARY_FILE);
6763 
6764  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6765  __kmp_registration_str));
6766 
6767  while (!done) {
6768 
6769  char *value = NULL; // Actual value of the environment variable.
6770 
6771 #if defined(KMP_USE_SHM)
6772  char *shm_name = __kmp_str_format("/%s", name);
6773  int shm_preexist = 0;
6774  char *data1;
6775  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6776  if ((fd1 == -1) && (errno == EEXIST)) {
6777  // file didn't open because it already exists.
6778  // try opening existing file
6779  fd1 = shm_open(shm_name, O_RDWR, 0666);
6780  if (fd1 == -1) { // file didn't open
6781  // error out here
6782  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6783  __kmp_msg_null);
6784  } else {
6785  // able to open existing file
6786  shm_preexist = 1;
6787  }
6788  } else if (fd1 == -1) {
6789  // SHM didn't open; it was due to error other than already exists. Try to
6790  // create a temp file under /tmp.
6791  // TODO: /tmp might not always be the temporary directory. For now we will
6792  // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6793  char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6794  fd1 = mkstemp(temp_file_name);
6795  if (fd1 == -1) {
6796  // error out here.
6797  __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6798  __kmp_msg_null);
6799  }
6800  temp_reg_status_file_name = temp_file_name;
6801  }
6802  if (shm_preexist == 0) {
6803  // we created SHM now set size
6804  if (ftruncate(fd1, SHM_SIZE) == -1) {
6805  // error occured setting size;
6806  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6807  KMP_ERR(errno), __kmp_msg_null);
6808  }
6809  }
6810  data1 =
6811  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6812  if (data1 == MAP_FAILED) {
6813  // failed to map shared memory
6814  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6815  __kmp_msg_null);
6816  }
6817  if (shm_preexist == 0) { // set data to SHM, set value
6818  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6819  }
6820  // Read value from either what we just wrote or existing file.
6821  value = __kmp_str_format("%s", data1); // read value from SHM
6822  munmap(data1, SHM_SIZE);
6823  close(fd1);
6824 #else // Windows and unix with static library
6825  // Set environment variable, but do not overwrite if it is exist.
6826  __kmp_env_set(name, __kmp_registration_str, 0);
6827  // read value to see if it got set
6828  value = __kmp_env_get(name);
6829 #endif
6830 
6831  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6832  done = 1; // Ok, environment variable set successfully, exit the loop.
6833  } else {
6834  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6835  // Check whether it alive or dead.
6836  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6837  char *tail = value;
6838  char *flag_addr_str = NULL;
6839  char *flag_val_str = NULL;
6840  char const *file_name = NULL;
6841  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6842  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6843  file_name = tail;
6844  if (tail != NULL) {
6845  unsigned long *flag_addr = 0;
6846  unsigned long flag_val = 0;
6847  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6848  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6849  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6850  // First, check whether environment-encoded address is mapped into
6851  // addr space.
6852  // If so, dereference it to see if it still has the right value.
6853  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6854  neighbor = 1;
6855  } else {
6856  // If not, then we know the other copy of the library is no longer
6857  // running.
6858  neighbor = 2;
6859  }
6860  }
6861  }
6862  switch (neighbor) {
6863  case 0: // Cannot parse environment variable -- neighbor status unknown.
6864  // Assume it is the incompatible format of future version of the
6865  // library. Assume the other library is alive.
6866  // WARN( ... ); // TODO: Issue a warning.
6867  file_name = "unknown library";
6868  KMP_FALLTHROUGH();
6869  // Attention! Falling to the next case. That's intentional.
6870  case 1: { // Neighbor is alive.
6871  // Check it is allowed.
6872  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6873  if (!__kmp_str_match_true(duplicate_ok)) {
6874  // That's not allowed. Issue fatal error.
6875  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6876  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6877  }
6878  KMP_INTERNAL_FREE(duplicate_ok);
6879  __kmp_duplicate_library_ok = 1;
6880  done = 1; // Exit the loop.
6881  } break;
6882  case 2: { // Neighbor is dead.
6883 
6884 #if defined(KMP_USE_SHM)
6885  // close shared memory.
6886  shm_unlink(shm_name); // this removes file in /dev/shm
6887 #else
6888  // Clear the variable and try to register library again.
6889  __kmp_env_unset(name);
6890 #endif
6891  } break;
6892  default: {
6893  KMP_DEBUG_ASSERT(0);
6894  } break;
6895  }
6896  }
6897  KMP_INTERNAL_FREE((void *)value);
6898 #if defined(KMP_USE_SHM)
6899  KMP_INTERNAL_FREE((void *)shm_name);
6900 #endif
6901  } // while
6902  KMP_INTERNAL_FREE((void *)name);
6903 
6904 } // func __kmp_register_library_startup
6905 
6906 void __kmp_unregister_library(void) {
6907 
6908  char *name = __kmp_reg_status_name();
6909  char *value = NULL;
6910 
6911 #if defined(KMP_USE_SHM)
6912  bool use_shm = true;
6913  char *shm_name = __kmp_str_format("/%s", name);
6914  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6915  if (fd1 == -1) {
6916  // File did not open. Try the temporary file.
6917  use_shm = false;
6918  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6919  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6920  if (fd1 == -1) {
6921  // give it up now.
6922  return;
6923  }
6924  }
6925  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6926  if (data1 != MAP_FAILED) {
6927  value = __kmp_str_format("%s", data1); // read value from SHM
6928  munmap(data1, SHM_SIZE);
6929  }
6930  close(fd1);
6931 #else
6932  value = __kmp_env_get(name);
6933 #endif
6934 
6935  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6936  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6937  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6938 // Ok, this is our variable. Delete it.
6939 #if defined(KMP_USE_SHM)
6940  if (use_shm) {
6941  shm_unlink(shm_name); // this removes file in /dev/shm
6942  } else {
6943  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6944  unlink(temp_reg_status_file_name); // this removes the temp file
6945  }
6946 #else
6947  __kmp_env_unset(name);
6948 #endif
6949  }
6950 
6951 #if defined(KMP_USE_SHM)
6952  KMP_INTERNAL_FREE(shm_name);
6953  if (!use_shm) {
6954  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6955  KMP_INTERNAL_FREE(temp_reg_status_file_name);
6956  }
6957 #endif
6958 
6959  KMP_INTERNAL_FREE(__kmp_registration_str);
6960  KMP_INTERNAL_FREE(value);
6961  KMP_INTERNAL_FREE(name);
6962 
6963  __kmp_registration_flag = 0;
6964  __kmp_registration_str = NULL;
6965 
6966 } // __kmp_unregister_library
6967 
6968 // End of Library registration stuff.
6969 // -----------------------------------------------------------------------------
6970 
6971 #if KMP_MIC_SUPPORTED
6972 
6973 static void __kmp_check_mic_type() {
6974  kmp_cpuid_t cpuid_state = {0};
6975  kmp_cpuid_t *cs_p = &cpuid_state;
6976  __kmp_x86_cpuid(1, 0, cs_p);
6977  // We don't support mic1 at the moment
6978  if ((cs_p->eax & 0xff0) == 0xB10) {
6979  __kmp_mic_type = mic2;
6980  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6981  __kmp_mic_type = mic3;
6982  } else {
6983  __kmp_mic_type = non_mic;
6984  }
6985 }
6986 
6987 #endif /* KMP_MIC_SUPPORTED */
6988 
6989 #if KMP_HAVE_UMWAIT
6990 static void __kmp_user_level_mwait_init() {
6991  struct kmp_cpuid buf;
6992  __kmp_x86_cpuid(7, 0, &buf);
6993  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6994  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6995  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6996  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6997  __kmp_umwait_enabled));
6998 }
6999 #elif KMP_HAVE_MWAIT
7000 #ifndef AT_INTELPHIUSERMWAIT
7001 // Spurious, non-existent value that should always fail to return anything.
7002 // Will be replaced with the correct value when we know that.
7003 #define AT_INTELPHIUSERMWAIT 10000
7004 #endif
7005 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7006 // earlier OS is used to build the RTL, we'll use the following internal
7007 // function when the entry is not found.
7008 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7009 unsigned long getauxval(unsigned long) { return 0; }
7010 
7011 static void __kmp_user_level_mwait_init() {
7012  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7013  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7014  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7015  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7016  if (__kmp_mic_type == mic3) {
7017  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7018  if ((res & 0x1) || __kmp_user_level_mwait) {
7019  __kmp_mwait_enabled = TRUE;
7020  if (__kmp_user_level_mwait) {
7021  KMP_INFORM(EnvMwaitWarn);
7022  }
7023  } else {
7024  __kmp_mwait_enabled = FALSE;
7025  }
7026  }
7027  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7028  "__kmp_mwait_enabled = %d\n",
7029  __kmp_mic_type, __kmp_mwait_enabled));
7030 }
7031 #endif /* KMP_HAVE_UMWAIT */
7032 
7033 static void __kmp_do_serial_initialize(void) {
7034  int i, gtid;
7035  size_t size;
7036 
7037  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7038 
7039  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7040  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7041  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7042  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7043  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7044 
7045 #if OMPT_SUPPORT
7046  ompt_pre_init();
7047 #endif
7048 #if OMPD_SUPPORT
7049  __kmp_env_dump();
7050  ompd_init();
7051 #endif
7052 
7053  __kmp_validate_locks();
7054 
7055  /* Initialize internal memory allocator */
7056  __kmp_init_allocator();
7057 
7058  /* Register the library startup via an environment variable or via mapped
7059  shared memory file and check to see whether another copy of the library is
7060  already registered. Since forked child process is often terminated, we
7061  postpone the registration till middle initialization in the child */
7062  if (__kmp_need_register_serial)
7063  __kmp_register_library_startup();
7064 
7065  /* TODO reinitialization of library */
7066  if (TCR_4(__kmp_global.g.g_done)) {
7067  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7068  }
7069 
7070  __kmp_global.g.g_abort = 0;
7071  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7072 
7073 /* initialize the locks */
7074 #if KMP_USE_ADAPTIVE_LOCKS
7075 #if KMP_DEBUG_ADAPTIVE_LOCKS
7076  __kmp_init_speculative_stats();
7077 #endif
7078 #endif
7079 #if KMP_STATS_ENABLED
7080  __kmp_stats_init();
7081 #endif
7082  __kmp_init_lock(&__kmp_global_lock);
7083  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7084  __kmp_init_lock(&__kmp_debug_lock);
7085  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7086  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7087  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7088  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7089  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7090  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7091  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7092  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7093  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7094  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7095  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7096  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7097  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7098  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7099  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7100 #if KMP_USE_MONITOR
7101  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7102 #endif
7103  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7104 
7105  /* conduct initialization and initial setup of configuration */
7106 
7107  __kmp_runtime_initialize();
7108 
7109 #if KMP_MIC_SUPPORTED
7110  __kmp_check_mic_type();
7111 #endif
7112 
7113 // Some global variable initialization moved here from kmp_env_initialize()
7114 #ifdef KMP_DEBUG
7115  kmp_diag = 0;
7116 #endif
7117  __kmp_abort_delay = 0;
7118 
7119  // From __kmp_init_dflt_team_nth()
7120  /* assume the entire machine will be used */
7121  __kmp_dflt_team_nth_ub = __kmp_xproc;
7122  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7123  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7124  }
7125  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7126  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7127  }
7128  __kmp_max_nth = __kmp_sys_max_nth;
7129  __kmp_cg_max_nth = __kmp_sys_max_nth;
7130  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7131  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7132  __kmp_teams_max_nth = __kmp_sys_max_nth;
7133  }
7134 
7135  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7136  // part
7137  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7138 #if KMP_USE_MONITOR
7139  __kmp_monitor_wakeups =
7140  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7141  __kmp_bt_intervals =
7142  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7143 #endif
7144  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7145  __kmp_library = library_throughput;
7146  // From KMP_SCHEDULE initialization
7147  __kmp_static = kmp_sch_static_balanced;
7148 // AC: do not use analytical here, because it is non-monotonous
7149 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7150 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7151 // need to repeat assignment
7152 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7153 // bit control and barrier method control parts
7154 #if KMP_FAST_REDUCTION_BARRIER
7155 #define kmp_reduction_barrier_gather_bb ((int)1)
7156 #define kmp_reduction_barrier_release_bb ((int)1)
7157 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7158 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7159 #endif // KMP_FAST_REDUCTION_BARRIER
7160  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7161  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7162  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7163  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7164  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7165 #if KMP_FAST_REDUCTION_BARRIER
7166  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7167  // lin_64 ): hyper,1
7168  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7169  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7170  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7171  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7172  }
7173 #endif // KMP_FAST_REDUCTION_BARRIER
7174  }
7175 #if KMP_FAST_REDUCTION_BARRIER
7176 #undef kmp_reduction_barrier_release_pat
7177 #undef kmp_reduction_barrier_gather_pat
7178 #undef kmp_reduction_barrier_release_bb
7179 #undef kmp_reduction_barrier_gather_bb
7180 #endif // KMP_FAST_REDUCTION_BARRIER
7181 #if KMP_MIC_SUPPORTED
7182  if (__kmp_mic_type == mic2) { // KNC
7183  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7184  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7185  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7186  1; // forkjoin release
7187  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7188  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7189  }
7190 #if KMP_FAST_REDUCTION_BARRIER
7191  if (__kmp_mic_type == mic2) { // KNC
7192  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7193  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7194  }
7195 #endif // KMP_FAST_REDUCTION_BARRIER
7196 #endif // KMP_MIC_SUPPORTED
7197 
7198 // From KMP_CHECKS initialization
7199 #ifdef KMP_DEBUG
7200  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7201 #else
7202  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7203 #endif
7204 
7205  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7206  __kmp_foreign_tp = TRUE;
7207 
7208  __kmp_global.g.g_dynamic = FALSE;
7209  __kmp_global.g.g_dynamic_mode = dynamic_default;
7210 
7211  __kmp_init_nesting_mode();
7212 
7213  __kmp_env_initialize(NULL);
7214 
7215 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7216  __kmp_user_level_mwait_init();
7217 #endif
7218 // Print all messages in message catalog for testing purposes.
7219 #ifdef KMP_DEBUG
7220  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7221  if (__kmp_str_match_true(val)) {
7222  kmp_str_buf_t buffer;
7223  __kmp_str_buf_init(&buffer);
7224  __kmp_i18n_dump_catalog(&buffer);
7225  __kmp_printf("%s", buffer.str);
7226  __kmp_str_buf_free(&buffer);
7227  }
7228  __kmp_env_free(&val);
7229 #endif
7230 
7231  __kmp_threads_capacity =
7232  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7233  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7234  __kmp_tp_capacity = __kmp_default_tp_capacity(
7235  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7236 
7237  // If the library is shut down properly, both pools must be NULL. Just in
7238  // case, set them to NULL -- some memory may leak, but subsequent code will
7239  // work even if pools are not freed.
7240  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7241  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7242  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7243  __kmp_thread_pool = NULL;
7244  __kmp_thread_pool_insert_pt = NULL;
7245  __kmp_team_pool = NULL;
7246 
7247  /* Allocate all of the variable sized records */
7248  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7249  * expandable */
7250  /* Since allocation is cache-aligned, just add extra padding at the end */
7251  size =
7252  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7253  CACHE_LINE;
7254  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7255  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7256  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7257 
7258  /* init thread counts */
7259  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7260  0); // Asserts fail if the library is reinitializing and
7261  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7262  __kmp_all_nth = 0;
7263  __kmp_nth = 0;
7264 
7265  /* setup the uber master thread and hierarchy */
7266  gtid = __kmp_register_root(TRUE);
7267  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7268  KMP_ASSERT(KMP_UBER_GTID(gtid));
7269  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7270 
7271  KMP_MB(); /* Flush all pending memory write invalidates. */
7272 
7273  __kmp_common_initialize();
7274 
7275 #if KMP_OS_UNIX
7276  /* invoke the child fork handler */
7277  __kmp_register_atfork();
7278 #endif
7279 
7280 #if !KMP_DYNAMIC_LIB || \
7281  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7282  {
7283  /* Invoke the exit handler when the program finishes, only for static
7284  library and macOS* dynamic. For other dynamic libraries, we already
7285  have _fini and DllMain. */
7286  int rc = atexit(__kmp_internal_end_atexit);
7287  if (rc != 0) {
7288  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7289  __kmp_msg_null);
7290  }
7291  }
7292 #endif
7293 
7294 #if KMP_HANDLE_SIGNALS
7295 #if KMP_OS_UNIX
7296  /* NOTE: make sure that this is called before the user installs their own
7297  signal handlers so that the user handlers are called first. this way they
7298  can return false, not call our handler, avoid terminating the library, and
7299  continue execution where they left off. */
7300  __kmp_install_signals(FALSE);
7301 #endif /* KMP_OS_UNIX */
7302 #if KMP_OS_WINDOWS
7303  __kmp_install_signals(TRUE);
7304 #endif /* KMP_OS_WINDOWS */
7305 #endif
7306 
7307  /* we have finished the serial initialization */
7308  __kmp_init_counter++;
7309 
7310  __kmp_init_serial = TRUE;
7311 
7312  if (__kmp_settings) {
7313  __kmp_env_print();
7314  }
7315 
7316  if (__kmp_display_env || __kmp_display_env_verbose) {
7317  __kmp_env_print_2();
7318  }
7319 
7320 #if OMPT_SUPPORT
7321  ompt_post_init();
7322 #endif
7323 
7324  KMP_MB();
7325 
7326  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7327 }
7328 
7329 void __kmp_serial_initialize(void) {
7330  if (__kmp_init_serial) {
7331  return;
7332  }
7333  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7334  if (__kmp_init_serial) {
7335  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7336  return;
7337  }
7338  __kmp_do_serial_initialize();
7339  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7340 }
7341 
7342 static void __kmp_do_middle_initialize(void) {
7343  int i, j;
7344  int prev_dflt_team_nth;
7345 
7346  if (!__kmp_init_serial) {
7347  __kmp_do_serial_initialize();
7348  }
7349 
7350  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7351 
7352  if (UNLIKELY(!__kmp_need_register_serial)) {
7353  // We are in a forked child process. The registration was skipped during
7354  // serial initialization in __kmp_atfork_child handler. Do it here.
7355  __kmp_register_library_startup();
7356  }
7357 
7358  // Save the previous value for the __kmp_dflt_team_nth so that
7359  // we can avoid some reinitialization if it hasn't changed.
7360  prev_dflt_team_nth = __kmp_dflt_team_nth;
7361 
7362 #if KMP_AFFINITY_SUPPORTED
7363  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7364  // number of cores on the machine.
7365  __kmp_affinity_initialize(__kmp_affinity);
7366 
7367 #endif /* KMP_AFFINITY_SUPPORTED */
7368 
7369  KMP_ASSERT(__kmp_xproc > 0);
7370  if (__kmp_avail_proc == 0) {
7371  __kmp_avail_proc = __kmp_xproc;
7372  }
7373 
7374  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7375  // correct them now
7376  j = 0;
7377  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7378  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7379  __kmp_avail_proc;
7380  j++;
7381  }
7382 
7383  if (__kmp_dflt_team_nth == 0) {
7384 #ifdef KMP_DFLT_NTH_CORES
7385  // Default #threads = #cores
7386  __kmp_dflt_team_nth = __kmp_ncores;
7387  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7388  "__kmp_ncores (%d)\n",
7389  __kmp_dflt_team_nth));
7390 #else
7391  // Default #threads = #available OS procs
7392  __kmp_dflt_team_nth = __kmp_avail_proc;
7393  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7394  "__kmp_avail_proc(%d)\n",
7395  __kmp_dflt_team_nth));
7396 #endif /* KMP_DFLT_NTH_CORES */
7397  }
7398 
7399  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7400  __kmp_dflt_team_nth = KMP_MIN_NTH;
7401  }
7402  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7403  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7404  }
7405 
7406  if (__kmp_nesting_mode > 0)
7407  __kmp_set_nesting_mode_threads();
7408 
7409  // There's no harm in continuing if the following check fails,
7410  // but it indicates an error in the previous logic.
7411  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7412 
7413  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7414  // Run through the __kmp_threads array and set the num threads icv for each
7415  // root thread that is currently registered with the RTL (which has not
7416  // already explicitly set its nthreads-var with a call to
7417  // omp_set_num_threads()).
7418  for (i = 0; i < __kmp_threads_capacity; i++) {
7419  kmp_info_t *thread = __kmp_threads[i];
7420  if (thread == NULL)
7421  continue;
7422  if (thread->th.th_current_task->td_icvs.nproc != 0)
7423  continue;
7424 
7425  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7426  }
7427  }
7428  KA_TRACE(
7429  20,
7430  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7431  __kmp_dflt_team_nth));
7432 
7433 #ifdef KMP_ADJUST_BLOCKTIME
7434  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7435  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7436  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7437  if (__kmp_nth > __kmp_avail_proc) {
7438  __kmp_zero_bt = TRUE;
7439  }
7440  }
7441 #endif /* KMP_ADJUST_BLOCKTIME */
7442 
7443  /* we have finished middle initialization */
7444  TCW_SYNC_4(__kmp_init_middle, TRUE);
7445 
7446  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7447 }
7448 
7449 void __kmp_middle_initialize(void) {
7450  if (__kmp_init_middle) {
7451  return;
7452  }
7453  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7454  if (__kmp_init_middle) {
7455  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7456  return;
7457  }
7458  __kmp_do_middle_initialize();
7459  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7460 }
7461 
7462 void __kmp_parallel_initialize(void) {
7463  int gtid = __kmp_entry_gtid(); // this might be a new root
7464 
7465  /* synchronize parallel initialization (for sibling) */
7466  if (TCR_4(__kmp_init_parallel))
7467  return;
7468  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7469  if (TCR_4(__kmp_init_parallel)) {
7470  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7471  return;
7472  }
7473 
7474  /* TODO reinitialization after we have already shut down */
7475  if (TCR_4(__kmp_global.g.g_done)) {
7476  KA_TRACE(
7477  10,
7478  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7479  __kmp_infinite_loop();
7480  }
7481 
7482  /* jc: The lock __kmp_initz_lock is already held, so calling
7483  __kmp_serial_initialize would cause a deadlock. So we call
7484  __kmp_do_serial_initialize directly. */
7485  if (!__kmp_init_middle) {
7486  __kmp_do_middle_initialize();
7487  }
7488  __kmp_assign_root_init_mask();
7489  __kmp_resume_if_hard_paused();
7490 
7491  /* begin initialization */
7492  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7493  KMP_ASSERT(KMP_UBER_GTID(gtid));
7494 
7495 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7496  // Save the FP control regs.
7497  // Worker threads will set theirs to these values at thread startup.
7498  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7499  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7500  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7501 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7502 
7503 #if KMP_OS_UNIX
7504 #if KMP_HANDLE_SIGNALS
7505  /* must be after __kmp_serial_initialize */
7506  __kmp_install_signals(TRUE);
7507 #endif
7508 #endif
7509 
7510  __kmp_suspend_initialize();
7511 
7512 #if defined(USE_LOAD_BALANCE)
7513  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7514  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7515  }
7516 #else
7517  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7518  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7519  }
7520 #endif
7521 
7522  if (__kmp_version) {
7523  __kmp_print_version_2();
7524  }
7525 
7526  /* we have finished parallel initialization */
7527  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7528 
7529  KMP_MB();
7530  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7531 
7532  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7533 }
7534 
7535 void __kmp_hidden_helper_initialize() {
7536  if (TCR_4(__kmp_init_hidden_helper))
7537  return;
7538 
7539  // __kmp_parallel_initialize is required before we initialize hidden helper
7540  if (!TCR_4(__kmp_init_parallel))
7541  __kmp_parallel_initialize();
7542 
7543  // Double check. Note that this double check should not be placed before
7544  // __kmp_parallel_initialize as it will cause dead lock.
7545  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7546  if (TCR_4(__kmp_init_hidden_helper)) {
7547  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7548  return;
7549  }
7550 
7551 #if KMP_AFFINITY_SUPPORTED
7552  // Initialize hidden helper affinity settings.
7553  // The above __kmp_parallel_initialize() will initialize
7554  // regular affinity (and topology) if not already done.
7555  if (!__kmp_hh_affinity.flags.initialized)
7556  __kmp_affinity_initialize(__kmp_hh_affinity);
7557 #endif
7558 
7559  // Set the count of hidden helper tasks to be executed to zero
7560  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7561 
7562  // Set the global variable indicating that we're initializing hidden helper
7563  // team/threads
7564  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7565 
7566  // Platform independent initialization
7567  __kmp_do_initialize_hidden_helper_threads();
7568 
7569  // Wait here for the finish of initialization of hidden helper teams
7570  __kmp_hidden_helper_threads_initz_wait();
7571 
7572  // We have finished hidden helper initialization
7573  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7574 
7575  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7576 }
7577 
7578 /* ------------------------------------------------------------------------ */
7579 
7580 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7581  kmp_team_t *team) {
7582  kmp_disp_t *dispatch;
7583 
7584  KMP_MB();
7585 
7586  /* none of the threads have encountered any constructs, yet. */
7587  this_thr->th.th_local.this_construct = 0;
7588 #if KMP_CACHE_MANAGE
7589  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7590 #endif /* KMP_CACHE_MANAGE */
7591  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7592  KMP_DEBUG_ASSERT(dispatch);
7593  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7594  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7595  // this_thr->th.th_info.ds.ds_tid ] );
7596 
7597  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7598  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7599  if (__kmp_env_consistency_check)
7600  __kmp_push_parallel(gtid, team->t.t_ident);
7601 
7602  KMP_MB(); /* Flush all pending memory write invalidates. */
7603 }
7604 
7605 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7606  kmp_team_t *team) {
7607  if (__kmp_env_consistency_check)
7608  __kmp_pop_parallel(gtid, team->t.t_ident);
7609 
7610  __kmp_finish_implicit_task(this_thr);
7611 }
7612 
7613 int __kmp_invoke_task_func(int gtid) {
7614  int rc;
7615  int tid = __kmp_tid_from_gtid(gtid);
7616  kmp_info_t *this_thr = __kmp_threads[gtid];
7617  kmp_team_t *team = this_thr->th.th_team;
7618 
7619  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7620 #if USE_ITT_BUILD
7621  if (__itt_stack_caller_create_ptr) {
7622  // inform ittnotify about entering user's code
7623  if (team->t.t_stack_id != NULL) {
7624  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7625  } else {
7626  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7627  __kmp_itt_stack_callee_enter(
7628  (__itt_caller)team->t.t_parent->t.t_stack_id);
7629  }
7630  }
7631 #endif /* USE_ITT_BUILD */
7632 #if INCLUDE_SSC_MARKS
7633  SSC_MARK_INVOKING();
7634 #endif
7635 
7636 #if OMPT_SUPPORT
7637  void *dummy;
7638  void **exit_frame_p;
7639  ompt_data_t *my_task_data;
7640  ompt_data_t *my_parallel_data;
7641  int ompt_team_size;
7642 
7643  if (ompt_enabled.enabled) {
7644  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7645  .ompt_task_info.frame.exit_frame.ptr);
7646  } else {
7647  exit_frame_p = &dummy;
7648  }
7649 
7650  my_task_data =
7651  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7652  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7653  if (ompt_enabled.ompt_callback_implicit_task) {
7654  ompt_team_size = team->t.t_nproc;
7655  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7656  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7657  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7658  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7659  }
7660 #endif
7661 
7662 #if KMP_STATS_ENABLED
7663  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7664  if (previous_state == stats_state_e::TEAMS_REGION) {
7665  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7666  } else {
7667  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7668  }
7669  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7670 #endif
7671 
7672  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7673  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7674 #if OMPT_SUPPORT
7675  ,
7676  exit_frame_p
7677 #endif
7678  );
7679 #if OMPT_SUPPORT
7680  *exit_frame_p = NULL;
7681  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7682 #endif
7683 
7684 #if KMP_STATS_ENABLED
7685  if (previous_state == stats_state_e::TEAMS_REGION) {
7686  KMP_SET_THREAD_STATE(previous_state);
7687  }
7688  KMP_POP_PARTITIONED_TIMER();
7689 #endif
7690 
7691 #if USE_ITT_BUILD
7692  if (__itt_stack_caller_create_ptr) {
7693  // inform ittnotify about leaving user's code
7694  if (team->t.t_stack_id != NULL) {
7695  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7696  } else {
7697  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7698  __kmp_itt_stack_callee_leave(
7699  (__itt_caller)team->t.t_parent->t.t_stack_id);
7700  }
7701  }
7702 #endif /* USE_ITT_BUILD */
7703  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7704 
7705  return rc;
7706 }
7707 
7708 void __kmp_teams_master(int gtid) {
7709  // This routine is called by all primary threads in teams construct
7710  kmp_info_t *thr = __kmp_threads[gtid];
7711  kmp_team_t *team = thr->th.th_team;
7712  ident_t *loc = team->t.t_ident;
7713  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7714  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7715  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7716  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7717  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7718 
7719  // This thread is a new CG root. Set up the proper variables.
7720  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7721  tmp->cg_root = thr; // Make thr the CG root
7722  // Init to thread limit stored when league primary threads were forked
7723  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7724  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7725  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7726  " cg_nthreads to 1\n",
7727  thr, tmp));
7728  tmp->up = thr->th.th_cg_roots;
7729  thr->th.th_cg_roots = tmp;
7730 
7731 // Launch league of teams now, but not let workers execute
7732 // (they hang on fork barrier until next parallel)
7733 #if INCLUDE_SSC_MARKS
7734  SSC_MARK_FORKING();
7735 #endif
7736  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7737  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7738  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7739 #if INCLUDE_SSC_MARKS
7740  SSC_MARK_JOINING();
7741 #endif
7742  // If the team size was reduced from the limit, set it to the new size
7743  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7744  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7745  // AC: last parameter "1" eliminates join barrier which won't work because
7746  // worker threads are in a fork barrier waiting for more parallel regions
7747  __kmp_join_call(loc, gtid
7748 #if OMPT_SUPPORT
7749  ,
7750  fork_context_intel
7751 #endif
7752  ,
7753  1);
7754 }
7755 
7756 int __kmp_invoke_teams_master(int gtid) {
7757  kmp_info_t *this_thr = __kmp_threads[gtid];
7758  kmp_team_t *team = this_thr->th.th_team;
7759 #if KMP_DEBUG
7760  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7761  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7762  (void *)__kmp_teams_master);
7763 #endif
7764  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7765 #if OMPT_SUPPORT
7766  int tid = __kmp_tid_from_gtid(gtid);
7767  ompt_data_t *task_data =
7768  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7769  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7770  if (ompt_enabled.ompt_callback_implicit_task) {
7771  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7772  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7773  ompt_task_initial);
7774  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7775  }
7776 #endif
7777  __kmp_teams_master(gtid);
7778 #if OMPT_SUPPORT
7779  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7780 #endif
7781  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7782  return 1;
7783 }
7784 
7785 /* this sets the requested number of threads for the next parallel region
7786  encountered by this team. since this should be enclosed in the forkjoin
7787  critical section it should avoid race conditions with asymmetrical nested
7788  parallelism */
7789 
7790 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7791  kmp_info_t *thr = __kmp_threads[gtid];
7792 
7793  if (num_threads > 0)
7794  thr->th.th_set_nproc = num_threads;
7795 }
7796 
7797 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7798  int num_threads) {
7799  KMP_DEBUG_ASSERT(thr);
7800  // Remember the number of threads for inner parallel regions
7801  if (!TCR_4(__kmp_init_middle))
7802  __kmp_middle_initialize(); // get internal globals calculated
7803  __kmp_assign_root_init_mask();
7804  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7805  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7806 
7807  if (num_threads == 0) {
7808  if (__kmp_teams_thread_limit > 0) {
7809  num_threads = __kmp_teams_thread_limit;
7810  } else {
7811  num_threads = __kmp_avail_proc / num_teams;
7812  }
7813  // adjust num_threads w/o warning as it is not user setting
7814  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7815  // no thread_limit clause specified - do not change thread-limit-var ICV
7816  if (num_threads > __kmp_dflt_team_nth) {
7817  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7818  }
7819  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7820  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7821  } // prevent team size to exceed thread-limit-var
7822  if (num_teams * num_threads > __kmp_teams_max_nth) {
7823  num_threads = __kmp_teams_max_nth / num_teams;
7824  }
7825  if (num_threads == 0) {
7826  num_threads = 1;
7827  }
7828  } else {
7829  if (num_threads < 0) {
7830  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7831  __kmp_msg_null);
7832  num_threads = 1;
7833  }
7834  // This thread will be the primary thread of the league primary threads
7835  // Store new thread limit; old limit is saved in th_cg_roots list
7836  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7837  // num_threads = min(num_threads, nthreads-var)
7838  if (num_threads > __kmp_dflt_team_nth) {
7839  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7840  }
7841  if (num_teams * num_threads > __kmp_teams_max_nth) {
7842  int new_threads = __kmp_teams_max_nth / num_teams;
7843  if (new_threads == 0) {
7844  new_threads = 1;
7845  }
7846  if (new_threads != num_threads) {
7847  if (!__kmp_reserve_warn) { // user asked for too many threads
7848  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7849  __kmp_msg(kmp_ms_warning,
7850  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7851  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7852  }
7853  }
7854  num_threads = new_threads;
7855  }
7856  }
7857  thr->th.th_teams_size.nth = num_threads;
7858 }
7859 
7860 /* this sets the requested number of teams for the teams region and/or
7861  the number of threads for the next parallel region encountered */
7862 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7863  int num_threads) {
7864  kmp_info_t *thr = __kmp_threads[gtid];
7865  if (num_teams < 0) {
7866  // OpenMP specification requires requested values to be positive,
7867  // but people can send us any value, so we'd better check
7868  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7869  __kmp_msg_null);
7870  num_teams = 1;
7871  }
7872  if (num_teams == 0) {
7873  if (__kmp_nteams > 0) {
7874  num_teams = __kmp_nteams;
7875  } else {
7876  num_teams = 1; // default number of teams is 1.
7877  }
7878  }
7879  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7880  if (!__kmp_reserve_warn) {
7881  __kmp_reserve_warn = 1;
7882  __kmp_msg(kmp_ms_warning,
7883  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7884  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7885  }
7886  num_teams = __kmp_teams_max_nth;
7887  }
7888  // Set number of teams (number of threads in the outer "parallel" of the
7889  // teams)
7890  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7891 
7892  __kmp_push_thread_limit(thr, num_teams, num_threads);
7893 }
7894 
7895 /* This sets the requested number of teams for the teams region and/or
7896  the number of threads for the next parallel region encountered */
7897 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7898  int num_teams_ub, int num_threads) {
7899  kmp_info_t *thr = __kmp_threads[gtid];
7900  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7901  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7902  KMP_DEBUG_ASSERT(num_threads >= 0);
7903 
7904  if (num_teams_lb > num_teams_ub) {
7905  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7906  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7907  }
7908 
7909  int num_teams = 1; // defalt number of teams is 1.
7910 
7911  if (num_teams_lb == 0 && num_teams_ub > 0)
7912  num_teams_lb = num_teams_ub;
7913 
7914  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7915  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7916  if (num_teams > __kmp_teams_max_nth) {
7917  if (!__kmp_reserve_warn) {
7918  __kmp_reserve_warn = 1;
7919  __kmp_msg(kmp_ms_warning,
7920  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7921  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7922  }
7923  num_teams = __kmp_teams_max_nth;
7924  }
7925  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7926  num_teams = num_teams_ub;
7927  } else { // num_teams_lb <= num_teams <= num_teams_ub
7928  if (num_threads <= 0) {
7929  if (num_teams_ub > __kmp_teams_max_nth) {
7930  num_teams = num_teams_lb;
7931  } else {
7932  num_teams = num_teams_ub;
7933  }
7934  } else {
7935  num_teams = (num_threads > __kmp_teams_max_nth)
7936  ? num_teams
7937  : __kmp_teams_max_nth / num_threads;
7938  if (num_teams < num_teams_lb) {
7939  num_teams = num_teams_lb;
7940  } else if (num_teams > num_teams_ub) {
7941  num_teams = num_teams_ub;
7942  }
7943  }
7944  }
7945  // Set number of teams (number of threads in the outer "parallel" of the
7946  // teams)
7947  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7948 
7949  __kmp_push_thread_limit(thr, num_teams, num_threads);
7950 }
7951 
7952 // Set the proc_bind var to use in the following parallel region.
7953 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7954  kmp_info_t *thr = __kmp_threads[gtid];
7955  thr->th.th_set_proc_bind = proc_bind;
7956 }
7957 
7958 /* Launch the worker threads into the microtask. */
7959 
7960 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7961  kmp_info_t *this_thr = __kmp_threads[gtid];
7962 
7963 #ifdef KMP_DEBUG
7964  int f;
7965 #endif /* KMP_DEBUG */
7966 
7967  KMP_DEBUG_ASSERT(team);
7968  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7969  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7970  KMP_MB(); /* Flush all pending memory write invalidates. */
7971 
7972  team->t.t_construct = 0; /* no single directives seen yet */
7973  team->t.t_ordered.dt.t_value =
7974  0; /* thread 0 enters the ordered section first */
7975 
7976  /* Reset the identifiers on the dispatch buffer */
7977  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7978  if (team->t.t_max_nproc > 1) {
7979  int i;
7980  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7981  team->t.t_disp_buffer[i].buffer_index = i;
7982  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7983  }
7984  } else {
7985  team->t.t_disp_buffer[0].buffer_index = 0;
7986  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7987  }
7988 
7989  KMP_MB(); /* Flush all pending memory write invalidates. */
7990  KMP_ASSERT(this_thr->th.th_team == team);
7991 
7992 #ifdef KMP_DEBUG
7993  for (f = 0; f < team->t.t_nproc; f++) {
7994  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7995  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7996  }
7997 #endif /* KMP_DEBUG */
7998 
7999  /* release the worker threads so they may begin working */
8000  __kmp_fork_barrier(gtid, 0);
8001 }
8002 
8003 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8004  kmp_info_t *this_thr = __kmp_threads[gtid];
8005 
8006  KMP_DEBUG_ASSERT(team);
8007  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8008  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8009  KMP_MB(); /* Flush all pending memory write invalidates. */
8010 
8011  /* Join barrier after fork */
8012 
8013 #ifdef KMP_DEBUG
8014  if (__kmp_threads[gtid] &&
8015  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8016  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8017  __kmp_threads[gtid]);
8018  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8019  "team->t.t_nproc=%d\n",
8020  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8021  team->t.t_nproc);
8022  __kmp_print_structure();
8023  }
8024  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8025  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8026 #endif /* KMP_DEBUG */
8027 
8028  __kmp_join_barrier(gtid); /* wait for everyone */
8029 #if OMPT_SUPPORT
8030  if (ompt_enabled.enabled &&
8031  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8032  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8033  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8034  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8035 #if OMPT_OPTIONAL
8036  void *codeptr = NULL;
8037  if (KMP_MASTER_TID(ds_tid) &&
8038  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8039  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8040  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8041 
8042  if (ompt_enabled.ompt_callback_sync_region_wait) {
8043  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8044  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8045  codeptr);
8046  }
8047  if (ompt_enabled.ompt_callback_sync_region) {
8048  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8049  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8050  codeptr);
8051  }
8052 #endif
8053  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8054  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8055  ompt_scope_end, NULL, task_data, 0, ds_tid,
8056  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8057  }
8058  }
8059 #endif
8060 
8061  KMP_MB(); /* Flush all pending memory write invalidates. */
8062  KMP_ASSERT(this_thr->th.th_team == team);
8063 }
8064 
8065 /* ------------------------------------------------------------------------ */
8066 
8067 #ifdef USE_LOAD_BALANCE
8068 
8069 // Return the worker threads actively spinning in the hot team, if we
8070 // are at the outermost level of parallelism. Otherwise, return 0.
8071 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8072  int i;
8073  int retval;
8074  kmp_team_t *hot_team;
8075 
8076  if (root->r.r_active) {
8077  return 0;
8078  }
8079  hot_team = root->r.r_hot_team;
8080  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8081  return hot_team->t.t_nproc - 1; // Don't count primary thread
8082  }
8083 
8084  // Skip the primary thread - it is accounted for elsewhere.
8085  retval = 0;
8086  for (i = 1; i < hot_team->t.t_nproc; i++) {
8087  if (hot_team->t.t_threads[i]->th.th_active) {
8088  retval++;
8089  }
8090  }
8091  return retval;
8092 }
8093 
8094 // Perform an automatic adjustment to the number of
8095 // threads used by the next parallel region.
8096 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8097  int retval;
8098  int pool_active;
8099  int hot_team_active;
8100  int team_curr_active;
8101  int system_active;
8102 
8103  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8104  set_nproc));
8105  KMP_DEBUG_ASSERT(root);
8106  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8107  ->th.th_current_task->td_icvs.dynamic == TRUE);
8108  KMP_DEBUG_ASSERT(set_nproc > 1);
8109 
8110  if (set_nproc == 1) {
8111  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8112  return 1;
8113  }
8114 
8115  // Threads that are active in the thread pool, active in the hot team for this
8116  // particular root (if we are at the outer par level), and the currently
8117  // executing thread (to become the primary thread) are available to add to the
8118  // new team, but are currently contributing to the system load, and must be
8119  // accounted for.
8120  pool_active = __kmp_thread_pool_active_nth;
8121  hot_team_active = __kmp_active_hot_team_nproc(root);
8122  team_curr_active = pool_active + hot_team_active + 1;
8123 
8124  // Check the system load.
8125  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8126  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8127  "hot team active = %d\n",
8128  system_active, pool_active, hot_team_active));
8129 
8130  if (system_active < 0) {
8131  // There was an error reading the necessary info from /proc, so use the
8132  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8133  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8134  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8135  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8136 
8137  // Make this call behave like the thread limit algorithm.
8138  retval = __kmp_avail_proc - __kmp_nth +
8139  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8140  if (retval > set_nproc) {
8141  retval = set_nproc;
8142  }
8143  if (retval < KMP_MIN_NTH) {
8144  retval = KMP_MIN_NTH;
8145  }
8146 
8147  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8148  retval));
8149  return retval;
8150  }
8151 
8152  // There is a slight delay in the load balance algorithm in detecting new
8153  // running procs. The real system load at this instant should be at least as
8154  // large as the #active omp thread that are available to add to the team.
8155  if (system_active < team_curr_active) {
8156  system_active = team_curr_active;
8157  }
8158  retval = __kmp_avail_proc - system_active + team_curr_active;
8159  if (retval > set_nproc) {
8160  retval = set_nproc;
8161  }
8162  if (retval < KMP_MIN_NTH) {
8163  retval = KMP_MIN_NTH;
8164  }
8165 
8166  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8167  return retval;
8168 } // __kmp_load_balance_nproc()
8169 
8170 #endif /* USE_LOAD_BALANCE */
8171 
8172 /* ------------------------------------------------------------------------ */
8173 
8174 /* NOTE: this is called with the __kmp_init_lock held */
8175 void __kmp_cleanup(void) {
8176  int f;
8177 
8178  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8179 
8180  if (TCR_4(__kmp_init_parallel)) {
8181 #if KMP_HANDLE_SIGNALS
8182  __kmp_remove_signals();
8183 #endif
8184  TCW_4(__kmp_init_parallel, FALSE);
8185  }
8186 
8187  if (TCR_4(__kmp_init_middle)) {
8188 #if KMP_AFFINITY_SUPPORTED
8189  __kmp_affinity_uninitialize();
8190 #endif /* KMP_AFFINITY_SUPPORTED */
8191  __kmp_cleanup_hierarchy();
8192  TCW_4(__kmp_init_middle, FALSE);
8193  }
8194 
8195  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8196 
8197  if (__kmp_init_serial) {
8198  __kmp_runtime_destroy();
8199  __kmp_init_serial = FALSE;
8200  }
8201 
8202  __kmp_cleanup_threadprivate_caches();
8203 
8204  for (f = 0; f < __kmp_threads_capacity; f++) {
8205  if (__kmp_root[f] != NULL) {
8206  __kmp_free(__kmp_root[f]);
8207  __kmp_root[f] = NULL;
8208  }
8209  }
8210  __kmp_free(__kmp_threads);
8211  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8212  // there is no need in freeing __kmp_root.
8213  __kmp_threads = NULL;
8214  __kmp_root = NULL;
8215  __kmp_threads_capacity = 0;
8216 
8217  // Free old __kmp_threads arrays if they exist.
8218  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8219  while (ptr) {
8220  kmp_old_threads_list_t *next = ptr->next;
8221  __kmp_free(ptr->threads);
8222  __kmp_free(ptr);
8223  ptr = next;
8224  }
8225 
8226 #if KMP_USE_DYNAMIC_LOCK
8227  __kmp_cleanup_indirect_user_locks();
8228 #else
8229  __kmp_cleanup_user_locks();
8230 #endif
8231 #if OMPD_SUPPORT
8232  if (ompd_state) {
8233  __kmp_free(ompd_env_block);
8234  ompd_env_block = NULL;
8235  ompd_env_block_size = 0;
8236  }
8237 #endif
8238 
8239 #if KMP_AFFINITY_SUPPORTED
8240  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8241  __kmp_cpuinfo_file = NULL;
8242 #endif /* KMP_AFFINITY_SUPPORTED */
8243 
8244 #if KMP_USE_ADAPTIVE_LOCKS
8245 #if KMP_DEBUG_ADAPTIVE_LOCKS
8246  __kmp_print_speculative_stats();
8247 #endif
8248 #endif
8249  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8250  __kmp_nested_nth.nth = NULL;
8251  __kmp_nested_nth.size = 0;
8252  __kmp_nested_nth.used = 0;
8253  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8254  __kmp_nested_proc_bind.bind_types = NULL;
8255  __kmp_nested_proc_bind.size = 0;
8256  __kmp_nested_proc_bind.used = 0;
8257  if (__kmp_affinity_format) {
8258  KMP_INTERNAL_FREE(__kmp_affinity_format);
8259  __kmp_affinity_format = NULL;
8260  }
8261 
8262  __kmp_i18n_catclose();
8263 
8264 #if KMP_USE_HIER_SCHED
8265  __kmp_hier_scheds.deallocate();
8266 #endif
8267 
8268 #if KMP_STATS_ENABLED
8269  __kmp_stats_fini();
8270 #endif
8271 
8272  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8273 }
8274 
8275 /* ------------------------------------------------------------------------ */
8276 
8277 int __kmp_ignore_mppbeg(void) {
8278  char *env;
8279 
8280  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8281  if (__kmp_str_match_false(env))
8282  return FALSE;
8283  }
8284  // By default __kmpc_begin() is no-op.
8285  return TRUE;
8286 }
8287 
8288 int __kmp_ignore_mppend(void) {
8289  char *env;
8290 
8291  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8292  if (__kmp_str_match_false(env))
8293  return FALSE;
8294  }
8295  // By default __kmpc_end() is no-op.
8296  return TRUE;
8297 }
8298 
8299 void __kmp_internal_begin(void) {
8300  int gtid;
8301  kmp_root_t *root;
8302 
8303  /* this is a very important step as it will register new sibling threads
8304  and assign these new uber threads a new gtid */
8305  gtid = __kmp_entry_gtid();
8306  root = __kmp_threads[gtid]->th.th_root;
8307  KMP_ASSERT(KMP_UBER_GTID(gtid));
8308 
8309  if (root->r.r_begin)
8310  return;
8311  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8312  if (root->r.r_begin) {
8313  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8314  return;
8315  }
8316 
8317  root->r.r_begin = TRUE;
8318 
8319  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8320 }
8321 
8322 /* ------------------------------------------------------------------------ */
8323 
8324 void __kmp_user_set_library(enum library_type arg) {
8325  int gtid;
8326  kmp_root_t *root;
8327  kmp_info_t *thread;
8328 
8329  /* first, make sure we are initialized so we can get our gtid */
8330 
8331  gtid = __kmp_entry_gtid();
8332  thread = __kmp_threads[gtid];
8333 
8334  root = thread->th.th_root;
8335 
8336  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8337  library_serial));
8338  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8339  thread */
8340  KMP_WARNING(SetLibraryIncorrectCall);
8341  return;
8342  }
8343 
8344  switch (arg) {
8345  case library_serial:
8346  thread->th.th_set_nproc = 0;
8347  set__nproc(thread, 1);
8348  break;
8349  case library_turnaround:
8350  thread->th.th_set_nproc = 0;
8351  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8352  : __kmp_dflt_team_nth_ub);
8353  break;
8354  case library_throughput:
8355  thread->th.th_set_nproc = 0;
8356  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8357  : __kmp_dflt_team_nth_ub);
8358  break;
8359  default:
8360  KMP_FATAL(UnknownLibraryType, arg);
8361  }
8362 
8363  __kmp_aux_set_library(arg);
8364 }
8365 
8366 void __kmp_aux_set_stacksize(size_t arg) {
8367  if (!__kmp_init_serial)
8368  __kmp_serial_initialize();
8369 
8370 #if KMP_OS_DARWIN
8371  if (arg & (0x1000 - 1)) {
8372  arg &= ~(0x1000 - 1);
8373  if (arg + 0x1000) /* check for overflow if we round up */
8374  arg += 0x1000;
8375  }
8376 #endif
8377  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8378 
8379  /* only change the default stacksize before the first parallel region */
8380  if (!TCR_4(__kmp_init_parallel)) {
8381  size_t value = arg; /* argument is in bytes */
8382 
8383  if (value < __kmp_sys_min_stksize)
8384  value = __kmp_sys_min_stksize;
8385  else if (value > KMP_MAX_STKSIZE)
8386  value = KMP_MAX_STKSIZE;
8387 
8388  __kmp_stksize = value;
8389 
8390  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8391  }
8392 
8393  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8394 }
8395 
8396 /* set the behaviour of the runtime library */
8397 /* TODO this can cause some odd behaviour with sibling parallelism... */
8398 void __kmp_aux_set_library(enum library_type arg) {
8399  __kmp_library = arg;
8400 
8401  switch (__kmp_library) {
8402  case library_serial: {
8403  KMP_INFORM(LibraryIsSerial);
8404  } break;
8405  case library_turnaround:
8406  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8407  __kmp_use_yield = 2; // only yield when oversubscribed
8408  break;
8409  case library_throughput:
8410  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8411  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8412  break;
8413  default:
8414  KMP_FATAL(UnknownLibraryType, arg);
8415  }
8416 }
8417 
8418 /* Getting team information common for all team API */
8419 // Returns NULL if not in teams construct
8420 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8421  kmp_info_t *thr = __kmp_entry_thread();
8422  teams_serialized = 0;
8423  if (thr->th.th_teams_microtask) {
8424  kmp_team_t *team = thr->th.th_team;
8425  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8426  int ii = team->t.t_level;
8427  teams_serialized = team->t.t_serialized;
8428  int level = tlevel + 1;
8429  KMP_DEBUG_ASSERT(ii >= tlevel);
8430  while (ii > level) {
8431  for (teams_serialized = team->t.t_serialized;
8432  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8433  }
8434  if (team->t.t_serialized && (!teams_serialized)) {
8435  team = team->t.t_parent;
8436  continue;
8437  }
8438  if (ii > level) {
8439  team = team->t.t_parent;
8440  ii--;
8441  }
8442  }
8443  return team;
8444  }
8445  return NULL;
8446 }
8447 
8448 int __kmp_aux_get_team_num() {
8449  int serialized;
8450  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8451  if (team) {
8452  if (serialized > 1) {
8453  return 0; // teams region is serialized ( 1 team of 1 thread ).
8454  } else {
8455  return team->t.t_master_tid;
8456  }
8457  }
8458  return 0;
8459 }
8460 
8461 int __kmp_aux_get_num_teams() {
8462  int serialized;
8463  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8464  if (team) {
8465  if (serialized > 1) {
8466  return 1;
8467  } else {
8468  return team->t.t_parent->t.t_nproc;
8469  }
8470  }
8471  return 1;
8472 }
8473 
8474 /* ------------------------------------------------------------------------ */
8475 
8476 /*
8477  * Affinity Format Parser
8478  *
8479  * Field is in form of: %[[[0].]size]type
8480  * % and type are required (%% means print a literal '%')
8481  * type is either single char or long name surrounded by {},
8482  * e.g., N or {num_threads}
8483  * 0 => leading zeros
8484  * . => right justified when size is specified
8485  * by default output is left justified
8486  * size is the *minimum* field length
8487  * All other characters are printed as is
8488  *
8489  * Available field types:
8490  * L {thread_level} - omp_get_level()
8491  * n {thread_num} - omp_get_thread_num()
8492  * h {host} - name of host machine
8493  * P {process_id} - process id (integer)
8494  * T {thread_identifier} - native thread identifier (integer)
8495  * N {num_threads} - omp_get_num_threads()
8496  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8497  * a {thread_affinity} - comma separated list of integers or integer ranges
8498  * (values of affinity mask)
8499  *
8500  * Implementation-specific field types can be added
8501  * If a type is unknown, print "undefined"
8502  */
8503 
8504 // Structure holding the short name, long name, and corresponding data type
8505 // for snprintf. A table of these will represent the entire valid keyword
8506 // field types.
8507 typedef struct kmp_affinity_format_field_t {
8508  char short_name; // from spec e.g., L -> thread level
8509  const char *long_name; // from spec thread_level -> thread level
8510  char field_format; // data type for snprintf (typically 'd' or 's'
8511  // for integer or string)
8512 } kmp_affinity_format_field_t;
8513 
8514 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8515 #if KMP_AFFINITY_SUPPORTED
8516  {'A', "thread_affinity", 's'},
8517 #endif
8518  {'t', "team_num", 'd'},
8519  {'T', "num_teams", 'd'},
8520  {'L', "nesting_level", 'd'},
8521  {'n', "thread_num", 'd'},
8522  {'N', "num_threads", 'd'},
8523  {'a', "ancestor_tnum", 'd'},
8524  {'H', "host", 's'},
8525  {'P', "process_id", 'd'},
8526  {'i', "native_thread_id", 'd'}};
8527 
8528 // Return the number of characters it takes to hold field
8529 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8530  const char **ptr,
8531  kmp_str_buf_t *field_buffer) {
8532  int rc, format_index, field_value;
8533  const char *width_left, *width_right;
8534  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8535  static const int FORMAT_SIZE = 20;
8536  char format[FORMAT_SIZE] = {0};
8537  char absolute_short_name = 0;
8538 
8539  KMP_DEBUG_ASSERT(gtid >= 0);
8540  KMP_DEBUG_ASSERT(th);
8541  KMP_DEBUG_ASSERT(**ptr == '%');
8542  KMP_DEBUG_ASSERT(field_buffer);
8543 
8544  __kmp_str_buf_clear(field_buffer);
8545 
8546  // Skip the initial %
8547  (*ptr)++;
8548 
8549  // Check for %% first
8550  if (**ptr == '%') {
8551  __kmp_str_buf_cat(field_buffer, "%", 1);
8552  (*ptr)++; // skip over the second %
8553  return 1;
8554  }
8555 
8556  // Parse field modifiers if they are present
8557  pad_zeros = false;
8558  if (**ptr == '0') {
8559  pad_zeros = true;
8560  (*ptr)++; // skip over 0
8561  }
8562  right_justify = false;
8563  if (**ptr == '.') {
8564  right_justify = true;
8565  (*ptr)++; // skip over .
8566  }
8567  // Parse width of field: [width_left, width_right)
8568  width_left = width_right = NULL;
8569  if (**ptr >= '0' && **ptr <= '9') {
8570  width_left = *ptr;
8571  SKIP_DIGITS(*ptr);
8572  width_right = *ptr;
8573  }
8574 
8575  // Create the format for KMP_SNPRINTF based on flags parsed above
8576  format_index = 0;
8577  format[format_index++] = '%';
8578  if (!right_justify)
8579  format[format_index++] = '-';
8580  if (pad_zeros)
8581  format[format_index++] = '0';
8582  if (width_left && width_right) {
8583  int i = 0;
8584  // Only allow 8 digit number widths.
8585  // This also prevents overflowing format variable
8586  while (i < 8 && width_left < width_right) {
8587  format[format_index++] = *width_left;
8588  width_left++;
8589  i++;
8590  }
8591  }
8592 
8593  // Parse a name (long or short)
8594  // Canonicalize the name into absolute_short_name
8595  found_valid_name = false;
8596  parse_long_name = (**ptr == '{');
8597  if (parse_long_name)
8598  (*ptr)++; // skip initial left brace
8599  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8600  sizeof(__kmp_affinity_format_table[0]);
8601  ++i) {
8602  char short_name = __kmp_affinity_format_table[i].short_name;
8603  const char *long_name = __kmp_affinity_format_table[i].long_name;
8604  char field_format = __kmp_affinity_format_table[i].field_format;
8605  if (parse_long_name) {
8606  size_t length = KMP_STRLEN(long_name);
8607  if (strncmp(*ptr, long_name, length) == 0) {
8608  found_valid_name = true;
8609  (*ptr) += length; // skip the long name
8610  }
8611  } else if (**ptr == short_name) {
8612  found_valid_name = true;
8613  (*ptr)++; // skip the short name
8614  }
8615  if (found_valid_name) {
8616  format[format_index++] = field_format;
8617  format[format_index++] = '\0';
8618  absolute_short_name = short_name;
8619  break;
8620  }
8621  }
8622  if (parse_long_name) {
8623  if (**ptr != '}') {
8624  absolute_short_name = 0;
8625  } else {
8626  (*ptr)++; // skip over the right brace
8627  }
8628  }
8629 
8630  // Attempt to fill the buffer with the requested
8631  // value using snprintf within __kmp_str_buf_print()
8632  switch (absolute_short_name) {
8633  case 't':
8634  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8635  break;
8636  case 'T':
8637  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8638  break;
8639  case 'L':
8640  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8641  break;
8642  case 'n':
8643  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8644  break;
8645  case 'H': {
8646  static const int BUFFER_SIZE = 256;
8647  char buf[BUFFER_SIZE];
8648  __kmp_expand_host_name(buf, BUFFER_SIZE);
8649  rc = __kmp_str_buf_print(field_buffer, format, buf);
8650  } break;
8651  case 'P':
8652  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8653  break;
8654  case 'i':
8655  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8656  break;
8657  case 'N':
8658  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8659  break;
8660  case 'a':
8661  field_value =
8662  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8663  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8664  break;
8665 #if KMP_AFFINITY_SUPPORTED
8666  case 'A': {
8667  kmp_str_buf_t buf;
8668  __kmp_str_buf_init(&buf);
8669  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8670  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8671  __kmp_str_buf_free(&buf);
8672  } break;
8673 #endif
8674  default:
8675  // According to spec, If an implementation does not have info for field
8676  // type, then "undefined" is printed
8677  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8678  // Skip the field
8679  if (parse_long_name) {
8680  SKIP_TOKEN(*ptr);
8681  if (**ptr == '}')
8682  (*ptr)++;
8683  } else {
8684  (*ptr)++;
8685  }
8686  }
8687 
8688  KMP_ASSERT(format_index <= FORMAT_SIZE);
8689  return rc;
8690 }
8691 
8692 /*
8693  * Return number of characters needed to hold the affinity string
8694  * (not including null byte character)
8695  * The resultant string is printed to buffer, which the caller can then
8696  * handle afterwards
8697  */
8698 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8699  kmp_str_buf_t *buffer) {
8700  const char *parse_ptr;
8701  size_t retval;
8702  const kmp_info_t *th;
8703  kmp_str_buf_t field;
8704 
8705  KMP_DEBUG_ASSERT(buffer);
8706  KMP_DEBUG_ASSERT(gtid >= 0);
8707 
8708  __kmp_str_buf_init(&field);
8709  __kmp_str_buf_clear(buffer);
8710 
8711  th = __kmp_threads[gtid];
8712  retval = 0;
8713 
8714  // If format is NULL or zero-length string, then we use
8715  // affinity-format-var ICV
8716  parse_ptr = format;
8717  if (parse_ptr == NULL || *parse_ptr == '\0') {
8718  parse_ptr = __kmp_affinity_format;
8719  }
8720  KMP_DEBUG_ASSERT(parse_ptr);
8721 
8722  while (*parse_ptr != '\0') {
8723  // Parse a field
8724  if (*parse_ptr == '%') {
8725  // Put field in the buffer
8726  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8727  __kmp_str_buf_catbuf(buffer, &field);
8728  retval += rc;
8729  } else {
8730  // Put literal character in buffer
8731  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8732  retval++;
8733  parse_ptr++;
8734  }
8735  }
8736  __kmp_str_buf_free(&field);
8737  return retval;
8738 }
8739 
8740 // Displays the affinity string to stdout
8741 void __kmp_aux_display_affinity(int gtid, const char *format) {
8742  kmp_str_buf_t buf;
8743  __kmp_str_buf_init(&buf);
8744  __kmp_aux_capture_affinity(gtid, format, &buf);
8745  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8746  __kmp_str_buf_free(&buf);
8747 }
8748 
8749 /* ------------------------------------------------------------------------ */
8750 
8751 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8752  int blocktime = arg; /* argument is in milliseconds */
8753 #if KMP_USE_MONITOR
8754  int bt_intervals;
8755 #endif
8756  kmp_int8 bt_set;
8757 
8758  __kmp_save_internal_controls(thread);
8759 
8760  /* Normalize and set blocktime for the teams */
8761  if (blocktime < KMP_MIN_BLOCKTIME)
8762  blocktime = KMP_MIN_BLOCKTIME;
8763  else if (blocktime > KMP_MAX_BLOCKTIME)
8764  blocktime = KMP_MAX_BLOCKTIME;
8765 
8766  set__blocktime_team(thread->th.th_team, tid, blocktime);
8767  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8768 
8769 #if KMP_USE_MONITOR
8770  /* Calculate and set blocktime intervals for the teams */
8771  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8772 
8773  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8774  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8775 #endif
8776 
8777  /* Set whether blocktime has been set to "TRUE" */
8778  bt_set = TRUE;
8779 
8780  set__bt_set_team(thread->th.th_team, tid, bt_set);
8781  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8782 #if KMP_USE_MONITOR
8783  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8784  "bt_intervals=%d, monitor_updates=%d\n",
8785  __kmp_gtid_from_tid(tid, thread->th.th_team),
8786  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8787  __kmp_monitor_wakeups));
8788 #else
8789  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8790  __kmp_gtid_from_tid(tid, thread->th.th_team),
8791  thread->th.th_team->t.t_id, tid, blocktime));
8792 #endif
8793 }
8794 
8795 void __kmp_aux_set_defaults(char const *str, size_t len) {
8796  if (!__kmp_init_serial) {
8797  __kmp_serial_initialize();
8798  }
8799  __kmp_env_initialize(str);
8800 
8801  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8802  __kmp_env_print();
8803  }
8804 } // __kmp_aux_set_defaults
8805 
8806 /* ------------------------------------------------------------------------ */
8807 /* internal fast reduction routines */
8808 
8809 PACKED_REDUCTION_METHOD_T
8810 __kmp_determine_reduction_method(
8811  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8812  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8813  kmp_critical_name *lck) {
8814 
8815  // Default reduction method: critical construct ( lck != NULL, like in current
8816  // PAROPT )
8817  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8818  // can be selected by RTL
8819  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8820  // can be selected by RTL
8821  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8822  // among generated by PAROPT.
8823 
8824  PACKED_REDUCTION_METHOD_T retval;
8825 
8826  int team_size;
8827 
8828  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8829  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8830 
8831 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8832  (loc && \
8833  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8834 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8835 
8836  retval = critical_reduce_block;
8837 
8838  // another choice of getting a team size (with 1 dynamic deference) is slower
8839  team_size = __kmp_get_team_num_threads(global_tid);
8840  if (team_size == 1) {
8841 
8842  retval = empty_reduce_block;
8843 
8844  } else {
8845 
8846  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8847 
8848 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8849  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8850 
8851 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8852  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8853 
8854  int teamsize_cutoff = 4;
8855 
8856 #if KMP_MIC_SUPPORTED
8857  if (__kmp_mic_type != non_mic) {
8858  teamsize_cutoff = 8;
8859  }
8860 #endif
8861  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8862  if (tree_available) {
8863  if (team_size <= teamsize_cutoff) {
8864  if (atomic_available) {
8865  retval = atomic_reduce_block;
8866  }
8867  } else {
8868  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8869  }
8870  } else if (atomic_available) {
8871  retval = atomic_reduce_block;
8872  }
8873 #else
8874 #error "Unknown or unsupported OS"
8875 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8876  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8877 
8878 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8879 
8880 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8881 
8882  // basic tuning
8883 
8884  if (atomic_available) {
8885  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8886  retval = atomic_reduce_block;
8887  }
8888  } // otherwise: use critical section
8889 
8890 #elif KMP_OS_DARWIN
8891 
8892  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8893  if (atomic_available && (num_vars <= 3)) {
8894  retval = atomic_reduce_block;
8895  } else if (tree_available) {
8896  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8897  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8898  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8899  }
8900  } // otherwise: use critical section
8901 
8902 #else
8903 #error "Unknown or unsupported OS"
8904 #endif
8905 
8906 #else
8907 #error "Unknown or unsupported architecture"
8908 #endif
8909  }
8910 
8911  // KMP_FORCE_REDUCTION
8912 
8913  // If the team is serialized (team_size == 1), ignore the forced reduction
8914  // method and stay with the unsynchronized method (empty_reduce_block)
8915  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8916  team_size != 1) {
8917 
8918  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8919 
8920  int atomic_available, tree_available;
8921 
8922  switch ((forced_retval = __kmp_force_reduction_method)) {
8923  case critical_reduce_block:
8924  KMP_ASSERT(lck); // lck should be != 0
8925  break;
8926 
8927  case atomic_reduce_block:
8928  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8929  if (!atomic_available) {
8930  KMP_WARNING(RedMethodNotSupported, "atomic");
8931  forced_retval = critical_reduce_block;
8932  }
8933  break;
8934 
8935  case tree_reduce_block:
8936  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8937  if (!tree_available) {
8938  KMP_WARNING(RedMethodNotSupported, "tree");
8939  forced_retval = critical_reduce_block;
8940  } else {
8941 #if KMP_FAST_REDUCTION_BARRIER
8942  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8943 #endif
8944  }
8945  break;
8946 
8947  default:
8948  KMP_ASSERT(0); // "unsupported method specified"
8949  }
8950 
8951  retval = forced_retval;
8952  }
8953 
8954  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8955 
8956 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8957 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8958 
8959  return (retval);
8960 }
8961 // this function is for testing set/get/determine reduce method
8962 kmp_int32 __kmp_get_reduce_method(void) {
8963  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8964 }
8965 
8966 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8967 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8968 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8969 
8970 // Hard pause shuts down the runtime completely. Resume happens naturally when
8971 // OpenMP is used subsequently.
8972 void __kmp_hard_pause() {
8973  __kmp_pause_status = kmp_hard_paused;
8974  __kmp_internal_end_thread(-1);
8975 }
8976 
8977 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8978 void __kmp_resume_if_soft_paused() {
8979  if (__kmp_pause_status == kmp_soft_paused) {
8980  __kmp_pause_status = kmp_not_paused;
8981 
8982  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8983  kmp_info_t *thread = __kmp_threads[gtid];
8984  if (thread) { // Wake it if sleeping
8985  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8986  thread);
8987  if (fl.is_sleeping())
8988  fl.resume(gtid);
8989  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8990  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8991  } else { // thread holds the lock and may sleep soon
8992  do { // until either the thread sleeps, or we can get the lock
8993  if (fl.is_sleeping()) {
8994  fl.resume(gtid);
8995  break;
8996  } else if (__kmp_try_suspend_mx(thread)) {
8997  __kmp_unlock_suspend_mx(thread);
8998  break;
8999  }
9000  } while (1);
9001  }
9002  }
9003  }
9004  }
9005 }
9006 
9007 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9008 // TODO: add warning messages
9009 int __kmp_pause_resource(kmp_pause_status_t level) {
9010  if (level == kmp_not_paused) { // requesting resume
9011  if (__kmp_pause_status == kmp_not_paused) {
9012  // error message about runtime not being paused, so can't resume
9013  return 1;
9014  } else {
9015  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9016  __kmp_pause_status == kmp_hard_paused);
9017  __kmp_pause_status = kmp_not_paused;
9018  return 0;
9019  }
9020  } else if (level == kmp_soft_paused) { // requesting soft pause
9021  if (__kmp_pause_status != kmp_not_paused) {
9022  // error message about already being paused
9023  return 1;
9024  } else {
9025  __kmp_soft_pause();
9026  return 0;
9027  }
9028  } else if (level == kmp_hard_paused) { // requesting hard pause
9029  if (__kmp_pause_status != kmp_not_paused) {
9030  // error message about already being paused
9031  return 1;
9032  } else {
9033  __kmp_hard_pause();
9034  return 0;
9035  }
9036  } else {
9037  // error message about invalid level
9038  return 1;
9039  }
9040 }
9041 
9042 void __kmp_omp_display_env(int verbose) {
9043  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9044  if (__kmp_init_serial == 0)
9045  __kmp_do_serial_initialize();
9046  __kmp_display_env_impl(!verbose, verbose);
9047  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9048 }
9049 
9050 // The team size is changing, so distributed barrier must be modified
9051 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9052  int new_nthreads) {
9053  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9054  bp_dist_bar);
9055  kmp_info_t **other_threads = team->t.t_threads;
9056 
9057  // We want all the workers to stop waiting on the barrier while we adjust the
9058  // size of the team.
9059  for (int f = 1; f < old_nthreads; ++f) {
9060  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9061  // Ignore threads that are already inactive or not present in the team
9062  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9063  // teams construct causes thread_limit to get passed in, and some of
9064  // those could be inactive; just ignore them
9065  continue;
9066  }
9067  // If thread is transitioning still to in_use state, wait for it
9068  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9069  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9070  KMP_CPU_PAUSE();
9071  }
9072  // The thread should be in_use now
9073  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9074  // Transition to unused state
9075  team->t.t_threads[f]->th.th_used_in_team.store(2);
9076  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9077  }
9078  // Release all the workers
9079  team->t.b->go_release();
9080 
9081  KMP_MFENCE();
9082 
9083  // Workers should see transition status 2 and move to 0; but may need to be
9084  // woken up first
9085  int count = old_nthreads - 1;
9086  while (count > 0) {
9087  count = old_nthreads - 1;
9088  for (int f = 1; f < old_nthreads; ++f) {
9089  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9090  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9091  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9092  void *, other_threads[f]->th.th_sleep_loc);
9093  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9094  }
9095  } else {
9096  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9097  count--;
9098  }
9099  }
9100  }
9101  // Now update the barrier size
9102  team->t.b->update_num_threads(new_nthreads);
9103  team->t.b->go_reset();
9104 }
9105 
9106 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9107  // Add the threads back to the team
9108  KMP_DEBUG_ASSERT(team);
9109  // Threads were paused and pointed at th_used_in_team temporarily during a
9110  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9111  // the thread that it should transition itself back into the team. Then, if
9112  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9113  // to wake it up.
9114  for (int f = 1; f < new_nthreads; ++f) {
9115  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9116  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9117  3);
9118  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9119  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9120  (kmp_flag_32<false, false> *)NULL);
9121  }
9122  }
9123  // The threads should be transitioning to the team; when they are done, they
9124  // should have set th_used_in_team to 1. This loop forces master to wait until
9125  // all threads have moved into the team and are waiting in the barrier.
9126  int count = new_nthreads - 1;
9127  while (count > 0) {
9128  count = new_nthreads - 1;
9129  for (int f = 1; f < new_nthreads; ++f) {
9130  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9131  count--;
9132  }
9133  }
9134  }
9135 }
9136 
9137 // Globals and functions for hidden helper task
9138 kmp_info_t **__kmp_hidden_helper_threads;
9139 kmp_info_t *__kmp_hidden_helper_main_thread;
9140 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9141 #if KMP_OS_LINUX
9142 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9143 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9144 #else
9145 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9146 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9147 #endif
9148 
9149 namespace {
9150 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9151 
9152 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9153  // This is an explicit synchronization on all hidden helper threads in case
9154  // that when a regular thread pushes a hidden helper task to one hidden
9155  // helper thread, the thread has not been awaken once since they're released
9156  // by the main thread after creating the team.
9157  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9158  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9159  __kmp_hidden_helper_threads_num)
9160  ;
9161 
9162  // If main thread, then wait for signal
9163  if (__kmpc_master(nullptr, *gtid)) {
9164  // First, unset the initial state and release the initial thread
9165  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9166  __kmp_hidden_helper_initz_release();
9167  __kmp_hidden_helper_main_thread_wait();
9168  // Now wake up all worker threads
9169  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9170  __kmp_hidden_helper_worker_thread_signal();
9171  }
9172  }
9173 }
9174 } // namespace
9175 
9176 void __kmp_hidden_helper_threads_initz_routine() {
9177  // Create a new root for hidden helper team/threads
9178  const int gtid = __kmp_register_root(TRUE);
9179  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9180  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9181  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9182  __kmp_hidden_helper_threads_num;
9183 
9184  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9185 
9186  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9187 
9188  // Set the initialization flag to FALSE
9189  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9190 
9191  __kmp_hidden_helper_threads_deinitz_release();
9192 }
9193 
9194 /* Nesting Mode:
9195  Set via KMP_NESTING_MODE, which takes an integer.
9196  Note: we skip duplicate topology levels, and skip levels with only
9197  one entity.
9198  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9199  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9200  in the topology, and initializes the number of threads at each of those
9201  levels to the number of entities at each level, respectively, below the
9202  entity at the parent level.
9203  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9204  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9205  the user to turn nesting on explicitly. This is an even more experimental
9206  option to this experimental feature, and may change or go away in the
9207  future.
9208 */
9209 
9210 // Allocate space to store nesting levels
9211 void __kmp_init_nesting_mode() {
9212  int levels = KMP_HW_LAST;
9213  __kmp_nesting_mode_nlevels = levels;
9214  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9215  for (int i = 0; i < levels; ++i)
9216  __kmp_nesting_nth_level[i] = 0;
9217  if (__kmp_nested_nth.size < levels) {
9218  __kmp_nested_nth.nth =
9219  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9220  __kmp_nested_nth.size = levels;
9221  }
9222 }
9223 
9224 // Set # threads for top levels of nesting; must be called after topology set
9225 void __kmp_set_nesting_mode_threads() {
9226  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9227 
9228  if (__kmp_nesting_mode == 1)
9229  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9230  else if (__kmp_nesting_mode > 1)
9231  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9232 
9233  if (__kmp_topology) { // use topology info
9234  int loc, hw_level;
9235  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9236  loc < __kmp_nesting_mode_nlevels;
9237  loc++, hw_level++) {
9238  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9239  if (__kmp_nesting_nth_level[loc] == 1)
9240  loc--;
9241  }
9242  // Make sure all cores are used
9243  if (__kmp_nesting_mode > 1 && loc > 1) {
9244  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9245  int num_cores = __kmp_topology->get_count(core_level);
9246  int upper_levels = 1;
9247  for (int level = 0; level < loc - 1; ++level)
9248  upper_levels *= __kmp_nesting_nth_level[level];
9249  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9250  __kmp_nesting_nth_level[loc - 1] =
9251  num_cores / __kmp_nesting_nth_level[loc - 2];
9252  }
9253  __kmp_nesting_mode_nlevels = loc;
9254  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9255  } else { // no topology info available; provide a reasonable guesstimation
9256  if (__kmp_avail_proc >= 4) {
9257  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9258  __kmp_nesting_nth_level[1] = 2;
9259  __kmp_nesting_mode_nlevels = 2;
9260  } else {
9261  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9262  __kmp_nesting_mode_nlevels = 1;
9263  }
9264  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9265  }
9266  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9267  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9268  }
9269  set__nproc(thread, __kmp_nesting_nth_level[0]);
9270  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9271  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9272  if (get__max_active_levels(thread) > 1) {
9273  // if max levels was set, set nesting mode levels to same
9274  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9275  }
9276  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9277  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9278 }
9279 
9280 // Empty symbols to export (see exports_so.txt) when feature is disabled
9281 extern "C" {
9282 #if !KMP_STATS_ENABLED
9283 void __kmp_reset_stats() {}
9284 #endif
9285 #if !USE_DEBUGGER
9286 int __kmp_omp_debug_struct_info = FALSE;
9287 int __kmp_debugging = FALSE;
9288 #endif
9289 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9290 void __kmp_itt_fini_ittlib() {}
9291 void __kmp_itt_init_ittlib() {}
9292 #endif
9293 }
9294 
9295 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236