LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 /* forward declaration */
25 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
26  kmp_info_t *this_thr);
27 static void __kmp_alloc_task_deque(kmp_info_t *thread,
28  kmp_thread_data_t *thread_data);
29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
30  kmp_task_team_t *task_team);
31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
32 
33 #ifdef BUILD_TIED_TASK_STACK
34 
35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
36 // from top do bottom
37 //
38 // gtid: global thread identifier for thread containing stack
39 // thread_data: thread data for task team thread containing stack
40 // threshold: value above which the trace statement triggers
41 // location: string identifying call site of this function (for trace)
42 static void __kmp_trace_task_stack(kmp_int32 gtid,
43  kmp_thread_data_t *thread_data,
44  int threshold, char *location) {
45  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
46  kmp_taskdata_t **stack_top = task_stack->ts_top;
47  kmp_int32 entries = task_stack->ts_entries;
48  kmp_taskdata_t *tied_task;
49 
50  KA_TRACE(
51  threshold,
52  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
53  "first_block = %p, stack_top = %p \n",
54  location, gtid, entries, task_stack->ts_first_block, stack_top));
55 
56  KMP_DEBUG_ASSERT(stack_top != NULL);
57  KMP_DEBUG_ASSERT(entries > 0);
58 
59  while (entries != 0) {
60  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
61  // fix up ts_top if we need to pop from previous block
62  if (entries & TASK_STACK_INDEX_MASK == 0) {
63  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
64 
65  stack_block = stack_block->sb_prev;
66  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
67  }
68 
69  // finish bookkeeping
70  stack_top--;
71  entries--;
72 
73  tied_task = *stack_top;
74 
75  KMP_DEBUG_ASSERT(tied_task != NULL);
76  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
77 
78  KA_TRACE(threshold,
79  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
80  "stack_top=%p, tied_task=%p\n",
81  location, gtid, entries, stack_top, tied_task));
82  }
83  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
84 
85  KA_TRACE(threshold,
86  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
87  location, gtid));
88 }
89 
90 // __kmp_init_task_stack: initialize the task stack for the first time
91 // after a thread_data structure is created.
92 // It should not be necessary to do this again (assuming the stack works).
93 //
94 // gtid: global thread identifier of calling thread
95 // thread_data: thread data for task team thread containing stack
96 static void __kmp_init_task_stack(kmp_int32 gtid,
97  kmp_thread_data_t *thread_data) {
98  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
99  kmp_stack_block_t *first_block;
100 
101  // set up the first block of the stack
102  first_block = &task_stack->ts_first_block;
103  task_stack->ts_top = (kmp_taskdata_t **)first_block;
104  memset((void *)first_block, '\0',
105  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
106 
107  // initialize the stack to be empty
108  task_stack->ts_entries = TASK_STACK_EMPTY;
109  first_block->sb_next = NULL;
110  first_block->sb_prev = NULL;
111 }
112 
113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
114 //
115 // gtid: global thread identifier for calling thread
116 // thread_data: thread info for thread containing stack
117 static void __kmp_free_task_stack(kmp_int32 gtid,
118  kmp_thread_data_t *thread_data) {
119  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
120  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
121 
122  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
123  // free from the second block of the stack
124  while (stack_block != NULL) {
125  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
126 
127  stack_block->sb_next = NULL;
128  stack_block->sb_prev = NULL;
129  if (stack_block != &task_stack->ts_first_block) {
130  __kmp_thread_free(thread,
131  stack_block); // free the block, if not the first
132  }
133  stack_block = next_block;
134  }
135  // initialize the stack to be empty
136  task_stack->ts_entries = 0;
137  task_stack->ts_top = NULL;
138 }
139 
140 // __kmp_push_task_stack: Push the tied task onto the task stack.
141 // Grow the stack if necessary by allocating another block.
142 //
143 // gtid: global thread identifier for calling thread
144 // thread: thread info for thread containing stack
145 // tied_task: the task to push on the stack
146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
147  kmp_taskdata_t *tied_task) {
148  // GEH - need to consider what to do if tt_threads_data not allocated yet
149  kmp_thread_data_t *thread_data =
150  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
151  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
152 
153  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
154  return; // Don't push anything on stack if team or team tasks are serialized
155  }
156 
157  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
158  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
159 
160  KA_TRACE(20,
161  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
162  gtid, thread, tied_task));
163  // Store entry
164  *(task_stack->ts_top) = tied_task;
165 
166  // Do bookkeeping for next push
167  task_stack->ts_top++;
168  task_stack->ts_entries++;
169 
170  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
171  // Find beginning of this task block
172  kmp_stack_block_t *stack_block =
173  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
174 
175  // Check if we already have a block
176  if (stack_block->sb_next !=
177  NULL) { // reset ts_top to beginning of next block
178  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
179  } else { // Alloc new block and link it up
180  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
181  thread, sizeof(kmp_stack_block_t));
182 
183  task_stack->ts_top = &new_block->sb_block[0];
184  stack_block->sb_next = new_block;
185  new_block->sb_prev = stack_block;
186  new_block->sb_next = NULL;
187 
188  KA_TRACE(
189  30,
190  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
191  gtid, tied_task, new_block));
192  }
193  }
194  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
195  tied_task));
196 }
197 
198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
199 // the task, just check to make sure it matches the ending task passed in.
200 //
201 // gtid: global thread identifier for the calling thread
202 // thread: thread info structure containing stack
203 // tied_task: the task popped off the stack
204 // ending_task: the task that is ending (should match popped task)
205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
206  kmp_taskdata_t *ending_task) {
207  // GEH - need to consider what to do if tt_threads_data not allocated yet
208  kmp_thread_data_t *thread_data =
209  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
210  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
211  kmp_taskdata_t *tied_task;
212 
213  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
214  // Don't pop anything from stack if team or team tasks are serialized
215  return;
216  }
217 
218  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
219  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
220 
221  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
222  thread));
223 
224  // fix up ts_top if we need to pop from previous block
225  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
226  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
227 
228  stack_block = stack_block->sb_prev;
229  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
230  }
231 
232  // finish bookkeeping
233  task_stack->ts_top--;
234  task_stack->ts_entries--;
235 
236  tied_task = *(task_stack->ts_top);
237 
238  KMP_DEBUG_ASSERT(tied_task != NULL);
239  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
240  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
241 
242  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
243  tied_task));
244  return;
245 }
246 #endif /* BUILD_TIED_TASK_STACK */
247 
248 // returns 1 if new task is allowed to execute, 0 otherwise
249 // checks Task Scheduling constraint (if requested) and
250 // mutexinoutset dependencies if any
251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
252  const kmp_taskdata_t *tasknew,
253  const kmp_taskdata_t *taskcurr) {
254  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
255  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
256  // only descendant of all deferred tied tasks can be scheduled, checking
257  // the last one is enough, as it in turn is the descendant of all others
258  kmp_taskdata_t *current = taskcurr->td_last_tied;
259  KMP_DEBUG_ASSERT(current != NULL);
260  // check if the task is not suspended on barrier
261  if (current->td_flags.tasktype == TASK_EXPLICIT ||
262  current->td_taskwait_thread > 0) { // <= 0 on barrier
263  kmp_int32 level = current->td_level;
264  kmp_taskdata_t *parent = tasknew->td_parent;
265  while (parent != current && parent->td_level > level) {
266  // check generation up to the level of the current task
267  parent = parent->td_parent;
268  KMP_DEBUG_ASSERT(parent != NULL);
269  }
270  if (parent != current)
271  return false;
272  }
273  }
274  // Check mutexinoutset dependencies, acquire locks
275  kmp_depnode_t *node = tasknew->td_depnode;
276  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
277  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
278  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
279  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
280  continue;
281  // could not get the lock, release previous locks
282  for (int j = i - 1; j >= 0; --j)
283  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
284  return false;
285  }
286  // negative num_locks means all locks acquired successfully
287  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
288  }
289  return true;
290 }
291 
292 // __kmp_realloc_task_deque:
293 // Re-allocates a task deque for a particular thread, copies the content from
294 // the old deque and adjusts the necessary data structures relating to the
295 // deque. This operation must be done with the deque_lock being held
296 static void __kmp_realloc_task_deque(kmp_info_t *thread,
297  kmp_thread_data_t *thread_data) {
298  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
299  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
300  kmp_int32 new_size = 2 * size;
301 
302  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
303  "%d] for thread_data %p\n",
304  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
305 
306  kmp_taskdata_t **new_deque =
307  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
308 
309  int i, j;
310  for (i = thread_data->td.td_deque_head, j = 0; j < size;
311  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
312  new_deque[j] = thread_data->td.td_deque[i];
313 
314  __kmp_free(thread_data->td.td_deque);
315 
316  thread_data->td.td_deque_head = 0;
317  thread_data->td.td_deque_tail = size;
318  thread_data->td.td_deque = new_deque;
319  thread_data->td.td_deque_size = new_size;
320 }
321 
322 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
323  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
324  kmp_thread_data_t *thread_data = &l->td;
325  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
326  thread_data->td.td_deque_last_stolen = -1;
327  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
328  "for thread_data %p\n",
329  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
330  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
331  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
332  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
333  return l;
334 }
335 
336 // The function finds the deque of priority tasks with given priority, or
337 // allocates a new deque and put it into sorted (high -> low) list of deques.
338 // Deques of non-default priority tasks are shared between all threads in team,
339 // as opposed to per-thread deques of tasks with default priority.
340 // The function is called under the lock task_team->tt.tt_task_pri_lock.
341 static kmp_thread_data_t *
342 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
343  kmp_thread_data_t *thread_data;
344  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
345  if (lst->priority == pri) {
346  // Found queue of tasks with given priority.
347  thread_data = &lst->td;
348  } else if (lst->priority < pri) {
349  // All current priority queues contain tasks with lower priority.
350  // Allocate new one for given priority tasks.
351  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
352  thread_data = &list->td;
353  list->priority = pri;
354  list->next = lst;
355  task_team->tt.tt_task_pri_list = list;
356  } else { // task_team->tt.tt_task_pri_list->priority > pri
357  kmp_task_pri_t *next_queue = lst->next;
358  while (next_queue && next_queue->priority > pri) {
359  lst = next_queue;
360  next_queue = lst->next;
361  }
362  // lst->priority > pri && (next == NULL || pri >= next->priority)
363  if (next_queue == NULL) {
364  // No queue with pri priority, need to allocate new one.
365  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
366  thread_data = &list->td;
367  list->priority = pri;
368  list->next = NULL;
369  lst->next = list;
370  } else if (next_queue->priority == pri) {
371  // Found queue of tasks with given priority.
372  thread_data = &next_queue->td;
373  } else { // lst->priority > pri > next->priority
374  // insert newly allocated between existed queues
375  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
376  thread_data = &list->td;
377  list->priority = pri;
378  list->next = next_queue;
379  lst->next = list;
380  }
381  }
382  return thread_data;
383 }
384 
385 // __kmp_push_priority_task: Add a task to the team's priority task deque
386 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
387  kmp_taskdata_t *taskdata,
388  kmp_task_team_t *task_team,
389  kmp_int32 pri) {
390  kmp_thread_data_t *thread_data = NULL;
391  KA_TRACE(20,
392  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
393  gtid, taskdata, pri));
394 
395  // Find task queue specific to priority value
396  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
397  if (UNLIKELY(lst == NULL)) {
398  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
399  if (task_team->tt.tt_task_pri_list == NULL) {
400  // List of queues is still empty, allocate one.
401  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
402  thread_data = &list->td;
403  list->priority = pri;
404  list->next = NULL;
405  task_team->tt.tt_task_pri_list = list;
406  } else {
407  // Other thread initialized a queue. Check if it fits and get thread_data.
408  thread_data = __kmp_get_priority_deque_data(task_team, pri);
409  }
410  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
411  } else {
412  if (lst->priority == pri) {
413  // Found queue of tasks with given priority.
414  thread_data = &lst->td;
415  } else {
416  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
417  thread_data = __kmp_get_priority_deque_data(task_team, pri);
418  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
419  }
420  }
421  KMP_DEBUG_ASSERT(thread_data);
422 
423  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
424  // Check if deque is full
425  if (TCR_4(thread_data->td.td_deque_ntasks) >=
426  TASK_DEQUE_SIZE(thread_data->td)) {
427  if (__kmp_enable_task_throttling &&
428  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
429  thread->th.th_current_task)) {
430  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
431  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
432  "TASK_NOT_PUSHED for task %p\n",
433  gtid, taskdata));
434  return TASK_NOT_PUSHED;
435  } else {
436  // expand deque to push the task which is not allowed to execute
437  __kmp_realloc_task_deque(thread, thread_data);
438  }
439  }
440  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
441  TASK_DEQUE_SIZE(thread_data->td));
442  // Push taskdata.
443  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
444  // Wrap index.
445  thread_data->td.td_deque_tail =
446  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
447  TCW_4(thread_data->td.td_deque_ntasks,
448  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
449  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
450  KMP_FSYNC_RELEASING(taskdata); // releasing child
451  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
452  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
453  gtid, taskdata, thread_data->td.td_deque_ntasks,
454  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
455  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
456  task_team->tt.tt_num_task_pri++; // atomic inc
457  return TASK_SUCCESSFULLY_PUSHED;
458 }
459 
460 // __kmp_push_task: Add a task to the thread's deque
461 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
462  kmp_info_t *thread = __kmp_threads[gtid];
463  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
464 
465  // If we encounter a hidden helper task, and the current thread is not a
466  // hidden helper thread, we have to give the task to any hidden helper thread
467  // starting from its shadow one.
468  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
469  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
470  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
471  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
472  // Signal the hidden helper threads.
473  __kmp_hidden_helper_worker_thread_signal();
474  return TASK_SUCCESSFULLY_PUSHED;
475  }
476 
477  kmp_task_team_t *task_team = thread->th.th_task_team;
478  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
479  kmp_thread_data_t *thread_data;
480 
481  KA_TRACE(20,
482  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
483 
484  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
485  // untied task needs to increment counter so that the task structure is not
486  // freed prematurely
487  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
488  KMP_DEBUG_USE_VAR(counter);
489  KA_TRACE(
490  20,
491  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
492  gtid, counter, taskdata));
493  }
494 
495  // The first check avoids building task_team thread data if serialized
496  if (UNLIKELY(taskdata->td_flags.task_serial)) {
497  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
498  "TASK_NOT_PUSHED for task %p\n",
499  gtid, taskdata));
500  return TASK_NOT_PUSHED;
501  }
502 
503  // Now that serialized tasks have returned, we can assume that we are not in
504  // immediate exec mode
505  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
506  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
507  __kmp_enable_tasking(task_team, thread);
508  }
509  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
510  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
511 
512  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
513  __kmp_max_task_priority > 0) {
514  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
515  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
516  }
517 
518  // Find tasking deque specific to encountering thread
519  thread_data = &task_team->tt.tt_threads_data[tid];
520 
521  // No lock needed since only owner can allocate. If the task is hidden_helper,
522  // we don't need it either because we have initialized the dequeue for hidden
523  // helper thread data.
524  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
525  __kmp_alloc_task_deque(thread, thread_data);
526  }
527 
528  int locked = 0;
529  // Check if deque is full
530  if (TCR_4(thread_data->td.td_deque_ntasks) >=
531  TASK_DEQUE_SIZE(thread_data->td)) {
532  if (__kmp_enable_task_throttling &&
533  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
534  thread->th.th_current_task)) {
535  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
536  "TASK_NOT_PUSHED for task %p\n",
537  gtid, taskdata));
538  return TASK_NOT_PUSHED;
539  } else {
540  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
541  locked = 1;
542  if (TCR_4(thread_data->td.td_deque_ntasks) >=
543  TASK_DEQUE_SIZE(thread_data->td)) {
544  // expand deque to push the task which is not allowed to execute
545  __kmp_realloc_task_deque(thread, thread_data);
546  }
547  }
548  }
549  // Lock the deque for the task push operation
550  if (!locked) {
551  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
552  // Need to recheck as we can get a proxy task from thread outside of OpenMP
553  if (TCR_4(thread_data->td.td_deque_ntasks) >=
554  TASK_DEQUE_SIZE(thread_data->td)) {
555  if (__kmp_enable_task_throttling &&
556  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
557  thread->th.th_current_task)) {
558  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
559  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
560  "returning TASK_NOT_PUSHED for task %p\n",
561  gtid, taskdata));
562  return TASK_NOT_PUSHED;
563  } else {
564  // expand deque to push the task which is not allowed to execute
565  __kmp_realloc_task_deque(thread, thread_data);
566  }
567  }
568  }
569  // Must have room since no thread can add tasks but calling thread
570  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
571  TASK_DEQUE_SIZE(thread_data->td));
572 
573  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
574  taskdata; // Push taskdata
575  // Wrap index.
576  thread_data->td.td_deque_tail =
577  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
578  TCW_4(thread_data->td.td_deque_ntasks,
579  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
580  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
581  KMP_FSYNC_RELEASING(taskdata); // releasing child
582  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
583  "task=%p ntasks=%d head=%u tail=%u\n",
584  gtid, taskdata, thread_data->td.td_deque_ntasks,
585  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
586 
587  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
588 
589  return TASK_SUCCESSFULLY_PUSHED;
590 }
591 
592 // __kmp_pop_current_task_from_thread: set up current task from called thread
593 // when team ends
594 //
595 // this_thr: thread structure to set current_task in.
596 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
597  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
598  "this_thread=%p, curtask=%p, "
599  "curtask_parent=%p\n",
600  0, this_thr, this_thr->th.th_current_task,
601  this_thr->th.th_current_task->td_parent));
602 
603  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
604 
605  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
606  "this_thread=%p, curtask=%p, "
607  "curtask_parent=%p\n",
608  0, this_thr, this_thr->th.th_current_task,
609  this_thr->th.th_current_task->td_parent));
610 }
611 
612 // __kmp_push_current_task_to_thread: set up current task in called thread for a
613 // new team
614 //
615 // this_thr: thread structure to set up
616 // team: team for implicit task data
617 // tid: thread within team to set up
618 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
619  int tid) {
620  // current task of the thread is a parent of the new just created implicit
621  // tasks of new team
622  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
623  "curtask=%p "
624  "parent_task=%p\n",
625  tid, this_thr, this_thr->th.th_current_task,
626  team->t.t_implicit_task_taskdata[tid].td_parent));
627 
628  KMP_DEBUG_ASSERT(this_thr != NULL);
629 
630  if (tid == 0) {
631  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
632  team->t.t_implicit_task_taskdata[0].td_parent =
633  this_thr->th.th_current_task;
634  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
635  }
636  } else {
637  team->t.t_implicit_task_taskdata[tid].td_parent =
638  team->t.t_implicit_task_taskdata[0].td_parent;
639  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
640  }
641 
642  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
643  "curtask=%p "
644  "parent_task=%p\n",
645  tid, this_thr, this_thr->th.th_current_task,
646  team->t.t_implicit_task_taskdata[tid].td_parent));
647 }
648 
649 // __kmp_task_start: bookkeeping for a task starting execution
650 //
651 // GTID: global thread id of calling thread
652 // task: task starting execution
653 // current_task: task suspending
654 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
655  kmp_taskdata_t *current_task) {
656  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
657  kmp_info_t *thread = __kmp_threads[gtid];
658 
659  KA_TRACE(10,
660  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
661  gtid, taskdata, current_task));
662 
663  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
664 
665  // mark currently executing task as suspended
666  // TODO: GEH - make sure root team implicit task is initialized properly.
667  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
668  current_task->td_flags.executing = 0;
669 
670 // Add task to stack if tied
671 #ifdef BUILD_TIED_TASK_STACK
672  if (taskdata->td_flags.tiedness == TASK_TIED) {
673  __kmp_push_task_stack(gtid, thread, taskdata);
674  }
675 #endif /* BUILD_TIED_TASK_STACK */
676 
677  // mark starting task as executing and as current task
678  thread->th.th_current_task = taskdata;
679 
680  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
681  taskdata->td_flags.tiedness == TASK_UNTIED);
682  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
683  taskdata->td_flags.tiedness == TASK_UNTIED);
684  taskdata->td_flags.started = 1;
685  taskdata->td_flags.executing = 1;
686  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
687  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
688 
689  // GEH TODO: shouldn't we pass some sort of location identifier here?
690  // APT: yes, we will pass location here.
691  // need to store current thread state (in a thread or taskdata structure)
692  // before setting work_state, otherwise wrong state is set after end of task
693 
694  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
695 
696  return;
697 }
698 
699 #if OMPT_SUPPORT
700 //------------------------------------------------------------------------------
701 // __ompt_task_init:
702 // Initialize OMPT fields maintained by a task. This will only be called after
703 // ompt_start_tool, so we already know whether ompt is enabled or not.
704 
705 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
706  // The calls to __ompt_task_init already have the ompt_enabled condition.
707  task->ompt_task_info.task_data.value = 0;
708  task->ompt_task_info.frame.exit_frame = ompt_data_none;
709  task->ompt_task_info.frame.enter_frame = ompt_data_none;
710  task->ompt_task_info.frame.exit_frame_flags =
711  ompt_frame_runtime | ompt_frame_framepointer;
712  task->ompt_task_info.frame.enter_frame_flags =
713  ompt_frame_runtime | ompt_frame_framepointer;
714 }
715 
716 // __ompt_task_start:
717 // Build and trigger task-begin event
718 static inline void __ompt_task_start(kmp_task_t *task,
719  kmp_taskdata_t *current_task,
720  kmp_int32 gtid) {
721  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
722  ompt_task_status_t status = ompt_task_switch;
723  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
724  status = ompt_task_yield;
725  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
726  }
727  /* let OMPT know that we're about to run this task */
728  if (ompt_enabled.ompt_callback_task_schedule) {
729  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
730  &(current_task->ompt_task_info.task_data), status,
731  &(taskdata->ompt_task_info.task_data));
732  }
733  taskdata->ompt_task_info.scheduling_parent = current_task;
734 }
735 
736 // __ompt_task_finish:
737 // Build and trigger final task-schedule event
738 static inline void __ompt_task_finish(kmp_task_t *task,
739  kmp_taskdata_t *resumed_task,
740  ompt_task_status_t status) {
741  if (ompt_enabled.ompt_callback_task_schedule) {
742  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
743  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
744  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
745  status = ompt_task_cancel;
746  }
747 
748  /* let OMPT know that we're returning to the callee task */
749  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
750  &(taskdata->ompt_task_info.task_data), status,
751  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
752  }
753 }
754 #endif
755 
756 template <bool ompt>
757 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
758  kmp_task_t *task,
759  void *frame_address,
760  void *return_address) {
761  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
762  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
763 
764  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
765  "current_task=%p\n",
766  gtid, loc_ref, taskdata, current_task));
767 
768  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
769  // untied task needs to increment counter so that the task structure is not
770  // freed prematurely
771  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
772  KMP_DEBUG_USE_VAR(counter);
773  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
774  "incremented for task %p\n",
775  gtid, counter, taskdata));
776  }
777 
778  taskdata->td_flags.task_serial =
779  1; // Execute this task immediately, not deferred.
780  __kmp_task_start(gtid, task, current_task);
781 
782 #if OMPT_SUPPORT
783  if (ompt) {
784  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
785  current_task->ompt_task_info.frame.enter_frame.ptr =
786  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
787  current_task->ompt_task_info.frame.enter_frame_flags =
788  taskdata->ompt_task_info.frame.exit_frame_flags =
789  ompt_frame_application | ompt_frame_framepointer;
790  }
791  if (ompt_enabled.ompt_callback_task_create) {
792  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
793  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
794  &(parent_info->task_data), &(parent_info->frame),
795  &(taskdata->ompt_task_info.task_data),
796  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
797  return_address);
798  }
799  __ompt_task_start(task, current_task, gtid);
800  }
801 #endif // OMPT_SUPPORT
802 
803  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
804  loc_ref, taskdata));
805 }
806 
807 #if OMPT_SUPPORT
808 OMPT_NOINLINE
809 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
810  kmp_task_t *task,
811  void *frame_address,
812  void *return_address) {
813  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
814  return_address);
815 }
816 #endif // OMPT_SUPPORT
817 
818 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
819 // execution
820 //
821 // loc_ref: source location information; points to beginning of task block.
822 // gtid: global thread number.
823 // task: task thunk for the started task.
824 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
825  kmp_task_t *task) {
826 #if OMPT_SUPPORT
827  if (UNLIKELY(ompt_enabled.enabled)) {
828  OMPT_STORE_RETURN_ADDRESS(gtid);
829  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
830  OMPT_GET_FRAME_ADDRESS(1),
831  OMPT_LOAD_RETURN_ADDRESS(gtid));
832  return;
833  }
834 #endif
835  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
836 }
837 
838 #ifdef TASK_UNUSED
839 // __kmpc_omp_task_begin: report that a given task has started execution
840 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
841 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
842  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
843 
844  KA_TRACE(
845  10,
846  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
847  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
848 
849  __kmp_task_start(gtid, task, current_task);
850 
851  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
852  loc_ref, KMP_TASK_TO_TASKDATA(task)));
853  return;
854 }
855 #endif // TASK_UNUSED
856 
857 // __kmp_free_task: free the current task space and the space for shareds
858 //
859 // gtid: Global thread ID of calling thread
860 // taskdata: task to free
861 // thread: thread data structure of caller
862 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
863  kmp_info_t *thread) {
864  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
865  taskdata));
866 
867  // Check to make sure all flags and counters have the correct values
868  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
869  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
870  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
871  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
872  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
873  taskdata->td_flags.task_serial == 1);
874  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
875  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
876  // Clear data to not be re-used later by mistake.
877  task->data1.destructors = NULL;
878  task->data2.priority = 0;
879 
880  taskdata->td_flags.freed = 1;
881 // deallocate the taskdata and shared variable blocks associated with this task
882 #if USE_FAST_MEMORY
883  __kmp_fast_free(thread, taskdata);
884 #else /* ! USE_FAST_MEMORY */
885  __kmp_thread_free(thread, taskdata);
886 #endif
887  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
888 }
889 
890 // __kmp_free_task_and_ancestors: free the current task and ancestors without
891 // children
892 //
893 // gtid: Global thread ID of calling thread
894 // taskdata: task to free
895 // thread: thread data structure of caller
896 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
897  kmp_taskdata_t *taskdata,
898  kmp_info_t *thread) {
899  // Proxy tasks must always be allowed to free their parents
900  // because they can be run in background even in serial mode.
901  kmp_int32 team_serial =
902  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
903  !taskdata->td_flags.proxy;
904  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
905 
906  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
907  KMP_DEBUG_ASSERT(children >= 0);
908 
909  // Now, go up the ancestor tree to see if any ancestors can now be freed.
910  while (children == 0) {
911  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
912 
913  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
914  "and freeing itself\n",
915  gtid, taskdata));
916 
917  // --- Deallocate my ancestor task ---
918  __kmp_free_task(gtid, taskdata, thread);
919 
920  taskdata = parent_taskdata;
921 
922  if (team_serial)
923  return;
924  // Stop checking ancestors at implicit task instead of walking up ancestor
925  // tree to avoid premature deallocation of ancestors.
926  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
927  if (taskdata->td_dephash) { // do we need to cleanup dephash?
928  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
929  kmp_tasking_flags_t flags_old = taskdata->td_flags;
930  if (children == 0 && flags_old.complete == 1) {
931  kmp_tasking_flags_t flags_new = flags_old;
932  flags_new.complete = 0;
933  if (KMP_COMPARE_AND_STORE_ACQ32(
934  RCAST(kmp_int32 *, &taskdata->td_flags),
935  *RCAST(kmp_int32 *, &flags_old),
936  *RCAST(kmp_int32 *, &flags_new))) {
937  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
938  "dephash of implicit task %p\n",
939  gtid, taskdata));
940  // cleanup dephash of finished implicit task
941  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
942  }
943  }
944  }
945  return;
946  }
947  // Predecrement simulated by "- 1" calculation
948  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
949  KMP_DEBUG_ASSERT(children >= 0);
950  }
951 
952  KA_TRACE(
953  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
954  "not freeing it yet\n",
955  gtid, taskdata, children));
956 }
957 
958 // Only need to keep track of child task counts if any of the following:
959 // 1. team parallel and tasking not serialized;
960 // 2. it is a proxy or detachable or hidden helper task
961 // 3. the children counter of its parent task is greater than 0.
962 // The reason for the 3rd one is for serialized team that found detached task,
963 // hidden helper task, T. In this case, the execution of T is still deferred,
964 // and it is also possible that a regular task depends on T. In this case, if we
965 // don't track the children, task synchronization will be broken.
966 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
967  kmp_tasking_flags_t flags = taskdata->td_flags;
968  bool ret = !(flags.team_serial || flags.tasking_ser);
969  ret = ret || flags.proxy == TASK_PROXY ||
970  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
971  ret = ret ||
972  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
973  return ret;
974 }
975 
976 // __kmp_task_finish: bookkeeping to do when a task finishes execution
977 //
978 // gtid: global thread ID for calling thread
979 // task: task to be finished
980 // resumed_task: task to be resumed. (may be NULL if task is serialized)
981 //
982 // template<ompt>: effectively ompt_enabled.enabled!=0
983 // the version with ompt=false is inlined, allowing to optimize away all ompt
984 // code in this case
985 template <bool ompt>
986 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
987  kmp_taskdata_t *resumed_task) {
988  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
989  kmp_info_t *thread = __kmp_threads[gtid];
990  kmp_task_team_t *task_team =
991  thread->th.th_task_team; // might be NULL for serial teams...
992 #if KMP_DEBUG
993  kmp_int32 children = 0;
994 #endif
995  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
996  "task %p\n",
997  gtid, taskdata, resumed_task));
998 
999  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1000 
1001 // Pop task from stack if tied
1002 #ifdef BUILD_TIED_TASK_STACK
1003  if (taskdata->td_flags.tiedness == TASK_TIED) {
1004  __kmp_pop_task_stack(gtid, thread, taskdata);
1005  }
1006 #endif /* BUILD_TIED_TASK_STACK */
1007 
1008  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1009  // untied task needs to check the counter so that the task structure is not
1010  // freed prematurely
1011  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1012  KA_TRACE(
1013  20,
1014  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1015  gtid, counter, taskdata));
1016  if (counter > 0) {
1017  // untied task is not done, to be continued possibly by other thread, do
1018  // not free it now
1019  if (resumed_task == NULL) {
1020  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1021  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1022  // task is the parent
1023  }
1024  thread->th.th_current_task = resumed_task; // restore current_task
1025  resumed_task->td_flags.executing = 1; // resume previous task
1026  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1027  "resuming task %p\n",
1028  gtid, taskdata, resumed_task));
1029  return;
1030  }
1031  }
1032 
1033  // bookkeeping for resuming task:
1034  // GEH - note tasking_ser => task_serial
1035  KMP_DEBUG_ASSERT(
1036  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1037  taskdata->td_flags.task_serial);
1038  if (taskdata->td_flags.task_serial) {
1039  if (resumed_task == NULL) {
1040  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1041  // task is the parent
1042  }
1043  } else {
1044  KMP_DEBUG_ASSERT(resumed_task !=
1045  NULL); // verify that resumed task is passed as argument
1046  }
1047 
1048  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1049  destructor thunk that has been generated by the compiler. The code is
1050  placed here, since at this point other tasks might have been released
1051  hence overlapping the destructor invocations with some other work in the
1052  released tasks. The OpenMP spec is not specific on when the destructors
1053  are invoked, so we should be free to choose. */
1054  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1055  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1056  KMP_ASSERT(destr_thunk);
1057  destr_thunk(gtid, task);
1058  }
1059 
1060  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1061  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1062  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1063 
1064  bool detach = false;
1065  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1066  if (taskdata->td_allow_completion_event.type ==
1067  KMP_EVENT_ALLOW_COMPLETION) {
1068  // event hasn't been fulfilled yet. Try to detach task.
1069  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1070  if (taskdata->td_allow_completion_event.type ==
1071  KMP_EVENT_ALLOW_COMPLETION) {
1072  // task finished execution
1073  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1074  taskdata->td_flags.executing = 0; // suspend the finishing task
1075 
1076 #if OMPT_SUPPORT
1077  // For a detached task, which is not completed, we switch back
1078  // the omp_fulfill_event signals completion
1079  // locking is necessary to avoid a race with ompt_task_late_fulfill
1080  if (ompt)
1081  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1082 #endif
1083 
1084  // no access to taskdata after this point!
1085  // __kmp_fulfill_event might free taskdata at any time from now
1086 
1087  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1088  detach = true;
1089  }
1090  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1091  }
1092  }
1093 
1094  if (!detach) {
1095  taskdata->td_flags.complete = 1; // mark the task as completed
1096 
1097 #if OMPT_SUPPORT
1098  // This is not a detached task, we are done here
1099  if (ompt)
1100  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1101 #endif
1102  // TODO: What would be the balance between the conditions in the function
1103  // and an atomic operation?
1104  if (__kmp_track_children_task(taskdata)) {
1105  __kmp_release_deps(gtid, taskdata);
1106  // Predecrement simulated by "- 1" calculation
1107 #if KMP_DEBUG
1108  children = -1 +
1109 #endif
1110  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1111  KMP_DEBUG_ASSERT(children >= 0);
1112  if (taskdata->td_taskgroup)
1113  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1114  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1115  task_team->tt.tt_hidden_helper_task_encountered)) {
1116  // if we found proxy or hidden helper tasks there could exist a dependency
1117  // chain with the proxy task as origin
1118  __kmp_release_deps(gtid, taskdata);
1119  }
1120  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1121  // called. Othertwise, if a task is executed immediately from the
1122  // release_deps code, the flag will be reset to 1 again by this same
1123  // function
1124  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1125  taskdata->td_flags.executing = 0; // suspend the finishing task
1126  }
1127 
1128  KA_TRACE(
1129  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1130  gtid, taskdata, children));
1131 
1132  // Free this task and then ancestor tasks if they have no children.
1133  // Restore th_current_task first as suggested by John:
1134  // johnmc: if an asynchronous inquiry peers into the runtime system
1135  // it doesn't see the freed task as the current task.
1136  thread->th.th_current_task = resumed_task;
1137  if (!detach)
1138  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1139 
1140  // TODO: GEH - make sure root team implicit task is initialized properly.
1141  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1142  resumed_task->td_flags.executing = 1; // resume previous task
1143 
1144  KA_TRACE(
1145  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1146  gtid, taskdata, resumed_task));
1147 
1148  return;
1149 }
1150 
1151 template <bool ompt>
1152 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1153  kmp_int32 gtid,
1154  kmp_task_t *task) {
1155  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1156  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1157  KMP_DEBUG_ASSERT(gtid >= 0);
1158  // this routine will provide task to resume
1159  __kmp_task_finish<ompt>(gtid, task, NULL);
1160 
1161  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1162  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1163 
1164 #if OMPT_SUPPORT
1165  if (ompt) {
1166  ompt_frame_t *ompt_frame;
1167  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1168  ompt_frame->enter_frame = ompt_data_none;
1169  ompt_frame->enter_frame_flags =
1170  ompt_frame_runtime | ompt_frame_framepointer;
1171  }
1172 #endif
1173 
1174  return;
1175 }
1176 
1177 #if OMPT_SUPPORT
1178 OMPT_NOINLINE
1179 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1180  kmp_task_t *task) {
1181  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1182 }
1183 #endif // OMPT_SUPPORT
1184 
1185 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1186 //
1187 // loc_ref: source location information; points to end of task block.
1188 // gtid: global thread number.
1189 // task: task thunk for the completed task.
1190 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1191  kmp_task_t *task) {
1192 #if OMPT_SUPPORT
1193  if (UNLIKELY(ompt_enabled.enabled)) {
1194  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1195  return;
1196  }
1197 #endif
1198  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1199 }
1200 
1201 #ifdef TASK_UNUSED
1202 // __kmpc_omp_task_complete: report that a task has completed execution
1203 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1204 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1205  kmp_task_t *task) {
1206  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1207  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1208 
1209  __kmp_task_finish<false>(gtid, task,
1210  NULL); // Not sure how to find task to resume
1211 
1212  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1213  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1214  return;
1215 }
1216 #endif // TASK_UNUSED
1217 
1218 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1219 // task for a given thread
1220 //
1221 // loc_ref: reference to source location of parallel region
1222 // this_thr: thread data structure corresponding to implicit task
1223 // team: team for this_thr
1224 // tid: thread id of given thread within team
1225 // set_curr_task: TRUE if need to push current task to thread
1226 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1227 // have already been done elsewhere.
1228 // TODO: Get better loc_ref. Value passed in may be NULL
1229 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1230  kmp_team_t *team, int tid, int set_curr_task) {
1231  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1232 
1233  KF_TRACE(
1234  10,
1235  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1236  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1237 
1238  task->td_task_id = KMP_GEN_TASK_ID();
1239  task->td_team = team;
1240  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1241  // in debugger)
1242  task->td_ident = loc_ref;
1243  task->td_taskwait_ident = NULL;
1244  task->td_taskwait_counter = 0;
1245  task->td_taskwait_thread = 0;
1246 
1247  task->td_flags.tiedness = TASK_TIED;
1248  task->td_flags.tasktype = TASK_IMPLICIT;
1249  task->td_flags.proxy = TASK_FULL;
1250 
1251  // All implicit tasks are executed immediately, not deferred
1252  task->td_flags.task_serial = 1;
1253  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1254  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1255 
1256  task->td_flags.started = 1;
1257  task->td_flags.executing = 1;
1258  task->td_flags.complete = 0;
1259  task->td_flags.freed = 0;
1260 
1261  task->td_depnode = NULL;
1262  task->td_last_tied = task;
1263  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1264 
1265  if (set_curr_task) { // only do this init first time thread is created
1266  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1267  // Not used: don't need to deallocate implicit task
1268  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1269  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1270  task->td_dephash = NULL;
1271  __kmp_push_current_task_to_thread(this_thr, team, tid);
1272  } else {
1273  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1274  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1275  }
1276 
1277 #if OMPT_SUPPORT
1278  if (UNLIKELY(ompt_enabled.enabled))
1279  __ompt_task_init(task, tid);
1280 #endif
1281 
1282  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1283  team, task));
1284 }
1285 
1286 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1287 // at the end of parallel regions. Some resources are kept for reuse in the next
1288 // parallel region.
1289 //
1290 // thread: thread data structure corresponding to implicit task
1291 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1292  kmp_taskdata_t *task = thread->th.th_current_task;
1293  if (task->td_dephash) {
1294  int children;
1295  task->td_flags.complete = 1;
1296  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1297  kmp_tasking_flags_t flags_old = task->td_flags;
1298  if (children == 0 && flags_old.complete == 1) {
1299  kmp_tasking_flags_t flags_new = flags_old;
1300  flags_new.complete = 0;
1301  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1302  *RCAST(kmp_int32 *, &flags_old),
1303  *RCAST(kmp_int32 *, &flags_new))) {
1304  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1305  "dephash of implicit task %p\n",
1306  thread->th.th_info.ds.ds_gtid, task));
1307  __kmp_dephash_free_entries(thread, task->td_dephash);
1308  }
1309  }
1310  }
1311 }
1312 
1313 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1314 // when these are destroyed regions
1315 //
1316 // thread: thread data structure corresponding to implicit task
1317 void __kmp_free_implicit_task(kmp_info_t *thread) {
1318  kmp_taskdata_t *task = thread->th.th_current_task;
1319  if (task && task->td_dephash) {
1320  __kmp_dephash_free(thread, task->td_dephash);
1321  task->td_dephash = NULL;
1322  }
1323 }
1324 
1325 // Round up a size to a power of two specified by val: Used to insert padding
1326 // between structures co-allocated using a single malloc() call
1327 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1328  if (size & (val - 1)) {
1329  size &= ~(val - 1);
1330  if (size <= KMP_SIZE_T_MAX - val) {
1331  size += val; // Round up if there is no overflow.
1332  }
1333  }
1334  return size;
1335 } // __kmp_round_up_to_va
1336 
1337 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1338 //
1339 // loc_ref: source location information
1340 // gtid: global thread number.
1341 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1342 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1343 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1344 // private vars accessed in task.
1345 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1346 // in task.
1347 // task_entry: Pointer to task code entry point generated by compiler.
1348 // returns: a pointer to the allocated kmp_task_t structure (task).
1349 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1350  kmp_tasking_flags_t *flags,
1351  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1352  kmp_routine_entry_t task_entry) {
1353  kmp_task_t *task;
1354  kmp_taskdata_t *taskdata;
1355  kmp_info_t *thread = __kmp_threads[gtid];
1356  kmp_team_t *team = thread->th.th_team;
1357  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1358  size_t shareds_offset;
1359 
1360  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1361  __kmp_middle_initialize();
1362 
1363  if (flags->hidden_helper) {
1364  if (__kmp_enable_hidden_helper) {
1365  if (!TCR_4(__kmp_init_hidden_helper))
1366  __kmp_hidden_helper_initialize();
1367  } else {
1368  // If the hidden helper task is not enabled, reset the flag to FALSE.
1369  flags->hidden_helper = FALSE;
1370  }
1371  }
1372 
1373  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1374  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1375  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1376  sizeof_shareds, task_entry));
1377 
1378  KMP_DEBUG_ASSERT(parent_task);
1379  if (parent_task->td_flags.final) {
1380  if (flags->merged_if0) {
1381  }
1382  flags->final = 1;
1383  }
1384 
1385  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1386  // Untied task encountered causes the TSC algorithm to check entire deque of
1387  // the victim thread. If no untied task encountered, then checking the head
1388  // of the deque should be enough.
1389  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1390  }
1391 
1392  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1393  // the tasking setup
1394  // when that happens is too late.
1395  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1396  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1397  if (flags->proxy == TASK_PROXY) {
1398  flags->tiedness = TASK_UNTIED;
1399  flags->merged_if0 = 1;
1400  }
1401  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1402  tasking support enabled */
1403  if ((thread->th.th_task_team) == NULL) {
1404  /* This should only happen if the team is serialized
1405  setup a task team and propagate it to the thread */
1406  KMP_DEBUG_ASSERT(team->t.t_serialized);
1407  KA_TRACE(30,
1408  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1409  gtid));
1410  // 1 indicates setup the current team regardless of nthreads
1411  __kmp_task_team_setup(thread, team, 1);
1412  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1413  }
1414  kmp_task_team_t *task_team = thread->th.th_task_team;
1415 
1416  /* tasking must be enabled now as the task might not be pushed */
1417  if (!KMP_TASKING_ENABLED(task_team)) {
1418  KA_TRACE(
1419  30,
1420  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1421  __kmp_enable_tasking(task_team, thread);
1422  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1423  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1424  // No lock needed since only owner can allocate
1425  if (thread_data->td.td_deque == NULL) {
1426  __kmp_alloc_task_deque(thread, thread_data);
1427  }
1428  }
1429 
1430  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1431  task_team->tt.tt_found_proxy_tasks == FALSE)
1432  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1433  if (flags->hidden_helper &&
1434  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1435  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1436  }
1437 
1438  // Calculate shared structure offset including padding after kmp_task_t struct
1439  // to align pointers in shared struct
1440  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1441  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1442 
1443  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1444  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1445  shareds_offset));
1446  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1447  sizeof_shareds));
1448 
1449  // Avoid double allocation here by combining shareds with taskdata
1450 #if USE_FAST_MEMORY
1451  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1452  sizeof_shareds);
1453 #else /* ! USE_FAST_MEMORY */
1454  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1455  sizeof_shareds);
1456 #endif /* USE_FAST_MEMORY */
1457 
1458  task = KMP_TASKDATA_TO_TASK(taskdata);
1459 
1460 // Make sure task & taskdata are aligned appropriately
1461 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1462  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1463  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1464 #else
1465  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1466  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1467 #endif
1468  if (sizeof_shareds > 0) {
1469  // Avoid double allocation here by combining shareds with taskdata
1470  task->shareds = &((char *)taskdata)[shareds_offset];
1471  // Make sure shareds struct is aligned to pointer size
1472  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1473  0);
1474  } else {
1475  task->shareds = NULL;
1476  }
1477  task->routine = task_entry;
1478  task->part_id = 0; // AC: Always start with 0 part id
1479 
1480  taskdata->td_task_id = KMP_GEN_TASK_ID();
1481  taskdata->td_team = thread->th.th_team;
1482  taskdata->td_alloc_thread = thread;
1483  taskdata->td_parent = parent_task;
1484  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1485  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1486  taskdata->td_ident = loc_ref;
1487  taskdata->td_taskwait_ident = NULL;
1488  taskdata->td_taskwait_counter = 0;
1489  taskdata->td_taskwait_thread = 0;
1490  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1491  // avoid copying icvs for proxy tasks
1492  if (flags->proxy == TASK_FULL)
1493  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1494 
1495  taskdata->td_flags = *flags;
1496  taskdata->td_task_team = thread->th.th_task_team;
1497  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1498  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1499  // If it is hidden helper task, we need to set the team and task team
1500  // correspondingly.
1501  if (flags->hidden_helper) {
1502  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1503  taskdata->td_team = shadow_thread->th.th_team;
1504  taskdata->td_task_team = shadow_thread->th.th_task_team;
1505  }
1506 
1507  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1508  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1509 
1510  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1511  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1512 
1513  // GEH - Note we serialize the task if the team is serialized to make sure
1514  // implicit parallel region tasks are not left until program termination to
1515  // execute. Also, it helps locality to execute immediately.
1516 
1517  taskdata->td_flags.task_serial =
1518  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1519  taskdata->td_flags.tasking_ser || flags->merged_if0);
1520 
1521  taskdata->td_flags.started = 0;
1522  taskdata->td_flags.executing = 0;
1523  taskdata->td_flags.complete = 0;
1524  taskdata->td_flags.freed = 0;
1525 
1526  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1527  // start at one because counts current task and children
1528  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1529  taskdata->td_taskgroup =
1530  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1531  taskdata->td_dephash = NULL;
1532  taskdata->td_depnode = NULL;
1533  if (flags->tiedness == TASK_UNTIED)
1534  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1535  else
1536  taskdata->td_last_tied = taskdata;
1537  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1538 #if OMPT_SUPPORT
1539  if (UNLIKELY(ompt_enabled.enabled))
1540  __ompt_task_init(taskdata, gtid);
1541 #endif
1542  // TODO: What would be the balance between the conditions in the function and
1543  // an atomic operation?
1544  if (__kmp_track_children_task(taskdata)) {
1545  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1546  if (parent_task->td_taskgroup)
1547  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1548  // Only need to keep track of allocated child tasks for explicit tasks since
1549  // implicit not deallocated
1550  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1551  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1552  }
1553  if (flags->hidden_helper) {
1554  taskdata->td_flags.task_serial = FALSE;
1555  // Increment the number of hidden helper tasks to be executed
1556  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1557  }
1558  }
1559 
1560  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1561  gtid, taskdata, taskdata->td_parent));
1562 
1563  return task;
1564 }
1565 
1566 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1567  kmp_int32 flags, size_t sizeof_kmp_task_t,
1568  size_t sizeof_shareds,
1569  kmp_routine_entry_t task_entry) {
1570  kmp_task_t *retval;
1571  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1572  __kmp_assert_valid_gtid(gtid);
1573  input_flags->native = FALSE;
1574  // __kmp_task_alloc() sets up all other runtime flags
1575  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1576  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1577  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1578  input_flags->proxy ? "proxy" : "",
1579  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1580  sizeof_shareds, task_entry));
1581 
1582  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1583  sizeof_shareds, task_entry);
1584 
1585  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1586 
1587  return retval;
1588 }
1589 
1590 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1591  kmp_int32 flags,
1592  size_t sizeof_kmp_task_t,
1593  size_t sizeof_shareds,
1594  kmp_routine_entry_t task_entry,
1595  kmp_int64 device_id) {
1596  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1597  // target task is untied defined in the specification
1598  input_flags.tiedness = TASK_UNTIED;
1599 
1600  if (__kmp_enable_hidden_helper)
1601  input_flags.hidden_helper = TRUE;
1602 
1603  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1604  sizeof_shareds, task_entry);
1605 }
1606 
1620 kmp_int32
1622  kmp_task_t *new_task, kmp_int32 naffins,
1623  kmp_task_affinity_info_t *affin_list) {
1624  return 0;
1625 }
1626 
1627 // __kmp_invoke_task: invoke the specified task
1628 //
1629 // gtid: global thread ID of caller
1630 // task: the task to invoke
1631 // current_task: the task to resume after task invocation
1632 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1633  kmp_taskdata_t *current_task) {
1634  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1635  kmp_info_t *thread;
1636  int discard = 0 /* false */;
1637  KA_TRACE(
1638  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1639  gtid, taskdata, current_task));
1640  KMP_DEBUG_ASSERT(task);
1641  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1642  taskdata->td_flags.complete == 1)) {
1643  // This is a proxy task that was already completed but it needs to run
1644  // its bottom-half finish
1645  KA_TRACE(
1646  30,
1647  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1648  gtid, taskdata));
1649 
1650  __kmp_bottom_half_finish_proxy(gtid, task);
1651 
1652  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1653  "proxy task %p, resuming task %p\n",
1654  gtid, taskdata, current_task));
1655 
1656  return;
1657  }
1658 
1659 #if OMPT_SUPPORT
1660  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1661  // does not execute code.
1662  ompt_thread_info_t oldInfo;
1663  if (UNLIKELY(ompt_enabled.enabled)) {
1664  // Store the threads states and restore them after the task
1665  thread = __kmp_threads[gtid];
1666  oldInfo = thread->th.ompt_thread_info;
1667  thread->th.ompt_thread_info.wait_id = 0;
1668  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1669  ? ompt_state_work_serial
1670  : ompt_state_work_parallel;
1671  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1672  }
1673 #endif
1674 
1675  // Decreament the counter of hidden helper tasks to be executed
1676  if (taskdata->td_flags.hidden_helper) {
1677  // Hidden helper tasks can only be executed by hidden helper threads
1678  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1679  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1680  }
1681 
1682  // Proxy tasks are not handled by the runtime
1683  if (taskdata->td_flags.proxy != TASK_PROXY) {
1684  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1685  }
1686 
1687  // TODO: cancel tasks if the parallel region has also been cancelled
1688  // TODO: check if this sequence can be hoisted above __kmp_task_start
1689  // if cancellation has been enabled for this run ...
1690  if (UNLIKELY(__kmp_omp_cancellation)) {
1691  thread = __kmp_threads[gtid];
1692  kmp_team_t *this_team = thread->th.th_team;
1693  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1694  if ((taskgroup && taskgroup->cancel_request) ||
1695  (this_team->t.t_cancel_request == cancel_parallel)) {
1696 #if OMPT_SUPPORT && OMPT_OPTIONAL
1697  ompt_data_t *task_data;
1698  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1699  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1700  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1701  task_data,
1702  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1703  : ompt_cancel_parallel) |
1704  ompt_cancel_discarded_task,
1705  NULL);
1706  }
1707 #endif
1708  KMP_COUNT_BLOCK(TASK_cancelled);
1709  // this task belongs to a task group and we need to cancel it
1710  discard = 1 /* true */;
1711  }
1712  }
1713 
1714  // Invoke the task routine and pass in relevant data.
1715  // Thunks generated by gcc take a different argument list.
1716  if (!discard) {
1717  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1718  taskdata->td_last_tied = current_task->td_last_tied;
1719  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1720  }
1721 #if KMP_STATS_ENABLED
1722  KMP_COUNT_BLOCK(TASK_executed);
1723  switch (KMP_GET_THREAD_STATE()) {
1724  case FORK_JOIN_BARRIER:
1725  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1726  break;
1727  case PLAIN_BARRIER:
1728  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1729  break;
1730  case TASKYIELD:
1731  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1732  break;
1733  case TASKWAIT:
1734  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1735  break;
1736  case TASKGROUP:
1737  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1738  break;
1739  default:
1740  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1741  break;
1742  }
1743 #endif // KMP_STATS_ENABLED
1744 
1745 // OMPT task begin
1746 #if OMPT_SUPPORT
1747  if (UNLIKELY(ompt_enabled.enabled))
1748  __ompt_task_start(task, current_task, gtid);
1749 #endif
1750 
1751 #if OMPD_SUPPORT
1752  if (ompd_state & OMPD_ENABLE_BP)
1753  ompd_bp_task_begin();
1754 #endif
1755 
1756 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1757  kmp_uint64 cur_time;
1758  kmp_int32 kmp_itt_count_task =
1759  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1760  current_task->td_flags.tasktype == TASK_IMPLICIT;
1761  if (kmp_itt_count_task) {
1762  thread = __kmp_threads[gtid];
1763  // Time outer level explicit task on barrier for adjusting imbalance time
1764  if (thread->th.th_bar_arrive_time)
1765  cur_time = __itt_get_timestamp();
1766  else
1767  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1768  }
1769  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1770 #endif
1771 
1772  if (task->routine != NULL) {
1773 #ifdef KMP_GOMP_COMPAT
1774  if (taskdata->td_flags.native) {
1775  ((void (*)(void *))(*(task->routine)))(task->shareds);
1776  } else
1777 #endif /* KMP_GOMP_COMPAT */
1778  {
1779  (*(task->routine))(gtid, task);
1780  }
1781  }
1782  KMP_POP_PARTITIONED_TIMER();
1783 
1784 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1785  if (kmp_itt_count_task) {
1786  // Barrier imbalance - adjust arrive time with the task duration
1787  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1788  }
1789  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1790  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1791 #endif
1792  }
1793 
1794 #if OMPD_SUPPORT
1795  if (ompd_state & OMPD_ENABLE_BP)
1796  ompd_bp_task_end();
1797 #endif
1798 
1799  // Proxy tasks are not handled by the runtime
1800  if (taskdata->td_flags.proxy != TASK_PROXY) {
1801 #if OMPT_SUPPORT
1802  if (UNLIKELY(ompt_enabled.enabled)) {
1803  thread->th.ompt_thread_info = oldInfo;
1804  if (taskdata->td_flags.tiedness == TASK_TIED) {
1805  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1806  }
1807  __kmp_task_finish<true>(gtid, task, current_task);
1808  } else
1809 #endif
1810  __kmp_task_finish<false>(gtid, task, current_task);
1811  }
1812 
1813  KA_TRACE(
1814  30,
1815  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1816  gtid, taskdata, current_task));
1817  return;
1818 }
1819 
1820 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1821 //
1822 // loc_ref: location of original task pragma (ignored)
1823 // gtid: Global Thread ID of encountering thread
1824 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1825 // Returns:
1826 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1827 // be resumed later.
1828 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1829 // resumed later.
1830 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1831  kmp_task_t *new_task) {
1832  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1833 
1834  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1835  loc_ref, new_taskdata));
1836 
1837 #if OMPT_SUPPORT
1838  kmp_taskdata_t *parent;
1839  if (UNLIKELY(ompt_enabled.enabled)) {
1840  parent = new_taskdata->td_parent;
1841  if (ompt_enabled.ompt_callback_task_create) {
1842  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1843  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1844  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1845  OMPT_GET_RETURN_ADDRESS(0));
1846  }
1847  }
1848 #endif
1849 
1850  /* Should we execute the new task or queue it? For now, let's just always try
1851  to queue it. If the queue fills up, then we'll execute it. */
1852 
1853  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1854  { // Execute this task immediately
1855  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1856  new_taskdata->td_flags.task_serial = 1;
1857  __kmp_invoke_task(gtid, new_task, current_task);
1858  }
1859 
1860  KA_TRACE(
1861  10,
1862  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1863  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1864  gtid, loc_ref, new_taskdata));
1865 
1866 #if OMPT_SUPPORT
1867  if (UNLIKELY(ompt_enabled.enabled)) {
1868  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1869  }
1870 #endif
1871  return TASK_CURRENT_NOT_QUEUED;
1872 }
1873 
1874 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1875 //
1876 // gtid: Global Thread ID of encountering thread
1877 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1878 // serialize_immediate: if TRUE then if the task is executed immediately its
1879 // execution will be serialized
1880 // Returns:
1881 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1882 // be resumed later.
1883 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1884 // resumed later.
1885 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1886  bool serialize_immediate) {
1887  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1888 
1889  /* Should we execute the new task or queue it? For now, let's just always try
1890  to queue it. If the queue fills up, then we'll execute it. */
1891  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1892  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1893  { // Execute this task immediately
1894  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1895  if (serialize_immediate)
1896  new_taskdata->td_flags.task_serial = 1;
1897  __kmp_invoke_task(gtid, new_task, current_task);
1898  }
1899 
1900  return TASK_CURRENT_NOT_QUEUED;
1901 }
1902 
1903 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1904 // non-thread-switchable task from the parent thread only!
1905 //
1906 // loc_ref: location of original task pragma (ignored)
1907 // gtid: Global Thread ID of encountering thread
1908 // new_task: non-thread-switchable task thunk allocated by
1909 // __kmp_omp_task_alloc()
1910 // Returns:
1911 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1912 // be resumed later.
1913 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1914 // resumed later.
1915 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1916  kmp_task_t *new_task) {
1917  kmp_int32 res;
1918  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1919 
1920 #if KMP_DEBUG || OMPT_SUPPORT
1921  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1922 #endif
1923  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1924  new_taskdata));
1925  __kmp_assert_valid_gtid(gtid);
1926 
1927 #if OMPT_SUPPORT
1928  kmp_taskdata_t *parent = NULL;
1929  if (UNLIKELY(ompt_enabled.enabled)) {
1930  if (!new_taskdata->td_flags.started) {
1931  OMPT_STORE_RETURN_ADDRESS(gtid);
1932  parent = new_taskdata->td_parent;
1933  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1934  parent->ompt_task_info.frame.enter_frame.ptr =
1935  OMPT_GET_FRAME_ADDRESS(0);
1936  }
1937  if (ompt_enabled.ompt_callback_task_create) {
1938  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1939  &(parent->ompt_task_info.task_data),
1940  &(parent->ompt_task_info.frame),
1941  &(new_taskdata->ompt_task_info.task_data),
1942  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1943  OMPT_LOAD_RETURN_ADDRESS(gtid));
1944  }
1945  } else {
1946  // We are scheduling the continuation of an UNTIED task.
1947  // Scheduling back to the parent task.
1948  __ompt_task_finish(new_task,
1949  new_taskdata->ompt_task_info.scheduling_parent,
1950  ompt_task_switch);
1951  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1952  }
1953  }
1954 #endif
1955 
1956  res = __kmp_omp_task(gtid, new_task, true);
1957 
1958  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1959  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1960  gtid, loc_ref, new_taskdata));
1961 #if OMPT_SUPPORT
1962  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1963  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1964  }
1965 #endif
1966  return res;
1967 }
1968 
1969 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1970 // a taskloop task with the correct OMPT return address
1971 //
1972 // loc_ref: location of original task pragma (ignored)
1973 // gtid: Global Thread ID of encountering thread
1974 // new_task: non-thread-switchable task thunk allocated by
1975 // __kmp_omp_task_alloc()
1976 // codeptr_ra: return address for OMPT callback
1977 // Returns:
1978 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1979 // be resumed later.
1980 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1981 // resumed later.
1982 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1983  kmp_task_t *new_task, void *codeptr_ra) {
1984  kmp_int32 res;
1985  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1986 
1987 #if KMP_DEBUG || OMPT_SUPPORT
1988  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1989 #endif
1990  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1991  new_taskdata));
1992 
1993 #if OMPT_SUPPORT
1994  kmp_taskdata_t *parent = NULL;
1995  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1996  parent = new_taskdata->td_parent;
1997  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1998  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1999  if (ompt_enabled.ompt_callback_task_create) {
2000  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2001  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2002  &(new_taskdata->ompt_task_info.task_data),
2003  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2004  codeptr_ra);
2005  }
2006  }
2007 #endif
2008 
2009  res = __kmp_omp_task(gtid, new_task, true);
2010 
2011  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2012  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2013  gtid, loc_ref, new_taskdata));
2014 #if OMPT_SUPPORT
2015  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2016  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2017  }
2018 #endif
2019  return res;
2020 }
2021 
2022 template <bool ompt>
2023 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2024  void *frame_address,
2025  void *return_address) {
2026  kmp_taskdata_t *taskdata = nullptr;
2027  kmp_info_t *thread;
2028  int thread_finished = FALSE;
2029  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2030 
2031  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2032  KMP_DEBUG_ASSERT(gtid >= 0);
2033 
2034  if (__kmp_tasking_mode != tskm_immediate_exec) {
2035  thread = __kmp_threads[gtid];
2036  taskdata = thread->th.th_current_task;
2037 
2038 #if OMPT_SUPPORT && OMPT_OPTIONAL
2039  ompt_data_t *my_task_data;
2040  ompt_data_t *my_parallel_data;
2041 
2042  if (ompt) {
2043  my_task_data = &(taskdata->ompt_task_info.task_data);
2044  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2045 
2046  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2047 
2048  if (ompt_enabled.ompt_callback_sync_region) {
2049  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2050  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2051  my_task_data, return_address);
2052  }
2053 
2054  if (ompt_enabled.ompt_callback_sync_region_wait) {
2055  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2056  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2057  my_task_data, return_address);
2058  }
2059  }
2060 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2061 
2062 // Debugger: The taskwait is active. Store location and thread encountered the
2063 // taskwait.
2064 #if USE_ITT_BUILD
2065 // Note: These values are used by ITT events as well.
2066 #endif /* USE_ITT_BUILD */
2067  taskdata->td_taskwait_counter += 1;
2068  taskdata->td_taskwait_ident = loc_ref;
2069  taskdata->td_taskwait_thread = gtid + 1;
2070 
2071 #if USE_ITT_BUILD
2072  void *itt_sync_obj = NULL;
2073 #if USE_ITT_NOTIFY
2074  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2075 #endif /* USE_ITT_NOTIFY */
2076 #endif /* USE_ITT_BUILD */
2077 
2078  bool must_wait =
2079  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2080 
2081  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2082  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2083  // If hidden helper thread is encountered, we must enable wait here.
2084  must_wait =
2085  must_wait ||
2086  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2087  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2088 
2089  if (must_wait) {
2090  kmp_flag_32<false, false> flag(
2091  RCAST(std::atomic<kmp_uint32> *,
2092  &(taskdata->td_incomplete_child_tasks)),
2093  0U);
2094  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2095  flag.execute_tasks(thread, gtid, FALSE,
2096  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2097  __kmp_task_stealing_constraint);
2098  }
2099  }
2100 #if USE_ITT_BUILD
2101  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2102  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2103 #endif /* USE_ITT_BUILD */
2104 
2105  // Debugger: The taskwait is completed. Location remains, but thread is
2106  // negated.
2107  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2108 
2109 #if OMPT_SUPPORT && OMPT_OPTIONAL
2110  if (ompt) {
2111  if (ompt_enabled.ompt_callback_sync_region_wait) {
2112  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2113  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2114  my_task_data, return_address);
2115  }
2116  if (ompt_enabled.ompt_callback_sync_region) {
2117  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2118  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2119  my_task_data, return_address);
2120  }
2121  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2122  }
2123 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2124 
2125  }
2126 
2127  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2128  "returning TASK_CURRENT_NOT_QUEUED\n",
2129  gtid, taskdata));
2130 
2131  return TASK_CURRENT_NOT_QUEUED;
2132 }
2133 
2134 #if OMPT_SUPPORT && OMPT_OPTIONAL
2135 OMPT_NOINLINE
2136 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2137  void *frame_address,
2138  void *return_address) {
2139  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2140  return_address);
2141 }
2142 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2143 
2144 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2145 // complete
2146 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2147 #if OMPT_SUPPORT && OMPT_OPTIONAL
2148  if (UNLIKELY(ompt_enabled.enabled)) {
2149  OMPT_STORE_RETURN_ADDRESS(gtid);
2150  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2151  OMPT_LOAD_RETURN_ADDRESS(gtid));
2152  }
2153 #endif
2154  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2155 }
2156 
2157 // __kmpc_omp_taskyield: switch to a different task
2158 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2159  kmp_taskdata_t *taskdata = NULL;
2160  kmp_info_t *thread;
2161  int thread_finished = FALSE;
2162 
2163  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2164  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2165 
2166  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2167  gtid, loc_ref, end_part));
2168  __kmp_assert_valid_gtid(gtid);
2169 
2170  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2171  thread = __kmp_threads[gtid];
2172  taskdata = thread->th.th_current_task;
2173 // Should we model this as a task wait or not?
2174 // Debugger: The taskwait is active. Store location and thread encountered the
2175 // taskwait.
2176 #if USE_ITT_BUILD
2177 // Note: These values are used by ITT events as well.
2178 #endif /* USE_ITT_BUILD */
2179  taskdata->td_taskwait_counter += 1;
2180  taskdata->td_taskwait_ident = loc_ref;
2181  taskdata->td_taskwait_thread = gtid + 1;
2182 
2183 #if USE_ITT_BUILD
2184  void *itt_sync_obj = NULL;
2185 #if USE_ITT_NOTIFY
2186  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2187 #endif /* USE_ITT_NOTIFY */
2188 #endif /* USE_ITT_BUILD */
2189  if (!taskdata->td_flags.team_serial) {
2190  kmp_task_team_t *task_team = thread->th.th_task_team;
2191  if (task_team != NULL) {
2192  if (KMP_TASKING_ENABLED(task_team)) {
2193 #if OMPT_SUPPORT
2194  if (UNLIKELY(ompt_enabled.enabled))
2195  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2196 #endif
2197  __kmp_execute_tasks_32(
2198  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2199  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2200  __kmp_task_stealing_constraint);
2201 #if OMPT_SUPPORT
2202  if (UNLIKELY(ompt_enabled.enabled))
2203  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2204 #endif
2205  }
2206  }
2207  }
2208 #if USE_ITT_BUILD
2209  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2210 #endif /* USE_ITT_BUILD */
2211 
2212  // Debugger: The taskwait is completed. Location remains, but thread is
2213  // negated.
2214  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2215  }
2216 
2217  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2218  "returning TASK_CURRENT_NOT_QUEUED\n",
2219  gtid, taskdata));
2220 
2221  return TASK_CURRENT_NOT_QUEUED;
2222 }
2223 
2224 // Task Reduction implementation
2225 //
2226 // Note: initial implementation didn't take into account the possibility
2227 // to specify omp_orig for initializer of the UDR (user defined reduction).
2228 // Corrected implementation takes into account the omp_orig object.
2229 // Compiler is free to use old implementation if omp_orig is not specified.
2230 
2239 typedef struct kmp_taskred_flags {
2241  unsigned lazy_priv : 1;
2242  unsigned reserved31 : 31;
2244 
2248 typedef struct kmp_task_red_input {
2249  void *reduce_shar;
2250  size_t reduce_size;
2251  // three compiler-generated routines (init, fini are optional):
2252  void *reduce_init;
2253  void *reduce_fini;
2254  void *reduce_comb;
2257 
2261 typedef struct kmp_taskred_data {
2262  void *reduce_shar;
2263  size_t reduce_size;
2265  void *reduce_priv;
2266  void *reduce_pend;
2267  // three compiler-generated routines (init, fini are optional):
2268  void *reduce_comb;
2269  void *reduce_init;
2270  void *reduce_fini;
2271  void *reduce_orig;
2273 
2279 typedef struct kmp_taskred_input {
2280  void *reduce_shar;
2281  void *reduce_orig;
2282  size_t reduce_size;
2283  // three compiler-generated routines (init, fini are optional):
2284  void *reduce_init;
2285  void *reduce_fini;
2286  void *reduce_comb;
2293 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2294 template <>
2295 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2296  kmp_task_red_input_t &src) {
2297  item.reduce_orig = NULL;
2298 }
2299 template <>
2300 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2301  kmp_taskred_input_t &src) {
2302  if (src.reduce_orig != NULL) {
2303  item.reduce_orig = src.reduce_orig;
2304  } else {
2305  item.reduce_orig = src.reduce_shar;
2306  } // non-NULL reduce_orig means new interface used
2307 }
2308 
2309 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2310 template <>
2311 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2312  size_t offset) {
2313  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2314 }
2315 template <>
2316 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2317  size_t offset) {
2318  ((void (*)(void *, void *))item.reduce_init)(
2319  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2320 }
2321 
2322 template <typename T>
2323 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2324  __kmp_assert_valid_gtid(gtid);
2325  kmp_info_t *thread = __kmp_threads[gtid];
2326  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2327  kmp_uint32 nth = thread->th.th_team_nproc;
2328  kmp_taskred_data_t *arr;
2329 
2330  // check input data just in case
2331  KMP_ASSERT(tg != NULL);
2332  KMP_ASSERT(data != NULL);
2333  KMP_ASSERT(num > 0);
2334  if (nth == 1) {
2335  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2336  gtid, tg));
2337  return (void *)tg;
2338  }
2339  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2340  gtid, tg, num));
2341  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2342  thread, num * sizeof(kmp_taskred_data_t));
2343  for (int i = 0; i < num; ++i) {
2344  size_t size = data[i].reduce_size - 1;
2345  // round the size up to cache line per thread-specific item
2346  size += CACHE_LINE - size % CACHE_LINE;
2347  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2348  arr[i].reduce_shar = data[i].reduce_shar;
2349  arr[i].reduce_size = size;
2350  arr[i].flags = data[i].flags;
2351  arr[i].reduce_comb = data[i].reduce_comb;
2352  arr[i].reduce_init = data[i].reduce_init;
2353  arr[i].reduce_fini = data[i].reduce_fini;
2354  __kmp_assign_orig<T>(arr[i], data[i]);
2355  if (!arr[i].flags.lazy_priv) {
2356  // allocate cache-line aligned block and fill it with zeros
2357  arr[i].reduce_priv = __kmp_allocate(nth * size);
2358  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2359  if (arr[i].reduce_init != NULL) {
2360  // initialize all thread-specific items
2361  for (size_t j = 0; j < nth; ++j) {
2362  __kmp_call_init<T>(arr[i], j * size);
2363  }
2364  }
2365  } else {
2366  // only allocate space for pointers now,
2367  // objects will be lazily allocated/initialized if/when requested
2368  // note that __kmp_allocate zeroes the allocated memory
2369  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2370  }
2371  }
2372  tg->reduce_data = (void *)arr;
2373  tg->reduce_num_data = num;
2374  return (void *)tg;
2375 }
2376 
2391 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2392  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2393 }
2394 
2407 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2408  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2409 }
2410 
2411 // Copy task reduction data (except for shared pointers).
2412 template <typename T>
2413 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2414  kmp_taskgroup_t *tg, void *reduce_data) {
2415  kmp_taskred_data_t *arr;
2416  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2417  " from data %p\n",
2418  thr, tg, reduce_data));
2419  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2420  thr, num * sizeof(kmp_taskred_data_t));
2421  // threads will share private copies, thunk routines, sizes, flags, etc.:
2422  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2423  for (int i = 0; i < num; ++i) {
2424  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2425  }
2426  tg->reduce_data = (void *)arr;
2427  tg->reduce_num_data = num;
2428 }
2429 
2439 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2440  __kmp_assert_valid_gtid(gtid);
2441  kmp_info_t *thread = __kmp_threads[gtid];
2442  kmp_int32 nth = thread->th.th_team_nproc;
2443  if (nth == 1)
2444  return data; // nothing to do
2445 
2446  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2447  if (tg == NULL)
2448  tg = thread->th.th_current_task->td_taskgroup;
2449  KMP_ASSERT(tg != NULL);
2450  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2451  kmp_int32 num = tg->reduce_num_data;
2452  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2453 
2454  KMP_ASSERT(data != NULL);
2455  while (tg != NULL) {
2456  for (int i = 0; i < num; ++i) {
2457  if (!arr[i].flags.lazy_priv) {
2458  if (data == arr[i].reduce_shar ||
2459  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2460  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2461  } else {
2462  // check shared location first
2463  void **p_priv = (void **)(arr[i].reduce_priv);
2464  if (data == arr[i].reduce_shar)
2465  goto found;
2466  // check if we get some thread specific location as parameter
2467  for (int j = 0; j < nth; ++j)
2468  if (data == p_priv[j])
2469  goto found;
2470  continue; // not found, continue search
2471  found:
2472  if (p_priv[tid] == NULL) {
2473  // allocate thread specific object lazily
2474  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2475  if (arr[i].reduce_init != NULL) {
2476  if (arr[i].reduce_orig != NULL) { // new interface
2477  ((void (*)(void *, void *))arr[i].reduce_init)(
2478  p_priv[tid], arr[i].reduce_orig);
2479  } else { // old interface (single parameter)
2480  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2481  }
2482  }
2483  }
2484  return p_priv[tid];
2485  }
2486  }
2487  tg = tg->parent;
2488  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2489  num = tg->reduce_num_data;
2490  }
2491  KMP_ASSERT2(0, "Unknown task reduction item");
2492  return NULL; // ERROR, this line never executed
2493 }
2494 
2495 // Finalize task reduction.
2496 // Called from __kmpc_end_taskgroup()
2497 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2498  kmp_int32 nth = th->th.th_team_nproc;
2499  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2500  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2501  kmp_int32 num = tg->reduce_num_data;
2502  for (int i = 0; i < num; ++i) {
2503  void *sh_data = arr[i].reduce_shar;
2504  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2505  void (*f_comb)(void *, void *) =
2506  (void (*)(void *, void *))(arr[i].reduce_comb);
2507  if (!arr[i].flags.lazy_priv) {
2508  void *pr_data = arr[i].reduce_priv;
2509  size_t size = arr[i].reduce_size;
2510  for (int j = 0; j < nth; ++j) {
2511  void *priv_data = (char *)pr_data + j * size;
2512  f_comb(sh_data, priv_data); // combine results
2513  if (f_fini)
2514  f_fini(priv_data); // finalize if needed
2515  }
2516  } else {
2517  void **pr_data = (void **)(arr[i].reduce_priv);
2518  for (int j = 0; j < nth; ++j) {
2519  if (pr_data[j] != NULL) {
2520  f_comb(sh_data, pr_data[j]); // combine results
2521  if (f_fini)
2522  f_fini(pr_data[j]); // finalize if needed
2523  __kmp_free(pr_data[j]);
2524  }
2525  }
2526  }
2527  __kmp_free(arr[i].reduce_priv);
2528  }
2529  __kmp_thread_free(th, arr);
2530  tg->reduce_data = NULL;
2531  tg->reduce_num_data = 0;
2532 }
2533 
2534 // Cleanup task reduction data for parallel or worksharing,
2535 // do not touch task private data other threads still working with.
2536 // Called from __kmpc_end_taskgroup()
2537 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2538  __kmp_thread_free(th, tg->reduce_data);
2539  tg->reduce_data = NULL;
2540  tg->reduce_num_data = 0;
2541 }
2542 
2543 template <typename T>
2544 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2545  int num, T *data) {
2546  __kmp_assert_valid_gtid(gtid);
2547  kmp_info_t *thr = __kmp_threads[gtid];
2548  kmp_int32 nth = thr->th.th_team_nproc;
2549  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2550  if (nth == 1) {
2551  KA_TRACE(10,
2552  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2553  gtid, thr->th.th_current_task->td_taskgroup));
2554  return (void *)thr->th.th_current_task->td_taskgroup;
2555  }
2556  kmp_team_t *team = thr->th.th_team;
2557  void *reduce_data;
2558  kmp_taskgroup_t *tg;
2559  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2560  if (reduce_data == NULL &&
2561  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2562  (void *)1)) {
2563  // single thread enters this block to initialize common reduction data
2564  KMP_DEBUG_ASSERT(reduce_data == NULL);
2565  // first initialize own data, then make a copy other threads can use
2566  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2567  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2568  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2569  // fini counters should be 0 at this point
2570  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2571  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2572  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2573  } else {
2574  while (
2575  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2576  (void *)1) { // wait for task reduction initialization
2577  KMP_CPU_PAUSE();
2578  }
2579  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2580  tg = thr->th.th_current_task->td_taskgroup;
2581  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2582  }
2583  return tg;
2584 }
2585 
2602 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2603  int num, void *data) {
2604  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2605  (kmp_task_red_input_t *)data);
2606 }
2607 
2622 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2623  void *data) {
2624  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2625  (kmp_taskred_input_t *)data);
2626 }
2627 
2636 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2637  __kmpc_end_taskgroup(loc, gtid);
2638 }
2639 
2640 // __kmpc_taskgroup: Start a new taskgroup
2641 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2642  __kmp_assert_valid_gtid(gtid);
2643  kmp_info_t *thread = __kmp_threads[gtid];
2644  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2645  kmp_taskgroup_t *tg_new =
2646  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2647  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2648  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2649  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2650  tg_new->parent = taskdata->td_taskgroup;
2651  tg_new->reduce_data = NULL;
2652  tg_new->reduce_num_data = 0;
2653  tg_new->gomp_data = NULL;
2654  taskdata->td_taskgroup = tg_new;
2655 
2656 #if OMPT_SUPPORT && OMPT_OPTIONAL
2657  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2658  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2659  if (!codeptr)
2660  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2661  kmp_team_t *team = thread->th.th_team;
2662  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2663  // FIXME: I think this is wrong for lwt!
2664  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2665 
2666  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2667  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2668  &(my_task_data), codeptr);
2669  }
2670 #endif
2671 }
2672 
2673 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2674 // and its descendants are complete
2675 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2676  __kmp_assert_valid_gtid(gtid);
2677  kmp_info_t *thread = __kmp_threads[gtid];
2678  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2679  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2680  int thread_finished = FALSE;
2681 
2682 #if OMPT_SUPPORT && OMPT_OPTIONAL
2683  kmp_team_t *team;
2684  ompt_data_t my_task_data;
2685  ompt_data_t my_parallel_data;
2686  void *codeptr = nullptr;
2687  if (UNLIKELY(ompt_enabled.enabled)) {
2688  team = thread->th.th_team;
2689  my_task_data = taskdata->ompt_task_info.task_data;
2690  // FIXME: I think this is wrong for lwt!
2691  my_parallel_data = team->t.ompt_team_info.parallel_data;
2692  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2693  if (!codeptr)
2694  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2695  }
2696 #endif
2697 
2698  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2699  KMP_DEBUG_ASSERT(taskgroup != NULL);
2700  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2701 
2702  if (__kmp_tasking_mode != tskm_immediate_exec) {
2703  // mark task as waiting not on a barrier
2704  taskdata->td_taskwait_counter += 1;
2705  taskdata->td_taskwait_ident = loc;
2706  taskdata->td_taskwait_thread = gtid + 1;
2707 #if USE_ITT_BUILD
2708  // For ITT the taskgroup wait is similar to taskwait until we need to
2709  // distinguish them
2710  void *itt_sync_obj = NULL;
2711 #if USE_ITT_NOTIFY
2712  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2713 #endif /* USE_ITT_NOTIFY */
2714 #endif /* USE_ITT_BUILD */
2715 
2716 #if OMPT_SUPPORT && OMPT_OPTIONAL
2717  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2718  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2719  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2720  &(my_task_data), codeptr);
2721  }
2722 #endif
2723 
2724  if (!taskdata->td_flags.team_serial ||
2725  (thread->th.th_task_team != NULL &&
2726  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2727  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2728  kmp_flag_32<false, false> flag(
2729  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2730  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2731  flag.execute_tasks(thread, gtid, FALSE,
2732  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2733  __kmp_task_stealing_constraint);
2734  }
2735  }
2736  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2737 
2738 #if OMPT_SUPPORT && OMPT_OPTIONAL
2739  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2740  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2741  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2742  &(my_task_data), codeptr);
2743  }
2744 #endif
2745 
2746 #if USE_ITT_BUILD
2747  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2748  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2749 #endif /* USE_ITT_BUILD */
2750  }
2751  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2752 
2753  if (taskgroup->reduce_data != NULL &&
2754  !taskgroup->gomp_data) { // need to reduce?
2755  int cnt;
2756  void *reduce_data;
2757  kmp_team_t *t = thread->th.th_team;
2758  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2759  // check if <priv> data of the first reduction variable shared for the team
2760  void *priv0 = arr[0].reduce_priv;
2761  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2762  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2763  // finishing task reduction on parallel
2764  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2765  if (cnt == thread->th.th_team_nproc - 1) {
2766  // we are the last thread passing __kmpc_reduction_modifier_fini()
2767  // finalize task reduction:
2768  __kmp_task_reduction_fini(thread, taskgroup);
2769  // cleanup fields in the team structure:
2770  // TODO: is relaxed store enough here (whole barrier should follow)?
2771  __kmp_thread_free(thread, reduce_data);
2772  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2773  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2774  } else {
2775  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2776  // so do not finalize reduction, just clean own copy of the data
2777  __kmp_task_reduction_clean(thread, taskgroup);
2778  }
2779  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2780  NULL &&
2781  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2782  // finishing task reduction on worksharing
2783  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2784  if (cnt == thread->th.th_team_nproc - 1) {
2785  // we are the last thread passing __kmpc_reduction_modifier_fini()
2786  __kmp_task_reduction_fini(thread, taskgroup);
2787  // cleanup fields in team structure:
2788  // TODO: is relaxed store enough here (whole barrier should follow)?
2789  __kmp_thread_free(thread, reduce_data);
2790  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2791  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2792  } else {
2793  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2794  // so do not finalize reduction, just clean own copy of the data
2795  __kmp_task_reduction_clean(thread, taskgroup);
2796  }
2797  } else {
2798  // finishing task reduction on taskgroup
2799  __kmp_task_reduction_fini(thread, taskgroup);
2800  }
2801  }
2802  // Restore parent taskgroup for the current task
2803  taskdata->td_taskgroup = taskgroup->parent;
2804  __kmp_thread_free(thread, taskgroup);
2805 
2806  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2807  gtid, taskdata));
2808 
2809 #if OMPT_SUPPORT && OMPT_OPTIONAL
2810  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2811  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2812  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2813  &(my_task_data), codeptr);
2814  }
2815 #endif
2816 }
2817 
2818 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2819  kmp_task_team_t *task_team,
2820  kmp_int32 is_constrained) {
2821  kmp_task_t *task = NULL;
2822  kmp_taskdata_t *taskdata;
2823  kmp_taskdata_t *current;
2824  kmp_thread_data_t *thread_data;
2825  int ntasks = task_team->tt.tt_num_task_pri;
2826  if (ntasks == 0) {
2827  KA_TRACE(
2828  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2829  return NULL;
2830  }
2831  do {
2832  // decrement num_tasks to "reserve" one task to get for execution
2833  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2834  ntasks - 1))
2835  break;
2836  } while (ntasks > 0);
2837  if (ntasks == 0) {
2838  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2839  __kmp_get_gtid()));
2840  return NULL;
2841  }
2842  // We got a "ticket" to get a "reserved" priority task
2843  int deque_ntasks;
2844  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2845  do {
2846  KMP_ASSERT(list != NULL);
2847  thread_data = &list->td;
2848  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2849  deque_ntasks = thread_data->td.td_deque_ntasks;
2850  if (deque_ntasks == 0) {
2851  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2852  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2853  __kmp_get_gtid(), thread_data));
2854  list = list->next;
2855  }
2856  } while (deque_ntasks == 0);
2857  KMP_DEBUG_ASSERT(deque_ntasks);
2858  int target = thread_data->td.td_deque_head;
2859  current = __kmp_threads[gtid]->th.th_current_task;
2860  taskdata = thread_data->td.td_deque[target];
2861  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2862  // Bump head pointer and Wrap.
2863  thread_data->td.td_deque_head =
2864  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2865  } else {
2866  if (!task_team->tt.tt_untied_task_encountered) {
2867  // The TSC does not allow to steal victim task
2868  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2869  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2870  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2871  gtid, thread_data, task_team, deque_ntasks, target,
2872  thread_data->td.td_deque_tail));
2873  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2874  return NULL;
2875  }
2876  int i;
2877  // walk through the deque trying to steal any task
2878  taskdata = NULL;
2879  for (i = 1; i < deque_ntasks; ++i) {
2880  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2881  taskdata = thread_data->td.td_deque[target];
2882  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2883  break; // found task to execute
2884  } else {
2885  taskdata = NULL;
2886  }
2887  }
2888  if (taskdata == NULL) {
2889  // No appropriate candidate found to execute
2890  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2891  KA_TRACE(
2892  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2893  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2894  gtid, thread_data, task_team, deque_ntasks,
2895  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2896  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2897  return NULL;
2898  }
2899  int prev = target;
2900  for (i = i + 1; i < deque_ntasks; ++i) {
2901  // shift remaining tasks in the deque left by 1
2902  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2903  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2904  prev = target;
2905  }
2906  KMP_DEBUG_ASSERT(
2907  thread_data->td.td_deque_tail ==
2908  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2909  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2910  }
2911  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2912  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2913  task = KMP_TASKDATA_TO_TASK(taskdata);
2914  return task;
2915 }
2916 
2917 // __kmp_remove_my_task: remove a task from my own deque
2918 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2919  kmp_task_team_t *task_team,
2920  kmp_int32 is_constrained) {
2921  kmp_task_t *task;
2922  kmp_taskdata_t *taskdata;
2923  kmp_thread_data_t *thread_data;
2924  kmp_uint32 tail;
2925 
2926  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2927  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2928  NULL); // Caller should check this condition
2929 
2930  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2931 
2932  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2933  gtid, thread_data->td.td_deque_ntasks,
2934  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2935 
2936  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2937  KA_TRACE(10,
2938  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2939  "ntasks=%d head=%u tail=%u\n",
2940  gtid, thread_data->td.td_deque_ntasks,
2941  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2942  return NULL;
2943  }
2944 
2945  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2946 
2947  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2948  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2949  KA_TRACE(10,
2950  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2951  "ntasks=%d head=%u tail=%u\n",
2952  gtid, thread_data->td.td_deque_ntasks,
2953  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2954  return NULL;
2955  }
2956 
2957  tail = (thread_data->td.td_deque_tail - 1) &
2958  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2959  taskdata = thread_data->td.td_deque[tail];
2960 
2961  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2962  thread->th.th_current_task)) {
2963  // The TSC does not allow to steal victim task
2964  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2965  KA_TRACE(10,
2966  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2967  "ntasks=%d head=%u tail=%u\n",
2968  gtid, thread_data->td.td_deque_ntasks,
2969  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2970  return NULL;
2971  }
2972 
2973  thread_data->td.td_deque_tail = tail;
2974  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2975 
2976  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2977 
2978  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2979  "ntasks=%d head=%u tail=%u\n",
2980  gtid, taskdata, thread_data->td.td_deque_ntasks,
2981  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2982 
2983  task = KMP_TASKDATA_TO_TASK(taskdata);
2984  return task;
2985 }
2986 
2987 // __kmp_steal_task: remove a task from another thread's deque
2988 // Assume that calling thread has already checked existence of
2989 // task_team thread_data before calling this routine.
2990 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2991  kmp_task_team_t *task_team,
2992  std::atomic<kmp_int32> *unfinished_threads,
2993  int *thread_finished,
2994  kmp_int32 is_constrained) {
2995  kmp_task_t *task;
2996  kmp_taskdata_t *taskdata;
2997  kmp_taskdata_t *current;
2998  kmp_thread_data_t *victim_td, *threads_data;
2999  kmp_int32 target;
3000  kmp_int32 victim_tid;
3001 
3002  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3003 
3004  threads_data = task_team->tt.tt_threads_data;
3005  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3006 
3007  victim_tid = victim_thr->th.th_info.ds.ds_tid;
3008  victim_td = &threads_data[victim_tid];
3009 
3010  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3011  "task_team=%p ntasks=%d head=%u tail=%u\n",
3012  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3013  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3014  victim_td->td.td_deque_tail));
3015 
3016  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3017  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3018  "task_team=%p ntasks=%d head=%u tail=%u\n",
3019  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3020  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3021  victim_td->td.td_deque_tail));
3022  return NULL;
3023  }
3024 
3025  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3026 
3027  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3028  // Check again after we acquire the lock
3029  if (ntasks == 0) {
3030  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3031  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3032  "task_team=%p ntasks=%d head=%u tail=%u\n",
3033  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3034  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3035  return NULL;
3036  }
3037 
3038  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3039  current = __kmp_threads[gtid]->th.th_current_task;
3040  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3041  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3042  // Bump head pointer and Wrap.
3043  victim_td->td.td_deque_head =
3044  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3045  } else {
3046  if (!task_team->tt.tt_untied_task_encountered) {
3047  // The TSC does not allow to steal victim task
3048  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3049  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3050  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3051  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3052  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3053  return NULL;
3054  }
3055  int i;
3056  // walk through victim's deque trying to steal any task
3057  target = victim_td->td.td_deque_head;
3058  taskdata = NULL;
3059  for (i = 1; i < ntasks; ++i) {
3060  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3061  taskdata = victim_td->td.td_deque[target];
3062  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3063  break; // found victim task
3064  } else {
3065  taskdata = NULL;
3066  }
3067  }
3068  if (taskdata == NULL) {
3069  // No appropriate candidate to steal found
3070  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3071  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3072  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3073  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3074  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3075  return NULL;
3076  }
3077  int prev = target;
3078  for (i = i + 1; i < ntasks; ++i) {
3079  // shift remaining tasks in the deque left by 1
3080  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3081  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3082  prev = target;
3083  }
3084  KMP_DEBUG_ASSERT(
3085  victim_td->td.td_deque_tail ==
3086  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3087  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3088  }
3089  if (*thread_finished) {
3090  // We need to un-mark this victim as a finished victim. This must be done
3091  // before releasing the lock, or else other threads (starting with the
3092  // primary thread victim) might be prematurely released from the barrier!!!
3093 #if KMP_DEBUG
3094  kmp_int32 count =
3095 #endif
3096  KMP_ATOMIC_INC(unfinished_threads);
3097  KA_TRACE(
3098  20,
3099  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3100  gtid, count + 1, task_team));
3101  *thread_finished = FALSE;
3102  }
3103  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3104 
3105  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3106 
3107  KMP_COUNT_BLOCK(TASK_stolen);
3108  KA_TRACE(10,
3109  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3110  "task_team=%p ntasks=%d head=%u tail=%u\n",
3111  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3112  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3113 
3114  task = KMP_TASKDATA_TO_TASK(taskdata);
3115  return task;
3116 }
3117 
3118 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3119 // condition is statisfied (return true) or there are none left (return false).
3120 //
3121 // final_spin is TRUE if this is the spin at the release barrier.
3122 // thread_finished indicates whether the thread is finished executing all
3123 // the tasks it has on its deque, and is at the release barrier.
3124 // spinner is the location on which to spin.
3125 // spinner == NULL means only execute a single task and return.
3126 // checker is the value to check to terminate the spin.
3127 template <class C>
3128 static inline int __kmp_execute_tasks_template(
3129  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3130  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3131  kmp_int32 is_constrained) {
3132  kmp_task_team_t *task_team = thread->th.th_task_team;
3133  kmp_thread_data_t *threads_data;
3134  kmp_task_t *task;
3135  kmp_info_t *other_thread;
3136  kmp_taskdata_t *current_task = thread->th.th_current_task;
3137  std::atomic<kmp_int32> *unfinished_threads;
3138  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3139  tid = thread->th.th_info.ds.ds_tid;
3140 
3141  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3142  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3143 
3144  if (task_team == NULL || current_task == NULL)
3145  return FALSE;
3146 
3147  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3148  "*thread_finished=%d\n",
3149  gtid, final_spin, *thread_finished));
3150 
3151  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3152  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3153 
3154  KMP_DEBUG_ASSERT(threads_data != NULL);
3155 
3156  nthreads = task_team->tt.tt_nproc;
3157  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3158  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
3159  task_team->tt.tt_hidden_helper_task_encountered);
3160  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3161 
3162  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3163  // getting tasks from target constructs
3164  while (1) { // Inner loop to find a task and execute it
3165  task = NULL;
3166  if (task_team->tt.tt_num_task_pri) { // get priority task first
3167  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3168  }
3169  if (task == NULL && use_own_tasks) { // check own queue next
3170  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3171  }
3172  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3173  int asleep = 1;
3174  use_own_tasks = 0;
3175  // Try to steal from the last place I stole from successfully.
3176  if (victim_tid == -2) { // haven't stolen anything yet
3177  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3178  if (victim_tid !=
3179  -1) // if we have a last stolen from victim, get the thread
3180  other_thread = threads_data[victim_tid].td.td_thr;
3181  }
3182  if (victim_tid != -1) { // found last victim
3183  asleep = 0;
3184  } else if (!new_victim) { // no recent steals and we haven't already
3185  // used a new victim; select a random thread
3186  do { // Find a different thread to steal work from.
3187  // Pick a random thread. Initial plan was to cycle through all the
3188  // threads, and only return if we tried to steal from every thread,
3189  // and failed. Arch says that's not such a great idea.
3190  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3191  if (victim_tid >= tid) {
3192  ++victim_tid; // Adjusts random distribution to exclude self
3193  }
3194  // Found a potential victim
3195  other_thread = threads_data[victim_tid].td.td_thr;
3196  // There is a slight chance that __kmp_enable_tasking() did not wake
3197  // up all threads waiting at the barrier. If victim is sleeping,
3198  // then wake it up. Since we were going to pay the cache miss
3199  // penalty for referencing another thread's kmp_info_t struct
3200  // anyway,
3201  // the check shouldn't cost too much performance at this point. In
3202  // extra barrier mode, tasks do not sleep at the separate tasking
3203  // barrier, so this isn't a problem.
3204  asleep = 0;
3205  if ((__kmp_tasking_mode == tskm_task_teams) &&
3206  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3207  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3208  NULL)) {
3209  asleep = 1;
3210  __kmp_null_resume_wrapper(other_thread);
3211  // A sleeping thread should not have any tasks on it's queue.
3212  // There is a slight possibility that it resumes, steals a task
3213  // from another thread, which spawns more tasks, all in the time
3214  // that it takes this thread to check => don't write an assertion
3215  // that the victim's queue is empty. Try stealing from a
3216  // different thread.
3217  }
3218  } while (asleep);
3219  }
3220 
3221  if (!asleep) {
3222  // We have a victim to try to steal from
3223  task = __kmp_steal_task(other_thread, gtid, task_team,
3224  unfinished_threads, thread_finished,
3225  is_constrained);
3226  }
3227  if (task != NULL) { // set last stolen to victim
3228  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3229  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3230  // The pre-refactored code did not try more than 1 successful new
3231  // vicitm, unless the last one generated more local tasks;
3232  // new_victim keeps track of this
3233  new_victim = 1;
3234  }
3235  } else { // No tasks found; unset last_stolen
3236  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3237  victim_tid = -2; // no successful victim found
3238  }
3239  }
3240 
3241  if (task == NULL)
3242  break; // break out of tasking loop
3243 
3244 // Found a task; execute it
3245 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3246  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3247  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3248  // get the object reliably
3249  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3250  }
3251  __kmp_itt_task_starting(itt_sync_obj);
3252  }
3253 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3254  __kmp_invoke_task(gtid, task, current_task);
3255 #if USE_ITT_BUILD
3256  if (itt_sync_obj != NULL)
3257  __kmp_itt_task_finished(itt_sync_obj);
3258 #endif /* USE_ITT_BUILD */
3259  // If this thread is only partway through the barrier and the condition is
3260  // met, then return now, so that the barrier gather/release pattern can
3261  // proceed. If this thread is in the last spin loop in the barrier,
3262  // waiting to be released, we know that the termination condition will not
3263  // be satisfied, so don't waste any cycles checking it.
3264  if (flag == NULL || (!final_spin && flag->done_check())) {
3265  KA_TRACE(
3266  15,
3267  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3268  gtid));
3269  return TRUE;
3270  }
3271  if (thread->th.th_task_team == NULL) {
3272  break;
3273  }
3274  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3275  // If execution of a stolen task results in more tasks being placed on our
3276  // run queue, reset use_own_tasks
3277  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3278  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3279  "other tasks, restart\n",
3280  gtid));
3281  use_own_tasks = 1;
3282  new_victim = 0;
3283  }
3284  }
3285 
3286  // The task source has been exhausted. If in final spin loop of barrier,
3287  // check if termination condition is satisfied. The work queue may be empty
3288  // but there might be proxy tasks still executing.
3289  if (final_spin &&
3290  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3291  // First, decrement the #unfinished threads, if that has not already been
3292  // done. This decrement might be to the spin location, and result in the
3293  // termination condition being satisfied.
3294  if (!*thread_finished) {
3295 #if KMP_DEBUG
3296  kmp_int32 count = -1 +
3297 #endif
3298  KMP_ATOMIC_DEC(unfinished_threads);
3299  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3300  "unfinished_threads to %d task_team=%p\n",
3301  gtid, count, task_team));
3302  *thread_finished = TRUE;
3303  }
3304 
3305  // It is now unsafe to reference thread->th.th_team !!!
3306  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3307  // thread to pass through the barrier, where it might reset each thread's
3308  // th.th_team field for the next parallel region. If we can steal more
3309  // work, we know that this has not happened yet.
3310  if (flag != NULL && flag->done_check()) {
3311  KA_TRACE(
3312  15,
3313  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3314  gtid));
3315  return TRUE;
3316  }
3317  }
3318 
3319  // If this thread's task team is NULL, primary thread has recognized that
3320  // there are no more tasks; bail out
3321  if (thread->th.th_task_team == NULL) {
3322  KA_TRACE(15,
3323  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3324  return FALSE;
3325  }
3326 
3327  // Check the flag again to see if it has already done in case to be trapped
3328  // into infinite loop when a if0 task depends on a hidden helper task
3329  // outside any parallel region. Detached tasks are not impacted in this case
3330  // because the only thread executing this function has to execute the proxy
3331  // task so it is in another code path that has the same check.
3332  if (flag == NULL || (!final_spin && flag->done_check())) {
3333  KA_TRACE(15,
3334  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3335  gtid));
3336  return TRUE;
3337  }
3338 
3339  // We could be getting tasks from target constructs; if this is the only
3340  // thread, keep trying to execute tasks from own queue
3341  if (nthreads == 1 &&
3342  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3343  use_own_tasks = 1;
3344  else {
3345  KA_TRACE(15,
3346  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3347  return FALSE;
3348  }
3349  }
3350 }
3351 
3352 template <bool C, bool S>
3353 int __kmp_execute_tasks_32(
3354  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3355  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3356  kmp_int32 is_constrained) {
3357  return __kmp_execute_tasks_template(
3358  thread, gtid, flag, final_spin,
3359  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3360 }
3361 
3362 template <bool C, bool S>
3363 int __kmp_execute_tasks_64(
3364  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3365  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3366  kmp_int32 is_constrained) {
3367  return __kmp_execute_tasks_template(
3368  thread, gtid, flag, final_spin,
3369  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3370 }
3371 
3372 template <bool C, bool S>
3373 int __kmp_atomic_execute_tasks_64(
3374  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3375  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3376  kmp_int32 is_constrained) {
3377  return __kmp_execute_tasks_template(
3378  thread, gtid, flag, final_spin,
3379  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3380 }
3381 
3382 int __kmp_execute_tasks_oncore(
3383  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3384  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3385  kmp_int32 is_constrained) {
3386  return __kmp_execute_tasks_template(
3387  thread, gtid, flag, final_spin,
3388  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3389 }
3390 
3391 template int
3392 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3393  kmp_flag_32<false, false> *, int,
3394  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3395 
3396 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3397  kmp_flag_64<false, true> *,
3398  int,
3399  int *USE_ITT_BUILD_ARG(void *),
3400  kmp_int32);
3401 
3402 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3403  kmp_flag_64<true, false> *,
3404  int,
3405  int *USE_ITT_BUILD_ARG(void *),
3406  kmp_int32);
3407 
3408 template int __kmp_atomic_execute_tasks_64<false, true>(
3409  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3410  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3411 
3412 template int __kmp_atomic_execute_tasks_64<true, false>(
3413  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3414  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3415 
3416 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3417 // next barrier so they can assist in executing enqueued tasks.
3418 // First thread in allocates the task team atomically.
3419 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3420  kmp_info_t *this_thr) {
3421  kmp_thread_data_t *threads_data;
3422  int nthreads, i, is_init_thread;
3423 
3424  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3425  __kmp_gtid_from_thread(this_thr)));
3426 
3427  KMP_DEBUG_ASSERT(task_team != NULL);
3428  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3429 
3430  nthreads = task_team->tt.tt_nproc;
3431  KMP_DEBUG_ASSERT(nthreads > 0);
3432  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3433 
3434  // Allocate or increase the size of threads_data if necessary
3435  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3436 
3437  if (!is_init_thread) {
3438  // Some other thread already set up the array.
3439  KA_TRACE(
3440  20,
3441  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3442  __kmp_gtid_from_thread(this_thr)));
3443  return;
3444  }
3445  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3446  KMP_DEBUG_ASSERT(threads_data != NULL);
3447 
3448  if (__kmp_tasking_mode == tskm_task_teams &&
3449  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3450  // Release any threads sleeping at the barrier, so that they can steal
3451  // tasks and execute them. In extra barrier mode, tasks do not sleep
3452  // at the separate tasking barrier, so this isn't a problem.
3453  for (i = 0; i < nthreads; i++) {
3454  void *sleep_loc;
3455  kmp_info_t *thread = threads_data[i].td.td_thr;
3456 
3457  if (i == this_thr->th.th_info.ds.ds_tid) {
3458  continue;
3459  }
3460  // Since we haven't locked the thread's suspend mutex lock at this
3461  // point, there is a small window where a thread might be putting
3462  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3463  // To work around this, __kmp_execute_tasks_template() periodically checks
3464  // see if other threads are sleeping (using the same random mechanism that
3465  // is used for task stealing) and awakens them if they are.
3466  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3467  NULL) {
3468  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3469  __kmp_gtid_from_thread(this_thr),
3470  __kmp_gtid_from_thread(thread)));
3471  __kmp_null_resume_wrapper(thread);
3472  } else {
3473  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3474  __kmp_gtid_from_thread(this_thr),
3475  __kmp_gtid_from_thread(thread)));
3476  }
3477  }
3478  }
3479 
3480  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3481  __kmp_gtid_from_thread(this_thr)));
3482 }
3483 
3484 /* // TODO: Check the comment consistency
3485  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3486  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3487  * After a child * thread checks into a barrier and calls __kmp_release() from
3488  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3489  * longer assume that the kmp_team_t structure is intact (at any moment, the
3490  * primary thread may exit the barrier code and free the team data structure,
3491  * and return the threads to the thread pool).
3492  *
3493  * This does not work with the tasking code, as the thread is still
3494  * expected to participate in the execution of any tasks that may have been
3495  * spawned my a member of the team, and the thread still needs access to all
3496  * to each thread in the team, so that it can steal work from it.
3497  *
3498  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3499  * counting mechanism, and is allocated by the primary thread before calling
3500  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3501  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3502  * of the kmp_task_team_t structs for consecutive barriers can overlap
3503  * (and will, unless the primary thread is the last thread to exit the barrier
3504  * release phase, which is not typical). The existence of such a struct is
3505  * useful outside the context of tasking.
3506  *
3507  * We currently use the existence of the threads array as an indicator that
3508  * tasks were spawned since the last barrier. If the structure is to be
3509  * useful outside the context of tasking, then this will have to change, but
3510  * not setting the field minimizes the performance impact of tasking on
3511  * barriers, when no explicit tasks were spawned (pushed, actually).
3512  */
3513 
3514 static kmp_task_team_t *__kmp_free_task_teams =
3515  NULL; // Free list for task_team data structures
3516 // Lock for task team data structures
3517 kmp_bootstrap_lock_t __kmp_task_team_lock =
3518  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3519 
3520 // __kmp_alloc_task_deque:
3521 // Allocates a task deque for a particular thread, and initialize the necessary
3522 // data structures relating to the deque. This only happens once per thread
3523 // per task team since task teams are recycled. No lock is needed during
3524 // allocation since each thread allocates its own deque.
3525 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3526  kmp_thread_data_t *thread_data) {
3527  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3528  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3529 
3530  // Initialize last stolen task field to "none"
3531  thread_data->td.td_deque_last_stolen = -1;
3532 
3533  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3534  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3535  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3536 
3537  KE_TRACE(
3538  10,
3539  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3540  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3541  // Allocate space for task deque, and zero the deque
3542  // Cannot use __kmp_thread_calloc() because threads not around for
3543  // kmp_reap_task_team( ).
3544  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3545  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3546  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3547 }
3548 
3549 // __kmp_free_task_deque:
3550 // Deallocates a task deque for a particular thread. Happens at library
3551 // deallocation so don't need to reset all thread data fields.
3552 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3553  if (thread_data->td.td_deque != NULL) {
3554  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3555  TCW_4(thread_data->td.td_deque_ntasks, 0);
3556  __kmp_free(thread_data->td.td_deque);
3557  thread_data->td.td_deque = NULL;
3558  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3559  }
3560 
3561 #ifdef BUILD_TIED_TASK_STACK
3562  // GEH: Figure out what to do here for td_susp_tied_tasks
3563  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3564  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3565  }
3566 #endif // BUILD_TIED_TASK_STACK
3567 }
3568 
3569 // __kmp_realloc_task_threads_data:
3570 // Allocates a threads_data array for a task team, either by allocating an
3571 // initial array or enlarging an existing array. Only the first thread to get
3572 // the lock allocs or enlarges the array and re-initializes the array elements.
3573 // That thread returns "TRUE", the rest return "FALSE".
3574 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3575 // The current size is given by task_team -> tt.tt_max_threads.
3576 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3577  kmp_task_team_t *task_team) {
3578  kmp_thread_data_t **threads_data_p;
3579  kmp_int32 nthreads, maxthreads;
3580  int is_init_thread = FALSE;
3581 
3582  if (TCR_4(task_team->tt.tt_found_tasks)) {
3583  // Already reallocated and initialized.
3584  return FALSE;
3585  }
3586 
3587  threads_data_p = &task_team->tt.tt_threads_data;
3588  nthreads = task_team->tt.tt_nproc;
3589  maxthreads = task_team->tt.tt_max_threads;
3590 
3591  // All threads must lock when they encounter the first task of the implicit
3592  // task region to make sure threads_data fields are (re)initialized before
3593  // used.
3594  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3595 
3596  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3597  // first thread to enable tasking
3598  kmp_team_t *team = thread->th.th_team;
3599  int i;
3600 
3601  is_init_thread = TRUE;
3602  if (maxthreads < nthreads) {
3603 
3604  if (*threads_data_p != NULL) {
3605  kmp_thread_data_t *old_data = *threads_data_p;
3606  kmp_thread_data_t *new_data = NULL;
3607 
3608  KE_TRACE(
3609  10,
3610  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3611  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3612  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3613  // Reallocate threads_data to have more elements than current array
3614  // Cannot use __kmp_thread_realloc() because threads not around for
3615  // kmp_reap_task_team( ). Note all new array entries are initialized
3616  // to zero by __kmp_allocate().
3617  new_data = (kmp_thread_data_t *)__kmp_allocate(
3618  nthreads * sizeof(kmp_thread_data_t));
3619  // copy old data to new data
3620  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3621  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3622 
3623 #ifdef BUILD_TIED_TASK_STACK
3624  // GEH: Figure out if this is the right thing to do
3625  for (i = maxthreads; i < nthreads; i++) {
3626  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3627  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3628  }
3629 #endif // BUILD_TIED_TASK_STACK
3630  // Install the new data and free the old data
3631  (*threads_data_p) = new_data;
3632  __kmp_free(old_data);
3633  } else {
3634  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3635  "threads data for task_team %p, size = %d\n",
3636  __kmp_gtid_from_thread(thread), task_team, nthreads));
3637  // Make the initial allocate for threads_data array, and zero entries
3638  // Cannot use __kmp_thread_calloc() because threads not around for
3639  // kmp_reap_task_team( ).
3640  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3641  nthreads * sizeof(kmp_thread_data_t));
3642 #ifdef BUILD_TIED_TASK_STACK
3643  // GEH: Figure out if this is the right thing to do
3644  for (i = 0; i < nthreads; i++) {
3645  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3646  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3647  }
3648 #endif // BUILD_TIED_TASK_STACK
3649  }
3650  task_team->tt.tt_max_threads = nthreads;
3651  } else {
3652  // If array has (more than) enough elements, go ahead and use it
3653  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3654  }
3655 
3656  // initialize threads_data pointers back to thread_info structures
3657  for (i = 0; i < nthreads; i++) {
3658  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3659  thread_data->td.td_thr = team->t.t_threads[i];
3660 
3661  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3662  // The last stolen field survives across teams / barrier, and the number
3663  // of threads may have changed. It's possible (likely?) that a new
3664  // parallel region will exhibit the same behavior as previous region.
3665  thread_data->td.td_deque_last_stolen = -1;
3666  }
3667  }
3668 
3669  KMP_MB();
3670  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3671  }
3672 
3673  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3674  return is_init_thread;
3675 }
3676 
3677 // __kmp_free_task_threads_data:
3678 // Deallocates a threads_data array for a task team, including any attached
3679 // tasking deques. Only occurs at library shutdown.
3680 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3681  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3682  if (task_team->tt.tt_threads_data != NULL) {
3683  int i;
3684  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3685  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3686  }
3687  __kmp_free(task_team->tt.tt_threads_data);
3688  task_team->tt.tt_threads_data = NULL;
3689  }
3690  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3691 }
3692 
3693 // __kmp_free_task_pri_list:
3694 // Deallocates tasking deques used for priority tasks.
3695 // Only occurs at library shutdown.
3696 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3697  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3698  if (task_team->tt.tt_task_pri_list != NULL) {
3699  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3700  while (list != NULL) {
3701  kmp_task_pri_t *next = list->next;
3702  __kmp_free_task_deque(&list->td);
3703  __kmp_free(list);
3704  list = next;
3705  }
3706  task_team->tt.tt_task_pri_list = NULL;
3707  }
3708  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3709 }
3710 
3711 // __kmp_allocate_task_team:
3712 // Allocates a task team associated with a specific team, taking it from
3713 // the global task team free list if possible. Also initializes data
3714 // structures.
3715 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3716  kmp_team_t *team) {
3717  kmp_task_team_t *task_team = NULL;
3718  int nthreads;
3719 
3720  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3721  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3722 
3723  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3724  // Take a task team from the task team pool
3725  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3726  if (__kmp_free_task_teams != NULL) {
3727  task_team = __kmp_free_task_teams;
3728  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3729  task_team->tt.tt_next = NULL;
3730  }
3731  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3732  }
3733 
3734  if (task_team == NULL) {
3735  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3736  "task team for team %p\n",
3737  __kmp_gtid_from_thread(thread), team));
3738  // Allocate a new task team if one is not available. Cannot use
3739  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3740  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3741  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3742  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3743 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3744  // suppress race conditions detection on synchronization flags in debug mode
3745  // this helps to analyze library internals eliminating false positives
3746  __itt_suppress_mark_range(
3747  __itt_suppress_range, __itt_suppress_threading_errors,
3748  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3749  __itt_suppress_mark_range(__itt_suppress_range,
3750  __itt_suppress_threading_errors,
3751  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3752  sizeof(task_team->tt.tt_active));
3753 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3754  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3755  // task_team->tt.tt_threads_data = NULL;
3756  // task_team->tt.tt_max_threads = 0;
3757  // task_team->tt.tt_next = NULL;
3758  }
3759 
3760  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3761  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3762  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3763  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3764 
3765  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3766  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3767  TCW_4(task_team->tt.tt_active, TRUE);
3768 
3769  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3770  "unfinished_threads init'd to %d\n",
3771  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3772  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3773  return task_team;
3774 }
3775 
3776 // __kmp_free_task_team:
3777 // Frees the task team associated with a specific thread, and adds it
3778 // to the global task team free list.
3779 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3780  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3781  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3782 
3783  // Put task team back on free list
3784  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3785 
3786  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3787  task_team->tt.tt_next = __kmp_free_task_teams;
3788  TCW_PTR(__kmp_free_task_teams, task_team);
3789 
3790  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3791 }
3792 
3793 // __kmp_reap_task_teams:
3794 // Free all the task teams on the task team free list.
3795 // Should only be done during library shutdown.
3796 // Cannot do anything that needs a thread structure or gtid since they are
3797 // already gone.
3798 void __kmp_reap_task_teams(void) {
3799  kmp_task_team_t *task_team;
3800 
3801  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3802  // Free all task_teams on the free list
3803  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3804  while ((task_team = __kmp_free_task_teams) != NULL) {
3805  __kmp_free_task_teams = task_team->tt.tt_next;
3806  task_team->tt.tt_next = NULL;
3807 
3808  // Free threads_data if necessary
3809  if (task_team->tt.tt_threads_data != NULL) {
3810  __kmp_free_task_threads_data(task_team);
3811  }
3812  if (task_team->tt.tt_task_pri_list != NULL) {
3813  __kmp_free_task_pri_list(task_team);
3814  }
3815  __kmp_free(task_team);
3816  }
3817  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3818  }
3819 }
3820 
3821 // __kmp_wait_to_unref_task_teams:
3822 // Some threads could still be in the fork barrier release code, possibly
3823 // trying to steal tasks. Wait for each thread to unreference its task team.
3824 void __kmp_wait_to_unref_task_teams(void) {
3825  kmp_info_t *thread;
3826  kmp_uint32 spins;
3827  kmp_uint64 time;
3828  int done;
3829 
3830  KMP_INIT_YIELD(spins);
3831  KMP_INIT_BACKOFF(time);
3832 
3833  for (;;) {
3834  done = TRUE;
3835 
3836  // TODO: GEH - this may be is wrong because some sync would be necessary
3837  // in case threads are added to the pool during the traversal. Need to
3838  // verify that lock for thread pool is held when calling this routine.
3839  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3840  thread = thread->th.th_next_pool) {
3841 #if KMP_OS_WINDOWS
3842  DWORD exit_val;
3843 #endif
3844  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3845  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3846  __kmp_gtid_from_thread(thread)));
3847  continue;
3848  }
3849 #if KMP_OS_WINDOWS
3850  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3851  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3852  thread->th.th_task_team = NULL;
3853  continue;
3854  }
3855 #endif
3856 
3857  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3858 
3859  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3860  "unreference task_team\n",
3861  __kmp_gtid_from_thread(thread)));
3862 
3863  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3864  void *sleep_loc;
3865  // If the thread is sleeping, awaken it.
3866  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3867  NULL) {
3868  KA_TRACE(
3869  10,
3870  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3871  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3872  __kmp_null_resume_wrapper(thread);
3873  }
3874  }
3875  }
3876  if (done) {
3877  break;
3878  }
3879 
3880  // If oversubscribed or have waited a bit, yield.
3881  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3882  }
3883 }
3884 
3885 // __kmp_task_team_setup: Create a task_team for the current team, but use
3886 // an already created, unused one if it already exists.
3887 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3888  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3889 
3890  // If this task_team hasn't been created yet, allocate it. It will be used in
3891  // the region after the next.
3892  // If it exists, it is the current task team and shouldn't be touched yet as
3893  // it may still be in use.
3894  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3895  (always || team->t.t_nproc > 1)) {
3896  team->t.t_task_team[this_thr->th.th_task_state] =
3897  __kmp_allocate_task_team(this_thr, team);
3898  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3899  " for team %d at parity=%d\n",
3900  __kmp_gtid_from_thread(this_thr),
3901  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3902  this_thr->th.th_task_state));
3903  }
3904 
3905  // After threads exit the release, they will call sync, and then point to this
3906  // other task_team; make sure it is allocated and properly initialized. As
3907  // threads spin in the barrier release phase, they will continue to use the
3908  // previous task_team struct(above), until they receive the signal to stop
3909  // checking for tasks (they can't safely reference the kmp_team_t struct,
3910  // which could be reallocated by the primary thread). No task teams are formed
3911  // for serialized teams.
3912  if (team->t.t_nproc > 1) {
3913  int other_team = 1 - this_thr->th.th_task_state;
3914  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3915  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3916  team->t.t_task_team[other_team] =
3917  __kmp_allocate_task_team(this_thr, team);
3918  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3919  "task_team %p for team %d at parity=%d\n",
3920  __kmp_gtid_from_thread(this_thr),
3921  team->t.t_task_team[other_team], team->t.t_id, other_team));
3922  } else { // Leave the old task team struct in place for the upcoming region;
3923  // adjust as needed
3924  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3925  if (!task_team->tt.tt_active ||
3926  team->t.t_nproc != task_team->tt.tt_nproc) {
3927  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3928  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3929  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3930  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3931  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3932  team->t.t_nproc);
3933  TCW_4(task_team->tt.tt_active, TRUE);
3934  }
3935  // if team size has changed, the first thread to enable tasking will
3936  // realloc threads_data if necessary
3937  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3938  "%p for team %d at parity=%d\n",
3939  __kmp_gtid_from_thread(this_thr),
3940  team->t.t_task_team[other_team], team->t.t_id, other_team));
3941  }
3942  }
3943 
3944  // For regular thread, task enabling should be called when the task is going
3945  // to be pushed to a dequeue. However, for the hidden helper thread, we need
3946  // it ahead of time so that some operations can be performed without race
3947  // condition.
3948  if (this_thr == __kmp_hidden_helper_main_thread) {
3949  for (int i = 0; i < 2; ++i) {
3950  kmp_task_team_t *task_team = team->t.t_task_team[i];
3951  if (KMP_TASKING_ENABLED(task_team)) {
3952  continue;
3953  }
3954  __kmp_enable_tasking(task_team, this_thr);
3955  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3956  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3957  if (thread_data->td.td_deque == NULL) {
3958  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3959  }
3960  }
3961  }
3962  }
3963 }
3964 
3965 // __kmp_task_team_sync: Propagation of task team data from team to threads
3966 // which happens just after the release phase of a team barrier. This may be
3967 // called by any thread, but only for teams with # threads > 1.
3968 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3969  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3970 
3971  // Toggle the th_task_state field, to switch which task_team this thread
3972  // refers to
3973  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3974 
3975  // It is now safe to propagate the task team pointer from the team struct to
3976  // the current thread.
3977  TCW_PTR(this_thr->th.th_task_team,
3978  team->t.t_task_team[this_thr->th.th_task_state]);
3979  KA_TRACE(20,
3980  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3981  "%p from Team #%d (parity=%d)\n",
3982  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3983  team->t.t_id, this_thr->th.th_task_state));
3984 }
3985 
3986 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
3987 // barrier gather phase. Only called by primary thread if #threads in team > 1
3988 // or if proxy tasks were created.
3989 //
3990 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3991 // by passing in 0 optionally as the last argument. When wait is zero, primary
3992 // thread does not wait for unfinished_threads to reach 0.
3993 void __kmp_task_team_wait(
3994  kmp_info_t *this_thr,
3995  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3996  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3997 
3998  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3999  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4000 
4001  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4002  if (wait) {
4003  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4004  "(for unfinished_threads to reach 0) on task_team = %p\n",
4005  __kmp_gtid_from_thread(this_thr), task_team));
4006  // Worker threads may have dropped through to release phase, but could
4007  // still be executing tasks. Wait here for tasks to complete. To avoid
4008  // memory contention, only primary thread checks termination condition.
4009  kmp_flag_32<false, false> flag(
4010  RCAST(std::atomic<kmp_uint32> *,
4011  &task_team->tt.tt_unfinished_threads),
4012  0U);
4013  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4014  }
4015  // Deactivate the old task team, so that the worker threads will stop
4016  // referencing it while spinning.
4017  KA_TRACE(
4018  20,
4019  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4020  "setting active to false, setting local and team's pointer to NULL\n",
4021  __kmp_gtid_from_thread(this_thr), task_team));
4022  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
4023  task_team->tt.tt_found_proxy_tasks == TRUE ||
4024  task_team->tt.tt_hidden_helper_task_encountered == TRUE);
4025  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4026  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4027  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4028  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4029  KMP_MB();
4030 
4031  TCW_PTR(this_thr->th.th_task_team, NULL);
4032  }
4033 }
4034 
4035 // __kmp_tasking_barrier:
4036 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4037 // Internal function to execute all tasks prior to a regular barrier or a join
4038 // barrier. It is a full barrier itself, which unfortunately turns regular
4039 // barriers into double barriers and join barriers into 1 1/2 barriers.
4040 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4041  std::atomic<kmp_uint32> *spin = RCAST(
4042  std::atomic<kmp_uint32> *,
4043  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4044  int flag = FALSE;
4045  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4046 
4047 #if USE_ITT_BUILD
4048  KMP_FSYNC_SPIN_INIT(spin, NULL);
4049 #endif /* USE_ITT_BUILD */
4050  kmp_flag_32<false, false> spin_flag(spin, 0U);
4051  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4052  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4053 #if USE_ITT_BUILD
4054  // TODO: What about itt_sync_obj??
4055  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4056 #endif /* USE_ITT_BUILD */
4057 
4058  if (TCR_4(__kmp_global.g.g_done)) {
4059  if (__kmp_global.g.g_abort)
4060  __kmp_abort_thread();
4061  break;
4062  }
4063  KMP_YIELD(TRUE);
4064  }
4065 #if USE_ITT_BUILD
4066  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4067 #endif /* USE_ITT_BUILD */
4068 }
4069 
4070 // __kmp_give_task puts a task into a given thread queue if:
4071 // - the queue for that thread was created
4072 // - there's space in that queue
4073 // Because of this, __kmp_push_task needs to check if there's space after
4074 // getting the lock
4075 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4076  kmp_int32 pass) {
4077  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4078  kmp_task_team_t *task_team = taskdata->td_task_team;
4079 
4080  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4081  taskdata, tid));
4082 
4083  // If task_team is NULL something went really bad...
4084  KMP_DEBUG_ASSERT(task_team != NULL);
4085 
4086  bool result = false;
4087  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4088 
4089  if (thread_data->td.td_deque == NULL) {
4090  // There's no queue in this thread, go find another one
4091  // We're guaranteed that at least one thread has a queue
4092  KA_TRACE(30,
4093  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4094  tid, taskdata));
4095  return result;
4096  }
4097 
4098  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4099  TASK_DEQUE_SIZE(thread_data->td)) {
4100  KA_TRACE(
4101  30,
4102  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4103  taskdata, tid));
4104 
4105  // if this deque is bigger than the pass ratio give a chance to another
4106  // thread
4107  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4108  return result;
4109 
4110  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4111  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4112  TASK_DEQUE_SIZE(thread_data->td)) {
4113  // expand deque to push the task which is not allowed to execute
4114  __kmp_realloc_task_deque(thread, thread_data);
4115  }
4116 
4117  } else {
4118 
4119  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4120 
4121  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4122  TASK_DEQUE_SIZE(thread_data->td)) {
4123  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4124  "thread %d.\n",
4125  taskdata, tid));
4126 
4127  // if this deque is bigger than the pass ratio give a chance to another
4128  // thread
4129  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4130  goto release_and_exit;
4131 
4132  __kmp_realloc_task_deque(thread, thread_data);
4133  }
4134  }
4135 
4136  // lock is held here, and there is space in the deque
4137 
4138  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4139  // Wrap index.
4140  thread_data->td.td_deque_tail =
4141  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4142  TCW_4(thread_data->td.td_deque_ntasks,
4143  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4144 
4145  result = true;
4146  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4147  taskdata, tid));
4148 
4149 release_and_exit:
4150  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4151 
4152  return result;
4153 }
4154 
4155 #define PROXY_TASK_FLAG 0x40000000
4156 /* The finish of the proxy tasks is divided in two pieces:
4157  - the top half is the one that can be done from a thread outside the team
4158  - the bottom half must be run from a thread within the team
4159 
4160  In order to run the bottom half the task gets queued back into one of the
4161  threads of the team. Once the td_incomplete_child_task counter of the parent
4162  is decremented the threads can leave the barriers. So, the bottom half needs
4163  to be queued before the counter is decremented. The top half is therefore
4164  divided in two parts:
4165  - things that can be run before queuing the bottom half
4166  - things that must be run after queuing the bottom half
4167 
4168  This creates a second race as the bottom half can free the task before the
4169  second top half is executed. To avoid this we use the
4170  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4171  half. */
4172 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4173  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4174  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4175  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4176  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4177 
4178  taskdata->td_flags.complete = 1; // mark the task as completed
4179 
4180  if (taskdata->td_taskgroup)
4181  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4182 
4183  // Create an imaginary children for this task so the bottom half cannot
4184  // release the task before we have completed the second top half
4185  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4186 }
4187 
4188 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4189 #if KMP_DEBUG
4190  kmp_int32 children = 0;
4191  // Predecrement simulated by "- 1" calculation
4192  children = -1 +
4193 #endif
4194  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4195  KMP_DEBUG_ASSERT(children >= 0);
4196 
4197  // Remove the imaginary children
4198  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4199 }
4200 
4201 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4202  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4203  kmp_info_t *thread = __kmp_threads[gtid];
4204 
4205  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4206  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4207  1); // top half must run before bottom half
4208 
4209  // We need to wait to make sure the top half is finished
4210  // Spinning here should be ok as this should happen quickly
4211  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4212  PROXY_TASK_FLAG) > 0)
4213  ;
4214 
4215  __kmp_release_deps(gtid, taskdata);
4216  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4217 }
4218 
4227 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4228  KMP_DEBUG_ASSERT(ptask != NULL);
4229  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4230  KA_TRACE(
4231  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4232  gtid, taskdata));
4233  __kmp_assert_valid_gtid(gtid);
4234  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4235 
4236  __kmp_first_top_half_finish_proxy(taskdata);
4237  __kmp_second_top_half_finish_proxy(taskdata);
4238  __kmp_bottom_half_finish_proxy(gtid, ptask);
4239 
4240  KA_TRACE(10,
4241  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4242  gtid, taskdata));
4243 }
4244 
4245 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4246  KMP_DEBUG_ASSERT(ptask != NULL);
4247  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4248 
4249  // Enqueue task to complete bottom half completion from a thread within the
4250  // corresponding team
4251  kmp_team_t *team = taskdata->td_team;
4252  kmp_int32 nthreads = team->t.t_nproc;
4253  kmp_info_t *thread;
4254 
4255  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4256  // but we cannot use __kmp_get_random here
4257  kmp_int32 start_k = start % nthreads;
4258  kmp_int32 pass = 1;
4259  kmp_int32 k = start_k;
4260 
4261  do {
4262  // For now we're just linearly trying to find a thread
4263  thread = team->t.t_threads[k];
4264  k = (k + 1) % nthreads;
4265 
4266  // we did a full pass through all the threads
4267  if (k == start_k)
4268  pass = pass << 1;
4269 
4270  } while (!__kmp_give_task(thread, k, ptask, pass));
4271 }
4272 
4280 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4281  KMP_DEBUG_ASSERT(ptask != NULL);
4282  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4283 
4284  KA_TRACE(
4285  10,
4286  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4287  taskdata));
4288 
4289  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4290 
4291  __kmp_first_top_half_finish_proxy(taskdata);
4292 
4293  __kmpc_give_task(ptask);
4294 
4295  __kmp_second_top_half_finish_proxy(taskdata);
4296 
4297  KA_TRACE(
4298  10,
4299  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4300  taskdata));
4301 }
4302 
4303 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4304  kmp_task_t *task) {
4305  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4306  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4307  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4308  td->td_allow_completion_event.ed.task = task;
4309  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4310  }
4311  return &td->td_allow_completion_event;
4312 }
4313 
4314 void __kmp_fulfill_event(kmp_event_t *event) {
4315  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4316  kmp_task_t *ptask = event->ed.task;
4317  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4318  bool detached = false;
4319  int gtid = __kmp_get_gtid();
4320 
4321  // The associated task might have completed or could be completing at this
4322  // point.
4323  // We need to take the lock to avoid races
4324  __kmp_acquire_tas_lock(&event->lock, gtid);
4325  if (taskdata->td_flags.proxy == TASK_PROXY) {
4326  detached = true;
4327  } else {
4328 #if OMPT_SUPPORT
4329  // The OMPT event must occur under mutual exclusion,
4330  // otherwise the tool might access ptask after free
4331  if (UNLIKELY(ompt_enabled.enabled))
4332  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4333 #endif
4334  }
4335  event->type = KMP_EVENT_UNINITIALIZED;
4336  __kmp_release_tas_lock(&event->lock, gtid);
4337 
4338  if (detached) {
4339 #if OMPT_SUPPORT
4340  // We free ptask afterwards and know the task is finished,
4341  // so locking is not necessary
4342  if (UNLIKELY(ompt_enabled.enabled))
4343  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4344 #endif
4345  // If the task detached complete the proxy task
4346  if (gtid >= 0) {
4347  kmp_team_t *team = taskdata->td_team;
4348  kmp_info_t *thread = __kmp_get_thread();
4349  if (thread->th.th_team == team) {
4350  __kmpc_proxy_task_completed(gtid, ptask);
4351  return;
4352  }
4353  }
4354 
4355  // fallback
4357  }
4358  }
4359 }
4360 
4361 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4362 // for taskloop
4363 //
4364 // thread: allocating thread
4365 // task_src: pointer to source task to be duplicated
4366 // returns: a pointer to the allocated kmp_task_t structure (task).
4367 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
4368  kmp_task_t *task;
4369  kmp_taskdata_t *taskdata;
4370  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4371  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4372  size_t shareds_offset;
4373  size_t task_size;
4374 
4375  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4376  task_src));
4377  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4378  TASK_FULL); // it should not be proxy task
4379  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4380  task_size = taskdata_src->td_size_alloc;
4381 
4382  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4383  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4384  task_size));
4385 #if USE_FAST_MEMORY
4386  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4387 #else
4388  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4389 #endif /* USE_FAST_MEMORY */
4390  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4391 
4392  task = KMP_TASKDATA_TO_TASK(taskdata);
4393 
4394  // Initialize new task (only specific fields not affected by memcpy)
4395  taskdata->td_task_id = KMP_GEN_TASK_ID();
4396  if (task->shareds != NULL) { // need setup shareds pointer
4397  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4398  task->shareds = &((char *)taskdata)[shareds_offset];
4399  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4400  0);
4401  }
4402  taskdata->td_alloc_thread = thread;
4403  taskdata->td_parent = parent_task;
4404  // task inherits the taskgroup from the parent task
4405  taskdata->td_taskgroup = parent_task->td_taskgroup;
4406  // tied task needs to initialize the td_last_tied at creation,
4407  // untied one does this when it is scheduled for execution
4408  if (taskdata->td_flags.tiedness == TASK_TIED)
4409  taskdata->td_last_tied = taskdata;
4410 
4411  // Only need to keep track of child task counts if team parallel and tasking
4412  // not serialized
4413  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4414  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4415  if (parent_task->td_taskgroup)
4416  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4417  // Only need to keep track of allocated child tasks for explicit tasks since
4418  // implicit not deallocated
4419  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4420  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4421  }
4422 
4423  KA_TRACE(20,
4424  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4425  thread, taskdata, taskdata->td_parent));
4426 #if OMPT_SUPPORT
4427  if (UNLIKELY(ompt_enabled.enabled))
4428  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4429 #endif
4430  return task;
4431 }
4432 
4433 // Routine optionally generated by the compiler for setting the lastprivate flag
4434 // and calling needed constructors for private/firstprivate objects
4435 // (used to form taskloop tasks from pattern task)
4436 // Parameters: dest task, src task, lastprivate flag.
4437 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4438 
4439 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4440 
4441 // class to encapsulate manipulating loop bounds in a taskloop task.
4442 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4443 // the loop bound variables.
4444 class kmp_taskloop_bounds_t {
4445  kmp_task_t *task;
4446  const kmp_taskdata_t *taskdata;
4447  size_t lower_offset;
4448  size_t upper_offset;
4449 
4450 public:
4451  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4452  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4453  lower_offset((char *)lb - (char *)task),
4454  upper_offset((char *)ub - (char *)task) {
4455  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4456  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4457  }
4458  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4459  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4460  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4461  size_t get_lower_offset() const { return lower_offset; }
4462  size_t get_upper_offset() const { return upper_offset; }
4463  kmp_uint64 get_lb() const {
4464  kmp_int64 retval;
4465 #if defined(KMP_GOMP_COMPAT)
4466  // Intel task just returns the lower bound normally
4467  if (!taskdata->td_flags.native) {
4468  retval = *(kmp_int64 *)((char *)task + lower_offset);
4469  } else {
4470  // GOMP task has to take into account the sizeof(long)
4471  if (taskdata->td_size_loop_bounds == 4) {
4472  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4473  retval = (kmp_int64)*lb;
4474  } else {
4475  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4476  retval = (kmp_int64)*lb;
4477  }
4478  }
4479 #else
4480  (void)taskdata;
4481  retval = *(kmp_int64 *)((char *)task + lower_offset);
4482 #endif // defined(KMP_GOMP_COMPAT)
4483  return retval;
4484  }
4485  kmp_uint64 get_ub() const {
4486  kmp_int64 retval;
4487 #if defined(KMP_GOMP_COMPAT)
4488  // Intel task just returns the upper bound normally
4489  if (!taskdata->td_flags.native) {
4490  retval = *(kmp_int64 *)((char *)task + upper_offset);
4491  } else {
4492  // GOMP task has to take into account the sizeof(long)
4493  if (taskdata->td_size_loop_bounds == 4) {
4494  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4495  retval = (kmp_int64)*ub;
4496  } else {
4497  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4498  retval = (kmp_int64)*ub;
4499  }
4500  }
4501 #else
4502  retval = *(kmp_int64 *)((char *)task + upper_offset);
4503 #endif // defined(KMP_GOMP_COMPAT)
4504  return retval;
4505  }
4506  void set_lb(kmp_uint64 lb) {
4507 #if defined(KMP_GOMP_COMPAT)
4508  // Intel task just sets the lower bound normally
4509  if (!taskdata->td_flags.native) {
4510  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4511  } else {
4512  // GOMP task has to take into account the sizeof(long)
4513  if (taskdata->td_size_loop_bounds == 4) {
4514  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4515  *lower = (kmp_uint32)lb;
4516  } else {
4517  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4518  *lower = (kmp_uint64)lb;
4519  }
4520  }
4521 #else
4522  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4523 #endif // defined(KMP_GOMP_COMPAT)
4524  }
4525  void set_ub(kmp_uint64 ub) {
4526 #if defined(KMP_GOMP_COMPAT)
4527  // Intel task just sets the upper bound normally
4528  if (!taskdata->td_flags.native) {
4529  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4530  } else {
4531  // GOMP task has to take into account the sizeof(long)
4532  if (taskdata->td_size_loop_bounds == 4) {
4533  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4534  *upper = (kmp_uint32)ub;
4535  } else {
4536  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4537  *upper = (kmp_uint64)ub;
4538  }
4539  }
4540 #else
4541  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4542 #endif // defined(KMP_GOMP_COMPAT)
4543  }
4544 };
4545 
4546 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4547 //
4548 // loc Source location information
4549 // gtid Global thread ID
4550 // task Pattern task, exposes the loop iteration range
4551 // lb Pointer to loop lower bound in task structure
4552 // ub Pointer to loop upper bound in task structure
4553 // st Loop stride
4554 // ub_glob Global upper bound (used for lastprivate check)
4555 // num_tasks Number of tasks to execute
4556 // grainsize Number of loop iterations per task
4557 // extras Number of chunks with grainsize+1 iterations
4558 // last_chunk Reduction of grainsize for last task
4559 // tc Iterations count
4560 // task_dup Tasks duplication routine
4561 // codeptr_ra Return address for OMPT events
4562 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4563  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4564  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4565  kmp_uint64 grainsize, kmp_uint64 extras,
4566  kmp_int64 last_chunk, kmp_uint64 tc,
4567 #if OMPT_SUPPORT
4568  void *codeptr_ra,
4569 #endif
4570  void *task_dup) {
4571  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4572  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4573  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4574  // compiler provides global bounds here
4575  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4576  kmp_uint64 lower = task_bounds.get_lb();
4577  kmp_uint64 upper = task_bounds.get_ub();
4578  kmp_uint64 i;
4579  kmp_info_t *thread = __kmp_threads[gtid];
4580  kmp_taskdata_t *current_task = thread->th.th_current_task;
4581  kmp_task_t *next_task;
4582  kmp_int32 lastpriv = 0;
4583 
4584  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4585  (last_chunk < 0 ? last_chunk : extras));
4586  KMP_DEBUG_ASSERT(num_tasks > extras);
4587  KMP_DEBUG_ASSERT(num_tasks > 0);
4588  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4589  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4590  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4591  ub_glob, st, task_dup));
4592 
4593  // Launch num_tasks tasks, assign grainsize iterations each task
4594  for (i = 0; i < num_tasks; ++i) {
4595  kmp_uint64 chunk_minus_1;
4596  if (extras == 0) {
4597  chunk_minus_1 = grainsize - 1;
4598  } else {
4599  chunk_minus_1 = grainsize;
4600  --extras; // first extras iterations get bigger chunk (grainsize+1)
4601  }
4602  upper = lower + st * chunk_minus_1;
4603  if (upper > *ub) {
4604  upper = *ub;
4605  }
4606  if (i == num_tasks - 1) {
4607  // schedule the last task, set lastprivate flag if needed
4608  if (st == 1) { // most common case
4609  KMP_DEBUG_ASSERT(upper == *ub);
4610  if (upper == ub_glob)
4611  lastpriv = 1;
4612  } else if (st > 0) { // positive loop stride
4613  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4614  if ((kmp_uint64)st > ub_glob - upper)
4615  lastpriv = 1;
4616  } else { // negative loop stride
4617  KMP_DEBUG_ASSERT(upper + st < *ub);
4618  if (upper - ub_glob < (kmp_uint64)(-st))
4619  lastpriv = 1;
4620  }
4621  }
4622  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4623  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4624  kmp_taskloop_bounds_t next_task_bounds =
4625  kmp_taskloop_bounds_t(next_task, task_bounds);
4626 
4627  // adjust task-specific bounds
4628  next_task_bounds.set_lb(lower);
4629  if (next_taskdata->td_flags.native) {
4630  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4631  } else {
4632  next_task_bounds.set_ub(upper);
4633  }
4634  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4635  // etc.
4636  ptask_dup(next_task, task, lastpriv);
4637  KA_TRACE(40,
4638  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4639  "upper %lld stride %lld, (offsets %p %p)\n",
4640  gtid, i, next_task, lower, upper, st,
4641  next_task_bounds.get_lower_offset(),
4642  next_task_bounds.get_upper_offset()));
4643 #if OMPT_SUPPORT
4644  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4645  codeptr_ra); // schedule new task
4646 #else
4647  __kmp_omp_task(gtid, next_task, true); // schedule new task
4648 #endif
4649  lower = upper + st; // adjust lower bound for the next iteration
4650  }
4651  // free the pattern task and exit
4652  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4653  // do not execute the pattern task, just do internal bookkeeping
4654  __kmp_task_finish<false>(gtid, task, current_task);
4655 }
4656 
4657 // Structure to keep taskloop parameters for auxiliary task
4658 // kept in the shareds of the task structure.
4659 typedef struct __taskloop_params {
4660  kmp_task_t *task;
4661  kmp_uint64 *lb;
4662  kmp_uint64 *ub;
4663  void *task_dup;
4664  kmp_int64 st;
4665  kmp_uint64 ub_glob;
4666  kmp_uint64 num_tasks;
4667  kmp_uint64 grainsize;
4668  kmp_uint64 extras;
4669  kmp_int64 last_chunk;
4670  kmp_uint64 tc;
4671  kmp_uint64 num_t_min;
4672 #if OMPT_SUPPORT
4673  void *codeptr_ra;
4674 #endif
4675 } __taskloop_params_t;
4676 
4677 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4678  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4679  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4680  kmp_uint64,
4681 #if OMPT_SUPPORT
4682  void *,
4683 #endif
4684  void *);
4685 
4686 // Execute part of the taskloop submitted as a task.
4687 int __kmp_taskloop_task(int gtid, void *ptask) {
4688  __taskloop_params_t *p =
4689  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4690  kmp_task_t *task = p->task;
4691  kmp_uint64 *lb = p->lb;
4692  kmp_uint64 *ub = p->ub;
4693  void *task_dup = p->task_dup;
4694  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4695  kmp_int64 st = p->st;
4696  kmp_uint64 ub_glob = p->ub_glob;
4697  kmp_uint64 num_tasks = p->num_tasks;
4698  kmp_uint64 grainsize = p->grainsize;
4699  kmp_uint64 extras = p->extras;
4700  kmp_int64 last_chunk = p->last_chunk;
4701  kmp_uint64 tc = p->tc;
4702  kmp_uint64 num_t_min = p->num_t_min;
4703 #if OMPT_SUPPORT
4704  void *codeptr_ra = p->codeptr_ra;
4705 #endif
4706 #if KMP_DEBUG
4707  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4708  KMP_DEBUG_ASSERT(task != NULL);
4709  KA_TRACE(20,
4710  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4711  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4712  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4713  st, task_dup));
4714 #endif
4715  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4716  if (num_tasks > num_t_min)
4717  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4718  grainsize, extras, last_chunk, tc, num_t_min,
4719 #if OMPT_SUPPORT
4720  codeptr_ra,
4721 #endif
4722  task_dup);
4723  else
4724  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4725  grainsize, extras, last_chunk, tc,
4726 #if OMPT_SUPPORT
4727  codeptr_ra,
4728 #endif
4729  task_dup);
4730 
4731  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4732  return 0;
4733 }
4734 
4735 // Schedule part of the taskloop as a task,
4736 // execute the rest of the taskloop.
4737 //
4738 // loc Source location information
4739 // gtid Global thread ID
4740 // task Pattern task, exposes the loop iteration range
4741 // lb Pointer to loop lower bound in task structure
4742 // ub Pointer to loop upper bound in task structure
4743 // st Loop stride
4744 // ub_glob Global upper bound (used for lastprivate check)
4745 // num_tasks Number of tasks to execute
4746 // grainsize Number of loop iterations per task
4747 // extras Number of chunks with grainsize+1 iterations
4748 // last_chunk Reduction of grainsize for last task
4749 // tc Iterations count
4750 // num_t_min Threshold to launch tasks recursively
4751 // task_dup Tasks duplication routine
4752 // codeptr_ra Return address for OMPT events
4753 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4754  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4755  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4756  kmp_uint64 grainsize, kmp_uint64 extras,
4757  kmp_int64 last_chunk, kmp_uint64 tc,
4758  kmp_uint64 num_t_min,
4759 #if OMPT_SUPPORT
4760  void *codeptr_ra,
4761 #endif
4762  void *task_dup) {
4763  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4764  KMP_DEBUG_ASSERT(task != NULL);
4765  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4766  KA_TRACE(20,
4767  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4768  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4769  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4770  st, task_dup));
4771  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4772  kmp_uint64 lower = *lb;
4773  kmp_info_t *thread = __kmp_threads[gtid];
4774  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4775  kmp_task_t *next_task;
4776  size_t lower_offset =
4777  (char *)lb - (char *)task; // remember offset of lb in the task structure
4778  size_t upper_offset =
4779  (char *)ub - (char *)task; // remember offset of ub in the task structure
4780 
4781  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4782  (last_chunk < 0 ? last_chunk : extras));
4783  KMP_DEBUG_ASSERT(num_tasks > extras);
4784  KMP_DEBUG_ASSERT(num_tasks > 0);
4785 
4786  // split the loop in two halves
4787  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4788  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4789  kmp_uint64 gr_size0 = grainsize;
4790  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4791  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4792  if (last_chunk < 0) {
4793  ext0 = ext1 = 0;
4794  last_chunk1 = last_chunk;
4795  tc0 = grainsize * n_tsk0;
4796  tc1 = tc - tc0;
4797  } else if (n_tsk0 <= extras) {
4798  gr_size0++; // integrate extras into grainsize
4799  ext0 = 0; // no extra iters in 1st half
4800  ext1 = extras - n_tsk0; // remaining extras
4801  tc0 = gr_size0 * n_tsk0;
4802  tc1 = tc - tc0;
4803  } else { // n_tsk0 > extras
4804  ext1 = 0; // no extra iters in 2nd half
4805  ext0 = extras;
4806  tc1 = grainsize * n_tsk1;
4807  tc0 = tc - tc1;
4808  }
4809  ub0 = lower + st * (tc0 - 1);
4810  lb1 = ub0 + st;
4811 
4812  // create pattern task for 2nd half of the loop
4813  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4814  // adjust lower bound (upper bound is not changed) for the 2nd half
4815  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4816  if (ptask_dup != NULL) // construct firstprivates, etc.
4817  ptask_dup(next_task, task, 0);
4818  *ub = ub0; // adjust upper bound for the 1st half
4819 
4820  // create auxiliary task for 2nd half of the loop
4821  // make sure new task has same parent task as the pattern task
4822  kmp_taskdata_t *current_task = thread->th.th_current_task;
4823  thread->th.th_current_task = taskdata->td_parent;
4824  kmp_task_t *new_task =
4825  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4826  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4827  // restore current task
4828  thread->th.th_current_task = current_task;
4829  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4830  p->task = next_task;
4831  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4832  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4833  p->task_dup = task_dup;
4834  p->st = st;
4835  p->ub_glob = ub_glob;
4836  p->num_tasks = n_tsk1;
4837  p->grainsize = grainsize;
4838  p->extras = ext1;
4839  p->last_chunk = last_chunk1;
4840  p->tc = tc1;
4841  p->num_t_min = num_t_min;
4842 #if OMPT_SUPPORT
4843  p->codeptr_ra = codeptr_ra;
4844 #endif
4845 
4846 #if OMPT_SUPPORT
4847  // schedule new task with correct return address for OMPT events
4848  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4849 #else
4850  __kmp_omp_task(gtid, new_task, true); // schedule new task
4851 #endif
4852 
4853  // execute the 1st half of current subrange
4854  if (n_tsk0 > num_t_min)
4855  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4856  ext0, last_chunk0, tc0, num_t_min,
4857 #if OMPT_SUPPORT
4858  codeptr_ra,
4859 #endif
4860  task_dup);
4861  else
4862  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4863  gr_size0, ext0, last_chunk0, tc0,
4864 #if OMPT_SUPPORT
4865  codeptr_ra,
4866 #endif
4867  task_dup);
4868 
4869  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4870 }
4871 
4872 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4873  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4874  int nogroup, int sched, kmp_uint64 grainsize,
4875  int modifier, void *task_dup) {
4876  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4877  KMP_DEBUG_ASSERT(task != NULL);
4878  if (nogroup == 0) {
4879 #if OMPT_SUPPORT && OMPT_OPTIONAL
4880  OMPT_STORE_RETURN_ADDRESS(gtid);
4881 #endif
4882  __kmpc_taskgroup(loc, gtid);
4883  }
4884 
4885  // =========================================================================
4886  // calculate loop parameters
4887  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4888  kmp_uint64 tc;
4889  // compiler provides global bounds here
4890  kmp_uint64 lower = task_bounds.get_lb();
4891  kmp_uint64 upper = task_bounds.get_ub();
4892  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4893  kmp_uint64 num_tasks = 0, extras = 0;
4894  kmp_int64 last_chunk =
4895  0; // reduce grainsize of last task by last_chunk in strict mode
4896  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4897  kmp_info_t *thread = __kmp_threads[gtid];
4898  kmp_taskdata_t *current_task = thread->th.th_current_task;
4899 
4900  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4901  "grain %llu(%d, %d), dup %p\n",
4902  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4903  task_dup));
4904 
4905  // compute trip count
4906  if (st == 1) { // most common case
4907  tc = upper - lower + 1;
4908  } else if (st < 0) {
4909  tc = (lower - upper) / (-st) + 1;
4910  } else { // st > 0
4911  tc = (upper - lower) / st + 1;
4912  }
4913  if (tc == 0) {
4914  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4915  // free the pattern task and exit
4916  __kmp_task_start(gtid, task, current_task);
4917  // do not execute anything for zero-trip loop
4918  __kmp_task_finish<false>(gtid, task, current_task);
4919  return;
4920  }
4921 
4922 #if OMPT_SUPPORT && OMPT_OPTIONAL
4923  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4924  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4925  if (ompt_enabled.ompt_callback_work) {
4926  ompt_callbacks.ompt_callback(ompt_callback_work)(
4927  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4928  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4929  }
4930 #endif
4931 
4932  if (num_tasks_min == 0)
4933  // TODO: can we choose better default heuristic?
4934  num_tasks_min =
4935  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4936 
4937  // compute num_tasks/grainsize based on the input provided
4938  switch (sched) {
4939  case 0: // no schedule clause specified, we can choose the default
4940  // let's try to schedule (team_size*10) tasks
4941  grainsize = thread->th.th_team_nproc * 10;
4942  KMP_FALLTHROUGH();
4943  case 2: // num_tasks provided
4944  if (grainsize > tc) {
4945  num_tasks = tc; // too big num_tasks requested, adjust values
4946  grainsize = 1;
4947  extras = 0;
4948  } else {
4949  num_tasks = grainsize;
4950  grainsize = tc / num_tasks;
4951  extras = tc % num_tasks;
4952  }
4953  break;
4954  case 1: // grainsize provided
4955  if (grainsize > tc) {
4956  num_tasks = 1;
4957  grainsize = tc; // too big grainsize requested, adjust values
4958  extras = 0;
4959  } else {
4960  if (modifier) {
4961  num_tasks = (tc + grainsize - 1) / grainsize;
4962  last_chunk = tc - (num_tasks * grainsize);
4963  extras = 0;
4964  } else {
4965  num_tasks = tc / grainsize;
4966  // adjust grainsize for balanced distribution of iterations
4967  grainsize = tc / num_tasks;
4968  extras = tc % num_tasks;
4969  }
4970  }
4971  break;
4972  default:
4973  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4974  }
4975 
4976  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4977  (last_chunk < 0 ? last_chunk : extras));
4978  KMP_DEBUG_ASSERT(num_tasks > extras);
4979  KMP_DEBUG_ASSERT(num_tasks > 0);
4980  // =========================================================================
4981 
4982  // check if clause value first
4983  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4984  if (if_val == 0) { // if(0) specified, mark task as serial
4985  taskdata->td_flags.task_serial = 1;
4986  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4987  // always start serial tasks linearly
4988  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4989  grainsize, extras, last_chunk, tc,
4990 #if OMPT_SUPPORT
4991  OMPT_GET_RETURN_ADDRESS(0),
4992 #endif
4993  task_dup);
4994  // !taskdata->td_flags.native => currently force linear spawning of tasks
4995  // for GOMP_taskloop
4996  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4997  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4998  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
4999  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5000  last_chunk));
5001  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5002  grainsize, extras, last_chunk, tc, num_tasks_min,
5003 #if OMPT_SUPPORT
5004  OMPT_GET_RETURN_ADDRESS(0),
5005 #endif
5006  task_dup);
5007  } else {
5008  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5009  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5010  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5011  last_chunk));
5012  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5013  grainsize, extras, last_chunk, tc,
5014 #if OMPT_SUPPORT
5015  OMPT_GET_RETURN_ADDRESS(0),
5016 #endif
5017  task_dup);
5018  }
5019 
5020 #if OMPT_SUPPORT && OMPT_OPTIONAL
5021  if (ompt_enabled.ompt_callback_work) {
5022  ompt_callbacks.ompt_callback(ompt_callback_work)(
5023  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5024  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5025  }
5026 #endif
5027 
5028  if (nogroup == 0) {
5029 #if OMPT_SUPPORT && OMPT_OPTIONAL
5030  OMPT_STORE_RETURN_ADDRESS(gtid);
5031 #endif
5032  __kmpc_end_taskgroup(loc, gtid);
5033  }
5034  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5035 }
5036 
5053 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5054  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5055  int sched, kmp_uint64 grainsize, void *task_dup) {
5056  __kmp_assert_valid_gtid(gtid);
5057  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5058  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5059  0, task_dup);
5060  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5061 }
5062 
5080 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5081  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5082  int nogroup, int sched, kmp_uint64 grainsize,
5083  int modifier, void *task_dup) {
5084  __kmp_assert_valid_gtid(gtid);
5085  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5086  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5087  modifier, task_dup);
5088  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5089 }
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:908
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp.h:234
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags