1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2    and a lowering pass for OpenACC device directives.
3 
4    Copyright (C) 2005-2022 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
56 #include "convert.h"
57 #include "opts.h"
58 
59 /* Describe the OpenACC looping structure of a function.  The entire
60    function is held in a 'NULL' loop.  */
61 
62 struct oacc_loop
63 {
64   oacc_loop *parent; /* Containing loop.  */
65 
66   oacc_loop *child; /* First inner loop.  */
67 
68   oacc_loop *sibling; /* Next loop within same parent.  */
69 
70   location_t loc; /* Location of the loop start.  */
71 
72   gcall *marker; /* Initial head marker.  */
73 
74   gcall *heads[GOMP_DIM_MAX];  /* Head marker functions.  */
75   gcall *tails[GOMP_DIM_MAX];  /* Tail marker functions.  */
76 
77   tree routine;  /* Pseudo-loop enclosing a routine.  */
78 
79   unsigned mask;   /* Partitioning mask.  */
80   unsigned e_mask; /* Partitioning of element loops (when tiling).  */
81   unsigned inner;  /* Partitioning of inner loops.  */
82   unsigned flags;  /* Partitioning flags.  */
83   vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
84   tree chunk_size; /* Chunk size.  */
85   gcall *head_end; /* Final marker of head sequence.  */
86 };
87 
88 /* Holds offload tables with decls.  */
89 vec<tree, va_gc> *offload_funcs, *offload_vars;
90 
91 /* Return level at which oacc routine may spawn a partitioned loop, or
92    -1 if it is not a routine (i.e. is an offload fn).  */
93 
94 int
oacc_fn_attrib_level(tree attr)95 oacc_fn_attrib_level (tree attr)
96 {
97   tree pos = TREE_VALUE (attr);
98 
99   if (!TREE_PURPOSE (pos))
100     return -1;
101 
102   int ix = 0;
103   for (ix = 0; ix != GOMP_DIM_MAX;
104        ix++, pos = TREE_CHAIN (pos))
105     if (!integer_zerop (TREE_PURPOSE (pos)))
106       break;
107 
108   return ix;
109 }
110 
111 /* Helper function for omp_finish_file routine.  Takes decls from V_DECLS and
112    adds their addresses and sizes to constructor-vector V_CTOR.  */
113 
114 static void
add_decls_addresses_to_decl_constructor(vec<tree,va_gc> * v_decls,vec<constructor_elt,va_gc> * v_ctor)115 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
116                                                    vec<constructor_elt, va_gc> *v_ctor)
117 {
118   unsigned len = vec_safe_length (v_decls);
119   for (unsigned i = 0; i < len; i++)
120     {
121       tree it = (*v_decls)[i];
122       bool is_var = VAR_P (it);
123       bool is_link_var
124           = is_var
125 #ifdef ACCEL_COMPILER
126             && DECL_HAS_VALUE_EXPR_P (it)
127 #endif
128             && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
129 
130       /* See also omp_finish_file and output_offload_tables in lto-cgraph.cc.  */
131       if (!in_lto_p && !symtab_node::get (it))
132           continue;
133 
134       tree size = NULL_TREE;
135       if (is_var)
136           size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
137 
138       tree addr;
139       if (!is_link_var)
140           addr = build_fold_addr_expr (it);
141       else
142           {
143 #ifdef ACCEL_COMPILER
144             /* For "omp declare target link" vars add address of the pointer to
145                the target table, instead of address of the var.  */
146             tree value_expr = DECL_VALUE_EXPR (it);
147             tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
148             varpool_node::finalize_decl (link_ptr_decl);
149             addr = build_fold_addr_expr (link_ptr_decl);
150 #else
151             addr = build_fold_addr_expr (it);
152 #endif
153 
154             /* Most significant bit of the size marks "omp declare target link"
155                vars in host and target tables.  */
156             unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
157             isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
158                                   * BITS_PER_UNIT - 1);
159             size = wide_int_to_tree (const_ptr_type_node, isize);
160           }
161 
162       CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
163       if (is_var)
164           CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
165     }
166 }
167 
168 /* Return true if DECL is a function for which its references should be
169    analyzed.  */
170 
171 static bool
omp_declare_target_fn_p(tree decl)172 omp_declare_target_fn_p (tree decl)
173 {
174   return (TREE_CODE (decl) == FUNCTION_DECL
175             && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
176             && !lookup_attribute ("omp declare target host",
177                                         DECL_ATTRIBUTES (decl))
178             && (!flag_openacc
179                 || oacc_get_fn_attrib (decl) == NULL_TREE));
180 }
181 
182 /* Return true if DECL Is a variable for which its initializer references
183    should be analyzed.  */
184 
185 static bool
omp_declare_target_var_p(tree decl)186 omp_declare_target_var_p (tree decl)
187 {
188   return (VAR_P (decl)
189             && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
190             && !lookup_attribute ("omp declare target link",
191                                         DECL_ATTRIBUTES (decl)));
192 }
193 
194 /* Helper function for omp_discover_implicit_declare_target, called through
195    walk_tree.  Mark referenced FUNCTION_DECLs implicitly as
196    declare target to.  */
197 
198 static tree
omp_discover_declare_target_tgt_fn_r(tree * tp,int * walk_subtrees,void * data)199 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
200 {
201   if (TREE_CODE (*tp) == CALL_EXPR
202       && CALL_EXPR_FN (*tp)
203       && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
204       && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
205       && lookup_attribute ("omp declare variant base",
206                                  DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
207                                                                         0))))
208     {
209       tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
210       for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
211           {
212             attr = lookup_attribute ("omp declare variant base", attr);
213             if (attr == NULL_TREE)
214               break;
215             tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
216             if (TREE_CODE (purpose) == FUNCTION_DECL)
217               omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
218           }
219     }
220   else if (TREE_CODE (*tp) == FUNCTION_DECL)
221     {
222       tree decl = *tp;
223       tree id = get_identifier ("omp declare target");
224       symtab_node *node = symtab_node::get (*tp);
225       if (node != NULL)
226           {
227             while (node->alias_target
228                      && TREE_CODE (node->alias_target) == FUNCTION_DECL)
229               {
230                 if (!omp_declare_target_fn_p (node->decl)
231                       && !lookup_attribute ("omp declare target host",
232                                                   DECL_ATTRIBUTES (node->decl)))
233                     {
234                       node->offloadable = 1;
235                       DECL_ATTRIBUTES (node->decl)
236                         = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
237                     }
238                 node = symtab_node::get (node->alias_target);
239               }
240             symtab_node *new_node = node->ultimate_alias_target ();
241             decl = new_node->decl;
242             while (node != new_node)
243               {
244                 if (!omp_declare_target_fn_p (node->decl)
245                       && !lookup_attribute ("omp declare target host",
246                                                   DECL_ATTRIBUTES (node->decl)))
247                     {
248                       node->offloadable = 1;
249                       DECL_ATTRIBUTES (node->decl)
250                         = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
251                     }
252                 gcc_assert (node->alias && node->analyzed);
253                 node = node->get_alias_target ();
254               }
255             node->offloadable = 1;
256             if (ENABLE_OFFLOADING)
257               g->have_offload = true;
258           }
259       if (omp_declare_target_fn_p (decl)
260             || lookup_attribute ("omp declare target host",
261                                      DECL_ATTRIBUTES (decl)))
262           return NULL_TREE;
263 
264       if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
265           ((vec<tree> *) data)->safe_push (decl);
266       DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
267                                                     DECL_ATTRIBUTES (decl));
268     }
269   else if (TYPE_P (*tp))
270     *walk_subtrees = 0;
271   /* else if (TREE_CODE (*tp) == OMP_TARGET)
272        {
273            if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
274              if (OMP_DEVICE_ANCESTOR (dev))
275                *walk_subtrees = 0;
276        } */
277   return NULL_TREE;
278 }
279 
280 /* Similarly, but ignore references outside of OMP_TARGET regions.  */
281 
282 static tree
omp_discover_declare_target_fn_r(tree * tp,int * walk_subtrees,void * data)283 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
284 {
285   if (TREE_CODE (*tp) == OMP_TARGET)
286     {
287       /* And not OMP_DEVICE_ANCESTOR.  */
288       walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
289                                             omp_discover_declare_target_tgt_fn_r,
290                                             data);
291       *walk_subtrees = 0;
292     }
293   else if (TYPE_P (*tp))
294     *walk_subtrees = 0;
295   return NULL_TREE;
296 }
297 
298 /* Helper function for omp_discover_implicit_declare_target, called through
299    walk_tree.  Mark referenced FUNCTION_DECLs implicitly as
300    declare target to.  */
301 
302 static tree
omp_discover_declare_target_var_r(tree * tp,int * walk_subtrees,void * data)303 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
304 {
305   if (TREE_CODE (*tp) == FUNCTION_DECL)
306     return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
307   else if (VAR_P (*tp)
308              && is_global_var (*tp)
309              && !omp_declare_target_var_p (*tp))
310     {
311       tree id = get_identifier ("omp declare target");
312       if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
313           {
314             error_at (DECL_SOURCE_LOCATION (*tp),
315                         "%qD specified both in declare target %<link%> and "
316                         "implicitly in %<to%> clauses", *tp);
317             DECL_ATTRIBUTES (*tp)
318               = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
319           }
320       if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
321           ((vec<tree> *) data)->safe_push (*tp);
322       DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
323       symtab_node *node = symtab_node::get (*tp);
324       if (node != NULL && !node->offloadable)
325           {
326             node->offloadable = 1;
327             if (ENABLE_OFFLOADING)
328               {
329                 g->have_offload = true;
330                 if (is_a <varpool_node *> (node))
331                     vec_safe_push (offload_vars, node->decl);
332               }
333           }
334     }
335   else if (TYPE_P (*tp))
336     *walk_subtrees = 0;
337   return NULL_TREE;
338 }
339 
340 /* Perform the OpenMP implicit declare target to discovery.  */
341 
342 void
omp_discover_implicit_declare_target(void)343 omp_discover_implicit_declare_target (void)
344 {
345   cgraph_node *node;
346   varpool_node *vnode;
347   auto_vec<tree> worklist;
348 
349   FOR_EACH_DEFINED_FUNCTION (node)
350     if (DECL_SAVED_TREE (node->decl))
351       {
352           struct cgraph_node *cgn;
353         if (omp_declare_target_fn_p (node->decl))
354             worklist.safe_push (node->decl);
355           else if (DECL_STRUCT_FUNCTION (node->decl)
356                      && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
357             worklist.safe_push (node->decl);
358           for (cgn = first_nested_function (node);
359                cgn; cgn = next_nested_function (cgn))
360             if (omp_declare_target_fn_p (cgn->decl))
361               worklist.safe_push (cgn->decl);
362             else if (DECL_STRUCT_FUNCTION (cgn->decl)
363                        && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
364               worklist.safe_push (cgn->decl);
365       }
366   FOR_EACH_VARIABLE (vnode)
367     if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
368           && omp_declare_target_var_p (vnode->decl))
369       worklist.safe_push (vnode->decl);
370   while (!worklist.is_empty ())
371     {
372       tree decl = worklist.pop ();
373       if (VAR_P (decl))
374           walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
375                                               omp_discover_declare_target_var_r,
376                                               &worklist);
377       else if (omp_declare_target_fn_p (decl))
378           walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
379                                               omp_discover_declare_target_tgt_fn_r,
380                                               &worklist);
381       else
382           walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
383                                               omp_discover_declare_target_fn_r,
384                                               &worklist);
385     }
386 
387   lang_hooks.decls.omp_finish_decl_inits ();
388 }
389 
390 
391 /* Create new symbols containing (address, size) pairs for global variables,
392    marked with "omp declare target" attribute, as well as addresses for the
393    functions, which are outlined offloading regions.  */
394 void
omp_finish_file(void)395 omp_finish_file (void)
396 {
397   unsigned num_funcs = vec_safe_length (offload_funcs);
398   unsigned num_vars = vec_safe_length (offload_vars);
399 
400   if (num_funcs == 0 && num_vars == 0)
401     return;
402 
403   if (targetm_common.have_named_sections)
404     {
405       vec<constructor_elt, va_gc> *v_f, *v_v;
406       vec_alloc (v_f, num_funcs);
407       vec_alloc (v_v, num_vars * 2);
408 
409       add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
410       add_decls_addresses_to_decl_constructor (offload_vars, v_v);
411 
412       tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
413                                                                 vec_safe_length (v_v));
414       tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
415                                                                  num_funcs);
416       SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
417       SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
418       tree ctor_v = build_constructor (vars_decl_type, v_v);
419       tree ctor_f = build_constructor (funcs_decl_type, v_f);
420       TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
421       TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
422       tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
423                                             get_identifier (".offload_func_table"),
424                                             funcs_decl_type);
425       tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
426                                            get_identifier (".offload_var_table"),
427                                            vars_decl_type);
428       TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
429       /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
430            otherwise a joint table in a binary will contain padding between
431            tables from multiple object files.  */
432       DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
433       SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
434       SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
435       DECL_INITIAL (funcs_decl) = ctor_f;
436       DECL_INITIAL (vars_decl) = ctor_v;
437       set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
438       set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
439 
440       varpool_node::finalize_decl (vars_decl);
441       varpool_node::finalize_decl (funcs_decl);
442     }
443   else
444     {
445       for (unsigned i = 0; i < num_funcs; i++)
446           {
447             tree it = (*offload_funcs)[i];
448             /* See also add_decls_addresses_to_decl_constructor
449                and output_offload_tables in lto-cgraph.cc.  */
450             if (!in_lto_p && !symtab_node::get (it))
451               continue;
452             targetm.record_offload_symbol (it);
453           }
454       for (unsigned i = 0; i < num_vars; i++)
455           {
456             tree it = (*offload_vars)[i];
457             if (!in_lto_p && !symtab_node::get (it))
458               continue;
459 #ifdef ACCEL_COMPILER
460             if (DECL_HAS_VALUE_EXPR_P (it)
461                 && lookup_attribute ("omp declare target link",
462                                            DECL_ATTRIBUTES (it)))
463               {
464                 tree value_expr = DECL_VALUE_EXPR (it);
465                 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
466                 targetm.record_offload_symbol (link_ptr_decl);
467                 varpool_node::finalize_decl (link_ptr_decl);
468               }
469             else
470 #endif
471               targetm.record_offload_symbol (it);
472           }
473     }
474 }
475 
476 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
477    axis DIM.  Return a tmp var holding the result.  */
478 
479 static tree
oacc_dim_call(bool pos,int dim,gimple_seq * seq)480 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
481 {
482   tree arg = build_int_cst (unsigned_type_node, dim);
483   tree size = create_tmp_var (integer_type_node);
484   enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
485   gimple *call = gimple_build_call_internal (fn, 1, arg);
486 
487   gimple_call_set_lhs (call, size);
488   gimple_seq_add_stmt (seq, call);
489 
490   return size;
491 }
492 
493 /* Find the number of threads (POS = false), or thread number (POS =
494    true) for an OpenACC region partitioned as MASK.  Setup code
495    required for the calculation is added to SEQ.  */
496 
497 static tree
oacc_thread_numbers(bool pos,int mask,gimple_seq * seq)498 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
499 {
500   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
501   unsigned ix;
502 
503   /* Start at gang level, and examine relevant dimension indices.  */
504   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
505     if (GOMP_DIM_MASK (ix) & mask)
506       {
507           if (res)
508             {
509               /* We had an outer index, so scale that by the size of
510                  this dimension.  */
511               tree n = oacc_dim_call (false, ix, seq);
512               res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
513             }
514           if (pos)
515             {
516               /* Determine index in this dimension.  */
517               tree id = oacc_dim_call (true, ix, seq);
518               if (res)
519                 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
520               else
521                 res = id;
522             }
523       }
524 
525   if (res == NULL_TREE)
526     res = integer_zero_node;
527 
528   return res;
529 }
530 
531 /* Transform IFN_GOACC_LOOP calls to actual code.  See
532    expand_oacc_for for where these are generated.  At the vector
533    level, we stride loops, such that each member of a warp will
534    operate on adjacent iterations.  At the worker and gang level,
535    each gang/warp executes a set of contiguous iterations.  Chunking
536    can override this such that each iteration engine executes a
537    contiguous chunk, and then moves on to stride to the next chunk.  */
538 
539 static void
oacc_xform_loop(gcall * call)540 oacc_xform_loop (gcall *call)
541 {
542   gimple_stmt_iterator gsi = gsi_for_stmt (call);
543   enum ifn_goacc_loop_kind code
544     = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
545   tree dir = gimple_call_arg (call, 1);
546   tree range = gimple_call_arg (call, 2);
547   tree step = gimple_call_arg (call, 3);
548   tree chunk_size = NULL_TREE;
549   unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
550   tree lhs = gimple_call_lhs (call);
551   tree type = NULL_TREE;
552   tree diff_type = TREE_TYPE (range);
553   tree r = NULL_TREE;
554   gimple_seq seq = NULL;
555   bool chunking = false, striding = true;
556   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
557   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
558 
559   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
560   if (!lhs)
561     {
562       gsi_replace_with_seq (&gsi, seq, true);
563       return;
564     }
565 
566   type = TREE_TYPE (lhs);
567 
568 #ifdef ACCEL_COMPILER
569   chunk_size = gimple_call_arg (call, 4);
570   if (integer_minus_onep (chunk_size)  /* Force static allocation.  */
571       || integer_zerop (chunk_size))   /* Default (also static).  */
572     {
573       /* If we're at the gang level, we want each to execute a
574            contiguous run of iterations.  Otherwise we want each element
575            to stride.  */
576       striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
577       chunking = false;
578     }
579   else
580     {
581       /* Chunk of size 1 is striding.  */
582       striding = integer_onep (chunk_size);
583       chunking = !striding;
584     }
585 #endif
586 
587   /* striding=true, chunking=true
588        -> invalid.
589      striding=true, chunking=false
590        -> chunks=1
591      striding=false,chunking=true
592        -> chunks=ceil (range/(chunksize*threads*step))
593      striding=false,chunking=false
594        -> chunk_size=ceil(range/(threads*step)),chunks=1  */
595   push_gimplify_context (true);
596 
597   switch (code)
598     {
599     default: gcc_unreachable ();
600 
601     case IFN_GOACC_LOOP_CHUNKS:
602       if (!chunking)
603           r = build_int_cst (type, 1);
604       else
605           {
606             /* chunk_max
607                = (range - dir) / (chunks * step * num_threads) + dir  */
608             tree per = oacc_thread_numbers (false, mask, &seq);
609             per = fold_convert (type, per);
610             chunk_size = fold_convert (type, chunk_size);
611             per = fold_build2 (MULT_EXPR, type, per, chunk_size);
612             per = fold_build2 (MULT_EXPR, type, per, step);
613             r = build2 (MINUS_EXPR, type, range, dir);
614             r = build2 (PLUS_EXPR, type, r, per);
615             r = build2 (TRUNC_DIV_EXPR, type, r, per);
616           }
617       break;
618 
619     case IFN_GOACC_LOOP_STEP:
620       {
621           /* If striding, step by the entire compute volume, otherwise
622              step by the inner volume.  */
623           unsigned volume = striding ? mask : inner_mask;
624 
625           r = oacc_thread_numbers (false, volume, &seq);
626           r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
627       }
628       break;
629 
630     case IFN_GOACC_LOOP_OFFSET:
631       /* Enable vectorization on non-SIMT targets.  */
632       if (!targetm.simt.vf
633             && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
634             /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
635                the loop.  */
636             && (flag_tree_loop_vectorize
637                 || !OPTION_SET_P (flag_tree_loop_vectorize)))
638           {
639             basic_block bb = gsi_bb (gsi);
640             class loop *parent = bb->loop_father;
641             class loop *body = parent->inner;
642 
643             parent->force_vectorize = true;
644             parent->safelen = INT_MAX;
645 
646             /* "Chunking loops" may have inner loops.  */
647             if (parent->inner)
648               {
649                 body->force_vectorize = true;
650                 body->safelen = INT_MAX;
651               }
652 
653             cfun->has_force_vectorize_loops = true;
654           }
655       if (striding)
656           {
657             r = oacc_thread_numbers (true, mask, &seq);
658             r = fold_convert (diff_type, r);
659           }
660       else
661           {
662             tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
663             tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
664             tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
665                                              inner_size, outer_size);
666 
667             volume = fold_convert (diff_type, volume);
668             if (chunking)
669               chunk_size = fold_convert (diff_type, chunk_size);
670             else
671               {
672                 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
673 
674                 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
675                 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
676                 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
677               }
678 
679             tree span = build2 (MULT_EXPR, diff_type, chunk_size,
680                                     fold_convert (diff_type, inner_size));
681             r = oacc_thread_numbers (true, outer_mask, &seq);
682             r = fold_convert (diff_type, r);
683             r = build2 (MULT_EXPR, diff_type, r, span);
684 
685             tree inner = oacc_thread_numbers (true, inner_mask, &seq);
686             inner = fold_convert (diff_type, inner);
687             r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
688 
689             if (chunking)
690               {
691                 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
692                 tree per
693                     = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
694                 per = build2 (MULT_EXPR, diff_type, per, chunk);
695 
696                 r = build2 (PLUS_EXPR, diff_type, r, per);
697               }
698           }
699       r = fold_build2 (MULT_EXPR, diff_type, r, step);
700       if (type != diff_type)
701           r = fold_convert (type, r);
702       break;
703 
704     case IFN_GOACC_LOOP_BOUND:
705       if (striding)
706           r = range;
707       else
708           {
709             tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
710             tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
711             tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
712                                              inner_size, outer_size);
713 
714             volume = fold_convert (diff_type, volume);
715             if (chunking)
716               chunk_size = fold_convert (diff_type, chunk_size);
717             else
718               {
719                 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
720 
721                 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
722                 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
723                 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
724               }
725 
726             tree span = build2 (MULT_EXPR, diff_type, chunk_size,
727                                     fold_convert (diff_type, inner_size));
728 
729             r = fold_build2 (MULT_EXPR, diff_type, span, step);
730 
731             tree offset = gimple_call_arg (call, 6);
732             r = build2 (PLUS_EXPR, diff_type, r,
733                           fold_convert (diff_type, offset));
734             r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
735                           diff_type, r, range);
736           }
737       if (diff_type != type)
738           r = fold_convert (type, r);
739       break;
740     }
741 
742   gimplify_assign (lhs, r, &seq);
743 
744   pop_gimplify_context (NULL);
745 
746   gsi_replace_with_seq (&gsi, seq, true);
747 }
748 
749 /* Transform a GOACC_TILE call.  Determines the element loop span for
750    the specified loop of the nest.  This is 1 if we're not tiling.
751 
752    GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
753 
754 static void
oacc_xform_tile(gcall * call)755 oacc_xform_tile (gcall *call)
756 {
757   gimple_stmt_iterator gsi = gsi_for_stmt (call);
758   unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
759   /* Inner loops have higher loop_nos.  */
760   unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
761   tree tile_size = gimple_call_arg (call, 2);
762   unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
763   tree lhs = gimple_call_lhs (call);
764   tree type = TREE_TYPE (lhs);
765   gimple_seq seq = NULL;
766   tree span = build_int_cst (type, 1);
767 
768   gcc_assert (!(e_mask
769                     & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
770                         | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
771   push_gimplify_context (!seen_error ());
772 
773 #ifndef ACCEL_COMPILER
774   /* Partitioning disabled on host compilers.  */
775   e_mask = 0;
776 #endif
777   if (!e_mask)
778     /* Not paritioning.  */
779     span = integer_one_node;
780   else if (!integer_zerop (tile_size))
781     /* User explicitly specified size.  */
782     span = tile_size;
783   else
784     {
785       /* Pick a size based on the paritioning of the element loop and
786            the number of loop nests.  */
787       tree first_size = NULL_TREE;
788       tree second_size = NULL_TREE;
789 
790       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
791           first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
792       if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
793           second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
794 
795       if (!first_size)
796           {
797             first_size = second_size;
798             second_size = NULL_TREE;
799           }
800 
801       if (loop_no + 1 == collapse)
802           {
803             span = first_size;
804             if (!loop_no && second_size)
805               span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
806                                         span, second_size);
807           }
808       else if (loop_no + 2 == collapse)
809           span = second_size;
810       else
811           span = NULL_TREE;
812 
813       if (!span)
814           /* There's no obvious element size for this loop.  Options
815              are 1, first_size or some non-unity constant (32 is my
816              favourite).   We should gather some statistics.  */
817           span = first_size;
818     }
819 
820   span = fold_convert (type, span);
821   gimplify_assign (lhs, span, &seq);
822 
823   pop_gimplify_context (NULL);
824 
825   gsi_replace_with_seq (&gsi, seq, true);
826 }
827 
828 /* Default partitioned and minimum partitioned dimensions.  */
829 
830 static int oacc_default_dims[GOMP_DIM_MAX];
831 static int oacc_min_dims[GOMP_DIM_MAX];
832 
833 int
oacc_get_default_dim(int dim)834 oacc_get_default_dim (int dim)
835 {
836   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
837   return oacc_default_dims[dim];
838 }
839 
840 int
oacc_get_min_dim(int dim)841 oacc_get_min_dim (int dim)
842 {
843   gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
844   return oacc_min_dims[dim];
845 }
846 
847 /* Parse the default dimension parameter.  This is a set of
848    :-separated optional compute dimensions.  Each specified dimension
849    is a positive integer.  When device type support is added, it is
850    planned to be a comma separated list of such compute dimensions,
851    with all but the first prefixed by the colon-terminated device
852    type.  */
853 
854 static void
oacc_parse_default_dims(const char * dims)855 oacc_parse_default_dims (const char *dims)
856 {
857   int ix;
858 
859   for (ix = GOMP_DIM_MAX; ix--;)
860     {
861       oacc_default_dims[ix] = -1;
862       oacc_min_dims[ix] = 1;
863     }
864 
865 #ifndef ACCEL_COMPILER
866   /* Cannot be overridden on the host.  */
867   dims = NULL;
868 #endif
869   if (dims)
870     {
871       const char *pos = dims;
872 
873       for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
874           {
875             if (ix)
876               {
877                 if (*pos != ':')
878                     goto malformed;
879                 pos++;
880               }
881 
882             if (*pos != ':')
883               {
884                 long val;
885                 const char *eptr;
886 
887                 errno = 0;
888                 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
889                 if (errno || val <= 0 || (int) val != val)
890                     goto malformed;
891                 pos = eptr;
892                 oacc_default_dims[ix] = (int) val;
893               }
894           }
895       if (*pos)
896           {
897           malformed:
898             error_at (UNKNOWN_LOCATION,
899                         "%<-fopenacc-dim%> operand is malformed at %qs", pos);
900           }
901     }
902 
903   /* Allow the backend to validate the dimensions.  */
904   targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
905   targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
906 }
907 
908 /* Validate and update the dimensions for offloaded FN.  ATTRS is the
909    raw attribute.  DIMS is an array of dimensions, which is filled in.
910    LEVEL is the partitioning level of a routine, or -1 for an offload
911    region itself.  USED is the mask of partitioned execution in the
912    function.  */
913 
914 static void
oacc_validate_dims(tree fn,tree attrs,int * dims,int level,unsigned used)915 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
916 {
917   tree purpose[GOMP_DIM_MAX];
918   unsigned ix;
919   tree pos = TREE_VALUE (attrs);
920 
921   /* Make sure the attribute creator attached the dimension
922      information.  */
923   gcc_assert (pos);
924 
925   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
926     {
927       purpose[ix] = TREE_PURPOSE (pos);
928       tree val = TREE_VALUE (pos);
929       dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
930       pos = TREE_CHAIN (pos);
931     }
932 
933   bool check = true;
934 #ifdef ACCEL_COMPILER
935   check = false;
936 #endif
937   if (check
938       && warn_openacc_parallelism
939       && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
940     {
941       static char const *const axes[] =
942       /* Must be kept in sync with GOMP_DIM enumeration.  */
943           { "gang", "worker", "vector" };
944       for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
945           if (dims[ix] < 0)
946             ; /* Defaulting axis.  */
947           else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
948             /* There is partitioned execution, but the user requested a
949                dimension size of 1.  They're probably confused.  */
950             warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
951                           "region contains %s partitioned code but"
952                           " is not %s partitioned", axes[ix], axes[ix]);
953           else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
954             /* The dimension is explicitly partitioned to non-unity, but
955                no use is made within the region.  */
956             warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
957                           "region is %s partitioned but"
958                           " does not contain %s partitioned code",
959                           axes[ix], axes[ix]);
960     }
961 
962   bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
963 
964   /* Default anything left to 1 or a partitioned default.  */
965   for (ix = 0; ix != GOMP_DIM_MAX; ix++)
966     if (dims[ix] < 0)
967       {
968           /* The OpenACC spec says 'If the [num_gangs] clause is not
969              specified, an implementation-defined default will be used;
970              the default may depend on the code within the construct.'
971              (2.5.6).  Thus an implementation is free to choose
972              non-unity default for a parallel region that doesn't have
973              any gang-partitioned loops.  However, it appears that there
974              is a sufficient body of user code that expects non-gang
975              partitioned regions to not execute in gang-redundant mode.
976              So we (a) don't warn about the non-portability and (b) pick
977              the minimum permissible dimension size when there is no
978              partitioned execution.  Otherwise we pick the global
979              default for the dimension, which the user can control.  The
980              same wording and logic applies to num_workers and
981              vector_length, however the worker- or vector- single
982              execution doesn't have the same impact as gang-redundant
983              execution.  (If the minimum gang-level partioning is not 1,
984              the target is probably too confusing.)  */
985           dims[ix] = (used & GOMP_DIM_MASK (ix)
986                         ? oacc_default_dims[ix] : oacc_min_dims[ix]);
987           changed = true;
988       }
989 
990   if (changed)
991     {
992       /* Replace the attribute with new values.  */
993       pos = NULL_TREE;
994       for (ix = GOMP_DIM_MAX; ix--;)
995           pos = tree_cons (purpose[ix],
996                                build_int_cst (integer_type_node, dims[ix]), pos);
997       oacc_replace_fn_attrib (fn, pos);
998     }
999 }
1000 
1001 /* Create an empty OpenACC loop structure at LOC.  */
1002 
1003 static oacc_loop *
new_oacc_loop_raw(oacc_loop * parent,location_t loc)1004 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1005 {
1006   oacc_loop *loop = XCNEW (oacc_loop);
1007 
1008   loop->parent = parent;
1009 
1010   if (parent)
1011     {
1012       loop->sibling = parent->child;
1013       parent->child = loop;
1014     }
1015 
1016   loop->loc = loc;
1017   return loop;
1018 }
1019 
1020 /* Create an outermost, dummy OpenACC loop for offloaded function
1021    DECL.  */
1022 
1023 static oacc_loop *
new_oacc_loop_outer(tree decl)1024 new_oacc_loop_outer (tree decl)
1025 {
1026   return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1027 }
1028 
1029 /* Start a new OpenACC loop  structure beginning at head marker HEAD.
1030    Link into PARENT loop.  Return the new loop.  */
1031 
1032 static oacc_loop *
new_oacc_loop(oacc_loop * parent,gcall * marker)1033 new_oacc_loop (oacc_loop *parent, gcall *marker)
1034 {
1035   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1036 
1037   loop->marker = marker;
1038 
1039   /* TODO: This is where device_type flattening would occur for the loop
1040      flags.  */
1041 
1042   loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1043 
1044   tree chunk_size = integer_zero_node;
1045   if (loop->flags & OLF_GANG_STATIC)
1046     chunk_size = gimple_call_arg (marker, 4);
1047   loop->chunk_size = chunk_size;
1048 
1049   return loop;
1050 }
1051 
1052 /* Create a dummy loop encompassing a call to a openACC routine.
1053    Extract the routine's partitioning requirements.  */
1054 
1055 static void
new_oacc_loop_routine(oacc_loop * parent,gcall * call,tree decl,tree attrs)1056 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1057 {
1058   oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1059   int level = oacc_fn_attrib_level (attrs);
1060 
1061   gcc_assert (level >= 0);
1062 
1063   loop->marker = call;
1064   loop->routine = decl;
1065   loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1066                     ^ (GOMP_DIM_MASK (level) - 1));
1067 }
1068 
1069 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1070    Return the parent loop.  */
1071 
1072 static oacc_loop *
finish_oacc_loop(oacc_loop * loop)1073 finish_oacc_loop (oacc_loop *loop)
1074 {
1075   /* If the loop has been collapsed, don't partition it.  */
1076   if (loop->ifns.is_empty ())
1077     loop->mask = loop->flags = 0;
1078   return loop->parent;
1079 }
1080 
1081 /* Free all OpenACC loop structures within LOOP (inclusive).  */
1082 
1083 static void
free_oacc_loop(oacc_loop * loop)1084 free_oacc_loop (oacc_loop *loop)
1085 {
1086   if (loop->sibling)
1087     free_oacc_loop (loop->sibling);
1088   if (loop->child)
1089     free_oacc_loop (loop->child);
1090 
1091   loop->ifns.release ();
1092   free (loop);
1093 }
1094 
1095 /* Dump out the OpenACC loop head or tail beginning at FROM.  */
1096 
1097 static void
dump_oacc_loop_part(FILE * file,gcall * from,int depth,const char * title,int level)1098 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1099                          const char *title, int level)
1100 {
1101   enum ifn_unique_kind kind
1102     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1103 
1104   fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1105   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1106     {
1107       gimple *stmt = gsi_stmt (gsi);
1108 
1109       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1110           {
1111             enum ifn_unique_kind k
1112               = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1113                  (gimple_call_arg (stmt, 0)));
1114 
1115             if (k == kind && stmt != from)
1116               break;
1117           }
1118       print_gimple_stmt (file, stmt, depth * 2 + 2);
1119 
1120       gsi_next (&gsi);
1121       while (gsi_end_p (gsi))
1122           gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1123     }
1124 }
1125 
1126 /* Dump OpenACC loop LOOP, its children, and its siblings.  */
1127 
1128 static void
dump_oacc_loop(FILE * file,oacc_loop * loop,int depth)1129 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1130 {
1131   int ix;
1132 
1133   fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1134              loop->flags, loop->mask,
1135              LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1136 
1137   if (loop->marker)
1138     print_gimple_stmt (file, loop->marker, depth * 2);
1139 
1140   if (loop->routine)
1141     fprintf (file, "%*sRoutine %s:%u:%s\n",
1142                depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1143                DECL_SOURCE_LINE (loop->routine),
1144                IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1145 
1146   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1147     if (loop->heads[ix])
1148       dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1149   for (ix = GOMP_DIM_MAX; ix--;)
1150     if (loop->tails[ix])
1151       dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1152 
1153   if (loop->child)
1154     dump_oacc_loop (file, loop->child, depth + 1);
1155   if (loop->sibling)
1156     dump_oacc_loop (file, loop->sibling, depth);
1157 }
1158 
1159 void debug_oacc_loop (oacc_loop *);
1160 
1161 /* Dump loops to stderr.  */
1162 
1163 DEBUG_FUNCTION void
debug_oacc_loop(oacc_loop * loop)1164 debug_oacc_loop (oacc_loop *loop)
1165 {
1166   dump_oacc_loop (stderr, loop, 0);
1167 }
1168 
1169 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1170    siblings.  */
1171 
1172 static void
inform_oacc_loop(const oacc_loop * loop)1173 inform_oacc_loop (const oacc_loop *loop)
1174 {
1175   const char *gang
1176     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1177   const char *worker
1178     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1179   const char *vector
1180     = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1181   const char *seq = loop->mask == 0 ? " seq" : "";
1182   const dump_user_location_t loc
1183     = dump_user_location_t::from_location_t (loop->loc);
1184   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1185                        "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1186                        vector, seq);
1187 
1188   if (loop->child)
1189     inform_oacc_loop (loop->child);
1190   if (loop->sibling)
1191     inform_oacc_loop (loop->sibling);
1192 }
1193 
1194 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1195    structures as we go.  By construction these loops are properly
1196    nested.  */
1197 
1198 static void
oacc_loop_discover_walk(oacc_loop * loop,basic_block bb)1199 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1200 {
1201   int marker = 0;
1202   int remaining = 0;
1203 
1204   if (bb->flags & BB_VISITED)
1205     return;
1206 
1207  follow:
1208   bb->flags |= BB_VISITED;
1209 
1210   /* Scan for loop markers.  */
1211   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1212        gsi_next (&gsi))
1213     {
1214       gimple *stmt = gsi_stmt (gsi);
1215 
1216       if (!is_gimple_call (stmt))
1217           continue;
1218 
1219       gcall *call = as_a <gcall *> (stmt);
1220 
1221       /* If this is a routine, make a dummy loop for it.  */
1222       if (tree decl = gimple_call_fndecl (call))
1223           if (tree attrs = oacc_get_fn_attrib (decl))
1224             {
1225               gcc_assert (!marker);
1226               new_oacc_loop_routine (loop, call, decl, attrs);
1227             }
1228 
1229       if (!gimple_call_internal_p (call))
1230           continue;
1231 
1232       switch (gimple_call_internal_fn (call))
1233           {
1234           default:
1235             break;
1236 
1237           case IFN_GOACC_LOOP:
1238           case IFN_GOACC_TILE:
1239             /* Record the abstraction function, so we can manipulate it
1240                later.  */
1241             loop->ifns.safe_push (call);
1242             break;
1243 
1244           case IFN_UNIQUE:
1245             enum ifn_unique_kind kind
1246               = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1247                                               (gimple_call_arg (call, 0)));
1248             if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1249                 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1250               {
1251                 if (gimple_call_num_args (call) == 2)
1252                     {
1253                       gcc_assert (marker && !remaining);
1254                       marker = 0;
1255                       if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1256                         loop = finish_oacc_loop (loop);
1257                       else
1258                         loop->head_end = call;
1259                     }
1260                 else
1261                     {
1262                       int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1263 
1264                       if (!marker)
1265                         {
1266                           if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1267                               loop = new_oacc_loop (loop, call);
1268                           remaining = count;
1269                         }
1270                       gcc_assert (count == remaining);
1271                       if (remaining)
1272                         {
1273                           remaining--;
1274                           if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1275                               loop->heads[marker] = call;
1276                           else
1277                               loop->tails[remaining] = call;
1278                         }
1279                       marker++;
1280                     }
1281               }
1282           }
1283     }
1284   if (remaining || marker)
1285     {
1286       bb = single_succ (bb);
1287       gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1288       goto follow;
1289     }
1290 
1291   /* Walk successor blocks.  */
1292   edge e;
1293   edge_iterator ei;
1294 
1295   FOR_EACH_EDGE (e, ei, bb->succs)
1296     oacc_loop_discover_walk (loop, e->dest);
1297 }
1298 
1299 /* LOOP is the first sibling.  Reverse the order in place and return
1300    the new first sibling.  Recurse to child loops.  */
1301 
1302 static oacc_loop *
oacc_loop_sibling_nreverse(oacc_loop * loop)1303 oacc_loop_sibling_nreverse (oacc_loop *loop)
1304 {
1305   oacc_loop *last = NULL;
1306   do
1307     {
1308       if (loop->child)
1309           loop->child = oacc_loop_sibling_nreverse (loop->child);
1310 
1311       oacc_loop *next = loop->sibling;
1312       loop->sibling = last;
1313       last = loop;
1314       loop = next;
1315     }
1316   while (loop);
1317 
1318   return last;
1319 }
1320 
1321 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1322    the current function.  */
1323 
1324 static oacc_loop *
oacc_loop_discovery()1325 oacc_loop_discovery ()
1326 {
1327   /* Clear basic block flags, in particular BB_VISITED which we're going to use
1328      in the following.  */
1329   clear_bb_flags ();
1330 
1331   oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1332   oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1333 
1334   /* The siblings were constructed in reverse order, reverse them so
1335      that diagnostics come out in an unsurprising order.  */
1336   top = oacc_loop_sibling_nreverse (top);
1337 
1338   return top;
1339 }
1340 
1341 /* Transform the abstract internal function markers starting at FROM
1342    to be for partitioning level LEVEL.  Stop when we meet another HEAD
1343    or TAIL  marker.  */
1344 
1345 static void
oacc_loop_xform_head_tail(gcall * from,int level)1346 oacc_loop_xform_head_tail (gcall *from, int level)
1347 {
1348   enum ifn_unique_kind kind
1349     = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1350   tree replacement = build_int_cst (unsigned_type_node, level);
1351 
1352   for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1353     {
1354       gimple *stmt = gsi_stmt (gsi);
1355 
1356       if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1357           {
1358             enum ifn_unique_kind k
1359               = ((enum ifn_unique_kind)
1360                  TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1361 
1362             if (k == IFN_UNIQUE_OACC_FORK
1363                 || k == IFN_UNIQUE_OACC_JOIN
1364                 || k == IFN_UNIQUE_OACC_PRIVATE)
1365               *gimple_call_arg_ptr (stmt, 2) = replacement;
1366             else if (k == kind && stmt != from)
1367               break;
1368           }
1369       else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1370           *gimple_call_arg_ptr (stmt, 3) = replacement;
1371       update_stmt (stmt);
1372 
1373       gsi_next (&gsi);
1374       while (gsi_end_p (gsi))
1375           gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1376     }
1377 }
1378 
1379 /* Process the discovered OpenACC loops, setting the correct
1380    partitioning level etc.  */
1381 
1382 static void
oacc_loop_process(oacc_loop * loop,int fn_level)1383 oacc_loop_process (oacc_loop *loop, int fn_level)
1384 {
1385   if (loop->child)
1386     oacc_loop_process (loop->child, fn_level);
1387 
1388   if (loop->mask && !loop->routine)
1389     {
1390       int ix;
1391       tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1392       tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1393       tree chunk_arg = loop->chunk_size;
1394       gcall *call;
1395 
1396       for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1397           {
1398             switch (gimple_call_internal_fn (call))
1399               {
1400               case IFN_GOACC_LOOP:
1401                 {
1402                     bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1403                     gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1404                     if (!is_e)
1405                       gimple_call_set_arg (call, 4, chunk_arg);
1406                 }
1407                 break;
1408 
1409               case IFN_GOACC_TILE:
1410                 gimple_call_set_arg (call, 3, mask_arg);
1411                 gimple_call_set_arg (call, 4, e_mask_arg);
1412                 break;
1413 
1414               default:
1415                 gcc_unreachable ();
1416               }
1417             update_stmt (call);
1418           }
1419 
1420       unsigned dim = GOMP_DIM_GANG;
1421       unsigned mask = loop->mask | loop->e_mask;
1422       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1423           {
1424             while (!(GOMP_DIM_MASK (dim) & mask))
1425               dim++;
1426 
1427             oacc_loop_xform_head_tail (loop->heads[ix], dim);
1428             oacc_loop_xform_head_tail (loop->tails[ix], dim);
1429 
1430             mask ^= GOMP_DIM_MASK (dim);
1431           }
1432     }
1433 
1434   if (loop->sibling)
1435     oacc_loop_process (loop->sibling, fn_level);
1436 
1437 
1438   /* OpenACC 2.6, 2.9.11. "reduction clause" places a restriction such that
1439      "The 'reduction' clause may not be specified on an orphaned 'loop'
1440      construct with the 'gang' clause, or on an orphaned 'loop' construct that
1441      will generate gang parallelism in a procedure that is compiled with the
1442      'routine gang' clause."  */
1443   if (fn_level == GOMP_DIM_GANG
1444       && (loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1445       && (loop->flags & OLF_REDUCTION))
1446     error_at (loop->loc,
1447                 "gang reduction on an orphan loop");
1448 }
1449 
1450 /* Walk the OpenACC loop heirarchy checking and assigning the
1451    programmer-specified partitionings.  OUTER_MASK is the partitioning
1452    this loop is contained within.  Return mask of partitioning
1453    encountered.  If any auto loops are discovered, set GOMP_DIM_MAX
1454    bit.  */
1455 
1456 static unsigned
oacc_loop_fixed_partitions(oacc_loop * loop,unsigned outer_mask)1457 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1458 {
1459   unsigned this_mask = loop->mask;
1460   unsigned mask_all = 0;
1461   bool noisy = true;
1462 
1463 #ifdef ACCEL_COMPILER
1464   /* When device_type is supported, we want the device compiler to be
1465      noisy, if the loop parameters are device_type-specific.  */
1466   noisy = false;
1467 #endif
1468 
1469   if (!loop->routine)
1470     {
1471       bool auto_par = (loop->flags & OLF_AUTO) != 0;
1472       bool seq_par = (loop->flags & OLF_SEQ) != 0;
1473       bool tiling = (loop->flags & OLF_TILE) != 0;
1474 
1475       this_mask = ((loop->flags >> OLF_DIM_BASE)
1476                        & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1477 
1478       /* Apply auto partitioning if this is a non-partitioned regular
1479            loop, or (no more than) single axis tiled loop.  */
1480       bool maybe_auto
1481           = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1482 
1483       if ((this_mask != 0) + auto_par + seq_par > 1)
1484           {
1485             if (noisy)
1486               error_at (loop->loc,
1487                           seq_par
1488                           ? G_("%<seq%> overrides other OpenACC loop specifiers")
1489                           : G_("%<auto%> conflicts with other OpenACC loop "
1490                                  "specifiers"));
1491             maybe_auto = false;
1492             loop->flags &= ~OLF_AUTO;
1493             if (seq_par)
1494               {
1495                 loop->flags
1496                     &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1497                 this_mask = 0;
1498               }
1499           }
1500 
1501       if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1502           {
1503             loop->flags |= OLF_AUTO;
1504             mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1505           }
1506     }
1507 
1508   if (this_mask & outer_mask)
1509     {
1510       const oacc_loop *outer;
1511       for (outer = loop->parent; outer; outer = outer->parent)
1512           if ((outer->mask | outer->e_mask) & this_mask)
1513             break;
1514 
1515       if (noisy)
1516           {
1517             if (outer)
1518               {
1519                 error_at (loop->loc,
1520                               loop->routine
1521                               ? G_("routine call uses same OpenACC parallelism"
1522                                    " as containing loop")
1523                               : G_("inner loop uses same OpenACC parallelism"
1524                                    " as containing loop"));
1525                 inform (outer->loc, "containing loop here");
1526               }
1527             else
1528               error_at (loop->loc,
1529                           loop->routine
1530                           ? G_("routine call uses OpenACC parallelism disallowed"
1531                                  " by containing routine")
1532                           : G_("loop uses OpenACC parallelism disallowed"
1533                                  " by containing routine"));
1534 
1535             if (loop->routine)
1536               inform (DECL_SOURCE_LOCATION (loop->routine),
1537                         "routine %qD declared here", loop->routine);
1538           }
1539       this_mask &= ~outer_mask;
1540     }
1541   else
1542     {
1543       unsigned outermost = least_bit_hwi (this_mask);
1544 
1545       if (outermost && outermost <= outer_mask)
1546           {
1547             if (noisy)
1548               {
1549                 error_at (loop->loc,
1550                               "incorrectly nested OpenACC loop parallelism");
1551 
1552                 const oacc_loop *outer;
1553                 for (outer = loop->parent;
1554                        outer->flags && outer->flags < outermost;
1555                        outer = outer->parent)
1556                     continue;
1557                 inform (outer->loc, "containing loop here");
1558               }
1559 
1560             this_mask &= ~outermost;
1561           }
1562     }
1563 
1564   mask_all |= this_mask;
1565 
1566   if (loop->flags & OLF_TILE)
1567     {
1568       /* When tiling, vector goes to the element loop, and failing
1569            that we put worker there.  The std doesn't contemplate
1570            specifying all three.  We choose to put worker and vector on
1571            the element loops in that case.  */
1572       unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1573       if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1574           this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1575 
1576       loop->e_mask = this_e_mask;
1577       this_mask ^= this_e_mask;
1578     }
1579 
1580   loop->mask = this_mask;
1581 
1582   if (dump_file)
1583     fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1584                LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1585                loop->mask, loop->e_mask);
1586 
1587   if (loop->child)
1588     {
1589       unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1590       loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1591       mask_all |= loop->inner;
1592     }
1593 
1594   if (loop->sibling)
1595     mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1596 
1597   return mask_all;
1598 }
1599 
1600 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1601    OUTER_MASK is the partitioning this loop is contained within.
1602    OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1603    Return the cumulative partitioning used by this loop, siblings and
1604    children.  */
1605 
1606 static unsigned
oacc_loop_auto_partitions(oacc_loop * loop,unsigned outer_mask,bool outer_assign)1607 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1608                                  bool outer_assign)
1609 {
1610   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1611   bool noisy = true;
1612   bool tiling = loop->flags & OLF_TILE;
1613 
1614 #ifdef ACCEL_COMPILER
1615   /* When device_type is supported, we want the device compiler to be
1616      noisy, if the loop parameters are device_type-specific.  */
1617   noisy = false;
1618 #endif
1619 
1620   if (assign && (!outer_assign || loop->inner))
1621     {
1622       /* Allocate outermost and non-innermost loops at the outermost
1623            non-innermost available level.  */
1624       unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1625 
1626       /* Find the first outermost available partition. */
1627       while (this_mask <= outer_mask)
1628           this_mask <<= 1;
1629 
1630       /* Grab two axes if tiling, and we've not assigned anything  */
1631       if (tiling && !(loop->mask | loop->e_mask))
1632           this_mask |= this_mask << 1;
1633 
1634       /* Prohibit the innermost partitioning at the moment.  */
1635       this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1636 
1637       /* Don't use any dimension explicitly claimed by an inner loop. */
1638       this_mask &= ~loop->inner;
1639 
1640       if (tiling && !loop->e_mask)
1641           {
1642             /* If we got two axes, allocate the inner one to the element
1643                loop.  */
1644             loop->e_mask = this_mask & (this_mask << 1);
1645             this_mask ^= loop->e_mask;
1646           }
1647 
1648       loop->mask |= this_mask;
1649     }
1650 
1651   if (loop->child)
1652     {
1653       unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1654       loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1655                                                          outer_assign | assign);
1656     }
1657 
1658   if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1659     {
1660       /* Allocate the loop at the innermost available level.  Note
1661            that we do this even if we already assigned this loop the
1662            outermost available level above.  That way we'll partition
1663            this along 2 axes, if they are available.  */
1664       unsigned this_mask = 0;
1665 
1666       /* Determine the outermost partitioning used within this loop.  */
1667       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1668       this_mask = least_bit_hwi (this_mask);
1669 
1670       /* Pick the partitioning just inside that one.  */
1671       this_mask >>= 1;
1672 
1673       /* And avoid picking one use by an outer loop.  */
1674       this_mask &= ~outer_mask;
1675 
1676       /* If tiling and we failed completely above, grab the next one
1677            too.  Making sure it doesn't hit an outer loop.  */
1678       if (tiling)
1679           {
1680             this_mask &= ~(loop->e_mask | loop->mask);
1681             unsigned tile_mask = ((this_mask >> 1)
1682                                         & ~(outer_mask | loop->e_mask | loop->mask));
1683 
1684             if (tile_mask || loop->mask)
1685               {
1686                 loop->e_mask |= this_mask;
1687                 this_mask = tile_mask;
1688               }
1689             if (!loop->e_mask && noisy)
1690               warning_at (loop->loc, 0,
1691                               "insufficient partitioning available"
1692                               " to parallelize element loop");
1693           }
1694 
1695       loop->mask |= this_mask;
1696       if (!loop->mask && noisy)
1697           warning_at (loop->loc, 0,
1698                         tiling
1699                         ? G_("insufficient partitioning available"
1700                                " to parallelize tile loop")
1701                         : G_("insufficient partitioning available"
1702                                " to parallelize loop"));
1703     }
1704 
1705   if (assign && dump_file)
1706     fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1707                LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1708                loop->mask, loop->e_mask);
1709 
1710   unsigned inner_mask = 0;
1711 
1712   if (loop->sibling)
1713     inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1714                                                        outer_mask, outer_assign);
1715 
1716   inner_mask |= loop->inner | loop->mask | loop->e_mask;
1717 
1718   return inner_mask;
1719 }
1720 
1721 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1722    axes.  Return mask of partitioning.  */
1723 
1724 static unsigned
oacc_loop_partition(oacc_loop * loop,unsigned outer_mask)1725 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1726 {
1727   unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1728 
1729   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1730     {
1731       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1732       mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1733     }
1734   return mask_all;
1735 }
1736 
1737 /* Default fork/join early expander.  Delete the function calls if
1738    there is no RTL expander.  */
1739 
1740 bool
default_goacc_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool is_fork)1741 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1742                                const int *ARG_UNUSED (dims), bool is_fork)
1743 {
1744   if (is_fork)
1745     return targetm.have_oacc_fork ();
1746   else
1747     return targetm.have_oacc_join ();
1748 }
1749 
1750 /* Default goacc.reduction early expander.
1751 
1752    LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1753    If RES_PTR is not integer-zerop:
1754        SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1755        TEARDOWN - emit '*RES_PTR = VAR'
1756    If LHS is not NULL
1757        emit 'LHS = VAR'   */
1758 
1759 void
default_goacc_reduction(gcall * call)1760 default_goacc_reduction (gcall *call)
1761 {
1762   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1763   gimple_stmt_iterator gsi = gsi_for_stmt (call);
1764   tree lhs = gimple_call_lhs (call);
1765   tree var = gimple_call_arg (call, 2);
1766   gimple_seq seq = NULL;
1767 
1768   if (code == IFN_GOACC_REDUCTION_SETUP
1769       || code == IFN_GOACC_REDUCTION_TEARDOWN)
1770     {
1771       /* Setup and Teardown need to copy from/to the receiver object,
1772            if there is one.  */
1773       tree ref_to_res = gimple_call_arg (call, 1);
1774 
1775       if (!integer_zerop (ref_to_res))
1776           {
1777             tree dst = build_simple_mem_ref (ref_to_res);
1778             tree src = var;
1779 
1780             if (code == IFN_GOACC_REDUCTION_SETUP)
1781               {
1782                 src = dst;
1783                 dst = lhs;
1784                 lhs = NULL;
1785               }
1786             gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1787           }
1788     }
1789 
1790   /* Copy VAR to LHS, if there is an LHS.  */
1791   if (lhs)
1792     gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1793 
1794   gsi_replace_with_seq (&gsi, seq, true);
1795 }
1796 
1797 struct var_decl_rewrite_info
1798 {
1799   gimple *stmt;
1800   hash_map<tree, tree> *adjusted_vars;
1801   bool avoid_pointer_conversion;
1802   bool modified;
1803 };
1804 
1805 /* Helper function for execute_oacc_device_lower.  Rewrite VAR_DECLs (by
1806    themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1807    the var_decl_rewrite_info pointed to via DATA.  Used as part of coercing
1808    gang-private variables in OpenACC offload regions to reside in GPU shared
1809    memory.  */
1810 
1811 static tree
oacc_rewrite_var_decl(tree * tp,int * walk_subtrees,void * data)1812 oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1813 {
1814   walk_stmt_info *wi = (walk_stmt_info *) data;
1815   var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1816 
1817   if (TREE_CODE (*tp) == ADDR_EXPR)
1818     {
1819       tree arg = TREE_OPERAND (*tp, 0);
1820       tree *new_arg = info->adjusted_vars->get (arg);
1821 
1822       if (new_arg)
1823           {
1824             if (info->avoid_pointer_conversion)
1825               {
1826                 *tp = build_fold_addr_expr (*new_arg);
1827                 info->modified = true;
1828                 *walk_subtrees = 0;
1829               }
1830             else
1831               {
1832                 gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1833                 tree repl = build_fold_addr_expr (*new_arg);
1834                 gimple *stmt1
1835                     = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1836                 tree conv = convert_to_pointer (TREE_TYPE (*tp),
1837                                                         gimple_assign_lhs (stmt1));
1838                 gimple *stmt2
1839                     = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1840                 gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1841                 gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1842                 *tp = gimple_assign_lhs (stmt2);
1843                 info->modified = true;
1844                 *walk_subtrees = 0;
1845               }
1846           }
1847     }
1848   else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1849     {
1850       tree *base = &TREE_OPERAND (*tp, 0);
1851 
1852       while (TREE_CODE (*base) == COMPONENT_REF
1853                || TREE_CODE (*base) == ARRAY_REF)
1854           base = &TREE_OPERAND (*base, 0);
1855 
1856       if (TREE_CODE (*base) != VAR_DECL)
1857           return NULL;
1858 
1859       tree *new_decl = info->adjusted_vars->get (*base);
1860       if (!new_decl)
1861           return NULL;
1862 
1863       int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1864       tree field = TREE_OPERAND (*tp, 1);
1865 
1866       /* Adjust the type of the field.  */
1867       int field_quals = TYPE_QUALS (TREE_TYPE (field));
1868       if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1869           {
1870             tree *field_type = &TREE_TYPE (field);
1871             while (TREE_CODE (*field_type) == ARRAY_TYPE)
1872               field_type = &TREE_TYPE (*field_type);
1873             field_quals |= base_quals;
1874             *field_type = build_qualified_type (*field_type, field_quals);
1875           }
1876 
1877       /* Adjust the type of the component ref itself.  */
1878       tree comp_type = TREE_TYPE (*tp);
1879       int comp_quals = TYPE_QUALS (comp_type);
1880       if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1881           {
1882             comp_quals |= base_quals;
1883             TREE_TYPE (*tp)
1884               = build_qualified_type (comp_type, comp_quals);
1885           }
1886 
1887       *base = *new_decl;
1888       info->modified = true;
1889     }
1890   else if (TREE_CODE (*tp) == VAR_DECL)
1891     {
1892       tree *new_decl = info->adjusted_vars->get (*tp);
1893       if (new_decl)
1894           {
1895             *tp = *new_decl;
1896             info->modified = true;
1897           }
1898     }
1899 
1900   return NULL_TREE;
1901 }
1902 
1903 /* Return TRUE if CALL is a call to a builtin atomic/sync operation.  */
1904 
1905 static bool
is_sync_builtin_call(gcall * call)1906 is_sync_builtin_call (gcall *call)
1907 {
1908   tree callee = gimple_call_fndecl (call);
1909 
1910   if (callee != NULL_TREE
1911       && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1912     switch (DECL_FUNCTION_CODE (callee))
1913       {
1914 #undef DEF_SYNC_BUILTIN
1915 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1916 #include "sync-builtins.def"
1917 #undef DEF_SYNC_BUILTIN
1918           return true;
1919 
1920       default:
1921           ;
1922       }
1923 
1924   return false;
1925 }
1926 
1927 /* Main entry point for oacc transformations which run on the device
1928    compiler after LTO, so we know what the target device is at this
1929    point (including the host fallback).  */
1930 
1931 static unsigned int
execute_oacc_loop_designation()1932 execute_oacc_loop_designation ()
1933 {
1934   tree attrs = oacc_get_fn_attrib (current_function_decl);
1935 
1936   if (!attrs)
1937     /* Not an offloaded function.  */
1938     return 0;
1939 
1940   /* Parse the default dim argument exactly once.  */
1941   if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1942     {
1943       oacc_parse_default_dims (flag_openacc_dims);
1944       flag_openacc_dims = (char *)&flag_openacc_dims;
1945     }
1946 
1947   bool is_oacc_parallel
1948     = (lookup_attribute ("oacc parallel",
1949                                DECL_ATTRIBUTES (current_function_decl)) != NULL);
1950   bool is_oacc_kernels
1951     = (lookup_attribute ("oacc kernels",
1952                                DECL_ATTRIBUTES (current_function_decl)) != NULL);
1953   bool is_oacc_serial
1954     = (lookup_attribute ("oacc serial",
1955                                DECL_ATTRIBUTES (current_function_decl)) != NULL);
1956   bool is_oacc_parallel_kernels_parallelized
1957     = (lookup_attribute ("oacc parallel_kernels_parallelized",
1958                                DECL_ATTRIBUTES (current_function_decl)) != NULL);
1959   bool is_oacc_parallel_kernels_gang_single
1960     = (lookup_attribute ("oacc parallel_kernels_gang_single",
1961                                DECL_ATTRIBUTES (current_function_decl)) != NULL);
1962   int fn_level = oacc_fn_attrib_level (attrs);
1963   bool is_oacc_routine = (fn_level >= 0);
1964   gcc_checking_assert (is_oacc_parallel
1965                            + is_oacc_kernels
1966                            + is_oacc_serial
1967                            + is_oacc_parallel_kernels_parallelized
1968                            + is_oacc_parallel_kernels_gang_single
1969                            + is_oacc_routine
1970                            == 1);
1971 
1972   bool is_oacc_kernels_parallelized
1973     = (lookup_attribute ("oacc kernels parallelized",
1974                                DECL_ATTRIBUTES (current_function_decl)) != NULL);
1975   if (is_oacc_kernels_parallelized)
1976     gcc_checking_assert (is_oacc_kernels);
1977 
1978   if (dump_file)
1979     {
1980       if (is_oacc_parallel)
1981           fprintf (dump_file, "Function is OpenACC parallel offload\n");
1982       else if (is_oacc_kernels)
1983           fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1984                      (is_oacc_kernels_parallelized
1985                       ? "parallelized" : "unparallelized"));
1986       else if (is_oacc_serial)
1987           fprintf (dump_file, "Function is OpenACC serial offload\n");
1988       else if (is_oacc_parallel_kernels_parallelized)
1989           fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1990                      "parallel_kernels_parallelized");
1991       else if (is_oacc_parallel_kernels_gang_single)
1992           fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1993                      "parallel_kernels_gang_single");
1994       else if (is_oacc_routine)
1995           fprintf (dump_file, "Function is OpenACC routine level %d\n",
1996                      fn_level);
1997       else
1998           gcc_unreachable ();
1999     }
2000 
2001   /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
2002      it's a convenient place, so...  */
2003   if (is_oacc_routine)
2004     {
2005       tree attr = lookup_attribute ("omp declare target",
2006                                             DECL_ATTRIBUTES (current_function_decl));
2007       gcc_checking_assert (attr);
2008       tree clauses = TREE_VALUE (attr);
2009       gcc_checking_assert (clauses);
2010 
2011       /* Should this OpenACC routine be discarded?  */
2012       bool discard = false;
2013 
2014       tree clause_nohost = omp_find_clause (clauses, OMP_CLAUSE_NOHOST);
2015       if (dump_file)
2016           fprintf (dump_file,
2017                      "OpenACC routine '%s' %s '%s' clause.\n",
2018                      lang_hooks.decl_printable_name (current_function_decl, 2),
2019                      clause_nohost ? "has" : "doesn't have",
2020                      omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2021       /* Host compiler, 'nohost' clause?  */
2022 #ifndef ACCEL_COMPILER
2023       if (clause_nohost)
2024           discard = true;
2025 #endif
2026 
2027       if (dump_file)
2028           fprintf (dump_file,
2029                      "OpenACC routine '%s' %sdiscarded.\n",
2030                      lang_hooks.decl_printable_name (current_function_decl, 2),
2031                      discard ? "" : "not ");
2032       if (discard)
2033           {
2034             TREE_ASM_WRITTEN (current_function_decl) = 1;
2035             return TODO_discard_function;
2036           }
2037     }
2038 
2039   /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2040      kernels, so remove the parallelism dimensions function attributes
2041      potentially set earlier on.  */
2042   if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2043     {
2044       oacc_set_fn_attrib (current_function_decl, NULL, NULL);
2045       attrs = oacc_get_fn_attrib (current_function_decl);
2046     }
2047 
2048   /* Discover, partition and process the loops.  */
2049   oacc_loop *loops = oacc_loop_discovery ();
2050 
2051   unsigned outer_mask = 0;
2052   if (is_oacc_routine)
2053     outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2054   unsigned used_mask = oacc_loop_partition (loops, outer_mask);
2055   /* OpenACC kernels constructs are special: they currently don't use the
2056      generic oacc_loop infrastructure and attribute/dimension processing.  */
2057   if (is_oacc_kernels && is_oacc_kernels_parallelized)
2058     {
2059       /* Parallelized OpenACC kernels constructs use gang parallelism.  See
2060            also tree-parloops.cc:create_parallel_loop.  */
2061       used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2062     }
2063 
2064   int dims[GOMP_DIM_MAX];
2065   oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
2066 
2067   if (dump_file)
2068     {
2069       const char *comma = "Compute dimensions [";
2070       for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2071           fprintf (dump_file, "%s%d", comma, dims[ix]);
2072       fprintf (dump_file, "]\n");
2073     }
2074 
2075   /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2076      a single gang only.  */
2077   if (is_oacc_parallel_kernels_gang_single)
2078     gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2079 
2080   oacc_loop_process (loops, fn_level);
2081   if (dump_file)
2082     {
2083       fprintf (dump_file, "OpenACC loops\n");
2084       dump_oacc_loop (dump_file, loops, 0);
2085       fprintf (dump_file, "\n");
2086     }
2087   if (dump_enabled_p ())
2088     {
2089       oacc_loop *l = loops;
2090       /* OpenACC kernels constructs are special: they currently don't use the
2091            generic oacc_loop infrastructure.  */
2092       if (is_oacc_kernels)
2093           {
2094             /* Create a fake oacc_loop for diagnostic purposes.  */
2095             l = new_oacc_loop_raw (NULL,
2096                                          DECL_SOURCE_LOCATION (current_function_decl));
2097             l->mask = used_mask;
2098           }
2099       else
2100           {
2101             /* Skip the outermost, dummy OpenACC loop  */
2102             l = l->child;
2103           }
2104       if (l)
2105           inform_oacc_loop (l);
2106       if (is_oacc_kernels)
2107           free_oacc_loop (l);
2108     }
2109 
2110   free_oacc_loop (loops);
2111 
2112   return 0;
2113 }
2114 
2115 static unsigned int
execute_oacc_device_lower()2116 execute_oacc_device_lower ()
2117 {
2118   tree attrs = oacc_get_fn_attrib (current_function_decl);
2119 
2120   if (!attrs)
2121     /* Not an offloaded function.  */
2122     return 0;
2123 
2124   int dims[GOMP_DIM_MAX];
2125   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2126     dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
2127 
2128   hash_map<tree, tree> adjusted_vars;
2129 
2130   /* Now lower internal loop functions to target-specific code
2131      sequences.  */
2132   basic_block bb;
2133   FOR_ALL_BB_FN (bb, cfun)
2134     for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
2135       {
2136           gimple *stmt = gsi_stmt (gsi);
2137           if (!is_gimple_call (stmt))
2138             {
2139               gsi_next (&gsi);
2140               continue;
2141             }
2142 
2143           gcall *call = as_a <gcall *> (stmt);
2144           if (!gimple_call_internal_p (call))
2145             {
2146               gsi_next (&gsi);
2147               continue;
2148             }
2149 
2150           /* Rewind to allow rescan.  */
2151           gsi_prev (&gsi);
2152           bool rescan = false, remove = false;
2153           enum  internal_fn ifn_code = gimple_call_internal_fn (call);
2154 
2155           switch (ifn_code)
2156             {
2157             default: break;
2158 
2159             case IFN_GOACC_TILE:
2160               oacc_xform_tile (call);
2161               rescan = true;
2162               break;
2163 
2164             case IFN_GOACC_LOOP:
2165               oacc_xform_loop (call);
2166               rescan = true;
2167               break;
2168 
2169             case IFN_GOACC_REDUCTION:
2170               /* Mark the function for SSA renaming.  */
2171               mark_virtual_operands_for_renaming (cfun);
2172 
2173               /* If the level is -1, this ended up being an unused
2174                  axis.  Handle as a default.  */
2175               if (integer_minus_onep (gimple_call_arg (call, 3)))
2176                 default_goacc_reduction (call);
2177               else
2178                 targetm.goacc.reduction (call);
2179               rescan = true;
2180               break;
2181 
2182             case IFN_UNIQUE:
2183               {
2184                 enum ifn_unique_kind kind
2185                     = ((enum ifn_unique_kind)
2186                        TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2187 
2188                 switch (kind)
2189                     {
2190                     default:
2191                       break;
2192 
2193                     case IFN_UNIQUE_OACC_FORK:
2194                     case IFN_UNIQUE_OACC_JOIN:
2195                       if (integer_minus_onep (gimple_call_arg (call, 2)))
2196                         remove = true;
2197                       else if (!targetm.goacc.fork_join
2198                                  (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2199                         remove = true;
2200                       break;
2201 
2202                     case IFN_UNIQUE_OACC_HEAD_MARK:
2203                     case IFN_UNIQUE_OACC_TAIL_MARK:
2204                       remove = true;
2205                       break;
2206 
2207                     case IFN_UNIQUE_OACC_PRIVATE:
2208                       {
2209                         dump_flags_t l_dump_flags
2210                           = get_openacc_privatization_dump_flags ();
2211 
2212                         location_t loc = gimple_location (stmt);
2213                         if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2214                           loc = DECL_SOURCE_LOCATION (current_function_decl);
2215                         const dump_user_location_t d_u_loc
2216                           = dump_user_location_t::from_location_t (loc);
2217 
2218                         HOST_WIDE_INT level
2219                           = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2220                         gcc_checking_assert (level == -1
2221                                                    || (level >= 0
2222                                                        && level < GOMP_DIM_MAX));
2223                         for (unsigned i = 3;
2224                                i < gimple_call_num_args (call);
2225                                i++)
2226                           {
2227                               static char const *const axes[] =
2228                               /* Must be kept in sync with GOMP_DIM enumeration.  */
2229                                 { "gang", "worker", "vector" };
2230 
2231                               tree arg = gimple_call_arg (call, i);
2232                               gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2233                               tree decl = TREE_OPERAND (arg, 0);
2234                               if (dump_enabled_p ())
2235 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2236 #if __GNUC__ >= 10
2237 # pragma GCC diagnostic push
2238 # pragma GCC diagnostic ignored "-Wformat"
2239 #endif
2240                                 dump_printf_loc (l_dump_flags, d_u_loc,
2241                                                      "variable %<%T%> ought to be"
2242                                                      " adjusted for OpenACC"
2243                                                      " privatization level: %qs\n",
2244                                                      decl,
2245                                                      (level == -1
2246                                                       ? "UNKNOWN" : axes[level]));
2247 #if __GNUC__ >= 10
2248 # pragma GCC diagnostic pop
2249 #endif
2250                               bool adjusted;
2251                               if (level == -1)
2252                                 adjusted = false;
2253                               else if (!targetm.goacc.adjust_private_decl)
2254                                 adjusted = false;
2255                               else if (level == GOMP_DIM_VECTOR)
2256                                 {
2257                                   /* That's the default behavior.  */
2258                                   adjusted = true;
2259                                 }
2260                               else
2261                                 {
2262                                   tree oldtype = TREE_TYPE (decl);
2263                                   tree newdecl
2264                                     = targetm.goacc.adjust_private_decl (loc, decl,
2265                                                                                    level);
2266                                   adjusted = (TREE_TYPE (newdecl) != oldtype
2267                                                   || newdecl != decl);
2268                                   if (adjusted)
2269                                     adjusted_vars.put (decl, newdecl);
2270                                 }
2271                               if (adjusted
2272                                   && dump_enabled_p ())
2273 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2274 #if __GNUC__ >= 10
2275 # pragma GCC diagnostic push
2276 # pragma GCC diagnostic ignored "-Wformat"
2277 #endif
2278                                 dump_printf_loc (l_dump_flags, d_u_loc,
2279                                                      "variable %<%T%> adjusted for"
2280                                                      " OpenACC privatization level:"
2281                                                      " %qs\n",
2282                                                      decl, axes[level]);
2283 #if __GNUC__ >= 10
2284 # pragma GCC diagnostic pop
2285 #endif
2286                           }
2287                         remove = true;
2288                       }
2289                       break;
2290                     }
2291                 break;
2292               }
2293             }
2294 
2295           if (gsi_end_p (gsi))
2296             /* We rewound past the beginning of the BB.  */
2297             gsi = gsi_start_bb (bb);
2298           else
2299             /* Undo the rewind.  */
2300             gsi_next (&gsi);
2301 
2302           if (remove)
2303             {
2304               if (gimple_vdef (call))
2305                 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
2306               if (gimple_call_lhs (call))
2307                 {
2308                     /* Propagate the data dependency var.  */
2309                     gimple *ass = gimple_build_assign (gimple_call_lhs (call),
2310                                                                gimple_call_arg (call, 1));
2311                     gsi_replace (&gsi, ass,  false);
2312                 }
2313               else
2314                 gsi_remove (&gsi, true);
2315             }
2316           else if (!rescan)
2317             /* If not rescanning, advance over the call.  */
2318             gsi_next (&gsi);
2319       }
2320 
2321   /* Regarding the OpenACC privatization level, we're currently only looking at
2322      making the gang-private level work.  Regarding that, we have the following
2323      configurations:
2324 
2325        - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2326            particular, change 'TREE_TYPE', etc.) and there is no
2327            'targetm.goacc.expand_var_decl'.
2328 
2329        - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2330            marker and then 'targetm.goacc.expand_var_decl' does the work.
2331 
2332      Eventually (in particular, for worker-private level?), both
2333      'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2334      may need to do things, but that's currently not meant to be addressed, and
2335      thus not fully worked out and implemented, and thus untested.  Hence,
2336      'assert' what currently is implemented/tested, only.  */
2337 
2338   if (targetm.goacc.expand_var_decl)
2339     gcc_assert (adjusted_vars.is_empty ());
2340 
2341   /* Make adjustments to gang-private local variables if required by the
2342      target, e.g. forcing them into a particular address space.  Afterwards,
2343      ADDR_EXPR nodes which have adjusted variables as their argument need to
2344      be modified in one of two ways:
2345 
2346        1. They can be recreated, making a pointer to the variable in the new
2347             address space, or
2348 
2349        2. The address of the variable in the new address space can be taken,
2350             converted to the default (original) address space, and the result of
2351             that conversion subsituted in place of the original ADDR_EXPR node.
2352 
2353      Which of these is done depends on the gimple statement being processed.
2354      At present atomic operations and inline asms use (1), and everything else
2355      uses (2).  At least on AMD GCN, there are atomic operations that work
2356      directly in the LDS address space.
2357 
2358      COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2359      the new decl, adjusting types of appropriate tree nodes as necessary.  */
2360 
2361   if (targetm.goacc.adjust_private_decl
2362       && !adjusted_vars.is_empty ())
2363     {
2364       FOR_ALL_BB_FN (bb, cfun)
2365           for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2366                !gsi_end_p (gsi);
2367                gsi_next (&gsi))
2368             {
2369               gimple *stmt = gsi_stmt (gsi);
2370               walk_stmt_info wi;
2371               var_decl_rewrite_info info;
2372 
2373               info.avoid_pointer_conversion
2374                 = (is_gimple_call (stmt)
2375                      && is_sync_builtin_call (as_a <gcall *> (stmt)))
2376                     || gimple_code (stmt) == GIMPLE_ASM;
2377               info.stmt = stmt;
2378               info.modified = false;
2379               info.adjusted_vars = &adjusted_vars;
2380 
2381               memset (&wi, 0, sizeof (wi));
2382               wi.info = &info;
2383 
2384               walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2385 
2386               if (info.modified)
2387                 update_stmt (stmt);
2388             }
2389     }
2390 
2391   return 0;
2392 }
2393 
2394 /* Default launch dimension validator.  Force everything to 1.  A
2395    backend that wants to provide larger dimensions must override this
2396    hook.  */
2397 
2398 bool
default_goacc_validate_dims(tree ARG_UNUSED (decl),int * dims,int ARG_UNUSED (fn_level),unsigned ARG_UNUSED (used))2399 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2400                                    int ARG_UNUSED (fn_level),
2401                                    unsigned ARG_UNUSED (used))
2402 {
2403   bool changed = false;
2404 
2405   for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2406     {
2407       if (dims[ix] != 1)
2408           {
2409             dims[ix] = 1;
2410             changed = true;
2411           }
2412     }
2413 
2414   return changed;
2415 }
2416 
2417 /* Default dimension bound is unknown on accelerator and 1 on host.  */
2418 
2419 int
default_goacc_dim_limit(int ARG_UNUSED (axis))2420 default_goacc_dim_limit (int ARG_UNUSED (axis))
2421 {
2422 #ifdef ACCEL_COMPILER
2423   return 0;
2424 #else
2425   return 1;
2426 #endif
2427 }
2428 
2429 namespace {
2430 
2431 const pass_data pass_data_oacc_loop_designation =
2432 {
2433   GIMPLE_PASS, /* type */
2434   "oaccloops", /* name */
2435   OPTGROUP_OMP, /* optinfo_flags */
2436   TV_NONE, /* tv_id */
2437   PROP_cfg, /* properties_required */
2438   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
2439   0, /* properties_destroyed */
2440   0, /* todo_flags_start */
2441   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2442 };
2443 
2444 class pass_oacc_loop_designation : public gimple_opt_pass
2445 {
2446 public:
pass_oacc_loop_designation(gcc::context * ctxt)2447   pass_oacc_loop_designation (gcc::context *ctxt)
2448     : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2449   {}
2450 
2451   /* opt_pass methods: */
gate(function *)2452   virtual bool gate (function *) { return flag_openacc; };
2453 
execute(function *)2454   virtual unsigned int execute (function *)
2455     {
2456       return execute_oacc_loop_designation ();
2457     }
2458 
2459 }; // class pass_oacc_loop_designation
2460 
2461 const pass_data pass_data_oacc_device_lower =
2462 {
2463   GIMPLE_PASS, /* type */
2464   "oaccdevlow", /* name */
2465   OPTGROUP_OMP, /* optinfo_flags */
2466   TV_NONE, /* tv_id */
2467   PROP_cfg, /* properties_required */
2468   0 /* Possibly PROP_gimple_eomp.  */, /* properties_provided */
2469   0, /* properties_destroyed */
2470   0, /* todo_flags_start */
2471   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2472 };
2473 
2474 class pass_oacc_device_lower : public gimple_opt_pass
2475 {
2476 public:
pass_oacc_device_lower(gcc::context * ctxt)2477   pass_oacc_device_lower (gcc::context *ctxt)
2478     : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2479   {}
2480 
2481   /* opt_pass methods: */
gate(function *)2482   virtual bool gate (function *) { return flag_openacc; };
2483 
execute(function *)2484   virtual unsigned int execute (function *)
2485     {
2486       return execute_oacc_device_lower ();
2487     }
2488 
2489 }; // class pass_oacc_device_lower
2490 
2491 } // anon namespace
2492 
2493 gimple_opt_pass *
make_pass_oacc_loop_designation(gcc::context * ctxt)2494 make_pass_oacc_loop_designation (gcc::context *ctxt)
2495 {
2496   return new pass_oacc_loop_designation (ctxt);
2497 }
2498 
2499 gimple_opt_pass *
make_pass_oacc_device_lower(gcc::context * ctxt)2500 make_pass_oacc_device_lower (gcc::context *ctxt)
2501 {
2502   return new pass_oacc_device_lower (ctxt);
2503 }
2504 
2505 
2506 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2507    GOMP_SIMT_ENTER call identifying the privatized variables, which are
2508    turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2509    Set *REGIMPLIFY to true, except if no privatized variables were seen.  */
2510 
2511 static void
ompdevlow_adjust_simt_enter(gimple_stmt_iterator * gsi,bool * regimplify)2512 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2513 {
2514   gimple *alloc_stmt = gsi_stmt (*gsi);
2515   tree simtrec = gimple_call_lhs (alloc_stmt);
2516   tree simduid = gimple_call_arg (alloc_stmt, 0);
2517   gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2518   gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2519   tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2520   TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2521   TREE_ADDRESSABLE (rectype) = 1;
2522   TREE_TYPE (simtrec) = build_pointer_type (rectype);
2523   for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2524     {
2525       tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2526       if (*argp == null_pointer_node)
2527           continue;
2528       gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2529                       && VAR_P (TREE_OPERAND (*argp, 0)));
2530       tree var = TREE_OPERAND (*argp, 0);
2531 
2532       tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2533                                      DECL_NAME (var), TREE_TYPE (var));
2534       SET_DECL_ALIGN (field, DECL_ALIGN (var));
2535       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2536       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2537 
2538       insert_field_into_struct (rectype, field);
2539 
2540       tree t = build_simple_mem_ref (simtrec);
2541       t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2542       TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2543       SET_DECL_VALUE_EXPR (var, t);
2544       DECL_HAS_VALUE_EXPR_P (var) = 1;
2545       *regimplify = true;
2546     }
2547   layout_type (rectype);
2548   tree size = TYPE_SIZE_UNIT (rectype);
2549   tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2550 
2551   alloc_stmt
2552     = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2553   gimple_call_set_lhs (alloc_stmt, simtrec);
2554   gsi_replace (gsi, alloc_stmt, false);
2555   gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2556   enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2557   gsi_replace (&enter_gsi, enter_stmt, false);
2558 
2559   use_operand_p use;
2560   gimple *exit_stmt;
2561   if (single_imm_use (simtrec, &use, &exit_stmt))
2562     {
2563       gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2564       gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2565       tree clobber = build_clobber (rectype);
2566       exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2567       gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2568     }
2569   else
2570     gcc_checking_assert (has_zero_uses (simtrec));
2571 }
2572 
2573 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables.  */
2574 
2575 static tree
find_simtpriv_var_op(tree * tp,int * walk_subtrees,void *)2576 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2577 {
2578   tree t = *tp;
2579 
2580   if (VAR_P (t)
2581       && DECL_HAS_VALUE_EXPR_P (t)
2582       && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2583     {
2584       *walk_subtrees = 0;
2585       return t;
2586     }
2587   return NULL_TREE;
2588 }
2589 
2590 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2591    VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2592    LANE is kept to be expanded to RTL later on.  Also cleanup all other SIMT
2593    internal functions on non-SIMT targets, and likewise some SIMD internal
2594    functions on SIMT targets.  */
2595 
2596 static unsigned int
execute_omp_device_lower()2597 execute_omp_device_lower ()
2598 {
2599   int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2600   bool regimplify = false;
2601   basic_block bb;
2602   gimple_stmt_iterator gsi;
2603   bool calls_declare_variant_alt
2604     = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2605   FOR_EACH_BB_FN (bb, cfun)
2606     for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2607       {
2608           gimple *stmt = gsi_stmt (gsi);
2609           if (!is_gimple_call (stmt))
2610             continue;
2611           if (!gimple_call_internal_p (stmt))
2612             {
2613               if (calls_declare_variant_alt)
2614                 if (tree fndecl = gimple_call_fndecl (stmt))
2615                     {
2616                       tree new_fndecl = omp_resolve_declare_variant (fndecl);
2617                       if (new_fndecl != fndecl)
2618                         {
2619                           gimple_call_set_fndecl (stmt, new_fndecl);
2620                           update_stmt (stmt);
2621                         }
2622                     }
2623               continue;
2624             }
2625           tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2626           tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2627           switch (gimple_call_internal_fn (stmt))
2628             {
2629             case IFN_GOMP_USE_SIMT:
2630               rhs = vf == 1 ? integer_zero_node : integer_one_node;
2631               break;
2632             case IFN_GOMP_SIMT_ENTER:
2633               rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2634               goto simtreg_enter_exit;
2635             case IFN_GOMP_SIMT_ENTER_ALLOC:
2636               if (vf != 1)
2637                 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2638               rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2639               goto simtreg_enter_exit;
2640             case IFN_GOMP_SIMT_EXIT:
2641             simtreg_enter_exit:
2642               if (vf != 1)
2643                 continue;
2644               unlink_stmt_vdef (stmt);
2645               break;
2646             case IFN_GOMP_SIMT_LANE:
2647             case IFN_GOMP_SIMT_LAST_LANE:
2648               rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2649               break;
2650             case IFN_GOMP_SIMT_VF:
2651               rhs = build_int_cst (type, vf);
2652               break;
2653             case IFN_GOMP_SIMT_ORDERED_PRED:
2654               rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2655               if (rhs || !lhs)
2656                 unlink_stmt_vdef (stmt);
2657               break;
2658             case IFN_GOMP_SIMT_VOTE_ANY:
2659             case IFN_GOMP_SIMT_XCHG_BFLY:
2660             case IFN_GOMP_SIMT_XCHG_IDX:
2661               rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2662               break;
2663             case IFN_GOMP_SIMD_LANE:
2664             case IFN_GOMP_SIMD_LAST_LANE:
2665               rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2666               break;
2667             case IFN_GOMP_SIMD_VF:
2668               rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2669               break;
2670             default:
2671               continue;
2672             }
2673           if (lhs && !rhs)
2674             continue;
2675           stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2676           gsi_replace (&gsi, stmt, false);
2677       }
2678   if (regimplify)
2679     FOR_EACH_BB_REVERSE_FN (bb, cfun)
2680       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2681           if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2682             {
2683               if (gimple_clobber_p (gsi_stmt (gsi)))
2684                 gsi_remove (&gsi, true);
2685               else
2686                 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2687             }
2688   if (vf != 1)
2689     cfun->has_force_vectorize_loops = false;
2690   return 0;
2691 }
2692 
2693 namespace {
2694 
2695 const pass_data pass_data_omp_device_lower =
2696 {
2697   GIMPLE_PASS, /* type */
2698   "ompdevlow", /* name */
2699   OPTGROUP_OMP, /* optinfo_flags */
2700   TV_NONE, /* tv_id */
2701   PROP_cfg, /* properties_required */
2702   PROP_gimple_lomp_dev, /* properties_provided */
2703   0, /* properties_destroyed */
2704   0, /* todo_flags_start */
2705   TODO_update_ssa, /* todo_flags_finish */
2706 };
2707 
2708 class pass_omp_device_lower : public gimple_opt_pass
2709 {
2710 public:
pass_omp_device_lower(gcc::context * ctxt)2711   pass_omp_device_lower (gcc::context *ctxt)
2712     : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2713   {}
2714 
2715   /* opt_pass methods: */
gate(function * fun)2716   virtual bool gate (function *fun)
2717     {
2718       return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2719                 || (flag_openmp
2720                       && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2721     }
execute(function *)2722   virtual unsigned int execute (function *)
2723     {
2724       return execute_omp_device_lower ();
2725     }
2726 
2727 }; // class pass_expand_omp_ssa
2728 
2729 } // anon namespace
2730 
2731 gimple_opt_pass *
make_pass_omp_device_lower(gcc::context * ctxt)2732 make_pass_omp_device_lower (gcc::context *ctxt)
2733 {
2734   return new pass_omp_device_lower (ctxt);
2735 }
2736 
2737 /* "omp declare target link" handling pass.  */
2738 
2739 namespace {
2740 
2741 const pass_data pass_data_omp_target_link =
2742 {
2743   GIMPLE_PASS,                          /* type */
2744   "omptargetlink",            /* name */
2745   OPTGROUP_OMP,                         /* optinfo_flags */
2746   TV_NONE,                              /* tv_id */
2747   PROP_ssa,                             /* properties_required */
2748   0,                                    /* properties_provided */
2749   0,                                    /* properties_destroyed */
2750   0,                                    /* todo_flags_start */
2751   TODO_update_ssa,            /* todo_flags_finish */
2752 };
2753 
2754 class pass_omp_target_link : public gimple_opt_pass
2755 {
2756 public:
pass_omp_target_link(gcc::context * ctxt)2757   pass_omp_target_link (gcc::context *ctxt)
2758     : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2759   {}
2760 
2761   /* opt_pass methods: */
gate(function * fun)2762   virtual bool gate (function *fun)
2763     {
2764 #ifdef ACCEL_COMPILER
2765       return offloading_function_p (fun->decl);
2766 #else
2767       (void) fun;
2768       return false;
2769 #endif
2770     }
2771 
2772   virtual unsigned execute (function *);
2773 };
2774 
2775 /* Callback for walk_gimple_stmt used to scan for link var operands.  */
2776 
2777 static tree
find_link_var_op(tree * tp,int * walk_subtrees,void *)2778 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2779 {
2780   tree t = *tp;
2781 
2782   if (VAR_P (t)
2783       && DECL_HAS_VALUE_EXPR_P (t)
2784       && is_global_var (t)
2785       && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2786     {
2787       *walk_subtrees = 0;
2788       return t;
2789     }
2790 
2791   return NULL_TREE;
2792 }
2793 
2794 unsigned
execute(function * fun)2795 pass_omp_target_link::execute (function *fun)
2796 {
2797   basic_block bb;
2798   FOR_EACH_BB_FN (bb, fun)
2799     {
2800       gimple_stmt_iterator gsi;
2801       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2802           {
2803             if (gimple_call_builtin_p (gsi_stmt (gsi), BUILT_IN_GOMP_TARGET))
2804               {
2805                 /* Nullify the second argument of __builtin_GOMP_target_ext.  */
2806                 gimple_call_set_arg (gsi_stmt (gsi), 1, null_pointer_node);
2807                 update_stmt (gsi_stmt (gsi));
2808               }
2809             if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2810               gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2811           }
2812     }
2813 
2814   return 0;
2815 }
2816 
2817 } // anon namespace
2818 
2819 gimple_opt_pass *
make_pass_omp_target_link(gcc::context * ctxt)2820 make_pass_omp_target_link (gcc::context *ctxt)
2821 {
2822   return new pass_omp_target_link (ctxt);
2823 }
2824