1 /* Decompose OpenACC 'kernels' constructs into parts, a sequence of compute
2    constructs
3 
4    Copyright (C) 2020-2022 Free Software Foundation, Inc.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "langhooks.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "cgraph.h"
32 #include "fold-const.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimple-walk.h"
36 #include "gomp-constants.h"
37 #include "omp-general.h"
38 #include "diagnostic-core.h"
39 
40 
41 /* This preprocessing pass is run immediately before lower_omp.  It decomposes
42    OpenACC 'kernels' constructs into parts, a sequence of compute constructs.
43 
44    The translation is as follows:
45      - The entire 'kernels' region is turned into a 'data' region with clauses
46        taken from the 'kernels' region.  New 'create' clauses are added for all
47        variables declared at the top level in the kernels region.
48      - Any loop nests annotated with an OpenACC 'loop' directive are wrapped in
49        a new compute construct.
50            - 'loop' directives without an explicit 'independent' or 'seq' clause
51              get an 'auto' clause added; other clauses are preserved on the loop
52              or moved to the new surrounding compute construct, as applicable.
53      - Any sequences of other code (non-loops, non-OpenACC 'loop's) are wrapped
54        in new "gang-single" compute construct: 'worker'/'vector' parallelism is
55        preserved, but 'num_gangs (1)' is enforced.
56      - Both points above only apply at the topmost level in the region, that
57        is, the transformation does not introduce new compute constructs inside
58        nested statement bodies.  In particular, this means that a
59        gang-parallelizable loop inside an 'if' statement is made "gang-single".
60      - In order to make the host wait only once for the whole region instead
61        of once per device kernel launch, the new compute constructs are
62        annotated 'async'.  Unless the original 'kernels' construct already was
63        marked 'async', the entire region ends with a 'wait' directive.  If the
64        original 'kernels' construct was marked 'async', the synthesized 'async'
65        clauses use the original 'kernels' construct's 'async' argument
66        (possibly implicit).
67 */
68 
69 
70 /*TODO Things are conceptually wrong here: 'loop' clauses may be hidden behind
71   'device_type', so we have to defer a lot of processing until we're in the
72   offloading compilation.  "Fortunately", GCC doesn't support the OpenACC
73   'device_type' clause yet, so we get away that.  */
74 
75 
76 /* Helper function for decompose_kernels_region_body.  If STMT contains a
77    "top-level" OMP_FOR statement, returns a pointer to that statement;
78    returns NULL otherwise.
79 
80    A "top-level" OMP_FOR statement is one that is possibly accompanied by
81    small snippets of setup code.  Specifically, this function accepts an
82    OMP_FOR possibly wrapped in a singleton bind and a singleton try
83    statement to allow for a local loop variable, but not an OMP_FOR
84    statement nested in any other constructs.  Alternatively, it accepts a
85    non-singleton bind containing only assignments and then an OMP_FOR
86    statement at the very end.  The former style can be generated by the C
87    frontend, the latter by the Fortran frontend.  */
88 
89 static gimple *
top_level_omp_for_in_stmt(gimple * stmt)90 top_level_omp_for_in_stmt (gimple *stmt)
91 {
92   if (gimple_code (stmt) == GIMPLE_OMP_FOR)
93     return stmt;
94 
95   if (gimple_code (stmt) == GIMPLE_BIND)
96     {
97       gimple_seq body = gimple_bind_body (as_a <gbind *> (stmt));
98       if (gimple_seq_singleton_p (body))
99           {
100             /* Accept an OMP_FOR statement, or a try statement containing only
101                a single OMP_FOR.  */
102             gimple *maybe_for_or_try = gimple_seq_first_stmt (body);
103             if (gimple_code (maybe_for_or_try) == GIMPLE_OMP_FOR)
104               return maybe_for_or_try;
105             else if (gimple_code (maybe_for_or_try) == GIMPLE_TRY)
106               {
107                 gimple_seq try_body = gimple_try_eval (maybe_for_or_try);
108                 if (!gimple_seq_singleton_p (try_body))
109                     return NULL;
110                 gimple *maybe_omp_for_stmt = gimple_seq_first_stmt (try_body);
111                 if (gimple_code (maybe_omp_for_stmt) == GIMPLE_OMP_FOR)
112                     return maybe_omp_for_stmt;
113               }
114           }
115       else
116           {
117             gimple_stmt_iterator gsi;
118             /* Accept only a block of optional assignments followed by an
119                OMP_FOR at the end.  No other kinds of statements allowed.  */
120             for (gsi = gsi_start (body); !gsi_end_p (gsi); gsi_next (&gsi))
121               {
122                 gimple *body_stmt = gsi_stmt (gsi);
123                 if (gimple_code (body_stmt) == GIMPLE_ASSIGN)
124                     continue;
125                 else if (gimple_code (body_stmt) == GIMPLE_OMP_FOR
126                            && gsi_one_before_end_p (gsi))
127                     return body_stmt;
128                 else
129                     return NULL;
130               }
131           }
132     }
133 
134   return NULL;
135 }
136 
137 /* Helper for adjust_region_code: evaluate the statement at GSI_P.  */
138 
139 static tree
adjust_region_code_walk_stmt_fn(gimple_stmt_iterator * gsi_p,bool * handled_ops_p,struct walk_stmt_info * wi)140 adjust_region_code_walk_stmt_fn (gimple_stmt_iterator *gsi_p,
141                                          bool *handled_ops_p,
142                                          struct walk_stmt_info *wi)
143 {
144   int *region_code = (int *) wi->info;
145 
146   gimple *stmt = gsi_stmt (*gsi_p);
147   switch (gimple_code (stmt))
148     {
149     case GIMPLE_OMP_FOR:
150       {
151           tree clauses = gimple_omp_for_clauses (stmt);
152           if (omp_find_clause (clauses, OMP_CLAUSE_INDEPENDENT))
153             {
154               /* Explicit 'independent' clause.  */
155               /* Keep going; recurse into loop body.  */
156               break;
157             }
158           else if (omp_find_clause (clauses, OMP_CLAUSE_SEQ))
159             {
160               /* Explicit 'seq' clause.  */
161               /* We'll "parallelize" if at some level a loop construct has been
162                  marked up by the user as unparallelizable ('seq' clause; we'll
163                  respect that in the later processing).  Given that the user has
164                  explicitly marked it up, this loop construct cannot be
165                  performance-critical, and in this case it's also fine to
166                  "parallelize" instead of "gang-single", because any outer or
167                  inner loops may still exploit the available parallelism.  */
168               /* Keep going; recurse into loop body.  */
169               break;
170             }
171           else
172             {
173               /* Explicit or implicit 'auto' clause.  */
174               /* The user would like this loop analyzed ('auto' clause) and
175                  typically parallelized, but we don't have available yet the
176                  compiler logic to analyze this, so can't parallelize it here, so
177                  we'd very likely be running into a performance problem if we
178                  were to execute this unparallelized, thus forward the whole loop
179                  nest to 'parloops'.  */
180               *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
181               /* Terminate: final decision for this region.  */
182               *handled_ops_p = true;
183               return integer_zero_node;
184             }
185           gcc_unreachable ();
186       }
187 
188     case GIMPLE_COND:
189     case GIMPLE_GOTO:
190     case GIMPLE_SWITCH:
191     case GIMPLE_ASM:
192     case GIMPLE_TRANSACTION:
193     case GIMPLE_RETURN:
194       /* Statement that might constitute some looping/control flow pattern.  */
195       /* The user would like this code analyzed (implicit inside a 'kernels'
196            region) and typically parallelized, but we don't have available yet
197            the compiler logic to analyze this, so can't parallelize it here, so
198            we'd very likely be running into a performance problem if we were to
199            execute this unparallelized, thus forward the whole thing to
200            'parloops'.  */
201       *region_code = GF_OMP_TARGET_KIND_OACC_KERNELS;
202       /* Terminate: final decision for this region.  */
203       *handled_ops_p = true;
204       return integer_zero_node;
205 
206     default:
207       /* Keep going.  */
208       break;
209     }
210 
211   return NULL;
212 }
213 
214 /* Adjust the REGION_CODE for the region in GS.  */
215 
216 static void
adjust_region_code(gimple_seq gs,int * region_code)217 adjust_region_code (gimple_seq gs, int *region_code)
218 {
219   struct walk_stmt_info wi;
220   memset (&wi, 0, sizeof (wi));
221   wi.info = region_code;
222   walk_gimple_seq (gs, adjust_region_code_walk_stmt_fn, NULL, &wi);
223 }
224 
225 /* Helper function for make_loops_gang_single for walking the tree.  If the
226    statement indicated by GSI_P is an OpenACC for loop with a gang clause,
227    issue a warning and remove the clause.  */
228 
229 static tree
visit_loops_in_gang_single_region(gimple_stmt_iterator * gsi_p,bool * handled_ops_p,struct walk_stmt_info *)230 visit_loops_in_gang_single_region (gimple_stmt_iterator *gsi_p,
231                                            bool *handled_ops_p,
232                                            struct walk_stmt_info *)
233 {
234   *handled_ops_p = false;
235 
236   gimple *stmt = gsi_stmt (*gsi_p);
237   switch (gimple_code (stmt))
238     {
239     case GIMPLE_OMP_FOR:
240       /*TODO Given the current 'adjust_region_code' algorithm, this is
241           actually...  */
242 #if 0
243       gcc_unreachable ();
244 #else
245       /* ..., but due to bugs (PR100400), we may actually come here.
246            Reliably catch this, regardless of checking level.  */
247       internal_error ("PR100400");
248 #endif
249 
250       {
251           tree clauses = gimple_omp_for_clauses (stmt);
252           tree prev_clause = NULL;
253           for (tree clause = clauses; clause; clause = OMP_CLAUSE_CHAIN (clause))
254             {
255               if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_GANG)
256                 {
257                     /* It makes no sense to have a 'gang' clause in a "gang-single"
258                        region, so warn and remove it.  */
259                     warning_at (gimple_location (stmt), 0,
260                                   "conditionally executed loop in %<kernels%> region"
261                                   " will be executed by a single gang;"
262                                   " ignoring %<gang%> clause");
263                     if (prev_clause != NULL)
264                       OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause);
265                     else
266                       clauses = OMP_CLAUSE_CHAIN (clause);
267 
268                     break;
269                 }
270               prev_clause = clause;
271             }
272           gimple_omp_for_set_clauses (stmt, clauses);
273       }
274       /* No need to recurse into nested statements; no loop nested inside
275            this loop can be gang-partitioned.  */
276       sorry ("%<gang%> loop in %<gang-single%> region");
277       *handled_ops_p = true;
278       break;
279 
280     default:
281       break;
282     }
283 
284   return NULL;
285 }
286 
287 /* Visit all nested OpenACC loops in the sequence indicated by GS.  This
288    statement is expected to be inside a gang-single region.  Issue a warning
289    for any loops inside it that have gang clauses and remove the clauses.  */
290 
291 static void
make_loops_gang_single(gimple_seq gs)292 make_loops_gang_single (gimple_seq gs)
293 {
294   struct walk_stmt_info wi;
295   memset (&wi, 0, sizeof (wi));
296   walk_gimple_seq (gs, visit_loops_in_gang_single_region, NULL, &wi);
297 }
298 
299 /* Construct a "gang-single" compute construct at LOC containing the STMTS.
300    Annotate with CLAUSES, which must not contain a 'num_gangs' clause, and an
301    additional 'num_gangs (1)' clause to force "gang-single" execution.  */
302 
303 static gimple *
make_region_seq(location_t loc,gimple_seq stmts,tree num_gangs_clause,tree num_workers_clause,tree vector_length_clause,tree clauses)304 make_region_seq (location_t loc, gimple_seq stmts,
305                      tree num_gangs_clause,
306                      tree num_workers_clause,
307                      tree vector_length_clause,
308                      tree clauses)
309 {
310   /* This correctly unshares the entire clause chain rooted here.  */
311   clauses = unshare_expr (clauses);
312 
313   dump_user_location_t loc_stmts_first = gimple_seq_first (stmts);
314 
315   /* Figure out the region code for this region.  */
316   /* Optimistic default: assume "setup code", no looping; thus not
317      performance-critical.  */
318   int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE;
319   adjust_region_code (stmts, &region_code);
320 
321   if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE)
322     {
323       if (dump_enabled_p ())
324           /*TODO MSG_MISSED_OPTIMIZATION? */
325           dump_printf_loc (MSG_NOTE, loc_stmts_first,
326                                "beginning %<gang-single%> part"
327                                " in OpenACC %<kernels%> region\n");
328 
329       /* Synthesize a 'num_gangs (1)' clause.  */
330       tree gang_single_clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
331       OMP_CLAUSE_OPERAND (gang_single_clause, 0) = integer_one_node;
332       OMP_CLAUSE_CHAIN (gang_single_clause) = clauses;
333       clauses = gang_single_clause;
334 
335       /* Remove and issue warnings about gang clauses on any OpenACC
336            loops nested inside this sequentially executed statement.  */
337       make_loops_gang_single (stmts);
338     }
339   else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
340     {
341       if (dump_enabled_p ())
342           dump_printf_loc (MSG_NOTE, loc_stmts_first,
343                                "beginning %<parloops%> part"
344                                " in OpenACC %<kernels%> region\n");
345 
346       /* As we're transforming a 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another
347            'GF_OMP_TARGET_KIND_OACC_KERNELS', this isn't doing any of the clauses
348            mangling that 'make_region_loop_nest' is doing.  */
349       /* Re-assemble the clauses stripped off earlier.  */
350       if (num_gangs_clause != NULL)
351           {
352             tree c = unshare_expr (num_gangs_clause);
353             OMP_CLAUSE_CHAIN (c) = clauses;
354             clauses = c;
355           }
356       if (num_workers_clause != NULL)
357           {
358             tree c = unshare_expr (num_workers_clause);
359             OMP_CLAUSE_CHAIN (c) = clauses;
360             clauses = c;
361           }
362       if (vector_length_clause != NULL)
363           {
364             tree c = unshare_expr (vector_length_clause);
365             OMP_CLAUSE_CHAIN (c) = clauses;
366             clauses = c;
367           }
368     }
369   else
370     gcc_unreachable ();
371 
372   /* Build the gang-single region.  */
373   gimple *single_region = gimple_build_omp_target (NULL, region_code, clauses);
374   gimple_set_location (single_region, loc);
375   gbind *single_body = gimple_build_bind (NULL, stmts, make_node (BLOCK));
376   gimple_omp_set_body (single_region, single_body);
377 
378   return single_region;
379 }
380 
381 /* Helper function for make_region_loop_nest.  Adds a 'num_gangs'
382    ('num_workers', 'vector_length') clause to the given CLAUSES, either the one
383    from the parent compute construct (PARENT_CLAUSE) or a new one based on the
384    loop's own LOOP_CLAUSE ('gang (num: N)' or similar for 'worker' or 'vector'
385    clauses) with the given CLAUSE_CODE.  Does nothing if neither PARENT_CLAUSE
386    nor LOOP_CLAUSE exist.  Returns the new clauses.  */
387 
388 static tree
add_parent_or_loop_num_clause(tree parent_clause,tree loop_clause,omp_clause_code clause_code,tree clauses)389 add_parent_or_loop_num_clause (tree parent_clause, tree loop_clause,
390                                      omp_clause_code clause_code, tree clauses)
391 {
392   if (parent_clause != NULL)
393     {
394       tree num_clause = unshare_expr (parent_clause);
395       OMP_CLAUSE_CHAIN (num_clause) = clauses;
396       clauses = num_clause;
397     }
398   else if (loop_clause != NULL)
399     {
400       /* The kernels region does not have a 'num_gangs' clause, but the loop
401            itself had a 'gang (num: N)' clause.  Honor it by adding a
402            'num_gangs (N)' clause on the compute construct.  */
403       tree num = OMP_CLAUSE_OPERAND (loop_clause, 0);
404       tree new_num_clause
405           = build_omp_clause (OMP_CLAUSE_LOCATION (loop_clause), clause_code);
406       OMP_CLAUSE_OPERAND (new_num_clause, 0) = num;
407       OMP_CLAUSE_CHAIN (new_num_clause) = clauses;
408       clauses = new_num_clause;
409     }
410   return clauses;
411 }
412 
413 /* Helper for make_region_loop_nest, looking for 'worker (num: N)' or 'vector
414    (length: N)' clauses in nested loops.  Removes the argument, transferring it
415    to the enclosing compute construct (via WI->INFO).  If arguments within the
416    same loop nest conflict, emits a warning.
417 
418    This function also decides whether to add an 'auto' clause on each of these
419    nested loops.  */
420 
421 struct adjust_nested_loop_clauses_wi_info
422 {
423   tree *loop_gang_clause_ptr;
424   tree *loop_worker_clause_ptr;
425   tree *loop_vector_clause_ptr;
426 };
427 
428 static tree
adjust_nested_loop_clauses(gimple_stmt_iterator * gsi_p,bool *,struct walk_stmt_info * wi)429 adjust_nested_loop_clauses (gimple_stmt_iterator *gsi_p, bool *,
430                                   struct walk_stmt_info *wi)
431 {
432   struct adjust_nested_loop_clauses_wi_info *wi_info
433     = (struct adjust_nested_loop_clauses_wi_info *) wi->info;
434   gimple *stmt = gsi_stmt (*gsi_p);
435 
436   if (gimple_code (stmt) == GIMPLE_OMP_FOR)
437     {
438       bool add_auto_clause = true;
439       tree loop_clauses = gimple_omp_for_clauses (stmt);
440       tree loop_clause = loop_clauses;
441       for (; loop_clause; loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
442           {
443             tree *outer_clause_ptr = NULL;
444             switch (OMP_CLAUSE_CODE (loop_clause))
445               {
446               case OMP_CLAUSE_GANG:
447                 outer_clause_ptr = wi_info->loop_gang_clause_ptr;
448                 break;
449               case OMP_CLAUSE_WORKER:
450                 outer_clause_ptr = wi_info->loop_worker_clause_ptr;
451                 break;
452               case OMP_CLAUSE_VECTOR:
453                 outer_clause_ptr = wi_info->loop_vector_clause_ptr;
454                 break;
455               case OMP_CLAUSE_SEQ:
456               case OMP_CLAUSE_INDEPENDENT:
457               case OMP_CLAUSE_AUTO:
458                 add_auto_clause = false;
459               default:
460                 break;
461               }
462             if (outer_clause_ptr != NULL)
463               {
464                 if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL
465                       && *outer_clause_ptr == NULL)
466                     {
467                       /* Transfer the clause to the enclosing compute construct and
468                          remove the numerical argument from the 'loop'.  */
469                       *outer_clause_ptr = unshare_expr (loop_clause);
470                       OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
471                     }
472                 else if (OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL &&
473                            OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0) != NULL)
474                     {
475                       /* See if both of these are the same constant.  If they
476                          aren't, emit a warning.  */
477                       tree old_op = OMP_CLAUSE_OPERAND (*outer_clause_ptr, 0);
478                       tree new_op = OMP_CLAUSE_OPERAND (loop_clause, 0);
479                       if (!(cst_and_fits_in_hwi (old_op) &&
480                               cst_and_fits_in_hwi (new_op) &&
481                               int_cst_value (old_op) == int_cst_value (new_op)))
482                         {
483                           const char *clause_name
484                               = omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
485                           error_at (gimple_location (stmt),
486                                         "cannot honor conflicting %qs clause",
487                                         clause_name);
488                           inform (OMP_CLAUSE_LOCATION (*outer_clause_ptr),
489                                     "location of the previous clause"
490                                     " in the same loop nest");
491                         }
492                       OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
493                     }
494               }
495           }
496       if (add_auto_clause)
497           {
498             tree auto_clause
499               = build_omp_clause (gimple_location (stmt), OMP_CLAUSE_AUTO);
500             OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
501             gimple_omp_for_set_clauses (stmt, auto_clause);
502           }
503     }
504 
505   return NULL;
506 }
507 
508 /* Helper for make_region_loop_nest.  Transform OpenACC 'kernels'/'loop'
509    construct clauses into OpenACC 'parallel'/'loop' construct ones.  */
510 
511 static tree
transform_kernels_loop_clauses(gimple * omp_for,tree num_gangs_clause,tree num_workers_clause,tree vector_length_clause,tree clauses)512 transform_kernels_loop_clauses (gimple *omp_for,
513                                         tree num_gangs_clause,
514                                         tree num_workers_clause,
515                                         tree vector_length_clause,
516                                         tree clauses)
517 {
518   /* If this loop in a kernels region does not have an explicit 'seq',
519      'independent', or 'auto' clause, we must give it an explicit 'auto'
520      clause.
521      We also check for 'gang (num: N)' clauses.  These must not appear in
522      kernels regions that have their own 'num_gangs' clause.  Otherwise, they
523      must be converted and put on the region; similarly for 'worker' and
524      'vector' clauses.  */
525   bool add_auto_clause = true;
526   tree loop_gang_clause = NULL, loop_worker_clause = NULL,
527        loop_vector_clause = NULL;
528   tree loop_clauses = gimple_omp_for_clauses (omp_for);
529   for (tree loop_clause = loop_clauses;
530        loop_clause;
531        loop_clause = OMP_CLAUSE_CHAIN (loop_clause))
532     {
533       bool found_num_clause = false;
534       tree *clause_ptr, clause_to_check;
535       switch (OMP_CLAUSE_CODE (loop_clause))
536           {
537           case OMP_CLAUSE_GANG:
538             found_num_clause = true;
539             clause_ptr = &loop_gang_clause;
540             clause_to_check = num_gangs_clause;
541             break;
542           case OMP_CLAUSE_WORKER:
543             found_num_clause = true;
544             clause_ptr = &loop_worker_clause;
545             clause_to_check = num_workers_clause;
546             break;
547           case OMP_CLAUSE_VECTOR:
548             found_num_clause = true;
549             clause_ptr = &loop_vector_clause;
550             clause_to_check = vector_length_clause;
551             break;
552           case OMP_CLAUSE_INDEPENDENT:
553           case OMP_CLAUSE_SEQ:
554           case OMP_CLAUSE_AUTO:
555             add_auto_clause = false;
556           default:
557             break;
558           }
559       if (found_num_clause && OMP_CLAUSE_OPERAND (loop_clause, 0) != NULL)
560           {
561             if (clause_to_check)
562               {
563                 const char *clause_name
564                     = omp_clause_code_name[OMP_CLAUSE_CODE (loop_clause)];
565                 const char *parent_clause_name
566                     = omp_clause_code_name[OMP_CLAUSE_CODE (clause_to_check)];
567                 error_at (OMP_CLAUSE_LOCATION (loop_clause),
568                               "argument not permitted on %qs clause"
569                               " in OpenACC %<kernels%> region with a %qs clause",
570                               clause_name, parent_clause_name);
571                 inform (OMP_CLAUSE_LOCATION (clause_to_check),
572                           "location of OpenACC %<kernels%>");
573               }
574             /* Copy the 'gang (N)'/'worker (N)'/'vector (N)' clause to the
575                enclosing compute construct.  */
576             *clause_ptr = unshare_expr (loop_clause);
577             OMP_CLAUSE_CHAIN (*clause_ptr) = NULL;
578             /* Leave a 'gang'/'worker'/'vector' clause on the 'loop', but without
579                argument.  */
580             OMP_CLAUSE_OPERAND (loop_clause, 0) = NULL;
581           }
582     }
583   if (add_auto_clause)
584     {
585       tree auto_clause = build_omp_clause (gimple_location (omp_for),
586                                                      OMP_CLAUSE_AUTO);
587       OMP_CLAUSE_CHAIN (auto_clause) = loop_clauses;
588       loop_clauses = auto_clause;
589     }
590   gimple_omp_for_set_clauses (omp_for, loop_clauses);
591   /* We must also recurse into the loop; it might contain nested loops having
592      their own 'worker (num: W)' or 'vector (length: V)' clauses.  Turn these
593      into 'worker'/'vector' clauses on the compute construct.  */
594   struct walk_stmt_info wi;
595   memset (&wi, 0, sizeof (wi));
596   struct adjust_nested_loop_clauses_wi_info wi_info;
597   wi_info.loop_gang_clause_ptr = &loop_gang_clause;
598   wi_info.loop_worker_clause_ptr = &loop_worker_clause;
599   wi_info.loop_vector_clause_ptr = &loop_vector_clause;
600   wi.info = &wi_info;
601   gimple *body = gimple_omp_body (omp_for);
602   walk_gimple_seq (body, adjust_nested_loop_clauses, NULL, &wi);
603   /* Check if there were conflicting numbers of workers or vector length.  */
604   if (loop_gang_clause != NULL &&
605       OMP_CLAUSE_OPERAND (loop_gang_clause, 0) == NULL)
606     loop_gang_clause = NULL;
607   if (loop_worker_clause != NULL &&
608       OMP_CLAUSE_OPERAND (loop_worker_clause, 0) == NULL)
609     loop_worker_clause = NULL;
610   if (loop_vector_clause != NULL &&
611       OMP_CLAUSE_OPERAND (loop_vector_clause, 0) == NULL)
612     vector_length_clause = NULL;
613 
614   /* If the kernels region had 'num_gangs', 'num_worker', 'vector_length'
615      clauses, add these to this new compute construct.  */
616   clauses
617     = add_parent_or_loop_num_clause (num_gangs_clause, loop_gang_clause,
618                                              OMP_CLAUSE_NUM_GANGS, clauses);
619   clauses
620     = add_parent_or_loop_num_clause (num_workers_clause, loop_worker_clause,
621                                              OMP_CLAUSE_NUM_WORKERS, clauses);
622   clauses
623     = add_parent_or_loop_num_clause (vector_length_clause, loop_vector_clause,
624                                              OMP_CLAUSE_VECTOR_LENGTH, clauses);
625 
626   return clauses;
627 }
628 
629 /* Construct a possibly gang-parallel compute construct containing the STMT,
630    which must be identical to, or a bind containing, the loop OMP_FOR.
631 
632    The NUM_GANGS_CLAUSE, NUM_WORKERS_CLAUSE, and VECTOR_LENGTH_CLAUSE are
633    optional clauses from the original kernels region and must not be contained
634    in the other CLAUSES. The newly created compute construct is annotated with
635    the optional NUM_GANGS_CLAUSE as well as the other CLAUSES.  If there is no
636    NUM_GANGS_CLAUSE but the loop has a 'gang (num: N)' clause, that is
637    converted to a 'num_gangs (N)' clause on the new compute construct, and
638    similarly for 'worker' and 'vector' clauses.
639 
640    The outermost loop gets an 'auto' clause unless there already is an
641    'seq'/'independent'/'auto' clause.  Nested loops inside OMP_FOR are treated
642    similarly by the adjust_nested_loop_clauses function.  */
643 
644 static gimple *
make_region_loop_nest(gimple * omp_for,gimple_seq stmts,tree num_gangs_clause,tree num_workers_clause,tree vector_length_clause,tree clauses)645 make_region_loop_nest (gimple *omp_for, gimple_seq stmts,
646                            tree num_gangs_clause,
647                            tree num_workers_clause,
648                            tree vector_length_clause,
649                            tree clauses)
650 {
651   /* This correctly unshares the entire clause chain rooted here.  */
652   clauses = unshare_expr (clauses);
653 
654   /* Figure out the region code for this region.  */
655   /* Optimistic default: assume that the loop nest is parallelizable
656      (essentially, no GIMPLE_OMP_FOR with (explicit or implicit) 'auto' clause,
657      and no un-annotated loops).  */
658   int region_code = GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED;
659   adjust_region_code (stmts, &region_code);
660 
661   if (region_code == GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED)
662     {
663       if (dump_enabled_p ())
664           /* This is not MSG_OPTIMIZED_LOCATIONS, as we're just doing what the
665              user asked us to.  */
666           dump_printf_loc (MSG_NOTE, omp_for,
667                                "parallelized loop nest"
668                                " in OpenACC %<kernels%> region\n");
669 
670       clauses = transform_kernels_loop_clauses (omp_for,
671                                                             num_gangs_clause,
672                                                             num_workers_clause,
673                                                             vector_length_clause,
674                                                             clauses);
675     }
676   else if (region_code == GF_OMP_TARGET_KIND_OACC_KERNELS)
677     {
678       if (dump_enabled_p ())
679           dump_printf_loc (MSG_NOTE, omp_for,
680                                "forwarded loop nest"
681                                " in OpenACC %<kernels%> region"
682                                " to %<parloops%> for analysis\n");
683 
684       /* We're transforming one 'GF_OMP_TARGET_KIND_OACC_KERNELS' into another
685            'GF_OMP_TARGET_KIND_OACC_KERNELS', so don't have to
686            'transform_kernels_loop_clauses'.  */
687       /* Re-assemble the clauses stripped off earlier.  */
688       clauses
689           = add_parent_or_loop_num_clause (num_gangs_clause, NULL,
690                                                    OMP_CLAUSE_NUM_GANGS, clauses);
691       clauses
692           = add_parent_or_loop_num_clause (num_workers_clause, NULL,
693                                                    OMP_CLAUSE_NUM_WORKERS, clauses);
694       clauses
695           = add_parent_or_loop_num_clause (vector_length_clause, NULL,
696                                                    OMP_CLAUSE_VECTOR_LENGTH, clauses);
697     }
698   else
699     gcc_unreachable ();
700 
701   gimple *parallel_body_bind
702     = gimple_build_bind (NULL, stmts, make_node (BLOCK));
703   gimple *parallel_region
704     = gimple_build_omp_target (parallel_body_bind, region_code, clauses);
705   gimple_set_location (parallel_region, gimple_location (omp_for));
706 
707   return parallel_region;
708 }
709 
710 /* Eliminate any binds directly inside BIND by adding their statements to
711    BIND (i.e., modifying it in place), excluding binds that hold only an
712    OMP_FOR loop and associated setup/cleanup code.  Recurse into binds but
713    not other statements.  Return a chain of the local variables of eliminated
714    binds, i.e., the local variables found in nested binds.  If
715    INCLUDE_TOPLEVEL_VARS is true, this also includes the variables belonging
716    to BIND itself. */
717 
718 static tree
flatten_binds(gbind * bind,bool include_toplevel_vars=false)719 flatten_binds (gbind *bind, bool include_toplevel_vars = false)
720 {
721   tree vars = NULL, last_var = NULL;
722 
723   if (include_toplevel_vars)
724     {
725       vars = gimple_bind_vars (bind);
726       last_var = vars;
727     }
728 
729   gimple_seq new_body = NULL;
730   gimple_seq body_sequence = gimple_bind_body (bind);
731   gimple_stmt_iterator gsi, gsi_n;
732   for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n)
733     {
734       /* Advance the iterator here because otherwise it would be invalidated
735            by moving statements below.  */
736       gsi_n = gsi;
737       gsi_next (&gsi_n);
738 
739       gimple *stmt = gsi_stmt (gsi);
740       /* Flatten bind statements, except the ones that contain only an
741            OpenACC for loop.  */
742       if (gimple_code (stmt) == GIMPLE_BIND
743             && !top_level_omp_for_in_stmt (stmt))
744           {
745             gbind *inner_bind = as_a <gbind *> (stmt);
746             /* Flatten recursively, and collect all variables.  */
747             tree inner_vars = flatten_binds (inner_bind, true);
748             gimple_seq inner_sequence = gimple_bind_body (inner_bind);
749             if (flag_checking)
750               {
751                 for (gimple_stmt_iterator inner_gsi = gsi_start (inner_sequence);
752                        !gsi_end_p (inner_gsi);
753                        gsi_next (&inner_gsi))
754                     {
755                       gimple *inner_stmt = gsi_stmt (inner_gsi);
756                       gcc_assert (gimple_code (inner_stmt) != GIMPLE_BIND
757                                     || top_level_omp_for_in_stmt (inner_stmt));
758                     }
759               }
760             gimple_seq_add_seq (&new_body, inner_sequence);
761             /* Find the last variable; we will append others to it.  */
762             while (last_var != NULL && TREE_CHAIN (last_var) != NULL)
763               last_var = TREE_CHAIN (last_var);
764             if (last_var != NULL)
765               {
766                 TREE_CHAIN (last_var) = inner_vars;
767                 last_var = inner_vars;
768               }
769             else
770               {
771                 vars = inner_vars;
772                 last_var = vars;
773               }
774           }
775       else
776           gimple_seq_add_stmt (&new_body, stmt);
777     }
778 
779   /* Put the possibly transformed body back into the bind.  */
780   gimple_bind_set_body (bind, new_body);
781   return vars;
782 }
783 
784 /* Helper function for places where we construct data regions.  Wraps the BODY
785    inside a try-finally construct at LOC that calls __builtin_GOACC_data_end
786    in its cleanup block.  Returns this try statement.  */
787 
788 static gimple *
make_data_region_try_statement(location_t loc,gimple * body)789 make_data_region_try_statement (location_t loc, gimple *body)
790 {
791   tree data_end_fn = builtin_decl_explicit (BUILT_IN_GOACC_DATA_END);
792   gimple *call = gimple_build_call (data_end_fn, 0);
793   gimple_seq cleanup = NULL;
794   gimple_seq_add_stmt (&cleanup, call);
795   gimple *try_stmt = gimple_build_try (body, cleanup, GIMPLE_TRY_FINALLY);
796   gimple_set_location (body, loc);
797   return try_stmt;
798 }
799 
800 /* If INNER_BIND_VARS holds variables, build an OpenACC data region with
801    location LOC containing BODY and having 'create (var)' clauses for each
802    variable (as a side effect, such variables also get TREE_ADDRESSABLE set).
803    If INNER_CLEANUP is present, add a try-finally statement with
804    this cleanup code in the finally block.  Return the new data region, or
805    the original BODY if no data region was needed.  */
806 
807 static gimple *
maybe_build_inner_data_region(location_t loc,gimple * body,tree inner_bind_vars,gimple * inner_cleanup)808 maybe_build_inner_data_region (location_t loc, gimple *body,
809                                      tree inner_bind_vars, gimple *inner_cleanup)
810 {
811   /* Is this an instantiation of a template?  (In this case, we don't care what
812      the generic decl is - just whether the function decl has one.)  */
813   bool generic_inst_p
814     = (lang_hooks.decls.get_generic_function_decl (current_function_decl)
815        != NULL);
816 
817   /* Build data 'create (var)' clauses for these local variables.
818      Below we will add these to a data region enclosing the entire body
819      of the decomposed kernels region.  */
820   tree prev_mapped_var = NULL, next = NULL, artificial_vars = NULL,
821        inner_data_clauses = NULL;
822   for (tree v = inner_bind_vars; v; v = next)
823     {
824       next = TREE_CHAIN (v);
825       if (DECL_ARTIFICIAL (v)
826             || TREE_CODE (v) == CONST_DECL
827             || generic_inst_p)
828           {
829             /* If this is an artificial temporary, it need not be mapped.  We
830                move its declaration into the bind inside the data region.
831                Also avoid mapping variables if we are inside a template
832                instantiation; the code does not contain all the copies to
833                temporaries that would make this legal.  */
834             TREE_CHAIN (v) = artificial_vars;
835             artificial_vars = v;
836             if (prev_mapped_var != NULL)
837               TREE_CHAIN (prev_mapped_var) = next;
838             else
839               inner_bind_vars = next;
840           }
841       else
842           {
843             /* Otherwise, build the map clause.  */
844             tree new_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
845             OMP_CLAUSE_SET_MAP_KIND (new_clause, GOMP_MAP_ALLOC);
846             OMP_CLAUSE_DECL (new_clause) = v;
847             OMP_CLAUSE_SIZE (new_clause) = DECL_SIZE_UNIT (v);
848             OMP_CLAUSE_CHAIN (new_clause) = inner_data_clauses;
849             inner_data_clauses = new_clause;
850 
851             prev_mapped_var = v;
852 
853             /* See <https://gcc.gnu.org/PR100280>.  */
854             if (!TREE_ADDRESSABLE (v))
855               {
856                 /* Request that OMP lowering make 'v' addressable.  */
857                 OMP_CLAUSE_MAP_DECL_MAKE_ADDRESSABLE (new_clause) = 1;
858 
859                 if (dump_enabled_p ())
860                     {
861                       const dump_user_location_t d_u_loc
862                         = dump_user_location_t::from_location_t (loc);
863                       /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
864 #if __GNUC__ >= 10
865 # pragma GCC diagnostic push
866 # pragma GCC diagnostic ignored "-Wformat"
867 #endif
868                       dump_printf_loc (MSG_NOTE, d_u_loc,
869                                            "OpenACC %<kernels%> decomposition:"
870                                            " variable %<%T%> declared in block"
871                                            " requested to be made addressable\n",
872                                            v);
873 #if __GNUC__ >= 10
874 # pragma GCC diagnostic pop
875 #endif
876                     }
877               }
878           }
879     }
880 
881   if (artificial_vars)
882     body = gimple_build_bind (artificial_vars, body, make_node (BLOCK));
883 
884   /* If we determined above that there are variables that need to be created
885      on the device, construct a data region for them and wrap the body
886      inside that.  */
887   if (inner_data_clauses != NULL)
888     {
889       gcc_assert (inner_bind_vars != NULL);
890       gimple *inner_data_region
891           = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
892                                            inner_data_clauses);
893       gimple_set_location (inner_data_region, loc);
894       /* Make sure __builtin_GOACC_data_end is called at the end.  */
895       gimple *try_stmt = make_data_region_try_statement (loc, body);
896       gimple_omp_set_body (inner_data_region, try_stmt);
897       gimple *bind_body;
898       if (inner_cleanup != NULL)
899           /* Clobber all the inner variables that need to be clobbered.  */
900           bind_body = gimple_build_try (inner_data_region, inner_cleanup,
901                                               GIMPLE_TRY_FINALLY);
902       else
903           bind_body = inner_data_region;
904       body = gimple_build_bind (inner_bind_vars, bind_body, make_node (BLOCK));
905     }
906 
907   return body;
908 }
909 
910 static void
add_wait(location_t loc,gimple_seq * region_body)911 add_wait (location_t loc, gimple_seq *region_body)
912 {
913   /* A "#pragma acc wait" is just a call GOACC_wait (acc_async_sync, 0).  */
914   tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
915   tree sync_arg = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
916   gimple *wait_call = gimple_build_call (wait_fn, 2,
917                                                    sync_arg, integer_zero_node);
918   gimple_set_location (wait_call, loc);
919   gimple_seq_add_stmt (region_body, wait_call);
920 }
921 
922 /* Helper function of decompose_kernels_region_body.  The statements in
923    REGION_BODY are expected to be decomposed parts; add an 'async' clause to
924    each.  Also add a 'wait' directive at the end of the sequence.  */
925 
926 static void
add_async_clauses_and_wait(location_t loc,gimple_seq * region_body)927 add_async_clauses_and_wait (location_t loc, gimple_seq *region_body)
928 {
929   tree default_async_queue
930     = build_int_cst (integer_type_node, GOMP_ASYNC_NOVAL);
931   for (gimple_stmt_iterator gsi = gsi_start (*region_body);
932        !gsi_end_p (gsi);
933        gsi_next (&gsi))
934     {
935       gimple *stmt = gsi_stmt (gsi);
936       tree target_clauses = gimple_omp_target_clauses (stmt);
937       tree new_async_clause = build_omp_clause (loc, OMP_CLAUSE_ASYNC);
938       OMP_CLAUSE_OPERAND (new_async_clause, 0) = default_async_queue;
939       OMP_CLAUSE_CHAIN (new_async_clause) = target_clauses;
940       target_clauses = new_async_clause;
941       gimple_omp_target_set_clauses (as_a <gomp_target *> (stmt),
942                                              target_clauses);
943     }
944   add_wait (loc, region_body);
945 }
946 
947 /* Auxiliary analysis of the body of a kernels region, to determine for each
948    OpenACC loop whether it is control-dependent (i.e., not necessarily
949    executed every time the kernels region is entered) or not.
950    We say that a loop is control-dependent if there is some cond, switch, or
951    goto statement that jumps over it, forwards or backwards.  For example,
952    if the loop is controlled by an if statement, then a jump to the true
953    block, the false block, or from one of those blocks to the control flow
954    join point will necessarily jump over the loop.
955    This analysis implements an ad-hoc union-find data structure classifying
956    statements into "control-flow regions" as follows: Most statements are in
957    the same region as their predecessor, except that each OpenACC loop is in
958    a region of its own, and each OpenACC loop's successor starts a new
959    region.  We then unite the regions of any statements linked by jumps,
960    placing any cond, switch, or goto statement in the same region as its
961    target label(s).
962    In the end, control dependence of OpenACC loops can be determined by
963    comparing their immediate predecessor and successor statements' regions.
964    A jump crosses the loop if and only if the predecessor and successor are
965    in the same region.  (If there is no predecessor or successor, the loop
966    is executed unconditionally.)
967    The methods in this class identify statements by their index in the
968    kernels region's body.  */
969 
970 class control_flow_regions
971 {
972   public:
973     /* Initialize an instance and pre-compute the control-flow region
974        information for the statement sequence SEQ.  */
975     control_flow_regions (gimple_seq seq);
976 
977     /* Return true if the statement with the given index IDX in the analyzed
978        statement sequence is an unconditionally executed OpenACC loop.  */
979     bool is_unconditional_oacc_for_loop (size_t idx);
980 
981   private:
982     /* Find the region representative for the statement identified by index
983        STMT_IDX.  */
984     size_t find_rep (size_t stmt_idx);
985 
986     /* Union the regions containing the statements represented by
987        representatives A and B.  */
988     void union_reps (size_t a, size_t b);
989 
990     /* Helper for the constructor.  Performs the actual computation of the
991        control-flow regions in the statement sequence SEQ.  */
992     void compute_regions (gimple_seq seq);
993 
994     /* The mapping from statement indices to region representatives.  */
995     vec <size_t> representatives;
996 
997     /* A cache mapping statement indices to a flag indicating whether the
998        statement is a top level OpenACC for loop.  */
999     vec <bool> omp_for_loops;
1000 };
1001 
control_flow_regions(gimple_seq seq)1002 control_flow_regions::control_flow_regions (gimple_seq seq)
1003 {
1004   representatives.create (1);
1005   omp_for_loops.create (1);
1006   compute_regions (seq);
1007 }
1008 
1009 bool
is_unconditional_oacc_for_loop(size_t idx)1010 control_flow_regions::is_unconditional_oacc_for_loop (size_t idx)
1011 {
1012   if (idx == 0 || idx == representatives.length () - 1)
1013     /* The first or last statement in the kernels region.  This means that
1014        there is no room before or after it for a jump or a label.  Thus
1015        there cannot be a jump across it, so it is unconditional.  */
1016     return true;
1017   /* Otherwise, the loop is unconditional if the statements before and after
1018      it are in different control flow regions.  Scan forward and backward,
1019      skipping over neighboring OpenACC for loops, to find these preceding
1020      statements.  */
1021   size_t prev_index = idx - 1;
1022   while (prev_index > 0 && omp_for_loops [prev_index] == true)
1023     prev_index--;
1024   /* If all preceding statements are also OpenACC loops, all of these are
1025      unconditional.  */
1026   if (prev_index == 0)
1027     return true;
1028   size_t succ_index = idx + 1;
1029   while (succ_index < omp_for_loops.length ()
1030            && omp_for_loops [succ_index] == true)
1031     succ_index++;
1032   /* If all following statements are also OpenACC loops, all of these are
1033      unconditional.  */
1034   if (succ_index == omp_for_loops.length ())
1035     return true;
1036   return (find_rep (prev_index) != find_rep (succ_index));
1037 }
1038 
1039 size_t
find_rep(size_t stmt_idx)1040 control_flow_regions::find_rep (size_t stmt_idx)
1041 {
1042   size_t rep = stmt_idx, aux = stmt_idx;
1043   /* Find the root representative of this statement.  */
1044   while (representatives[rep] != rep)
1045     rep = representatives[rep];
1046   /* Compress the path from the original statement to the representative.  */
1047   while (representatives[aux] != rep)
1048     {
1049       size_t tmp = representatives[aux];
1050       representatives[aux] = rep;
1051       aux = tmp;
1052     }
1053   return rep;
1054 }
1055 
1056 void
union_reps(size_t a,size_t b)1057 control_flow_regions::union_reps (size_t a, size_t b)
1058 {
1059   a = find_rep (a);
1060   b = find_rep (b);
1061   representatives[b] = a;
1062 }
1063 
1064 void
compute_regions(gimple_seq seq)1065 control_flow_regions::compute_regions (gimple_seq seq)
1066 {
1067   hash_map <gimple *, size_t> control_flow_reps;
1068   hash_map <tree, size_t> label_reps;
1069   size_t current_region = 0, idx = 0;
1070 
1071   /* In a first pass, assign an initial region to each statement.  Except in
1072      the case of OpenACC loops, each statement simply gets the same region
1073      representative as its predecessor.  */
1074   for (gimple_stmt_iterator gsi = gsi_start (seq);
1075        !gsi_end_p (gsi);
1076        gsi_next (&gsi))
1077     {
1078       gimple *stmt = gsi_stmt (gsi);
1079       gimple *omp_for = top_level_omp_for_in_stmt (stmt);
1080       omp_for_loops.safe_push (omp_for != NULL);
1081       if (omp_for != NULL)
1082           {
1083             /* Assign a new region to this loop and to its successor.  */
1084             current_region = idx;
1085             representatives.safe_push (current_region);
1086             current_region++;
1087           }
1088       else
1089           {
1090             representatives.safe_push (current_region);
1091             /* Remember any jumps and labels for the second pass below.  */
1092             if (gimple_code (stmt) == GIMPLE_COND
1093                 || gimple_code (stmt) == GIMPLE_SWITCH
1094                 || gimple_code (stmt) == GIMPLE_GOTO)
1095               control_flow_reps.put (stmt, current_region);
1096             else if (gimple_code (stmt) == GIMPLE_LABEL)
1097               label_reps.put (gimple_label_label (as_a <glabel *> (stmt)),
1098                                   current_region);
1099           }
1100       idx++;
1101     }
1102   gcc_assert (representatives.length () == omp_for_loops.length ());
1103 
1104   /* Revisit all the control flow statements and union the region of each
1105      cond, switch, or goto statement with the target labels' regions.  */
1106   for (hash_map <gimple *, size_t>::iterator it = control_flow_reps.begin ();
1107        it != control_flow_reps.end ();
1108        ++it)
1109     {
1110       gimple *stmt = (*it).first;
1111       size_t stmt_rep = (*it).second;
1112       switch (gimple_code (stmt))
1113           {
1114             tree label;
1115             unsigned int n;
1116 
1117           case GIMPLE_COND:
1118             label = gimple_cond_true_label (as_a <gcond *> (stmt));
1119             union_reps (stmt_rep, *label_reps.get (label));
1120             label = gimple_cond_false_label (as_a <gcond *> (stmt));
1121             union_reps (stmt_rep, *label_reps.get (label));
1122             break;
1123 
1124           case GIMPLE_SWITCH:
1125             n = gimple_switch_num_labels (as_a <gswitch *> (stmt));
1126             for (unsigned int i = 0; i < n; i++)
1127               {
1128                 tree switch_case
1129                     = gimple_switch_label (as_a <gswitch *> (stmt), i);
1130                 label = CASE_LABEL (switch_case);
1131                 union_reps (stmt_rep, *label_reps.get (label));
1132               }
1133             break;
1134 
1135           case GIMPLE_GOTO:
1136             label = gimple_goto_dest (stmt);
1137             union_reps (stmt_rep, *label_reps.get (label));
1138             break;
1139 
1140           default:
1141             gcc_unreachable ();
1142           }
1143     }
1144 }
1145 
1146 /* Decompose the body of the KERNELS_REGION, which was originally annotated
1147    with the KERNELS_CLAUSES, into a series of compute constructs.  */
1148 
1149 static gimple *
decompose_kernels_region_body(gimple * kernels_region,tree kernels_clauses)1150 decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
1151 {
1152   location_t loc = gimple_location (kernels_region);
1153 
1154   /* The kernels clauses will be propagated to the child clauses unmodified,
1155      except that the 'num_gangs', 'num_workers', and 'vector_length' clauses
1156      will only be added to loop regions.  The other regions are "gang-single"
1157      and get an explicit 'num_gangs (1)' clause.  So separate out the
1158      'num_gangs', 'num_workers', and 'vector_length' clauses here.
1159      Also check for the presence of an 'async' clause but do not remove it from
1160      the 'kernels' clauses.  */
1161   tree num_gangs_clause = NULL, num_workers_clause = NULL,
1162        vector_length_clause = NULL;
1163   tree async_clause = NULL;
1164   tree prev_clause = NULL, next_clause = NULL;
1165   tree parallel_clauses = kernels_clauses;
1166   for (tree c = parallel_clauses; c; c = next_clause)
1167     {
1168       /* Preserve this here, as we might NULL it later.  */
1169       next_clause = OMP_CLAUSE_CHAIN (c);
1170 
1171       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_GANGS
1172             || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_NUM_WORKERS
1173             || OMP_CLAUSE_CODE (c) == OMP_CLAUSE_VECTOR_LENGTH)
1174           {
1175             /* Cut this clause out of the chain.  */
1176             if (prev_clause != NULL)
1177               OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (c);
1178             else
1179               kernels_clauses = OMP_CLAUSE_CHAIN (c);
1180             OMP_CLAUSE_CHAIN (c) = NULL;
1181             switch (OMP_CLAUSE_CODE (c))
1182               {
1183               case OMP_CLAUSE_NUM_GANGS:
1184                 num_gangs_clause = c;
1185                 break;
1186               case OMP_CLAUSE_NUM_WORKERS:
1187                 num_workers_clause = c;
1188                 break;
1189               case OMP_CLAUSE_VECTOR_LENGTH:
1190                 vector_length_clause = c;
1191                 break;
1192               default:
1193                 gcc_unreachable ();
1194               }
1195           }
1196       else
1197           prev_clause = c;
1198       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
1199           async_clause = c;
1200     }
1201 
1202   gimple *kernels_body = gimple_omp_body (kernels_region);
1203   gbind *kernels_bind = as_a <gbind *> (kernels_body);
1204 
1205   /* The body of the region may contain other nested binds declaring inner
1206      local variables.  Collapse all these binds into one to ensure that we
1207      have a single sequence of statements to iterate over; also, collect all
1208      inner variables.  */
1209   tree inner_bind_vars = flatten_binds (kernels_bind);
1210   gimple_seq body_sequence = gimple_bind_body (kernels_bind);
1211 
1212   /* All these inner variables will get allocated on the device (below, by
1213      calling maybe_build_inner_data_region).  Here we create 'present'
1214      clauses for them and add these clauses to the list of clauses to be
1215      attached to each inner compute construct.  */
1216   tree present_clauses = kernels_clauses;
1217   for (tree var = inner_bind_vars; var; var = TREE_CHAIN (var))
1218     {
1219       if (!DECL_ARTIFICIAL (var) && TREE_CODE (var) != CONST_DECL)
1220           {
1221             tree present_clause = build_omp_clause (loc, OMP_CLAUSE_MAP);
1222             OMP_CLAUSE_SET_MAP_KIND (present_clause, GOMP_MAP_FORCE_PRESENT);
1223             OMP_CLAUSE_DECL (present_clause) = var;
1224             OMP_CLAUSE_SIZE (present_clause) = DECL_SIZE_UNIT (var);
1225             OMP_CLAUSE_CHAIN (present_clause) = present_clauses;
1226             present_clauses = present_clause;
1227           }
1228     }
1229   kernels_clauses = present_clauses;
1230 
1231   /* In addition to nested binds, the "real" body of the region may be
1232      nested inside a try-finally block.  Find its cleanup block, which
1233      contains code to clobber the local variables that must be clobbered.  */
1234   gimple *inner_cleanup = NULL;
1235   if (body_sequence != NULL && gimple_code (body_sequence) == GIMPLE_TRY)
1236     {
1237       if (gimple_seq_singleton_p (body_sequence))
1238           {
1239             /* The try statement is the only thing inside the bind.  */
1240             inner_cleanup = gimple_try_cleanup (body_sequence);
1241             body_sequence = gimple_try_eval (body_sequence);
1242           }
1243       else
1244           {
1245             /* The bind's body starts with a try statement, but it is followed
1246                by other things.  */
1247             gimple_stmt_iterator gsi = gsi_start (body_sequence);
1248             gimple *try_stmt = gsi_stmt (gsi);
1249             inner_cleanup = gimple_try_cleanup (try_stmt);
1250             gimple *try_body = gimple_try_eval (try_stmt);
1251 
1252             gsi_remove (&gsi, false);
1253             /* Now gsi indicates the sequence of statements after the try
1254                statement in the bind.  Append the statement in the try body and
1255                the trailing statements from gsi.  */
1256             gsi_insert_seq_before (&gsi, try_body, GSI_CONTINUE_LINKING);
1257             body_sequence = gsi_stmt (gsi);
1258           }
1259     }
1260 
1261   /* This sequence will collect all the top-level statements in the body of
1262      the data region we are about to construct.  */
1263   gimple_seq region_body = NULL;
1264   /* This sequence will collect consecutive statements to be put into a
1265      gang-single region.  */
1266   gimple_seq gang_single_seq = NULL;
1267   /* Flag recording whether the gang_single_seq only contains copies to
1268      local variables.  These may be loop setup code that should not be
1269      separated from the loop.  */
1270   bool only_simple_assignments = true;
1271 
1272   /* Precompute the control flow region information to determine whether an
1273      OpenACC loop is executed conditionally or unconditionally.  */
1274   control_flow_regions cf_regions (body_sequence);
1275 
1276   /* Iterate over the statements in the kernels region's body.  */
1277   size_t idx = 0;
1278   gimple_stmt_iterator gsi, gsi_n;
1279   for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n, idx++)
1280     {
1281       /* Advance the iterator here because otherwise it would be invalidated
1282            by moving statements below.  */
1283       gsi_n = gsi;
1284       gsi_next (&gsi_n);
1285 
1286       gimple *stmt = gsi_stmt (gsi);
1287       if (gimple_code (stmt) == GIMPLE_DEBUG)
1288           {
1289             if (flag_compare_debug_opt || flag_compare_debug)
1290               /* Let the usual '-fcompare-debug' analysis bail out, as
1291                  necessary.  */
1292               ;
1293             else
1294               sorry_at (loc, "%qs not yet supported",
1295                           gimple_code_name[gimple_code (stmt)]);
1296           }
1297       gimple *omp_for = top_level_omp_for_in_stmt (stmt);
1298       bool is_unconditional_oacc_for_loop = false;
1299       if (omp_for != NULL)
1300           is_unconditional_oacc_for_loop
1301             = cf_regions.is_unconditional_oacc_for_loop (idx);
1302       if (omp_for != NULL
1303             && is_unconditional_oacc_for_loop)
1304           {
1305             /* This is an OMP for statement, put it into a separate region.
1306                But first, construct a gang-single region containing any
1307                complex sequential statements we may have seen.  */
1308             if (gang_single_seq != NULL && !only_simple_assignments)
1309               {
1310                 gimple *single_region
1311                     = make_region_seq (loc, gang_single_seq,
1312                                            num_gangs_clause,
1313                                            num_workers_clause,
1314                                            vector_length_clause,
1315                                            kernels_clauses);
1316                 gimple_seq_add_stmt (&region_body, single_region);
1317               }
1318             else if (gang_single_seq != NULL && only_simple_assignments)
1319               {
1320                 /* There is a sequence of sequential statements preceding this
1321                      loop, but they are all simple assignments.  This is
1322                      probably setup code for the loop; in particular, Fortran DO
1323                      loops are preceded by code to copy the loop limit variable
1324                      to a temporary.  Group this code together with the loop
1325                      itself.  */
1326                 gimple_seq_add_stmt (&gang_single_seq, stmt);
1327                 stmt = gimple_build_bind (NULL, gang_single_seq,
1328                                                   make_node (BLOCK));
1329               }
1330             gang_single_seq = NULL;
1331             only_simple_assignments = true;
1332 
1333             gimple_seq parallel_seq = NULL;
1334             gimple_seq_add_stmt (&parallel_seq, stmt);
1335             gimple *parallel_region
1336               = make_region_loop_nest (omp_for, parallel_seq,
1337                                              num_gangs_clause,
1338                                              num_workers_clause,
1339                                              vector_length_clause,
1340                                              kernels_clauses);
1341             gimple_seq_add_stmt (&region_body, parallel_region);
1342           }
1343       else
1344           {
1345             if (omp_for != NULL)
1346               {
1347                 gcc_checking_assert (!is_unconditional_oacc_for_loop);
1348                 if (dump_enabled_p ())
1349                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, omp_for,
1350                                          "unparallelized loop nest"
1351                                          " in OpenACC %<kernels%> region:"
1352                                          " it's executed conditionally\n");
1353               }
1354 
1355             /* This is not an unconditional OMP for statement, so it will be
1356                put into a gang-single region.  */
1357             gimple_seq_add_stmt (&gang_single_seq, stmt);
1358             /* Is this a simple assignment? We call it simple if it is an
1359                assignment to an artificial local variable.  This captures
1360                Fortran loop setup code computing loop bounds and offsets.  */
1361             bool is_simple_assignment
1362               = (gimple_code (stmt) == GIMPLE_ASSIGN
1363                  && TREE_CODE (gimple_assign_lhs (stmt)) == VAR_DECL
1364                  && DECL_ARTIFICIAL (gimple_assign_lhs (stmt)));
1365             if (!is_simple_assignment)
1366               only_simple_assignments = false;
1367           }
1368     }
1369 
1370   /* If we did not emit a new region, and are not going to emit one now
1371      (that is, the original region was empty), prepare to emit a dummy so as
1372      to preserve the original construct, which other processing (at least
1373      test cases) depend on.  */
1374   if (region_body == NULL && gang_single_seq == NULL)
1375     {
1376       gimple *stmt = gimple_build_nop ();
1377       gimple_set_location (stmt, loc);
1378       gimple_seq_add_stmt (&gang_single_seq, stmt);
1379     }
1380 
1381   /* Gather up any remaining gang-single statements.  */
1382   if (gang_single_seq != NULL)
1383     {
1384       gimple *single_region
1385           = make_region_seq (loc, gang_single_seq,
1386                                  num_gangs_clause,
1387                                  num_workers_clause,
1388                                  vector_length_clause,
1389                                  kernels_clauses);
1390       gimple_seq_add_stmt (&region_body, single_region);
1391     }
1392 
1393   /* We want to launch these kernels asynchronously.  If the original
1394      kernels region had an async clause, this is done automatically because
1395      that async clause was copied to the individual regions we created.
1396      Otherwise, add an async clause to each newly created region, as well as
1397      a wait directive at the end.  */
1398   if (async_clause == NULL)
1399     add_async_clauses_and_wait (loc, &region_body);
1400   else
1401     /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
1402        region, then target memory will get unmapped at the point the data
1403        region ends, even if the inner asynchronous parallels have not yet
1404        completed.  For kernels marked "async", we might want to use "enter data
1405        async(...)" and "exit data async(...)" instead, or asynchronous data
1406        regions (see also <https://gcc.gnu.org/PR97390>
1407        "[OpenACC] 'async' clause on 'data' construct",
1408        which is to share the same implementation).
1409        For now, insert a (synchronous) wait at the end of the block.  */
1410     add_wait (loc, &region_body);
1411 
1412   tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
1413   gimple *body = gimple_build_bind (kernels_locals, region_body,
1414                                             make_node (BLOCK));
1415 
1416   /* If we found variables declared in nested scopes, build a data region to
1417      map them to the device.  */
1418   body = maybe_build_inner_data_region (loc, body, inner_bind_vars,
1419                                                   inner_cleanup);
1420 
1421   return body;
1422 }
1423 
1424 /* Decompose one OpenACC 'kernels' construct into an OpenACC 'data' construct
1425    containing the original OpenACC 'kernels' construct's region cut up into a
1426    sequence of compute constructs.  */
1427 
1428 static gimple *
omp_oacc_kernels_decompose_1(gimple * kernels_stmt)1429 omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
1430 {
1431   gcc_checking_assert (gimple_omp_target_kind (kernels_stmt)
1432                            == GF_OMP_TARGET_KIND_OACC_KERNELS);
1433   location_t loc = gimple_location (kernels_stmt);
1434 
1435   /* Collect the data clauses of the OpenACC 'kernels' directive and create a
1436      new OpenACC 'data' construct with those clauses.  */
1437   tree kernels_clauses = gimple_omp_target_clauses (kernels_stmt);
1438   tree data_clauses = NULL;
1439   for (tree c = kernels_clauses; c; c = OMP_CLAUSE_CHAIN (c))
1440     {
1441       /* Certain clauses are copied to the enclosing OpenACC 'data'.  Other
1442            clauses remain on the OpenACC 'kernels'.  */
1443       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP)
1444           {
1445             tree decl = OMP_CLAUSE_DECL (c);
1446             HOST_WIDE_INT map_kind = OMP_CLAUSE_MAP_KIND (c);
1447             switch (map_kind)
1448               {
1449               default:
1450                 if (map_kind == GOMP_MAP_ALLOC
1451                       && integer_zerop (OMP_CLAUSE_SIZE (c)))
1452                     /* ??? This is an alloc clause for mapping a pointer whose
1453                        target is already mapped.  We leave these on the inner
1454                        compute constructs because moving them to the outer data
1455                        region causes runtime errors.  */
1456                     break;
1457 
1458                 /* For non-artificial variables, and for non-declaration
1459                      expressions like A[0:n], copy the clause to the data
1460                      region.  */
1461                 if ((DECL_P (decl) && !DECL_ARTIFICIAL (decl))
1462                       || !DECL_P (decl))
1463                     {
1464                       tree new_clause = build_omp_clause (OMP_CLAUSE_LOCATION (c),
1465                                                                   OMP_CLAUSE_MAP);
1466                       OMP_CLAUSE_SET_MAP_KIND (new_clause, map_kind);
1467                       /* This must be unshared here to avoid "incorrect sharing
1468                          of tree nodes" errors from verify_gimple.  */
1469                       OMP_CLAUSE_DECL (new_clause) = unshare_expr (decl);
1470                       OMP_CLAUSE_SIZE (new_clause) = OMP_CLAUSE_SIZE (c);
1471                       OMP_CLAUSE_CHAIN (new_clause) = data_clauses;
1472                       data_clauses = new_clause;
1473 
1474                       /* Now that this data is mapped, turn the data clause on the
1475                          inner OpenACC 'kernels' into a 'present' clause.  */
1476                       OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_FORCE_PRESENT);
1477 
1478                       /* See <https://gcc.gnu.org/PR100280>,
1479                          <https://gcc.gnu.org/PR104086>.  */
1480                       if (DECL_P (decl)
1481                           && !TREE_ADDRESSABLE (decl))
1482                         {
1483                           /* Request that OMP lowering make 'decl' addressable.  */
1484                           OMP_CLAUSE_MAP_DECL_MAKE_ADDRESSABLE (new_clause) = 1;
1485 
1486                           if (dump_enabled_p ())
1487                               {
1488                                 location_t loc = OMP_CLAUSE_LOCATION (new_clause);
1489                                 const dump_user_location_t d_u_loc
1490                                   = dump_user_location_t::from_location_t (loc);
1491                                 /* PR100695 "Format decoder, quoting in 'dump_printf'
1492                                    etc." */
1493 #if __GNUC__ >= 10
1494 # pragma GCC diagnostic push
1495 # pragma GCC diagnostic ignored "-Wformat"
1496 #endif
1497                                 dump_printf_loc
1498                                   (MSG_NOTE, d_u_loc,
1499                                    "OpenACC %<kernels%> decomposition:"
1500                                    " variable %<%T%> in %qs clause"
1501                                    " requested to be made addressable\n",
1502                                    decl,
1503                                    user_omp_clause_code_name (new_clause, true));
1504 #if __GNUC__ >= 10
1505 # pragma GCC diagnostic pop
1506 #endif
1507                               }
1508                         }
1509                     }
1510                 break;
1511 
1512               case GOMP_MAP_POINTER:
1513               case GOMP_MAP_TO_PSET:
1514               case GOMP_MAP_FIRSTPRIVATE_POINTER:
1515               case GOMP_MAP_FIRSTPRIVATE_REFERENCE:
1516                 /* ??? Copying these map kinds leads to internal compiler
1517                      errors in later passes.  */
1518                 break;
1519               }
1520           }
1521       else if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_IF)
1522           {
1523             /* If there is an 'if' clause, it must be duplicated to the
1524                enclosing data region.  Temporarily remove the if clause's
1525                chain to avoid copying it.  */
1526             tree saved_chain = OMP_CLAUSE_CHAIN (c);
1527             OMP_CLAUSE_CHAIN (c) = NULL;
1528             tree new_if_clause = unshare_expr (c);
1529             OMP_CLAUSE_CHAIN (c) = saved_chain;
1530             OMP_CLAUSE_CHAIN (new_if_clause) = data_clauses;
1531             data_clauses = new_if_clause;
1532           }
1533     }
1534   /* Restore the original order of the clauses.  */
1535   data_clauses = nreverse (data_clauses);
1536 
1537   gimple *data_region
1538     = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_DATA_KERNELS,
1539                                      data_clauses);
1540   gimple_set_location (data_region, loc);
1541 
1542   /* Transform the body of the kernels region into a sequence of compute
1543      constructs.  */
1544   gimple *body = decompose_kernels_region_body (kernels_stmt,
1545                                                             kernels_clauses);
1546 
1547   /* Put the transformed pieces together.  The entire body of the region is
1548      wrapped in a try-finally statement that calls __builtin_GOACC_data_end
1549      for cleanup.  */
1550   gimple *try_stmt = make_data_region_try_statement (loc, body);
1551   gimple_omp_set_body (data_region, try_stmt);
1552 
1553   return data_region;
1554 }
1555 
1556 
1557 /* Decompose OpenACC 'kernels' constructs in the current function.  */
1558 
1559 static tree
omp_oacc_kernels_decompose_callback_stmt(gimple_stmt_iterator * gsi_p,bool * handled_ops_p,struct walk_stmt_info *)1560 omp_oacc_kernels_decompose_callback_stmt (gimple_stmt_iterator *gsi_p,
1561                                                     bool *handled_ops_p,
1562                                                     struct walk_stmt_info *)
1563 {
1564   gimple *stmt = gsi_stmt (*gsi_p);
1565 
1566   if ((gimple_code (stmt) == GIMPLE_OMP_TARGET)
1567       && gimple_omp_target_kind (stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
1568     {
1569       gimple *stmt_new = omp_oacc_kernels_decompose_1 (stmt);
1570       gsi_replace (gsi_p, stmt_new, false);
1571       *handled_ops_p = true;
1572     }
1573   else
1574     *handled_ops_p = false;
1575 
1576   return NULL;
1577 }
1578 
1579 static unsigned int
omp_oacc_kernels_decompose(void)1580 omp_oacc_kernels_decompose (void)
1581 {
1582   gimple_seq body = gimple_body (current_function_decl);
1583 
1584   struct walk_stmt_info wi;
1585   memset (&wi, 0, sizeof (wi));
1586   walk_gimple_seq_mod (&body, omp_oacc_kernels_decompose_callback_stmt, NULL,
1587                            &wi);
1588 
1589   gimple_set_body (current_function_decl, body);
1590 
1591   return 0;
1592 }
1593 
1594 
1595 namespace {
1596 
1597 const pass_data pass_data_omp_oacc_kernels_decompose =
1598 {
1599   GIMPLE_PASS, /* type */
1600   "omp_oacc_kernels_decompose", /* name */
1601   OPTGROUP_OMP, /* optinfo_flags */
1602   TV_NONE, /* tv_id */
1603   PROP_gimple_any, /* properties_required */
1604   0, /* properties_provided */
1605   0, /* properties_destroyed */
1606   0, /* todo_flags_start */
1607   0, /* todo_flags_finish */
1608 };
1609 
1610 class pass_omp_oacc_kernels_decompose : public gimple_opt_pass
1611 {
1612 public:
pass_omp_oacc_kernels_decompose(gcc::context * ctxt)1613   pass_omp_oacc_kernels_decompose (gcc::context *ctxt)
1614     : gimple_opt_pass (pass_data_omp_oacc_kernels_decompose, ctxt)
1615   {}
1616 
1617   /* opt_pass methods: */
gate(function *)1618   virtual bool gate (function *)
1619   {
1620     return (flag_openacc
1621               && param_openacc_kernels == OPENACC_KERNELS_DECOMPOSE);
1622   }
execute(function *)1623   virtual unsigned int execute (function *)
1624   {
1625     return omp_oacc_kernels_decompose ();
1626   }
1627 
1628 }; // class pass_omp_oacc_kernels_decompose
1629 
1630 } // anon namespace
1631 
1632 gimple_opt_pass *
make_pass_omp_oacc_kernels_decompose(gcc::context * ctxt)1633 make_pass_omp_oacc_kernels_decompose (gcc::context *ctxt)
1634 {
1635   return new pass_omp_oacc_kernels_decompose (ctxt);
1636 }
1637