1 /* Loop autoparallelization.
2    Copyright (C) 2006-2022 Free Software Foundation, Inc.
3    Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4    Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "gimple-pretty-print.h"
33 #include "fold-const.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-walk.h"
38 #include "stor-layout.h"
39 #include "tree-nested.h"
40 #include "tree-cfg.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "tree-into-ssa.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "langhooks.h"
49 #include "tree-vectorizer.h"
50 #include "tree-hasher.h"
51 #include "tree-parloops.h"
52 #include "omp-general.h"
53 #include "omp-low.h"
54 #include "tree-ssa.h"
55 #include "tree-ssa-alias.h"
56 #include "tree-eh.h"
57 #include "gomp-constants.h"
58 #include "tree-dfa.h"
59 #include "stringpool.h"
60 #include "attribs.h"
61 
62 /* This pass tries to distribute iterations of loops into several threads.
63    The implementation is straightforward -- for each loop we test whether its
64    iterations are independent, and if it is the case (and some additional
65    conditions regarding profitability and correctness are satisfied), we
66    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
67    machinery do its job.
68 
69    The most of the complexity is in bringing the code into shape expected
70    by the omp expanders:
71    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
72       variable and that the exit test is at the start of the loop body
73    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
74       variables by accesses through pointers, and breaking up ssa chains
75       by storing the values incoming to the parallelized loop to a structure
76       passed to the new function as an argument (something similar is done
77       in omp gimplification, unfortunately only a small part of the code
78       can be shared).
79 
80    TODO:
81    -- if there are several parallelizable loops in a function, it may be
82       possible to generate the threads just once (using synchronization to
83       ensure that cross-loop dependences are obeyed).
84    -- handling of common reduction patterns for outer loops.
85 
86    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
87 /*
88   Reduction handling:
89   currently we use code inspired by vect_force_simple_reduction to detect
90   reduction patterns.
91   The code transformation will be introduced by an example.
92 
93 
94 parloop
95 {
96   int sum=1;
97 
98   for (i = 0; i < N; i++)
99    {
100     x[i] = i + 3;
101     sum+=x[i];
102    }
103 }
104 
105 gimple-like code:
106 header_bb:
107 
108   # sum_29 = PHI <sum_11(5), 1(3)>
109   # i_28 = PHI <i_12(5), 0(3)>
110   D.1795_8 = i_28 + 3;
111   x[i_28] = D.1795_8;
112   sum_11 = D.1795_8 + sum_29;
113   i_12 = i_28 + 1;
114   if (N_6(D) > i_12)
115     goto header_bb;
116 
117 
118 exit_bb:
119 
120   # sum_21 = PHI <sum_11(4)>
121   printf (&"%d"[0], sum_21);
122 
123 
124 after reduction transformation (only relevant parts):
125 
126 parloop
127 {
128 
129 ....
130 
131 
132   # Storing the initial value given by the user.  #
133 
134   .paral_data_store.32.sum.27 = 1;
135 
136   #pragma omp parallel num_threads(4)
137 
138   #pragma omp for schedule(static)
139 
140   # The neutral element corresponding to the particular
141   reduction's operation, e.g. 0 for PLUS_EXPR,
142   1 for MULT_EXPR, etc. replaces the user's initial value.  #
143 
144   # sum.27_29 = PHI <sum.27_11, 0>
145 
146   sum.27_11 = D.1827_8 + sum.27_29;
147 
148   GIMPLE_OMP_CONTINUE
149 
150   # Adding this reduction phi is done at create_phi_for_local_result() #
151   # sum.27_56 = PHI <sum.27_11, 0>
152   GIMPLE_OMP_RETURN
153 
154   # Creating the atomic operation is done at
155   create_call_for_reduction_1()  #
156 
157   #pragma omp atomic_load
158   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
159   D.1840_60 = sum.27_56 + D.1839_59;
160   #pragma omp atomic_store (D.1840_60);
161 
162   GIMPLE_OMP_RETURN
163 
164  # collecting the result after the join of the threads is done at
165   create_loads_for_reductions().
166   The value computed by the threads is loaded from the
167   shared struct.  #
168 
169 
170   .paral_data_load.33_52 = &.paral_data_store.32;
171   sum_37 =  .paral_data_load.33_52->sum.27;
172   sum_43 = D.1795_41 + sum_37;
173 
174   exit bb:
175   # sum_21 = PHI <sum_43, sum_26>
176   printf (&"%d"[0], sum_21);
177 
178 ...
179 
180 }
181 
182 */
183 
184 /* Error reporting helper for parloops_is_simple_reduction below.  GIMPLE
185    statement STMT is printed with a message MSG. */
186 
187 static void
report_ploop_op(dump_flags_t msg_type,gimple * stmt,const char * msg)188 report_ploop_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
189 {
190   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
191 }
192 
193 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
194    operation.  Return true if the results of DEF_STMT_INFO are something
195    that can be accumulated by such a reduction.  */
196 
197 static bool
parloops_valid_reduction_input_p(stmt_vec_info def_stmt_info)198 parloops_valid_reduction_input_p (stmt_vec_info def_stmt_info)
199 {
200   return (is_gimple_assign (def_stmt_info->stmt)
201             || is_gimple_call (def_stmt_info->stmt)
202             || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
203             || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
204                 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
205                 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
206 }
207 
208 /* Detect SLP reduction of the form:
209 
210    #a1 = phi <a5, a0>
211    a2 = operation (a1)
212    a3 = operation (a2)
213    a4 = operation (a3)
214    a5 = operation (a4)
215 
216    #a = phi <a5>
217 
218    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
219    FIRST_STMT is the first reduction stmt in the chain
220    (a2 = operation (a1)).
221 
222    Return TRUE if a reduction chain was detected.  */
223 
224 static bool
parloops_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)225 parloops_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
226                                  gimple *first_stmt)
227 {
228   class loop *loop = (gimple_bb (phi))->loop_father;
229   class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
230   enum tree_code code;
231   gimple *loop_use_stmt = NULL;
232   stmt_vec_info use_stmt_info;
233   tree lhs;
234   imm_use_iterator imm_iter;
235   use_operand_p use_p;
236   int nloop_uses, size = 0, n_out_of_loop_uses;
237   bool found = false;
238 
239   if (loop != vect_loop)
240     return false;
241 
242   auto_vec<stmt_vec_info, 8> reduc_chain;
243   lhs = PHI_RESULT (phi);
244   code = gimple_assign_rhs_code (first_stmt);
245   while (1)
246     {
247       nloop_uses = 0;
248       n_out_of_loop_uses = 0;
249       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
250         {
251             gimple *use_stmt = USE_STMT (use_p);
252             if (is_gimple_debug (use_stmt))
253               continue;
254 
255           /* Check if we got back to the reduction phi.  */
256             if (use_stmt == phi)
257             {
258                 loop_use_stmt = use_stmt;
259               found = true;
260               break;
261             }
262 
263           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
264             {
265                 loop_use_stmt = use_stmt;
266                 nloop_uses++;
267             }
268            else
269              n_out_of_loop_uses++;
270 
271            /* There are can be either a single use in the loop or two uses in
272               phi nodes.  */
273            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
274              return false;
275         }
276 
277       if (found)
278         break;
279 
280       /* We reached a statement with no loop uses.  */
281       if (nloop_uses == 0)
282           return false;
283 
284       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
285       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
286         return false;
287 
288       if (!is_gimple_assign (loop_use_stmt)
289             || code != gimple_assign_rhs_code (loop_use_stmt)
290             || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
291         return false;
292 
293       /* Insert USE_STMT into reduction chain.  */
294       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
295       reduc_chain.safe_push (use_stmt_info);
296 
297       lhs = gimple_assign_lhs (loop_use_stmt);
298       size++;
299    }
300 
301   if (!found || loop_use_stmt != phi || size < 2)
302     return false;
303 
304   /* Swap the operands, if needed, to make the reduction operand be the second
305      operand.  */
306   lhs = PHI_RESULT (phi);
307   for (unsigned i = 0; i < reduc_chain.length (); ++i)
308     {
309       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
310       if (gimple_assign_rhs2 (next_stmt) == lhs)
311           {
312             tree op = gimple_assign_rhs1 (next_stmt);
313             stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
314 
315             /* Check that the other def is either defined in the loop
316                ("vect_internal_def"), or it's an induction (defined by a
317                loop-header phi-node).  */
318             if (def_stmt_info
319                 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
320                 && parloops_valid_reduction_input_p (def_stmt_info))
321               {
322                 lhs = gimple_assign_lhs (next_stmt);
323                 continue;
324               }
325 
326             return false;
327           }
328       else
329           {
330           tree op = gimple_assign_rhs2 (next_stmt);
331             stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
332 
333           /* Check that the other def is either defined in the loop
334             ("vect_internal_def"), or it's an induction (defined by a
335             loop-header phi-node).  */
336             if (def_stmt_info
337                 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
338                 && parloops_valid_reduction_input_p (def_stmt_info))
339               {
340                 if (dump_enabled_p ())
341                     dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
342                                          next_stmt);
343 
344                 swap_ssa_operands (next_stmt,
345                                          gimple_assign_rhs1_ptr (next_stmt),
346                                  gimple_assign_rhs2_ptr (next_stmt));
347                 update_stmt (next_stmt);
348               }
349             else
350               return false;
351         }
352 
353       lhs = gimple_assign_lhs (next_stmt);
354     }
355 
356   /* Build up the actual chain.  */
357   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
358     {
359       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
360       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
361     }
362   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
363   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
364 
365   /* Save the chain for further analysis in SLP detection.  */
366   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
367   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
368 
369   return true;
370 }
371 
372 /* Return true if we need an in-order reduction for operation CODE
373    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
374    overflow must wrap.  */
375 
376 static bool
parloops_needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)377 parloops_needs_fold_left_reduction_p (tree type, tree_code code,
378                                               bool need_wrapping_integral_overflow)
379 {
380   /* CHECKME: check for !flag_finite_math_only too?  */
381   if (SCALAR_FLOAT_TYPE_P (type))
382     switch (code)
383       {
384       case MIN_EXPR:
385       case MAX_EXPR:
386           return false;
387 
388       default:
389           return !flag_associative_math;
390       }
391 
392   if (INTEGRAL_TYPE_P (type))
393     {
394       if (!operation_no_trapping_overflow (type, code))
395           return true;
396       if (need_wrapping_integral_overflow
397             && !TYPE_OVERFLOW_WRAPS (type)
398             && operation_can_overflow (code))
399           return true;
400       return false;
401     }
402 
403   if (SAT_FIXED_POINT_TYPE_P (type))
404     return true;
405 
406   return false;
407 }
408 
409 
410 /* Function parloops_is_simple_reduction
411 
412    (1) Detect a cross-iteration def-use cycle that represents a simple
413    reduction computation.  We look for the following pattern:
414 
415    loop_header:
416      a1 = phi < a0, a2 >
417      a3 = ...
418      a2 = operation (a3, a1)
419 
420    or
421 
422    a3 = ...
423    loop_header:
424      a1 = phi < a0, a2 >
425      a2 = operation (a3, a1)
426 
427    such that:
428    1. operation is commutative and associative and it is safe to
429       change the order of the computation
430    2. no uses for a2 in the loop (a2 is used out of the loop)
431    3. no uses of a1 in the loop besides the reduction operation
432    4. no uses of a1 outside the loop.
433 
434    Conditions 1,4 are tested here.
435    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
436 
437    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
438    nested cycles.
439 
440    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
441    reductions:
442 
443      a1 = phi < a0, a2 >
444      inner loop (def of a3)
445      a2 = phi < a3 >
446 
447    (4) Detect condition expressions, ie:
448      for (int i = 0; i < N; i++)
449        if (a[i] < val)
450           ret_val = a[i];
451 
452 */
453 
454 static stmt_vec_info
parloops_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)455 parloops_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
456                                 bool *double_reduc,
457                                 bool need_wrapping_integral_overflow,
458                                 enum vect_reduction_type *v_reduc_type)
459 {
460   gphi *phi = as_a <gphi *> (phi_info->stmt);
461   class loop *loop = (gimple_bb (phi))->loop_father;
462   class loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
463   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
464   gimple *phi_use_stmt = NULL;
465   enum tree_code orig_code, code;
466   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
467   tree type;
468   tree name;
469   imm_use_iterator imm_iter;
470   use_operand_p use_p;
471   bool phi_def;
472 
473   *double_reduc = false;
474   *v_reduc_type = TREE_CODE_REDUCTION;
475 
476   tree phi_name = PHI_RESULT (phi);
477   /* ???  If there are no uses of the PHI result the inner loop reduction
478      won't be detected as possibly double-reduction by vectorizable_reduction
479      because that tries to walk the PHI arg from the preheader edge which
480      can be constant.  See PR60382.  */
481   if (has_zero_uses (phi_name))
482     return NULL;
483   unsigned nphi_def_loop_uses = 0;
484   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
485     {
486       gimple *use_stmt = USE_STMT (use_p);
487       if (is_gimple_debug (use_stmt))
488           continue;
489 
490       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
491         {
492           if (dump_enabled_p ())
493               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
494                                    "intermediate value used outside loop.\n");
495 
496           return NULL;
497         }
498 
499       nphi_def_loop_uses++;
500       phi_use_stmt = use_stmt;
501     }
502 
503   edge latch_e = loop_latch_edge (loop);
504   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
505   if (TREE_CODE (loop_arg) != SSA_NAME)
506     {
507       if (dump_enabled_p ())
508           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509                                "reduction: not ssa_name: %T\n", loop_arg);
510       return NULL;
511     }
512 
513   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
514   if (!def_stmt_info
515       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
516     return NULL;
517 
518   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
519     {
520       name = gimple_assign_lhs (def_stmt);
521       phi_def = false;
522     }
523   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
524     {
525       name = PHI_RESULT (def_stmt);
526       phi_def = true;
527     }
528   else
529     {
530       if (dump_enabled_p ())
531           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
532                                "reduction: unhandled reduction operation: %G",
533                                def_stmt_info->stmt);
534       return NULL;
535     }
536 
537   unsigned nlatch_def_loop_uses = 0;
538   auto_vec<gphi *, 3> lcphis;
539   bool inner_loop_of_double_reduc = false;
540   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
541     {
542       gimple *use_stmt = USE_STMT (use_p);
543       if (is_gimple_debug (use_stmt))
544           continue;
545       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
546           nlatch_def_loop_uses++;
547       else
548           {
549             /* We can have more than one loop-closed PHI.  */
550             lcphis.safe_push (as_a <gphi *> (use_stmt));
551             if (nested_in_vect_loop
552                 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
553                       == vect_double_reduction_def))
554               inner_loop_of_double_reduc = true;
555           }
556     }
557 
558   /* If this isn't a nested cycle or if the nested cycle reduction value
559      is used ouside of the inner loop we cannot handle uses of the reduction
560      value.  */
561   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
562       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
563     {
564       if (dump_enabled_p ())
565           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
566                                "reduction used in loop.\n");
567       return NULL;
568     }
569 
570   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
571      defined in the inner loop.  */
572   if (phi_def)
573     {
574       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
575       op1 = PHI_ARG_DEF (def_stmt, 0);
576 
577       if (gimple_phi_num_args (def_stmt) != 1
578           || TREE_CODE (op1) != SSA_NAME)
579         {
580           if (dump_enabled_p ())
581               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582                                    "unsupported phi node definition.\n");
583 
584           return NULL;
585         }
586 
587       gimple *def1 = SSA_NAME_DEF_STMT (op1);
588       if (gimple_bb (def1)
589             && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
590           && loop->inner
591           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
592           && is_gimple_assign (def1)
593             && is_a <gphi *> (phi_use_stmt)
594             && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
595         {
596           if (dump_enabled_p ())
597             report_ploop_op (MSG_NOTE, def_stmt,
598                                    "detected double reduction: ");
599 
600           *double_reduc = true;
601             return def_stmt_info;
602         }
603 
604       return NULL;
605     }
606 
607   /* If we are vectorizing an inner reduction we are executing that
608      in the original order only in case we are not dealing with a
609      double reduction.  */
610   bool check_reduction = true;
611   if (flow_loop_nested_p (vect_loop, loop))
612     {
613       gphi *lcphi;
614       unsigned i;
615       check_reduction = false;
616       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
617           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
618             {
619               gimple *use_stmt = USE_STMT (use_p);
620               if (is_gimple_debug (use_stmt))
621                 continue;
622               if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
623                 check_reduction = true;
624             }
625     }
626 
627   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
628   code = orig_code = gimple_assign_rhs_code (def_stmt);
629 
630   if (nested_in_vect_loop && !check_reduction)
631     {
632       /* FIXME: Even for non-reductions code generation is funneled
633            through vectorizable_reduction for the stmt defining the
634            PHI latch value.  So we have to artificially restrict ourselves
635            for the supported operations.  */
636       switch (get_gimple_rhs_class (code))
637           {
638           case GIMPLE_BINARY_RHS:
639           case GIMPLE_TERNARY_RHS:
640             break;
641           default:
642             /* Not supported by vectorizable_reduction.  */
643             if (dump_enabled_p ())
644               report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
645                                    "nested cycle: not handled operation: ");
646             return NULL;
647           }
648       if (dump_enabled_p ())
649           report_ploop_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
650       return def_stmt_info;
651     }
652 
653   /* We can handle "res -= x[i]", which is non-associative by
654      simply rewriting this into "res += -x[i]".  Avoid changing
655      gimple instruction for the first simple tests and only do this
656      if we're allowed to change code at all.  */
657   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
658     code = PLUS_EXPR;
659 
660   if (code == COND_EXPR)
661     {
662       if (! nested_in_vect_loop)
663           *v_reduc_type = COND_REDUCTION;
664 
665       op3 = gimple_assign_rhs1 (def_stmt);
666       if (COMPARISON_CLASS_P (op3))
667         {
668           op4 = TREE_OPERAND (op3, 1);
669           op3 = TREE_OPERAND (op3, 0);
670         }
671       if (op3 == phi_name || op4 == phi_name)
672           {
673             if (dump_enabled_p ())
674               report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
675                                    "reduction: condition depends on previous"
676                                    " iteration: ");
677             return NULL;
678           }
679 
680       op1 = gimple_assign_rhs2 (def_stmt);
681       op2 = gimple_assign_rhs3 (def_stmt);
682     }
683   else if (!commutative_tree_code (code) || !associative_tree_code (code))
684     {
685       if (dump_enabled_p ())
686           report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
687                                "reduction: not commutative/associative: ");
688       return NULL;
689     }
690   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
691     {
692       op1 = gimple_assign_rhs1 (def_stmt);
693       op2 = gimple_assign_rhs2 (def_stmt);
694     }
695   else
696     {
697       if (dump_enabled_p ())
698           report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
699                                "reduction: not handled operation: ");
700       return NULL;
701     }
702 
703   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
704     {
705       if (dump_enabled_p ())
706           report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
707                                "reduction: both uses not ssa_names: ");
708 
709       return NULL;
710     }
711 
712   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
713   if ((TREE_CODE (op1) == SSA_NAME
714        && !types_compatible_p (type,TREE_TYPE (op1)))
715       || (TREE_CODE (op2) == SSA_NAME
716           && !types_compatible_p (type, TREE_TYPE (op2)))
717       || (op3 && TREE_CODE (op3) == SSA_NAME
718           && !types_compatible_p (type, TREE_TYPE (op3)))
719       || (op4 && TREE_CODE (op4) == SSA_NAME
720           && !types_compatible_p (type, TREE_TYPE (op4))))
721     {
722       if (dump_enabled_p ())
723         {
724           dump_printf_loc (MSG_NOTE, vect_location,
725                                  "reduction: multiple types: operation type: "
726                                  "%T, operands types: %T,%T",
727                                  type,  TREE_TYPE (op1), TREE_TYPE (op2));
728           if (op3)
729               dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
730 
731           if (op4)
732               dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
733           dump_printf (MSG_NOTE, "\n");
734         }
735 
736       return NULL;
737     }
738 
739   /* Check whether it's ok to change the order of the computation.
740      Generally, when vectorizing a reduction we change the order of the
741      computation.  This may change the behavior of the program in some
742      cases, so we need to check that this is ok.  One exception is when
743      vectorizing an outer-loop: the inner-loop is executed sequentially,
744      and therefore vectorizing reductions in the inner-loop during
745      outer-loop vectorization is safe.  */
746   if (check_reduction
747       && *v_reduc_type == TREE_CODE_REDUCTION
748       && parloops_needs_fold_left_reduction_p (type, code,
749                                                          need_wrapping_integral_overflow))
750     *v_reduc_type = FOLD_LEFT_REDUCTION;
751 
752   /* Reduction is safe. We're dealing with one of the following:
753      1) integer arithmetic and no trapv
754      2) floating point arithmetic, and special flags permit this optimization
755      3) nested cycle (i.e., outer loop vectorization).  */
756   stmt_vec_info def1_info = loop_info->lookup_def (op1);
757   stmt_vec_info def2_info = loop_info->lookup_def (op2);
758   if (code != COND_EXPR && !def1_info && !def2_info)
759     {
760       if (dump_enabled_p ())
761           report_ploop_op (MSG_NOTE, def_stmt,
762                                "reduction: no defs for operands: ");
763       return NULL;
764     }
765 
766   /* Check that one def is the reduction def, defined by PHI,
767      the other def is either defined in the loop ("vect_internal_def"),
768      or it's an induction (defined by a loop-header phi-node).  */
769 
770   if (def2_info
771       && def2_info->stmt == phi
772       && (code == COND_EXPR
773             || !def1_info
774             || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
775             || parloops_valid_reduction_input_p (def1_info)))
776     {
777       if (dump_enabled_p ())
778           report_ploop_op (MSG_NOTE, def_stmt, "detected reduction: ");
779       return def_stmt_info;
780     }
781 
782   if (def1_info
783       && def1_info->stmt == phi
784       && (code == COND_EXPR
785             || !def2_info
786             || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
787             || parloops_valid_reduction_input_p (def2_info)))
788     {
789       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
790           {
791             /* Check if we can swap operands (just for simplicity - so that
792                the rest of the code can assume that the reduction variable
793                is always the last (second) argument).  */
794             if (code == COND_EXPR)
795               {
796                 /* Swap cond_expr by inverting the condition.  */
797                 tree cond_expr = gimple_assign_rhs1 (def_stmt);
798                 enum tree_code invert_code = ERROR_MARK;
799                 enum tree_code cond_code = TREE_CODE (cond_expr);
800 
801                 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
802                     {
803                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
804                       invert_code = invert_tree_comparison (cond_code, honor_nans);
805                     }
806                 if (invert_code != ERROR_MARK)
807                     {
808                       TREE_SET_CODE (cond_expr, invert_code);
809                       swap_ssa_operands (def_stmt,
810                                              gimple_assign_rhs2_ptr (def_stmt),
811                                              gimple_assign_rhs3_ptr (def_stmt));
812                     }
813                 else
814                     {
815                       if (dump_enabled_p ())
816                         report_ploop_op (MSG_NOTE, def_stmt,
817                                              "detected reduction: cannot swap operands "
818                                              "for cond_expr");
819                       return NULL;
820                     }
821               }
822             else
823               swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
824                                      gimple_assign_rhs2_ptr (def_stmt));
825 
826             if (dump_enabled_p ())
827               report_ploop_op (MSG_NOTE, def_stmt,
828                                    "detected reduction: need to swap operands: ");
829         }
830       else
831         {
832           if (dump_enabled_p ())
833             report_ploop_op (MSG_NOTE, def_stmt, "detected reduction: ");
834         }
835 
836       return def_stmt_info;
837     }
838 
839   /* Try to find SLP reduction chain.  */
840   if (! nested_in_vect_loop
841       && code != COND_EXPR
842       && orig_code != MINUS_EXPR
843       && parloops_is_slp_reduction (loop_info, phi, def_stmt))
844     {
845       if (dump_enabled_p ())
846         report_ploop_op (MSG_NOTE, def_stmt,
847                                "reduction: detected reduction chain: ");
848 
849       return def_stmt_info;
850     }
851 
852   /* Look for the expression computing loop_arg from loop PHI result.  */
853   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
854     return def_stmt_info;
855 
856   if (dump_enabled_p ())
857     {
858       report_ploop_op (MSG_MISSED_OPTIMIZATION, def_stmt,
859                            "reduction: unknown pattern: ");
860     }
861 
862   return NULL;
863 }
864 
865 /* Wrapper around vect_is_simple_reduction, which will modify code
866    in-place if it enables detection of more reductions.  Arguments
867    as there.  */
868 
869 stmt_vec_info
parloops_force_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool need_wrapping_integral_overflow)870 parloops_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
871                                    bool *double_reduc,
872                                    bool need_wrapping_integral_overflow)
873 {
874   enum vect_reduction_type v_reduc_type;
875   stmt_vec_info def_info
876     = parloops_is_simple_reduction (loop_info, phi_info, double_reduc,
877                                         need_wrapping_integral_overflow,
878                                         &v_reduc_type);
879   if (def_info)
880     {
881       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
882       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
883       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
884       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
885     }
886   return def_info;
887 }
888 
889 /* Minimal number of iterations of a loop that should be executed in each
890    thread.  */
891 #define MIN_PER_THREAD param_parloops_min_per_thread
892 
893 /* Element of the hashtable, representing a
894    reduction in the current loop.  */
895 struct reduction_info
896 {
897   gimple *reduc_stmt;                   /* reduction statement.  */
898   gimple *reduc_phi;                    /* The phi node defining the reduction.  */
899   enum tree_code reduction_code;/* code for the reduction operation.  */
900   unsigned reduc_version;     /* SSA_NAME_VERSION of original reduc_phi
901                                            result.  */
902   gphi *keep_res;             /* The PHI_RESULT of this phi is the resulting value
903                                            of the reduction variable when existing the loop. */
904   tree initial_value;                   /* The initial value of the reduction var before entering the loop.  */
905   tree field;                           /*  the name of the field in the parloop data structure intended for reduction.  */
906   tree reduc_addr;            /* The address of the reduction variable for
907                                            openacc reductions.  */
908   tree init;                            /* reduction initialization value.  */
909   gphi *new_phi;              /* (helper field) Newly created phi node whose result
910                                            will be passed to the atomic operation.  Represents
911                                            the local result each thread computed for the reduction
912                                            operation.  */
913 };
914 
915 /* Reduction info hashtable helpers.  */
916 
917 struct reduction_hasher : free_ptr_hash <reduction_info>
918 {
919   static inline hashval_t hash (const reduction_info *);
920   static inline bool equal (const reduction_info *, const reduction_info *);
921 };
922 
923 /* Equality and hash functions for hashtab code.  */
924 
925 inline bool
equal(const reduction_info * a,const reduction_info * b)926 reduction_hasher::equal (const reduction_info *a, const reduction_info *b)
927 {
928   return (a->reduc_phi == b->reduc_phi);
929 }
930 
931 inline hashval_t
hash(const reduction_info * a)932 reduction_hasher::hash (const reduction_info *a)
933 {
934   return a->reduc_version;
935 }
936 
937 typedef hash_table<reduction_hasher> reduction_info_table_type;
938 
939 
940 static struct reduction_info *
reduction_phi(reduction_info_table_type * reduction_list,gimple * phi)941 reduction_phi (reduction_info_table_type *reduction_list, gimple *phi)
942 {
943   struct reduction_info tmpred, *red;
944 
945   if (reduction_list->is_empty () || phi == NULL)
946     return NULL;
947 
948   if (gimple_uid (phi) == (unsigned int)-1
949       || gimple_uid (phi) == 0)
950     return NULL;
951 
952   tmpred.reduc_phi = phi;
953   tmpred.reduc_version = gimple_uid (phi);
954   red = reduction_list->find (&tmpred);
955   gcc_assert (red == NULL || red->reduc_phi == phi);
956 
957   return red;
958 }
959 
960 /* Element of hashtable of names to copy.  */
961 
962 struct name_to_copy_elt
963 {
964   unsigned version; /* The version of the name to copy.  */
965   tree new_name;    /* The new name used in the copy.  */
966   tree field;                 /* The field of the structure used to pass the
967                                  value.  */
968 };
969 
970 /* Name copies hashtable helpers.  */
971 
972 struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
973 {
974   static inline hashval_t hash (const name_to_copy_elt *);
975   static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *);
976 };
977 
978 /* Equality and hash functions for hashtab code.  */
979 
980 inline bool
equal(const name_to_copy_elt * a,const name_to_copy_elt * b)981 name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b)
982 {
983   return a->version == b->version;
984 }
985 
986 inline hashval_t
hash(const name_to_copy_elt * a)987 name_to_copy_hasher::hash (const name_to_copy_elt *a)
988 {
989   return (hashval_t) a->version;
990 }
991 
992 typedef hash_table<name_to_copy_hasher> name_to_copy_table_type;
993 
994 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
995    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
996    represents the denominator for every element in the matrix.  */
997 typedef struct lambda_trans_matrix_s
998 {
999   lambda_matrix matrix;
1000   int rowsize;
1001   int colsize;
1002   int denominator;
1003 } *lambda_trans_matrix;
1004 #define LTM_MATRIX(T) ((T)->matrix)
1005 #define LTM_ROWSIZE(T) ((T)->rowsize)
1006 #define LTM_COLSIZE(T) ((T)->colsize)
1007 #define LTM_DENOMINATOR(T) ((T)->denominator)
1008 
1009 /* Allocate a new transformation matrix.  */
1010 
1011 static lambda_trans_matrix
lambda_trans_matrix_new(int colsize,int rowsize,struct obstack * lambda_obstack)1012 lambda_trans_matrix_new (int colsize, int rowsize,
1013                                struct obstack * lambda_obstack)
1014 {
1015   lambda_trans_matrix ret;
1016 
1017   ret = (lambda_trans_matrix)
1018     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
1019   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
1020   LTM_ROWSIZE (ret) = rowsize;
1021   LTM_COLSIZE (ret) = colsize;
1022   LTM_DENOMINATOR (ret) = 1;
1023   return ret;
1024 }
1025 
1026 /* Multiply a vector VEC by a matrix MAT.
1027    MAT is an M*N matrix, and VEC is a vector with length N.  The result
1028    is stored in DEST which must be a vector of length M.  */
1029 
1030 static void
lambda_matrix_vector_mult(lambda_matrix matrix,int m,int n,lambda_vector vec,lambda_vector dest)1031 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
1032                                  lambda_vector vec, lambda_vector dest)
1033 {
1034   int i, j;
1035 
1036   lambda_vector_clear (dest, m);
1037   for (i = 0; i < m; i++)
1038     for (j = 0; j < n; j++)
1039       dest[i] += matrix[i][j] * vec[j];
1040 }
1041 
1042 /* Return true if TRANS is a legal transformation matrix that respects
1043    the dependence vectors in DISTS and DIRS.  The conservative answer
1044    is false.
1045 
1046    "Wolfe proves that a unimodular transformation represented by the
1047    matrix T is legal when applied to a loop nest with a set of
1048    lexicographically non-negative distance vectors RDG if and only if
1049    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
1050    i.e.: if and only if it transforms the lexicographically positive
1051    distance vectors to lexicographically positive vectors.  Note that
1052    a unimodular matrix must transform the zero vector (and only it) to
1053    the zero vector." S.Muchnick.  */
1054 
1055 static bool
lambda_transform_legal_p(lambda_trans_matrix trans,int nb_loops,vec<ddr_p> dependence_relations)1056 lambda_transform_legal_p (lambda_trans_matrix trans,
1057                                 int nb_loops,
1058                                 vec<ddr_p> dependence_relations)
1059 {
1060   unsigned int i, j;
1061   lambda_vector distres;
1062   struct data_dependence_relation *ddr;
1063 
1064   gcc_assert (LTM_COLSIZE (trans) == nb_loops
1065                 && LTM_ROWSIZE (trans) == nb_loops);
1066 
1067   /* When there are no dependences, the transformation is correct.  */
1068   if (dependence_relations.length () == 0)
1069     return true;
1070 
1071   ddr = dependence_relations[0];
1072   if (ddr == NULL)
1073     return true;
1074 
1075   /* When there is an unknown relation in the dependence_relations, we
1076      know that it is no worth looking at this loop nest: give up.  */
1077   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1078     return false;
1079 
1080   distres = lambda_vector_new (nb_loops);
1081 
1082   /* For each distance vector in the dependence graph.  */
1083   FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
1084     {
1085       /* Don't care about relations for which we know that there is no
1086            dependence, nor about read-read (aka. output-dependences):
1087            these data accesses can happen in any order.  */
1088       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
1089             || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
1090           continue;
1091 
1092       /* Conservatively answer: "this transformation is not valid".  */
1093       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1094           return false;
1095 
1096       /* If the dependence could not be captured by a distance vector,
1097            conservatively answer that the transform is not valid.  */
1098       if (DDR_NUM_DIST_VECTS (ddr) == 0)
1099           return false;
1100 
1101       /* Compute trans.dist_vect */
1102       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
1103           {
1104             lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
1105                                              DDR_DIST_VECT (ddr, j), distres);
1106 
1107             if (!lambda_vector_lexico_pos (distres, nb_loops))
1108               return false;
1109           }
1110     }
1111   return true;
1112 }
1113 
1114 /* Data dependency analysis. Returns true if the iterations of LOOP
1115    are independent on each other (that is, if we can execute them
1116    in parallel).  */
1117 
1118 static bool
loop_parallel_p(class loop * loop,struct obstack * parloop_obstack)1119 loop_parallel_p (class loop *loop, struct obstack * parloop_obstack)
1120 {
1121   vec<ddr_p> dependence_relations;
1122   vec<data_reference_p> datarefs;
1123   lambda_trans_matrix trans;
1124   bool ret = false;
1125 
1126   if (dump_file && (dump_flags & TDF_DETAILS))
1127   {
1128     fprintf (dump_file, "Considering loop %d\n", loop->num);
1129     if (!loop->inner)
1130       fprintf (dump_file, "loop is innermost\n");
1131     else
1132       fprintf (dump_file, "loop NOT innermost\n");
1133    }
1134 
1135   /* Check for problems with dependences.  If the loop can be reversed,
1136      the iterations are independent.  */
1137   auto_vec<loop_p, 3> loop_nest;
1138   datarefs.create (10);
1139   dependence_relations.create (100);
1140   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
1141                                                      &dependence_relations))
1142     {
1143       if (dump_file && (dump_flags & TDF_DETAILS))
1144           fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
1145       ret = false;
1146       goto end;
1147     }
1148   if (dump_file && (dump_flags & TDF_DETAILS))
1149     dump_data_dependence_relations (dump_file, dependence_relations);
1150 
1151   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
1152   LTM_MATRIX (trans)[0][0] = -1;
1153 
1154   if (lambda_transform_legal_p (trans, 1, dependence_relations))
1155     {
1156       ret = true;
1157       if (dump_file && (dump_flags & TDF_DETAILS))
1158           fprintf (dump_file, "  SUCCESS: may be parallelized\n");
1159     }
1160   else if (dump_file && (dump_flags & TDF_DETAILS))
1161     fprintf (dump_file,
1162                "  FAILED: data dependencies exist across iterations\n");
1163 
1164  end:
1165   free_dependence_relations (dependence_relations);
1166   free_data_refs (datarefs);
1167 
1168   return ret;
1169 }
1170 
1171 /* Return true when LOOP contains basic blocks marked with the
1172    BB_IRREDUCIBLE_LOOP flag.  */
1173 
1174 static inline bool
loop_has_blocks_with_irreducible_flag(class loop * loop)1175 loop_has_blocks_with_irreducible_flag (class loop *loop)
1176 {
1177   unsigned i;
1178   basic_block *bbs = get_loop_body_in_dom_order (loop);
1179   bool res = true;
1180 
1181   for (i = 0; i < loop->num_nodes; i++)
1182     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
1183       goto end;
1184 
1185   res = false;
1186  end:
1187   free (bbs);
1188   return res;
1189 }
1190 
1191 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
1192    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
1193    to their addresses that can be reused.  The address of OBJ is known to
1194    be invariant in the whole function.  Other needed statements are placed
1195    right before GSI.  */
1196 
1197 static tree
take_address_of(tree obj,tree type,edge entry,int_tree_htab_type * decl_address,gimple_stmt_iterator * gsi)1198 take_address_of (tree obj, tree type, edge entry,
1199                      int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi)
1200 {
1201   int uid;
1202   tree *var_p, name, addr;
1203   gassign *stmt;
1204   gimple_seq stmts;
1205 
1206   /* Since the address of OBJ is invariant, the trees may be shared.
1207      Avoid rewriting unrelated parts of the code.  */
1208   obj = unshare_expr (obj);
1209   for (var_p = &obj;
1210        handled_component_p (*var_p);
1211        var_p = &TREE_OPERAND (*var_p, 0))
1212     continue;
1213 
1214   /* Canonicalize the access to base on a MEM_REF.  */
1215   if (DECL_P (*var_p))
1216     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
1217 
1218   /* Assign a canonical SSA name to the address of the base decl used
1219      in the address and share it for all accesses and addresses based
1220      on it.  */
1221   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
1222   int_tree_map elt;
1223   elt.uid = uid;
1224   int_tree_map *slot = decl_address->find_slot (elt, INSERT);
1225   if (!slot->to)
1226     {
1227       if (gsi == NULL)
1228           return NULL;
1229       addr = TREE_OPERAND (*var_p, 0);
1230       const char *obj_name
1231           = get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
1232       if (obj_name)
1233           name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
1234       else
1235           name = make_ssa_name (TREE_TYPE (addr));
1236       stmt = gimple_build_assign (name, addr);
1237       gsi_insert_on_edge_immediate (entry, stmt);
1238 
1239       slot->uid = uid;
1240       slot->to = name;
1241     }
1242   else
1243     name = slot->to;
1244 
1245   /* Express the address in terms of the canonical SSA name.  */
1246   TREE_OPERAND (*var_p, 0) = name;
1247   if (gsi == NULL)
1248     return build_fold_addr_expr_with_type (obj, type);
1249 
1250   name = force_gimple_operand (build_addr (obj),
1251                                      &stmts, true, NULL_TREE);
1252   if (!gimple_seq_empty_p (stmts))
1253     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
1254 
1255   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
1256     {
1257       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
1258                                            NULL_TREE);
1259       if (!gimple_seq_empty_p (stmts))
1260           gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
1261     }
1262 
1263   return name;
1264 }
1265 
1266 static tree
reduc_stmt_res(gimple * stmt)1267 reduc_stmt_res (gimple *stmt)
1268 {
1269   return (gimple_code (stmt) == GIMPLE_PHI
1270             ? gimple_phi_result (stmt)
1271             : gimple_assign_lhs (stmt));
1272 }
1273 
1274 /* Callback for htab_traverse.  Create the initialization statement
1275    for reduction described in SLOT, and place it at the preheader of
1276    the loop described in DATA.  */
1277 
1278 int
initialize_reductions(reduction_info ** slot,class loop * loop)1279 initialize_reductions (reduction_info **slot, class loop *loop)
1280 {
1281   tree init;
1282   tree type, arg;
1283   edge e;
1284 
1285   struct reduction_info *const reduc = *slot;
1286 
1287   /* Create initialization in preheader:
1288      reduction_variable = initialization value of reduction.  */
1289 
1290   /* In the phi node at the header, replace the argument coming
1291      from the preheader with the reduction initialization value.  */
1292 
1293   /* Initialize the reduction.  */
1294   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1295   init = omp_reduction_init_op (gimple_location (reduc->reduc_stmt),
1296                                         reduc->reduction_code, type);
1297   reduc->init = init;
1298 
1299   /* Replace the argument representing the initialization value
1300      with the initialization value for the reduction (neutral
1301      element for the particular operation, e.g. 0 for PLUS_EXPR,
1302      1 for MULT_EXPR, etc).
1303      Keep the old value in a new variable "reduction_initial",
1304      that will be taken in consideration after the parallel
1305      computing is done.  */
1306 
1307   e = loop_preheader_edge (loop);
1308   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
1309   /* Create new variable to hold the initial value.  */
1310 
1311   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
1312              (reduc->reduc_phi, loop_preheader_edge (loop)), init);
1313   reduc->initial_value = arg;
1314   return 1;
1315 }
1316 
1317 struct elv_data
1318 {
1319   struct walk_stmt_info info;
1320   edge entry;
1321   int_tree_htab_type *decl_address;
1322   gimple_stmt_iterator *gsi;
1323   bool changed;
1324   bool reset;
1325 };
1326 
1327 /* Eliminates references to local variables in *TP out of the single
1328    entry single exit region starting at DTA->ENTRY.
1329    DECL_ADDRESS contains addresses of the references that had their
1330    address taken already.  If the expression is changed, CHANGED is
1331    set to true.  Callback for walk_tree.  */
1332 
1333 static tree
eliminate_local_variables_1(tree * tp,int * walk_subtrees,void * data)1334 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
1335 {
1336   struct elv_data *const dta = (struct elv_data *) data;
1337   tree t = *tp, var, addr, addr_type, type, obj;
1338 
1339   if (DECL_P (t))
1340     {
1341       *walk_subtrees = 0;
1342 
1343       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
1344           return NULL_TREE;
1345 
1346       type = TREE_TYPE (t);
1347       addr_type = build_pointer_type (type);
1348       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
1349                                     dta->gsi);
1350       if (dta->gsi == NULL && addr == NULL_TREE)
1351           {
1352             dta->reset = true;
1353             return NULL_TREE;
1354           }
1355 
1356       *tp = build_simple_mem_ref (addr);
1357 
1358       dta->changed = true;
1359       return NULL_TREE;
1360     }
1361 
1362   if (TREE_CODE (t) == ADDR_EXPR)
1363     {
1364       /* ADDR_EXPR may appear in two contexts:
1365            -- as a gimple operand, when the address taken is a function invariant
1366            -- as gimple rhs, when the resulting address in not a function
1367               invariant
1368            We do not need to do anything special in the latter case (the base of
1369            the memory reference whose address is taken may be replaced in the
1370            DECL_P case).  The former case is more complicated, as we need to
1371            ensure that the new address is still a gimple operand.  Thus, it
1372            is not sufficient to replace just the base of the memory reference --
1373            we need to move the whole computation of the address out of the
1374            loop.  */
1375       if (!is_gimple_val (t))
1376           return NULL_TREE;
1377 
1378       *walk_subtrees = 0;
1379       obj = TREE_OPERAND (t, 0);
1380       var = get_base_address (obj);
1381       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
1382           return NULL_TREE;
1383 
1384       addr_type = TREE_TYPE (t);
1385       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
1386                                     dta->gsi);
1387       if (dta->gsi == NULL && addr == NULL_TREE)
1388           {
1389             dta->reset = true;
1390             return NULL_TREE;
1391           }
1392       *tp = addr;
1393 
1394       dta->changed = true;
1395       return NULL_TREE;
1396     }
1397 
1398   if (!EXPR_P (t))
1399     *walk_subtrees = 0;
1400 
1401   return NULL_TREE;
1402 }
1403 
1404 /* Moves the references to local variables in STMT at *GSI out of the single
1405    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
1406    addresses of the references that had their address taken
1407    already.  */
1408 
1409 static void
eliminate_local_variables_stmt(edge entry,gimple_stmt_iterator * gsi,int_tree_htab_type * decl_address)1410 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
1411                                         int_tree_htab_type *decl_address)
1412 {
1413   struct elv_data dta;
1414   gimple *stmt = gsi_stmt (*gsi);
1415 
1416   memset (&dta.info, '\0', sizeof (dta.info));
1417   dta.entry = entry;
1418   dta.decl_address = decl_address;
1419   dta.changed = false;
1420   dta.reset = false;
1421 
1422   if (gimple_debug_bind_p (stmt))
1423     {
1424       dta.gsi = NULL;
1425       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
1426                      eliminate_local_variables_1, &dta.info, NULL);
1427       if (dta.reset)
1428           {
1429             gimple_debug_bind_reset_value (stmt);
1430             dta.changed = true;
1431           }
1432     }
1433   else if (gimple_clobber_p (stmt))
1434     {
1435       unlink_stmt_vdef (stmt);
1436       stmt = gimple_build_nop ();
1437       gsi_replace (gsi, stmt, false);
1438       dta.changed = true;
1439     }
1440   else
1441     {
1442       dta.gsi = gsi;
1443       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
1444     }
1445 
1446   if (dta.changed)
1447     update_stmt (stmt);
1448 }
1449 
1450 /* Eliminates the references to local variables from the single entry
1451    single exit region between the ENTRY and EXIT edges.
1452 
1453    This includes:
1454    1) Taking address of a local variable -- these are moved out of the
1455    region (and temporary variable is created to hold the address if
1456    necessary).
1457 
1458    2) Dereferencing a local variable -- these are replaced with indirect
1459    references.  */
1460 
1461 static void
eliminate_local_variables(edge entry,edge exit)1462 eliminate_local_variables (edge entry, edge exit)
1463 {
1464   basic_block bb;
1465   auto_vec<basic_block, 3> body;
1466   unsigned i;
1467   gimple_stmt_iterator gsi;
1468   bool has_debug_stmt = false;
1469   int_tree_htab_type decl_address (10);
1470   basic_block entry_bb = entry->src;
1471   basic_block exit_bb = exit->dest;
1472 
1473   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1474 
1475   FOR_EACH_VEC_ELT (body, i, bb)
1476     if (bb != entry_bb && bb != exit_bb)
1477       {
1478         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1479             if (is_gimple_debug (gsi_stmt (gsi)))
1480               {
1481                 if (gimple_debug_bind_p (gsi_stmt (gsi)))
1482                   has_debug_stmt = true;
1483               }
1484             else
1485               eliminate_local_variables_stmt (entry, &gsi, &decl_address);
1486       }
1487 
1488   if (has_debug_stmt)
1489     FOR_EACH_VEC_ELT (body, i, bb)
1490       if (bb != entry_bb && bb != exit_bb)
1491           for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1492             if (gimple_debug_bind_p (gsi_stmt (gsi)))
1493               eliminate_local_variables_stmt (entry, &gsi, &decl_address);
1494 }
1495 
1496 /* Returns true if expression EXPR is not defined between ENTRY and
1497    EXIT, i.e. if all its operands are defined outside of the region.  */
1498 
1499 static bool
expr_invariant_in_region_p(edge entry,edge exit,tree expr)1500 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
1501 {
1502   basic_block entry_bb = entry->src;
1503   basic_block exit_bb = exit->dest;
1504   basic_block def_bb;
1505 
1506   if (is_gimple_min_invariant (expr))
1507     return true;
1508 
1509   if (TREE_CODE (expr) == SSA_NAME)
1510     {
1511       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
1512       if (def_bb
1513             && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
1514             && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
1515           return false;
1516 
1517       return true;
1518     }
1519 
1520   return false;
1521 }
1522 
1523 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
1524    The copies are stored to NAME_COPIES, if NAME was already duplicated,
1525    its duplicate stored in NAME_COPIES is returned.
1526 
1527    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
1528    duplicated, storing the copies in DECL_COPIES.  */
1529 
1530 static tree
separate_decls_in_region_name(tree name,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies,bool copy_name_p)1531 separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies,
1532                                      int_tree_htab_type *decl_copies,
1533                                      bool copy_name_p)
1534 {
1535   tree copy, var, var_copy;
1536   unsigned idx, uid, nuid;
1537   struct int_tree_map ielt;
1538   struct name_to_copy_elt elt, *nelt;
1539   name_to_copy_elt **slot;
1540   int_tree_map *dslot;
1541 
1542   if (TREE_CODE (name) != SSA_NAME)
1543     return name;
1544 
1545   idx = SSA_NAME_VERSION (name);
1546   elt.version = idx;
1547   slot = name_copies->find_slot_with_hash (&elt, idx,
1548                                                      copy_name_p ? INSERT : NO_INSERT);
1549   if (slot && *slot)
1550     return (*slot)->new_name;
1551 
1552   if (copy_name_p)
1553     {
1554       copy = duplicate_ssa_name (name, NULL);
1555       nelt = XNEW (struct name_to_copy_elt);
1556       nelt->version = idx;
1557       nelt->new_name = copy;
1558       nelt->field = NULL_TREE;
1559       *slot = nelt;
1560     }
1561   else
1562     {
1563       gcc_assert (!slot);
1564       copy = name;
1565     }
1566 
1567   var = SSA_NAME_VAR (name);
1568   if (!var)
1569     return copy;
1570 
1571   uid = DECL_UID (var);
1572   ielt.uid = uid;
1573   dslot = decl_copies->find_slot_with_hash (ielt, uid, INSERT);
1574   if (!dslot->to)
1575     {
1576       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
1577       DECL_NOT_GIMPLE_REG_P (var_copy) = DECL_NOT_GIMPLE_REG_P (var);
1578       dslot->uid = uid;
1579       dslot->to = var_copy;
1580 
1581       /* Ensure that when we meet this decl next time, we won't duplicate
1582          it again.  */
1583       nuid = DECL_UID (var_copy);
1584       ielt.uid = nuid;
1585       dslot = decl_copies->find_slot_with_hash (ielt, nuid, INSERT);
1586       gcc_assert (!dslot->to);
1587       dslot->uid = nuid;
1588       dslot->to = var_copy;
1589     }
1590   else
1591     var_copy = dslot->to;
1592 
1593   replace_ssa_name_symbol (copy, var_copy);
1594   return copy;
1595 }
1596 
1597 /* Finds the ssa names used in STMT that are defined outside the
1598    region between ENTRY and EXIT and replaces such ssa names with
1599    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
1600    decls of all ssa names used in STMT (including those defined in
1601    LOOP) are replaced with the new temporary variables; the
1602    replacement decls are stored in DECL_COPIES.  */
1603 
1604 static void
separate_decls_in_region_stmt(edge entry,edge exit,gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)1605 separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt,
1606                                      name_to_copy_table_type *name_copies,
1607                                      int_tree_htab_type *decl_copies)
1608 {
1609   use_operand_p use;
1610   def_operand_p def;
1611   ssa_op_iter oi;
1612   tree name, copy;
1613   bool copy_name_p;
1614 
1615   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
1616   {
1617     name = DEF_FROM_PTR (def);
1618     gcc_assert (TREE_CODE (name) == SSA_NAME);
1619     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
1620                                                     false);
1621     gcc_assert (copy == name);
1622   }
1623 
1624   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
1625   {
1626     name = USE_FROM_PTR (use);
1627     if (TREE_CODE (name) != SSA_NAME)
1628       continue;
1629 
1630     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
1631     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
1632                                                     copy_name_p);
1633     SET_USE (use, copy);
1634   }
1635 }
1636 
1637 /* Finds the ssa names used in STMT that are defined outside the
1638    region between ENTRY and EXIT and replaces such ssa names with
1639    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
1640    decls of all ssa names used in STMT (including those defined in
1641    LOOP) are replaced with the new temporary variables; the
1642    replacement decls are stored in DECL_COPIES.  */
1643 
1644 static bool
separate_decls_in_region_debug(gimple * stmt,name_to_copy_table_type * name_copies,int_tree_htab_type * decl_copies)1645 separate_decls_in_region_debug (gimple *stmt,
1646                                         name_to_copy_table_type *name_copies,
1647                                         int_tree_htab_type *decl_copies)
1648 {
1649   use_operand_p use;
1650   ssa_op_iter oi;
1651   tree var, name;
1652   struct int_tree_map ielt;
1653   struct name_to_copy_elt elt;
1654   name_to_copy_elt **slot;
1655   int_tree_map *dslot;
1656 
1657   if (gimple_debug_bind_p (stmt))
1658     var = gimple_debug_bind_get_var (stmt);
1659   else if (gimple_debug_source_bind_p (stmt))
1660     var = gimple_debug_source_bind_get_var (stmt);
1661   else
1662     return true;
1663   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
1664     return true;
1665   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
1666   ielt.uid = DECL_UID (var);
1667   dslot = decl_copies->find_slot_with_hash (ielt, ielt.uid, NO_INSERT);
1668   if (!dslot)
1669     return true;
1670   if (gimple_debug_bind_p (stmt))
1671     gimple_debug_bind_set_var (stmt, dslot->to);
1672   else if (gimple_debug_source_bind_p (stmt))
1673     gimple_debug_source_bind_set_var (stmt, dslot->to);
1674 
1675   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
1676   {
1677     name = USE_FROM_PTR (use);
1678     if (TREE_CODE (name) != SSA_NAME)
1679       continue;
1680 
1681     elt.version = SSA_NAME_VERSION (name);
1682     slot = name_copies->find_slot_with_hash (&elt, elt.version, NO_INSERT);
1683     if (!slot)
1684       {
1685           gimple_debug_bind_reset_value (stmt);
1686           update_stmt (stmt);
1687           break;
1688       }
1689 
1690     SET_USE (use, (*slot)->new_name);
1691   }
1692 
1693   return false;
1694 }
1695 
1696 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
1697    specified in SLOT. The type is passed in DATA.  */
1698 
1699 int
add_field_for_reduction(reduction_info ** slot,tree type)1700 add_field_for_reduction (reduction_info **slot, tree type)
1701 {
1702 
1703   struct reduction_info *const red = *slot;
1704   tree var = reduc_stmt_res (red->reduc_stmt);
1705   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
1706                                  SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1707 
1708   insert_field_into_struct (type, field);
1709 
1710   red->field = field;
1711 
1712   return 1;
1713 }
1714 
1715 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
1716    described in SLOT. The type is passed in DATA.  */
1717 
1718 int
add_field_for_name(name_to_copy_elt ** slot,tree type)1719 add_field_for_name (name_to_copy_elt **slot, tree type)
1720 {
1721   struct name_to_copy_elt *const elt = *slot;
1722   tree name = ssa_name (elt->version);
1723   tree field = build_decl (UNKNOWN_LOCATION,
1724                                  FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1725                                  TREE_TYPE (name));
1726 
1727   insert_field_into_struct (type, field);
1728   elt->field = field;
1729 
1730   return 1;
1731 }
1732 
1733 /* Callback for htab_traverse.  A local result is the intermediate result
1734    computed by a single
1735    thread, or the initial value in case no iteration was executed.
1736    This function creates a phi node reflecting these values.
1737    The phi's result will be stored in NEW_PHI field of the
1738    reduction's data structure.  */
1739 
1740 int
create_phi_for_local_result(reduction_info ** slot,class loop * loop)1741 create_phi_for_local_result (reduction_info **slot, class loop *loop)
1742 {
1743   struct reduction_info *const reduc = *slot;
1744   edge e;
1745   gphi *new_phi;
1746   basic_block store_bb, continue_bb;
1747   tree local_res;
1748   location_t locus;
1749 
1750   /* STORE_BB is the block where the phi
1751      should be stored.  It is the destination of the loop exit.
1752      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1753   continue_bb = single_pred (loop->latch);
1754   store_bb = FALLTHRU_EDGE (continue_bb)->dest;
1755 
1756   /* STORE_BB has two predecessors.  One coming from  the loop
1757      (the reduction's result is computed at the loop),
1758      and another coming from a block preceding the loop,
1759      when no iterations
1760      are executed (the initial value should be taken).  */
1761   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb))
1762     e = EDGE_PRED (store_bb, 1);
1763   else
1764     e = EDGE_PRED (store_bb, 0);
1765   tree lhs = reduc_stmt_res (reduc->reduc_stmt);
1766   local_res = copy_ssa_name (lhs);
1767   locus = gimple_location (reduc->reduc_stmt);
1768   new_phi = create_phi_node (local_res, store_bb);
1769   add_phi_arg (new_phi, reduc->init, e, locus);
1770   add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus);
1771   reduc->new_phi = new_phi;
1772 
1773   return 1;
1774 }
1775 
1776 struct clsn_data
1777 {
1778   tree store;
1779   tree load;
1780 
1781   basic_block store_bb;
1782   basic_block load_bb;
1783 };
1784 
1785 /* Callback for htab_traverse.  Create an atomic instruction for the
1786    reduction described in SLOT.
1787    DATA annotates the place in memory the atomic operation relates to,
1788    and the basic block it needs to be generated in.  */
1789 
1790 int
create_call_for_reduction_1(reduction_info ** slot,struct clsn_data * clsn_data)1791 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1792 {
1793   struct reduction_info *const reduc = *slot;
1794   gimple_stmt_iterator gsi;
1795   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1796   tree load_struct;
1797   basic_block bb;
1798   basic_block new_bb;
1799   edge e;
1800   tree t, addr, ref, x;
1801   tree tmp_load, name;
1802   gimple *load;
1803 
1804   if (reduc->reduc_addr == NULL_TREE)
1805     {
1806       load_struct = build_simple_mem_ref (clsn_data->load);
1807       t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1808 
1809       addr = build_addr (t);
1810     }
1811   else
1812     {
1813       /* Set the address for the atomic store.  */
1814       addr = reduc->reduc_addr;
1815 
1816       /* Remove the non-atomic store '*addr = sum'.  */
1817       tree res = PHI_RESULT (reduc->keep_res);
1818       use_operand_p use_p;
1819       gimple *stmt;
1820       bool single_use_p = single_imm_use (res, &use_p, &stmt);
1821       gcc_assert (single_use_p);
1822       replace_uses_by (gimple_vdef (stmt),
1823                            gimple_vuse (stmt));
1824       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1825       gsi_remove (&gsi, true);
1826     }
1827 
1828   /* Create phi node.  */
1829   bb = clsn_data->load_bb;
1830 
1831   gsi = gsi_last_bb (bb);
1832   e = split_block (bb, gsi_stmt (gsi));
1833   new_bb = e->dest;
1834 
1835   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)));
1836   tmp_load = make_ssa_name (tmp_load);
1837   load = gimple_build_omp_atomic_load (tmp_load, addr,
1838                                                OMP_MEMORY_ORDER_RELAXED);
1839   SSA_NAME_DEF_STMT (tmp_load) = load;
1840   gsi = gsi_start_bb (new_bb);
1841   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1842 
1843   e = split_block (new_bb, load);
1844   new_bb = e->dest;
1845   gsi = gsi_start_bb (new_bb);
1846   ref = tmp_load;
1847   x = fold_build2 (reduc->reduction_code,
1848                        TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1849                        PHI_RESULT (reduc->new_phi));
1850 
1851   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1852                                            GSI_CONTINUE_LINKING);
1853 
1854   gimple *store = gimple_build_omp_atomic_store (name,
1855                                                              OMP_MEMORY_ORDER_RELAXED);
1856   gsi_insert_after (&gsi, store, GSI_NEW_STMT);
1857   return 1;
1858 }
1859 
1860 /* Create the atomic operation at the join point of the threads.
1861    REDUCTION_LIST describes the reductions in the LOOP.
1862    LD_ST_DATA describes the shared data structure where
1863    shared data is stored in and loaded from.  */
1864 static void
create_call_for_reduction(class loop * loop,reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1865 create_call_for_reduction (class loop *loop,
1866                                  reduction_info_table_type *reduction_list,
1867                                  struct clsn_data *ld_st_data)
1868 {
1869   reduction_list->traverse <class loop *, create_phi_for_local_result> (loop);
1870   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1871   basic_block continue_bb = single_pred (loop->latch);
1872   ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest;
1873   reduction_list
1874     ->traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data);
1875 }
1876 
1877 /* Callback for htab_traverse.  Loads the final reduction value at the
1878    join point of all threads, and inserts it in the right place.  */
1879 
1880 int
create_loads_for_reductions(reduction_info ** slot,struct clsn_data * clsn_data)1881 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1882 {
1883   struct reduction_info *const red = *slot;
1884   gimple *stmt;
1885   gimple_stmt_iterator gsi;
1886   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1887   tree load_struct;
1888   tree name;
1889   tree x;
1890 
1891   /* If there's no exit phi, the result of the reduction is unused.  */
1892   if (red->keep_res == NULL)
1893     return 1;
1894 
1895   gsi = gsi_after_labels (clsn_data->load_bb);
1896   load_struct = build_simple_mem_ref (clsn_data->load);
1897   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1898                               NULL_TREE);
1899 
1900   x = load_struct;
1901   name = PHI_RESULT (red->keep_res);
1902   stmt = gimple_build_assign (name, x);
1903 
1904   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1905 
1906   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1907        !gsi_end_p (gsi); gsi_next (&gsi))
1908     if (gsi_stmt (gsi) == red->keep_res)
1909       {
1910           remove_phi_node (&gsi, false);
1911           return 1;
1912       }
1913   gcc_unreachable ();
1914 }
1915 
1916 /* Load the reduction result that was stored in LD_ST_DATA.
1917    REDUCTION_LIST describes the list of reductions that the
1918    loads should be generated for.  */
1919 static void
create_final_loads_for_reduction(reduction_info_table_type * reduction_list,struct clsn_data * ld_st_data)1920 create_final_loads_for_reduction (reduction_info_table_type *reduction_list,
1921                                           struct clsn_data *ld_st_data)
1922 {
1923   gimple_stmt_iterator gsi;
1924   tree t;
1925   gimple *stmt;
1926 
1927   gsi = gsi_after_labels (ld_st_data->load_bb);
1928   t = build_fold_addr_expr (ld_st_data->store);
1929   stmt = gimple_build_assign (ld_st_data->load, t);
1930 
1931   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1932 
1933   reduction_list
1934     ->traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data);
1935 
1936 }
1937 
1938 /* Callback for htab_traverse.  Store the neutral value for the
1939   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1940   1 for MULT_EXPR, etc. into the reduction field.
1941   The reduction is specified in SLOT. The store information is
1942   passed in DATA.  */
1943 
1944 int
create_stores_for_reduction(reduction_info ** slot,struct clsn_data * clsn_data)1945 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1946 {
1947   struct reduction_info *const red = *slot;
1948   tree t;
1949   gimple *stmt;
1950   gimple_stmt_iterator gsi;
1951   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1952 
1953   gsi = gsi_last_bb (clsn_data->store_bb);
1954   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1955   stmt = gimple_build_assign (t, red->initial_value);
1956   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1957 
1958   return 1;
1959 }
1960 
1961 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1962    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1963    specified in SLOT.  */
1964 
1965 int
create_loads_and_stores_for_name(name_to_copy_elt ** slot,struct clsn_data * clsn_data)1966 create_loads_and_stores_for_name (name_to_copy_elt **slot,
1967                                           struct clsn_data *clsn_data)
1968 {
1969   struct name_to_copy_elt *const elt = *slot;
1970   tree t;
1971   gimple *stmt;
1972   gimple_stmt_iterator gsi;
1973   tree type = TREE_TYPE (elt->new_name);
1974   tree load_struct;
1975 
1976   gsi = gsi_last_bb (clsn_data->store_bb);
1977   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1978   stmt = gimple_build_assign (t, ssa_name (elt->version));
1979   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1980 
1981   gsi = gsi_last_bb (clsn_data->load_bb);
1982   load_struct = build_simple_mem_ref (clsn_data->load);
1983   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1984   stmt = gimple_build_assign (elt->new_name, t);
1985   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1986 
1987   return 1;
1988 }
1989 
1990 /* Moves all the variables used in LOOP and defined outside of it (including
1991    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1992    name) to a structure created for this purpose.  The code
1993 
1994    while (1)
1995      {
1996        use (a);
1997        use (b);
1998      }
1999 
2000    is transformed this way:
2001 
2002    bb0:
2003    old.a = a;
2004    old.b = b;
2005 
2006    bb1:
2007    a' = new->a;
2008    b' = new->b;
2009    while (1)
2010      {
2011        use (a');
2012        use (b');
2013      }
2014 
2015    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
2016    pointer `new' is intentionally not initialized (the loop will be split to a
2017    separate function later, and `new' will be initialized from its arguments).
2018    LD_ST_DATA holds information about the shared data structure used to pass
2019    information among the threads.  It is initialized here, and
2020    gen_parallel_loop will pass it to create_call_for_reduction that
2021    needs this information.  REDUCTION_LIST describes the reductions
2022    in LOOP.  */
2023 
2024 static void
separate_decls_in_region(edge entry,edge exit,reduction_info_table_type * reduction_list,tree * arg_struct,tree * new_arg_struct,struct clsn_data * ld_st_data)2025 separate_decls_in_region (edge entry, edge exit,
2026                                 reduction_info_table_type *reduction_list,
2027                                 tree *arg_struct, tree *new_arg_struct,
2028                                 struct clsn_data *ld_st_data)
2029 
2030 {
2031   basic_block bb1 = split_edge (entry);
2032   basic_block bb0 = single_pred (bb1);
2033   name_to_copy_table_type name_copies (10);
2034   int_tree_htab_type decl_copies (10);
2035   unsigned i;
2036   tree type, type_name, nvar;
2037   gimple_stmt_iterator gsi;
2038   struct clsn_data clsn_data;
2039   auto_vec<basic_block, 3> body;
2040   basic_block bb;
2041   basic_block entry_bb = bb1;
2042   basic_block exit_bb = exit->dest;
2043   bool has_debug_stmt = false;
2044 
2045   entry = single_succ_edge (entry_bb);
2046   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
2047 
2048   FOR_EACH_VEC_ELT (body, i, bb)
2049     {
2050       if (bb != entry_bb && bb != exit_bb)
2051           {
2052             for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2053               separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
2054                                                      &name_copies, &decl_copies);
2055 
2056             for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2057               {
2058                 gimple *stmt = gsi_stmt (gsi);
2059 
2060                 if (is_gimple_debug (stmt))
2061                     has_debug_stmt = true;
2062                 else
2063                     separate_decls_in_region_stmt (entry, exit, stmt,
2064                                                          &name_copies, &decl_copies);
2065               }
2066           }
2067     }
2068 
2069   /* Now process debug bind stmts.  We must not create decls while
2070      processing debug stmts, so we defer their processing so as to
2071      make sure we will have debug info for as many variables as
2072      possible (all of those that were dealt with in the loop above),
2073      and discard those for which we know there's nothing we can
2074      do.  */
2075   if (has_debug_stmt)
2076     FOR_EACH_VEC_ELT (body, i, bb)
2077       if (bb != entry_bb && bb != exit_bb)
2078           {
2079             for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
2080               {
2081                 gimple *stmt = gsi_stmt (gsi);
2082 
2083                 if (is_gimple_debug (stmt))
2084                     {
2085                       if (separate_decls_in_region_debug (stmt, &name_copies,
2086                                                                   &decl_copies))
2087                         {
2088                           gsi_remove (&gsi, true);
2089                           continue;
2090                         }
2091                     }
2092 
2093                 gsi_next (&gsi);
2094               }
2095           }
2096 
2097   if (name_copies.is_empty () && reduction_list->is_empty ())
2098     {
2099       /* It may happen that there is nothing to copy (if there are only
2100          loop carried and external variables in the loop).  */
2101       *arg_struct = NULL;
2102       *new_arg_struct = NULL;
2103     }
2104   else
2105     {
2106       /* Create the type for the structure to store the ssa names to.  */
2107       type = lang_hooks.types.make_type (RECORD_TYPE);
2108       type_name = build_decl (UNKNOWN_LOCATION,
2109                                     TYPE_DECL, create_tmp_var_name (".paral_data"),
2110                                     type);
2111       TYPE_NAME (type) = type_name;
2112 
2113       name_copies.traverse <tree, add_field_for_name> (type);
2114       if (reduction_list && !reduction_list->is_empty ())
2115           {
2116             /* Create the fields for reductions.  */
2117             reduction_list->traverse <tree, add_field_for_reduction> (type);
2118           }
2119       layout_type (type);
2120 
2121       /* Create the loads and stores.  */
2122       *arg_struct = create_tmp_var (type, ".paral_data_store");
2123       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
2124       *new_arg_struct = make_ssa_name (nvar);
2125 
2126       ld_st_data->store = *arg_struct;
2127       ld_st_data->load = *new_arg_struct;
2128       ld_st_data->store_bb = bb0;
2129       ld_st_data->load_bb = bb1;
2130 
2131       name_copies
2132           .traverse <struct clsn_data *, create_loads_and_stores_for_name>
2133                       (ld_st_data);
2134 
2135       /* Load the calculation from memory (after the join of the threads).  */
2136 
2137       if (reduction_list && !reduction_list->is_empty ())
2138           {
2139             reduction_list
2140               ->traverse <struct clsn_data *, create_stores_for_reduction>
2141               (ld_st_data);
2142             clsn_data.load = make_ssa_name (nvar);
2143             clsn_data.load_bb = exit->dest;
2144             clsn_data.store = ld_st_data->store;
2145             create_final_loads_for_reduction (reduction_list, &clsn_data);
2146           }
2147     }
2148 }
2149 
2150 /* Returns true if FN was created to run in parallel.  */
2151 
2152 bool
parallelized_function_p(tree fndecl)2153 parallelized_function_p (tree fndecl)
2154 {
2155   cgraph_node *node = cgraph_node::get (fndecl);
2156   gcc_assert (node != NULL);
2157   return node->parallelized_function;
2158 }
2159 
2160 /* Creates and returns an empty function that will receive the body of
2161    a parallelized loop.  */
2162 
2163 static tree
create_loop_fn(location_t loc)2164 create_loop_fn (location_t loc)
2165 {
2166   char buf[100];
2167   char *tname;
2168   tree decl, type, name, t;
2169   struct function *act_cfun = cfun;
2170   static unsigned loopfn_num;
2171 
2172   loc = LOCATION_LOCUS (loc);
2173   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
2174   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
2175   clean_symbol_name (tname);
2176   name = get_identifier (tname);
2177   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
2178 
2179   decl = build_decl (loc, FUNCTION_DECL, name, type);
2180   TREE_STATIC (decl) = 1;
2181   TREE_USED (decl) = 1;
2182   DECL_ARTIFICIAL (decl) = 1;
2183   DECL_IGNORED_P (decl) = 0;
2184   TREE_PUBLIC (decl) = 0;
2185   DECL_UNINLINABLE (decl) = 1;
2186   DECL_EXTERNAL (decl) = 0;
2187   DECL_CONTEXT (decl) = NULL_TREE;
2188   DECL_INITIAL (decl) = make_node (BLOCK);
2189   BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
2190 
2191   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
2192   DECL_ARTIFICIAL (t) = 1;
2193   DECL_IGNORED_P (t) = 1;
2194   DECL_RESULT (decl) = t;
2195 
2196   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
2197                       ptr_type_node);
2198   DECL_ARTIFICIAL (t) = 1;
2199   DECL_ARG_TYPE (t) = ptr_type_node;
2200   DECL_CONTEXT (t) = decl;
2201   TREE_USED (t) = 1;
2202   DECL_ARGUMENTS (decl) = t;
2203 
2204   allocate_struct_function (decl, false);
2205 
2206   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
2207      it.  */
2208   set_cfun (act_cfun);
2209 
2210   return decl;
2211 }
2212 
2213 /* Replace uses of NAME by VAL in block BB.  */
2214 
2215 static void
replace_uses_in_bb_by(tree name,tree val,basic_block bb)2216 replace_uses_in_bb_by (tree name, tree val, basic_block bb)
2217 {
2218   gimple *use_stmt;
2219   imm_use_iterator imm_iter;
2220 
2221   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
2222     {
2223       if (gimple_bb (use_stmt) != bb)
2224           continue;
2225 
2226       use_operand_p use_p;
2227       FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
2228           SET_USE (use_p, val);
2229     }
2230 }
2231 
2232 /* Do transformation from:
2233 
2234      <bb preheader>:
2235      ...
2236      goto <bb header>
2237 
2238      <bb header>:
2239      ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2240      sum_a = PHI <sum_init (preheader), sum_b (latch)>
2241      ...
2242      use (ivtmp_a)
2243      ...
2244      sum_b = sum_a + sum_update
2245      ...
2246      if (ivtmp_a < n)
2247        goto <bb latch>;
2248      else
2249        goto <bb exit>;
2250 
2251      <bb latch>:
2252      ivtmp_b = ivtmp_a + 1;
2253      goto <bb header>
2254 
2255      <bb exit>:
2256      sum_z = PHI <sum_b (cond[1]), ...>
2257 
2258      [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
2259            that's <bb header>.
2260 
2261    to:
2262 
2263      <bb preheader>:
2264      ...
2265      goto <bb newheader>
2266 
2267      <bb header>:
2268      ivtmp_a = PHI <ivtmp_c (latch)>
2269      sum_a = PHI <sum_c (latch)>
2270      ...
2271      use (ivtmp_a)
2272      ...
2273      sum_b = sum_a + sum_update
2274      ...
2275      goto <bb latch>;
2276 
2277      <bb newheader>:
2278      ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2279      sum_c = PHI <sum_init (preheader), sum_b (latch)>
2280      if (ivtmp_c < n + 1)
2281        goto <bb header>;
2282      else
2283        goto <bb newexit>;
2284 
2285      <bb latch>:
2286      ivtmp_b = ivtmp_a + 1;
2287      goto <bb newheader>
2288 
2289      <bb newexit>:
2290      sum_y = PHI <sum_c (newheader)>
2291 
2292      <bb exit>:
2293      sum_z = PHI <sum_y (newexit), ...>
2294 
2295 
2296    In unified diff format:
2297 
2298       <bb preheader>:
2299       ...
2300 -     goto <bb header>
2301 +     goto <bb newheader>
2302 
2303       <bb header>:
2304 -     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2305 -     sum_a = PHI <sum_init (preheader), sum_b (latch)>
2306 +     ivtmp_a = PHI <ivtmp_c (latch)>
2307 +     sum_a = PHI <sum_c (latch)>
2308       ...
2309       use (ivtmp_a)
2310       ...
2311       sum_b = sum_a + sum_update
2312       ...
2313 -     if (ivtmp_a < n)
2314 -       goto <bb latch>;
2315 +     goto <bb latch>;
2316 +
2317 +     <bb newheader>:
2318 +     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
2319 +     sum_c = PHI <sum_init (preheader), sum_b (latch)>
2320 +     if (ivtmp_c < n + 1)
2321 +       goto <bb header>;
2322       else
2323           goto <bb exit>;
2324 
2325       <bb latch>:
2326       ivtmp_b = ivtmp_a + 1;
2327 -     goto <bb header>
2328 +     goto <bb newheader>
2329 
2330 +    <bb newexit>:
2331 +    sum_y = PHI <sum_c (newheader)>
2332 
2333       <bb exit>:
2334 -     sum_z = PHI <sum_b (cond[1]), ...>
2335 +     sum_z = PHI <sum_y (newexit), ...>
2336 
2337    Note: the example does not show any virtual phis, but these are handled more
2338    or less as reductions.
2339 
2340 
2341    Moves the exit condition of LOOP to the beginning of its header.
2342    REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
2343    bound.  */
2344 
2345 static void
transform_to_exit_first_loop_alt(class loop * loop,reduction_info_table_type * reduction_list,tree bound)2346 transform_to_exit_first_loop_alt (class loop *loop,
2347                                           reduction_info_table_type *reduction_list,
2348                                           tree bound)
2349 {
2350   basic_block header = loop->header;
2351   basic_block latch = loop->latch;
2352   edge exit = single_dom_exit (loop);
2353   basic_block exit_block = exit->dest;
2354   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
2355   tree control = gimple_cond_lhs (cond_stmt);
2356   edge e;
2357 
2358   /* Rewriting virtuals into loop-closed ssa normal form makes this
2359      transformation simpler.  It also ensures that the virtuals are in
2360      loop-closed ssa normal from after the transformation, which is required by
2361      create_parallel_loop.  */
2362   rewrite_virtuals_into_loop_closed_ssa (loop);
2363 
2364   /* Create the new_header block.  */
2365   basic_block new_header = split_block_before_cond_jump (exit->src);
2366   edge edge_at_split = single_pred_edge (new_header);
2367 
2368   /* Redirect entry edge to new_header.  */
2369   edge entry = loop_preheader_edge (loop);
2370   e = redirect_edge_and_branch (entry, new_header);
2371   gcc_assert (e == entry);
2372 
2373   /* Redirect post_inc_edge to new_header.  */
2374   edge post_inc_edge = single_succ_edge (latch);
2375   e = redirect_edge_and_branch (post_inc_edge, new_header);
2376   gcc_assert (e == post_inc_edge);
2377 
2378   /* Redirect post_cond_edge to header.  */
2379   edge post_cond_edge = single_pred_edge (latch);
2380   e = redirect_edge_and_branch (post_cond_edge, header);
2381   gcc_assert (e == post_cond_edge);
2382 
2383   /* Redirect edge_at_split to latch.  */
2384   e = redirect_edge_and_branch (edge_at_split, latch);
2385   gcc_assert (e == edge_at_split);
2386 
2387   /* Set the new loop bound.  */
2388   gimple_cond_set_rhs (cond_stmt, bound);
2389   update_stmt (cond_stmt);
2390 
2391   /* Repair the ssa.  */
2392   vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
2393   edge_var_map *vm;
2394   gphi_iterator gsi;
2395   int i;
2396   for (gsi = gsi_start_phis (header), i = 0;
2397        !gsi_end_p (gsi) && v->iterate (i, &vm);
2398        gsi_next (&gsi), i++)
2399     {
2400       gphi *phi = gsi.phi ();
2401       tree res_a = PHI_RESULT (phi);
2402 
2403       /* Create new phi.  */
2404       tree res_c = copy_ssa_name (res_a, phi);
2405       gphi *nphi = create_phi_node (res_c, new_header);
2406 
2407       /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
2408       replace_uses_in_bb_by (res_a, res_c, new_header);
2409 
2410       /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
2411       add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
2412 
2413       /* Replace sum_b with sum_c in exit phi.  */
2414       tree res_b = redirect_edge_var_map_def (vm);
2415       replace_uses_in_bb_by (res_b, res_c, exit_block);
2416 
2417       struct reduction_info *red = reduction_phi (reduction_list, phi);
2418       gcc_assert (virtual_operand_p (res_a)
2419                       || res_a == control
2420                       || red != NULL);
2421 
2422       if (red)
2423           {
2424             /* Register the new reduction phi.  */
2425             red->reduc_phi = nphi;
2426             gimple_set_uid (red->reduc_phi, red->reduc_version);
2427           }
2428     }
2429   gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
2430 
2431   /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
2432   flush_pending_stmts (entry);
2433 
2434   /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
2435   flush_pending_stmts (post_inc_edge);
2436 
2437 
2438   basic_block new_exit_block = NULL;
2439   if (!single_pred_p (exit->dest))
2440     {
2441       /* Create a new empty exit block, inbetween the new loop header and the
2442            old exit block.  The function separate_decls_in_region needs this block
2443            to insert code that is active on loop exit, but not any other path.  */
2444       new_exit_block = split_edge (exit);
2445     }
2446 
2447   /* Insert and register the reduction exit phis.  */
2448   for (gphi_iterator gsi = gsi_start_phis (exit_block);
2449        !gsi_end_p (gsi);
2450        gsi_next (&gsi))
2451     {
2452       gphi *phi = gsi.phi ();
2453       gphi *nphi = NULL;
2454       tree res_z = PHI_RESULT (phi);
2455       tree res_c;
2456 
2457       if (new_exit_block != NULL)
2458           {
2459             /* Now that we have a new exit block, duplicate the phi of the old
2460                exit block in the new exit block to preserve loop-closed ssa.  */
2461             edge succ_new_exit_block = single_succ_edge (new_exit_block);
2462             edge pred_new_exit_block = single_pred_edge (new_exit_block);
2463             tree res_y = copy_ssa_name (res_z, phi);
2464             nphi = create_phi_node (res_y, new_exit_block);
2465             res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
2466             add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
2467             add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
2468           }
2469       else
2470           res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2471 
2472       if (virtual_operand_p (res_z))
2473           continue;
2474 
2475       gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c);
2476       struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
2477       if (red != NULL)
2478           red->keep_res = (nphi != NULL
2479                                ? nphi
2480                                : phi);
2481     }
2482 
2483   /* We're going to cancel the loop at the end of gen_parallel_loop, but until
2484      then we're still using some fields, so only bother about fields that are
2485      still used: header and latch.
2486      The loop has a new header bb, so we update it.  The latch bb stays the
2487      same.  */
2488   loop->header = new_header;
2489 
2490   /* Recalculate dominance info.  */
2491   free_dominance_info (CDI_DOMINATORS);
2492   calculate_dominance_info (CDI_DOMINATORS);
2493 
2494   checking_verify_ssa (true, true);
2495 }
2496 
2497 /* Tries to moves the exit condition of LOOP to the beginning of its header
2498    without duplication of the loop body.  NIT is the number of iterations of the
2499    loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
2500    transformation is successful.  */
2501 
2502 static bool
try_transform_to_exit_first_loop_alt(class loop * loop,reduction_info_table_type * reduction_list,tree nit)2503 try_transform_to_exit_first_loop_alt (class loop *loop,
2504                                               reduction_info_table_type *reduction_list,
2505                                               tree nit)
2506 {
2507   /* Check whether the latch contains a single statement.  */
2508   if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
2509     return false;
2510 
2511   /* Check whether the latch contains no phis.  */
2512   if (phi_nodes (loop->latch) != NULL)
2513     return false;
2514 
2515   /* Check whether the latch contains the loop iv increment.  */
2516   edge back = single_succ_edge (loop->latch);
2517   edge exit = single_dom_exit (loop);
2518   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
2519   tree control = gimple_cond_lhs (cond_stmt);
2520   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
2521   tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
2522   if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
2523     return false;
2524 
2525   /* Check whether there's no code between the loop condition and the latch.  */
2526   if (!single_pred_p (loop->latch)
2527       || single_pred (loop->latch) != exit->src)
2528     return false;
2529 
2530   tree alt_bound = NULL_TREE;
2531   tree nit_type = TREE_TYPE (nit);
2532 
2533   /* Figure out whether nit + 1 overflows.  */
2534   if (TREE_CODE (nit) == INTEGER_CST)
2535     {
2536       if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
2537           {
2538             alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
2539                                                nit, build_one_cst (nit_type));
2540 
2541             gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
2542             transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
2543             return true;
2544           }
2545       else
2546           {
2547             /* Todo: Figure out if we can trigger this, if it's worth to handle
2548                optimally, and if we can handle it optimally.  */
2549             return false;
2550           }
2551     }
2552 
2553   gcc_assert (TREE_CODE (nit) == SSA_NAME);
2554 
2555   /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an
2556      iv with base 0 and step 1 that is incremented in the latch, like this:
2557 
2558      <bb header>:
2559      # iv_1 = PHI <0 (preheader), iv_2 (latch)>
2560      ...
2561      if (iv_1 < nit)
2562        goto <bb latch>;
2563      else
2564        goto <bb exit>;
2565 
2566      <bb latch>:
2567      iv_2 = iv_1 + 1;
2568      goto <bb header>;
2569 
2570      The range of iv_1 is [0, nit].  The latch edge is taken for
2571      iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit.  So the
2572      number of latch executions is equal to nit.
2573 
2574      The function max_loop_iterations gives us the maximum number of latch
2575      executions, so it gives us the maximum value of nit.  */
2576   widest_int nit_max;
2577   if (!max_loop_iterations (loop, &nit_max))
2578     return false;
2579 
2580   /* Check if nit + 1 overflows.  */
2581   widest_int type_max = wi::to_widest (TYPE_MAX_VALUE (nit_type));
2582   if (nit_max >= type_max)
2583     return false;
2584 
2585   gimple *def = SSA_NAME_DEF_STMT (nit);
2586 
2587   /* Try to find nit + 1, in the form of n in an assignment nit = n - 1.  */
2588   if (def
2589       && is_gimple_assign (def)
2590       && gimple_assign_rhs_code (def) == PLUS_EXPR)
2591     {
2592       tree op1 = gimple_assign_rhs1 (def);
2593       tree op2 = gimple_assign_rhs2 (def);
2594       if (integer_minus_onep (op1))
2595           alt_bound = op2;
2596       else if (integer_minus_onep (op2))
2597           alt_bound = op1;
2598     }
2599 
2600   /* If not found, insert nit + 1.  */
2601   if (alt_bound == NULL_TREE)
2602     {
2603       alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
2604                                      build_int_cst_type (nit_type, 1));
2605 
2606       gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
2607 
2608       alt_bound
2609           = force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false,
2610                                             GSI_CONTINUE_LINKING);
2611     }
2612 
2613   transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
2614   return true;
2615 }
2616 
2617 /* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
2618    number of iterations of the loop.  REDUCTION_LIST describes the reductions in
2619    LOOP.  */
2620 
2621 static void
transform_to_exit_first_loop(class loop * loop,reduction_info_table_type * reduction_list,tree nit)2622 transform_to_exit_first_loop (class loop *loop,
2623                                     reduction_info_table_type *reduction_list,
2624                                     tree nit)
2625 {
2626   basic_block *bbs, *nbbs, ex_bb, orig_header;
2627   unsigned n;
2628   bool ok;
2629   edge exit = single_dom_exit (loop), hpred;
2630   tree control, control_name, res, t;
2631   gphi *phi, *nphi;
2632   gassign *stmt;
2633   gcond *cond_stmt, *cond_nit;
2634   tree nit_1;
2635 
2636   split_block_after_labels (loop->header);
2637   orig_header = single_succ (loop->header);
2638   hpred = single_succ_edge (loop->header);
2639 
2640   cond_stmt = as_a <gcond *> (last_stmt (exit->src));
2641   control = gimple_cond_lhs (cond_stmt);
2642   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
2643 
2644   /* Make sure that we have phi nodes on exit for all loop header phis
2645      (create_parallel_loop requires that).  */
2646   for (gphi_iterator gsi = gsi_start_phis (loop->header);
2647        !gsi_end_p (gsi);
2648        gsi_next (&gsi))
2649     {
2650       phi = gsi.phi ();
2651       res = PHI_RESULT (phi);
2652       t = copy_ssa_name (res, phi);
2653       SET_PHI_RESULT (phi, t);
2654       nphi = create_phi_node (res, orig_header);
2655       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
2656 
2657       if (res == control)
2658           {
2659             gimple_cond_set_lhs (cond_stmt, t);
2660             update_stmt (cond_stmt);
2661             control = t;
2662           }
2663     }
2664 
2665   bbs = get_loop_body_in_dom_order (loop);
2666 
2667   for (n = 0; bbs[n] != exit->src; n++)
2668    continue;
2669   nbbs = XNEWVEC (basic_block, n);
2670   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
2671                                            bbs + 1, n, nbbs);
2672   gcc_assert (ok);
2673   free (bbs);
2674   ex_bb = nbbs[0];
2675   free (nbbs);
2676 
2677   /* Other than reductions, the only gimple reg that should be copied
2678      out of the loop is the control variable.  */
2679   exit = single_dom_exit (loop);
2680   control_name = NULL_TREE;
2681   for (gphi_iterator gsi = gsi_start_phis (ex_bb);
2682        !gsi_end_p (gsi); )
2683     {
2684       phi = gsi.phi ();
2685       res = PHI_RESULT (phi);
2686       if (virtual_operand_p (res))
2687           {
2688             gsi_next (&gsi);
2689             continue;
2690           }
2691 
2692       /* Check if it is a part of reduction.  If it is,
2693          keep the phi at the reduction's keep_res field.  The
2694          PHI_RESULT of this phi is the resulting value of the reduction
2695          variable when exiting the loop.  */
2696 
2697       if (!reduction_list->is_empty ())
2698           {
2699             struct reduction_info *red;
2700 
2701             tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2702             red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
2703             if (red)
2704               {
2705                 red->keep_res = phi;
2706                 gsi_next (&gsi);
2707                 continue;
2708               }
2709           }
2710       gcc_assert (control_name == NULL_TREE
2711                       && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
2712       control_name = res;
2713       remove_phi_node (&gsi, false);
2714     }
2715   gcc_assert (control_name != NULL_TREE);
2716 
2717   /* Initialize the control variable to number of iterations
2718      according to the rhs of the exit condition.  */
2719   gimple_stmt_iterator gsi = gsi_after_labels (ex_bb);
2720   cond_nit = as_a <gcond *> (last_stmt (exit->src));
2721   nit_1 =  gimple_cond_rhs (cond_nit);
2722   nit_1 = force_gimple_operand_gsi (&gsi,
2723                                           fold_convert (TREE_TYPE (control_name), nit_1),
2724                                           false, NULL_TREE, false, GSI_SAME_STMT);
2725   stmt = gimple_build_assign (control_name, nit_1);
2726   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2727 }
2728 
2729 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
2730    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
2731    NEW_DATA is the variable that should be initialized from the argument
2732    of LOOP_FN.  N_THREADS is the requested number of threads, which can be 0 if
2733    that number is to be determined later.  */
2734 
2735 static void
create_parallel_loop(class loop * loop,tree loop_fn,tree data,tree new_data,unsigned n_threads,location_t loc,bool oacc_kernels_p)2736 create_parallel_loop (class loop *loop, tree loop_fn, tree data,
2737                           tree new_data, unsigned n_threads, location_t loc,
2738                           bool oacc_kernels_p)
2739 {
2740   gimple_stmt_iterator gsi;
2741   basic_block for_bb, ex_bb, continue_bb;
2742   tree t, param;
2743   gomp_parallel *omp_par_stmt;
2744   gimple *omp_return_stmt1, *omp_return_stmt2;
2745   gimple *phi;
2746   gcond *cond_stmt;
2747   gomp_for *for_stmt;
2748   gomp_continue *omp_cont_stmt;
2749   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
2750   edge exit, nexit, guard, end, e;
2751 
2752   if (oacc_kernels_p)
2753     {
2754       gcc_checking_assert (lookup_attribute ("oacc kernels",
2755                                                        DECL_ATTRIBUTES (cfun->decl)));
2756       /* Indicate to later processing that this is a parallelized OpenACC
2757            kernels construct.  */
2758       DECL_ATTRIBUTES (cfun->decl)
2759           = tree_cons (get_identifier ("oacc kernels parallelized"),
2760                          NULL_TREE, DECL_ATTRIBUTES (cfun->decl));
2761     }
2762   else
2763     {
2764       /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
2765 
2766       basic_block bb = loop_preheader_edge (loop)->src;
2767       basic_block paral_bb = single_pred (bb);
2768       gsi = gsi_last_bb (paral_bb);
2769 
2770       gcc_checking_assert (n_threads != 0);
2771       t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
2772       OMP_CLAUSE_NUM_THREADS_EXPR (t)
2773           = build_int_cst (integer_type_node, n_threads);
2774       omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
2775       gimple_set_location (omp_par_stmt, loc);
2776 
2777       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
2778 
2779       /* Initialize NEW_DATA.  */
2780       if (data)
2781           {
2782             gassign *assign_stmt;
2783 
2784             gsi = gsi_after_labels (bb);
2785 
2786             param = make_ssa_name (DECL_ARGUMENTS (loop_fn));
2787             assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data));
2788             gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2789 
2790             assign_stmt = gimple_build_assign (new_data,
2791                                                        fold_convert (TREE_TYPE (new_data), param));
2792             gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2793           }
2794 
2795       /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
2796       bb = split_loop_exit_edge (single_dom_exit (loop));
2797       gsi = gsi_last_bb (bb);
2798       omp_return_stmt1 = gimple_build_omp_return (false);
2799       gimple_set_location (omp_return_stmt1, loc);
2800       gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
2801     }
2802 
2803   /* Extract data for GIMPLE_OMP_FOR.  */
2804   gcc_assert (loop->header == single_dom_exit (loop)->src);
2805   cond_stmt = as_a <gcond *> (last_stmt (loop->header));
2806 
2807   cvar = gimple_cond_lhs (cond_stmt);
2808   cvar_base = SSA_NAME_VAR (cvar);
2809   phi = SSA_NAME_DEF_STMT (cvar);
2810   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2811   initvar = copy_ssa_name (cvar);
2812   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
2813              initvar);
2814   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2815 
2816   gsi = gsi_last_nondebug_bb (loop->latch);
2817   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
2818   gsi_remove (&gsi, true);
2819 
2820   /* Prepare cfg.  */
2821   for_bb = split_edge (loop_preheader_edge (loop));
2822   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
2823   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
2824   gcc_assert (exit == single_dom_exit (loop));
2825 
2826   guard = make_edge (for_bb, ex_bb, 0);
2827   /* FIXME: What is the probability?  */
2828   guard->probability = profile_probability::guessed_never ();
2829   /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid.  */
2830   loop->latch = split_edge (single_succ_edge (loop->latch));
2831   single_pred_edge (loop->latch)->flags = 0;
2832   end = make_single_succ_edge (single_pred (loop->latch), ex_bb, EDGE_FALLTHRU);
2833   rescan_loop_exit (end, true, false);
2834 
2835   for (gphi_iterator gpi = gsi_start_phis (ex_bb);
2836        !gsi_end_p (gpi); gsi_next (&gpi))
2837     {
2838       location_t locus;
2839       gphi *phi = gpi.phi ();
2840       tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2841       gimple *def_stmt = SSA_NAME_DEF_STMT (def);
2842 
2843       /* If the exit phi is not connected to a header phi in the same loop, this
2844            value is not modified in the loop, and we're done with this phi.  */
2845       if (!(gimple_code (def_stmt) == GIMPLE_PHI
2846               && gimple_bb (def_stmt) == loop->header))
2847           {
2848             locus = gimple_phi_arg_location_from_edge (phi, exit);
2849             add_phi_arg (phi, def, guard, locus);
2850             add_phi_arg (phi, def, end, locus);
2851             continue;
2852           }
2853 
2854       gphi *stmt = as_a <gphi *> (def_stmt);
2855       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2856       locus = gimple_phi_arg_location_from_edge (stmt,
2857                                                              loop_preheader_edge (loop));
2858       add_phi_arg (phi, def, guard, locus);
2859 
2860       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
2861       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
2862       add_phi_arg (phi, def, end, locus);
2863     }
2864   e = redirect_edge_and_branch (exit, nexit->dest);
2865   PENDING_STMT (e) = NULL;
2866 
2867   /* Emit GIMPLE_OMP_FOR.  */
2868   if (oacc_kernels_p)
2869     /* Parallelized OpenACC kernels constructs use gang parallelism.  See also
2870        omp-offload.cc:execute_oacc_loop_designation.  */
2871     t = build_omp_clause (loc, OMP_CLAUSE_GANG);
2872   else
2873     {
2874       t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
2875       int chunk_size = param_parloops_chunk_size;
2876       switch (param_parloops_schedule)
2877           {
2878           case PARLOOPS_SCHEDULE_STATIC:
2879             OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
2880             break;
2881           case PARLOOPS_SCHEDULE_DYNAMIC:
2882             OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC;
2883             break;
2884           case PARLOOPS_SCHEDULE_GUIDED:
2885             OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED;
2886             break;
2887           case PARLOOPS_SCHEDULE_AUTO:
2888             OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO;
2889             chunk_size = 0;
2890             break;
2891           case PARLOOPS_SCHEDULE_RUNTIME:
2892             OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME;
2893             chunk_size = 0;
2894             break;
2895           default:
2896             gcc_unreachable ();
2897           }
2898       if (chunk_size != 0)
2899           OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t)
2900             = build_int_cst (integer_type_node, chunk_size);
2901     }
2902 
2903   for_stmt = gimple_build_omp_for (NULL,
2904                                            (oacc_kernels_p
2905                                             ? GF_OMP_FOR_KIND_OACC_LOOP
2906                                             : GF_OMP_FOR_KIND_FOR),
2907                                            t, 1, NULL);
2908 
2909   gimple_cond_set_lhs (cond_stmt, cvar_base);
2910   type = TREE_TYPE (cvar);
2911   gimple_set_location (for_stmt, loc);
2912   gimple_omp_for_set_index (for_stmt, 0, initvar);
2913   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
2914   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
2915   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
2916   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
2917                                                             cvar_base,
2918                                                             build_int_cst (type, 1)));
2919 
2920   gsi = gsi_last_bb (for_bb);
2921   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
2922   SSA_NAME_DEF_STMT (initvar) = for_stmt;
2923 
2924   /* Emit GIMPLE_OMP_CONTINUE.  */
2925   continue_bb = single_pred (loop->latch);
2926   gsi = gsi_last_bb (continue_bb);
2927   omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar);
2928   gimple_set_location (omp_cont_stmt, loc);
2929   gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT);
2930   SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt;
2931 
2932   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
2933   gsi = gsi_last_bb (ex_bb);
2934   omp_return_stmt2 = gimple_build_omp_return (true);
2935   gimple_set_location (omp_return_stmt2, loc);
2936   gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT);
2937 
2938   /* After the above dom info is hosed.  Re-compute it.  */
2939   free_dominance_info (CDI_DOMINATORS);
2940   calculate_dominance_info (CDI_DOMINATORS);
2941 }
2942 
2943 /* Return number of phis in bb.  If COUNT_VIRTUAL_P is false, don't count the
2944    virtual phi.  */
2945 
2946 static unsigned int
num_phis(basic_block bb,bool count_virtual_p)2947 num_phis (basic_block bb, bool count_virtual_p)
2948 {
2949   unsigned int nr_phis = 0;
2950   gphi_iterator gsi;
2951   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2952     {
2953       if (!count_virtual_p && virtual_operand_p (PHI_RESULT (gsi.phi ())))
2954           continue;
2955 
2956       nr_phis++;
2957     }
2958 
2959   return nr_phis;
2960 }
2961 
2962 /* Generates code to execute the iterations of LOOP in N_THREADS
2963    threads in parallel, which can be 0 if that number is to be determined
2964    later.
2965 
2966    NITER describes number of iterations of LOOP.
2967    REDUCTION_LIST describes the reductions existent in the LOOP.  */
2968 
2969 static void
gen_parallel_loop(class loop * loop,reduction_info_table_type * reduction_list,unsigned n_threads,class tree_niter_desc * niter,bool oacc_kernels_p)2970 gen_parallel_loop (class loop *loop,
2971                        reduction_info_table_type *reduction_list,
2972                        unsigned n_threads, class tree_niter_desc *niter,
2973                        bool oacc_kernels_p)
2974 {
2975   tree many_iterations_cond, type, nit;
2976   tree arg_struct, new_arg_struct;
2977   gimple_seq stmts;
2978   edge entry, exit;
2979   struct clsn_data clsn_data;
2980   location_t loc;
2981   gimple *cond_stmt;
2982   unsigned int m_p_thread=2;
2983 
2984   /* From
2985 
2986      ---------------------------------------------------------------------
2987      loop
2988        {
2989            IV = phi (INIT, IV + STEP)
2990            BODY1;
2991            if (COND)
2992              break;
2993            BODY2;
2994        }
2995      ---------------------------------------------------------------------
2996 
2997      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
2998      we generate the following code:
2999 
3000      ---------------------------------------------------------------------
3001 
3002      if (MAY_BE_ZERO
3003      || NITER < MIN_PER_THREAD * N_THREADS)
3004      goto original;
3005 
3006      BODY1;
3007      store all local loop-invariant variables used in body of the loop to DATA.
3008      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
3009      load the variables from DATA.
3010      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
3011      BODY2;
3012      BODY1;
3013      GIMPLE_OMP_CONTINUE;
3014      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
3015      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
3016      goto end;
3017 
3018      original:
3019      loop
3020        {
3021            IV = phi (INIT, IV + STEP)
3022            BODY1;
3023            if (COND)
3024              break;
3025            BODY2;
3026        }
3027 
3028      end:
3029 
3030    */
3031 
3032   /* Create two versions of the loop -- in the old one, we know that the
3033      number of iterations is large enough, and we will transform it into the
3034      loop that will be split to loop_fn, the new one will be used for the
3035      remaining iterations.  */
3036 
3037   /* We should compute a better number-of-iterations value for outer loops.
3038      That is, if we have
3039 
3040     for (i = 0; i < n; ++i)
3041       for (j = 0; j < m; ++j)
3042         ...
3043 
3044     we should compute nit = n * m, not nit = n.
3045     Also may_be_zero handling would need to be adjusted.  */
3046 
3047   type = TREE_TYPE (niter->niter);
3048   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
3049                                     NULL_TREE);
3050   if (stmts)
3051     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3052 
3053   if (!oacc_kernels_p)
3054     {
3055       if (loop->inner)
3056           m_p_thread=2;
3057       else
3058           m_p_thread=MIN_PER_THREAD;
3059 
3060       gcc_checking_assert (n_threads != 0);
3061       many_iterations_cond =
3062           fold_build2 (GE_EXPR, boolean_type_node,
3063                          nit, build_int_cst (type, m_p_thread * n_threads - 1));
3064 
3065       many_iterations_cond
3066           = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3067                            invert_truthvalue (unshare_expr (niter->may_be_zero)),
3068                            many_iterations_cond);
3069       many_iterations_cond
3070           = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
3071       if (stmts)
3072           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3073       if (!is_gimple_condexpr (many_iterations_cond))
3074           {
3075             many_iterations_cond
3076               = force_gimple_operand (many_iterations_cond, &stmts,
3077                                             true, NULL_TREE);
3078             if (stmts)
3079               gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
3080                                                         stmts);
3081           }
3082 
3083       initialize_original_copy_tables ();
3084 
3085       /* We assume that the loop usually iterates a lot.  */
3086       loop_version (loop, many_iterations_cond, NULL,
3087                         profile_probability::likely (),
3088                         profile_probability::unlikely (),
3089                         profile_probability::likely (),
3090                         profile_probability::unlikely (), true);
3091       update_ssa (TODO_update_ssa);
3092       free_original_copy_tables ();
3093     }
3094 
3095   /* Base all the induction variables in LOOP on a single control one.  */
3096   canonicalize_loop_ivs (loop, &nit, true);
3097   if (num_phis (loop->header, false) != reduction_list->elements () + 1)
3098     {
3099       /* The call to canonicalize_loop_ivs above failed to "base all the
3100            induction variables in LOOP on a single control one".  Do damage
3101            control.  */
3102       basic_block preheader = loop_preheader_edge (loop)->src;
3103       basic_block cond_bb = single_pred (preheader);
3104       gcond *cond = as_a <gcond *> (gsi_stmt (gsi_last_bb (cond_bb)));
3105       gimple_cond_make_true (cond);
3106       update_stmt (cond);
3107       /* We've gotten rid of the duplicate loop created by loop_version, but
3108            we can't undo whatever canonicalize_loop_ivs has done.
3109            TODO: Fix this properly by ensuring that the call to
3110            canonicalize_loop_ivs succeeds.  */
3111       if (dump_file
3112             && (dump_flags & TDF_DETAILS))
3113           fprintf (dump_file, "canonicalize_loop_ivs failed for loop %d,"
3114                      " aborting transformation\n", loop->num);
3115       return;
3116     }
3117 
3118   /* Ensure that the exit condition is the first statement in the loop.
3119      The common case is that latch of the loop is empty (apart from the
3120      increment) and immediately follows the loop exit test.  Attempt to move the
3121      entry of the loop directly before the exit check and increase the number of
3122      iterations of the loop by one.  */
3123   if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
3124     {
3125       if (dump_file
3126             && (dump_flags & TDF_DETAILS))
3127           fprintf (dump_file,
3128                      "alternative exit-first loop transform succeeded"
3129                      " for loop %d\n", loop->num);
3130     }
3131   else
3132     {
3133       if (oacc_kernels_p)
3134           n_threads = 1;
3135 
3136       /* Fall back on the method that handles more cases, but duplicates the
3137            loop body: move the exit condition of LOOP to the beginning of its
3138            header, and duplicate the part of the last iteration that gets disabled
3139            to the exit of the loop.  */
3140       transform_to_exit_first_loop (loop, reduction_list, nit);
3141     }
3142 
3143   /* Generate initializations for reductions.  */
3144   if (!reduction_list->is_empty ())
3145     reduction_list->traverse <class loop *, initialize_reductions> (loop);
3146 
3147   /* Eliminate the references to local variables from the loop.  */
3148   gcc_assert (single_exit (loop));
3149   entry = loop_preheader_edge (loop);
3150   exit = single_dom_exit (loop);
3151 
3152   /* This rewrites the body in terms of new variables.  This has already
3153      been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
3154   if (!oacc_kernels_p)
3155     {
3156       eliminate_local_variables (entry, exit);
3157       /* In the old loop, move all variables non-local to the loop to a
3158            structure and back, and create separate decls for the variables used in
3159            loop.  */
3160       separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
3161                                         &new_arg_struct, &clsn_data);
3162     }
3163   else
3164     {
3165       arg_struct = NULL_TREE;
3166       new_arg_struct = NULL_TREE;
3167       clsn_data.load = NULL_TREE;
3168       clsn_data.load_bb = exit->dest;
3169       clsn_data.store = NULL_TREE;
3170       clsn_data.store_bb = NULL;
3171     }
3172 
3173   /* Create the parallel constructs.  */
3174   loc = UNKNOWN_LOCATION;
3175   cond_stmt = last_stmt (loop->header);
3176   if (cond_stmt)
3177     loc = gimple_location (cond_stmt);
3178   create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct,
3179                               n_threads, loc, oacc_kernels_p);
3180   if (!reduction_list->is_empty ())
3181     create_call_for_reduction (loop, reduction_list, &clsn_data);
3182 
3183   scev_reset ();
3184 
3185   /* Free loop bound estimations that could contain references to
3186      removed statements.  */
3187   free_numbers_of_iterations_estimates (cfun);
3188 }
3189 
3190 /* Returns true when LOOP contains vector phi nodes.  */
3191 
3192 static bool
loop_has_vector_phi_nodes(class loop * loop ATTRIBUTE_UNUSED)3193 loop_has_vector_phi_nodes (class loop *loop ATTRIBUTE_UNUSED)
3194 {
3195   unsigned i;
3196   basic_block *bbs = get_loop_body_in_dom_order (loop);
3197   gphi_iterator gsi;
3198   bool res = true;
3199 
3200   for (i = 0; i < loop->num_nodes; i++)
3201     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3202       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi.phi ()))) == VECTOR_TYPE)
3203           goto end;
3204 
3205   res = false;
3206  end:
3207   free (bbs);
3208   return res;
3209 }
3210 
3211 /* Create a reduction_info struct, initialize it with REDUC_STMT
3212    and PHI, insert it to the REDUCTION_LIST.  */
3213 
3214 static void
build_new_reduction(reduction_info_table_type * reduction_list,gimple * reduc_stmt,gphi * phi)3215 build_new_reduction (reduction_info_table_type *reduction_list,
3216                          gimple *reduc_stmt, gphi *phi)
3217 {
3218   reduction_info **slot;
3219   struct reduction_info *new_reduction;
3220   enum tree_code reduction_code;
3221 
3222   gcc_assert (reduc_stmt);
3223 
3224   if (gimple_code (reduc_stmt) == GIMPLE_PHI)
3225     {
3226       tree op1 = PHI_ARG_DEF (reduc_stmt, 0);
3227       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3228       reduction_code = gimple_assign_rhs_code (def1);
3229     }
3230   else
3231     reduction_code = gimple_assign_rhs_code (reduc_stmt);
3232   /* Check for OpenMP supported reduction.  */
3233   switch (reduction_code)
3234     {
3235     case PLUS_EXPR:
3236     case MULT_EXPR:
3237     case MAX_EXPR:
3238     case MIN_EXPR:
3239     case BIT_IOR_EXPR:
3240     case BIT_XOR_EXPR:
3241     case BIT_AND_EXPR:
3242     case TRUTH_OR_EXPR:
3243     case TRUTH_XOR_EXPR:
3244     case TRUTH_AND_EXPR:
3245       break;
3246     default:
3247       return;
3248     }
3249 
3250   if (dump_file && (dump_flags & TDF_DETAILS))
3251     {
3252       fprintf (dump_file,
3253                  "Detected reduction. reduction stmt is:\n");
3254       print_gimple_stmt (dump_file, reduc_stmt, 0);
3255       fprintf (dump_file, "\n");
3256     }
3257 
3258   new_reduction = XCNEW (struct reduction_info);
3259 
3260   new_reduction->reduc_stmt = reduc_stmt;
3261   new_reduction->reduc_phi = phi;
3262   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
3263   new_reduction->reduction_code = reduction_code;
3264   slot = reduction_list->find_slot (new_reduction, INSERT);
3265   *slot = new_reduction;
3266 }
3267 
3268 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
3269 
3270 int
set_reduc_phi_uids(reduction_info ** slot,void * data ATTRIBUTE_UNUSED)3271 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
3272 {
3273   struct reduction_info *const red = *slot;
3274   gimple_set_uid (red->reduc_phi, red->reduc_version);
3275   return 1;
3276 }
3277 
3278 /* Return true if the type of reduction performed by STMT_INFO is suitable
3279    for this pass.  */
3280 
3281 static bool
valid_reduction_p(stmt_vec_info stmt_info)3282 valid_reduction_p (stmt_vec_info stmt_info)
3283 {
3284   /* Parallelization would reassociate the operation, which isn't
3285      allowed for in-order reductions.  */
3286   vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (stmt_info);
3287   return reduc_type != FOLD_LEFT_REDUCTION;
3288 }
3289 
3290 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
3291 
3292 static void
gather_scalar_reductions(loop_p loop,reduction_info_table_type * reduction_list)3293 gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list)
3294 {
3295   gphi_iterator gsi;
3296   loop_vec_info simple_loop_info;
3297   auto_vec<gphi *, 4> double_reduc_phis;
3298   auto_vec<gimple *, 4> double_reduc_stmts;
3299 
3300   vec_info_shared shared;
3301   vect_loop_form_info info;
3302   if (!vect_analyze_loop_form (loop, &info))
3303     goto gather_done;
3304 
3305   simple_loop_info = vect_create_loop_vinfo (loop, &shared, &info);
3306   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
3307     {
3308       gphi *phi = gsi.phi ();
3309       affine_iv iv;
3310       tree res = PHI_RESULT (phi);
3311       bool double_reduc;
3312 
3313       if (virtual_operand_p (res))
3314           continue;
3315 
3316       if (simple_iv (loop, loop, res, &iv, true))
3317           continue;
3318 
3319       stmt_vec_info reduc_stmt_info
3320           = parloops_force_simple_reduction (simple_loop_info,
3321                                                      simple_loop_info->lookup_stmt (phi),
3322                                                      &double_reduc, true);
3323       if (!reduc_stmt_info || !valid_reduction_p (reduc_stmt_info))
3324           continue;
3325 
3326       if (double_reduc)
3327           {
3328             if (loop->inner->inner != NULL)
3329               continue;
3330 
3331             double_reduc_phis.safe_push (phi);
3332             double_reduc_stmts.safe_push (reduc_stmt_info->stmt);
3333             continue;
3334           }
3335 
3336       build_new_reduction (reduction_list, reduc_stmt_info->stmt, phi);
3337     }
3338   delete simple_loop_info;
3339 
3340   if (!double_reduc_phis.is_empty ())
3341     {
3342       vec_info_shared shared;
3343       vect_loop_form_info info;
3344       if (vect_analyze_loop_form (loop->inner, &info))
3345           {
3346             simple_loop_info
3347               = vect_create_loop_vinfo (loop->inner, &shared, &info);
3348             gphi *phi;
3349             unsigned int i;
3350 
3351             FOR_EACH_VEC_ELT (double_reduc_phis, i, phi)
3352               {
3353                 affine_iv iv;
3354                 tree res = PHI_RESULT (phi);
3355                 bool double_reduc;
3356 
3357                 use_operand_p use_p;
3358                 gimple *inner_stmt;
3359                 bool single_use_p = single_imm_use (res, &use_p, &inner_stmt);
3360                 gcc_assert (single_use_p);
3361                 if (gimple_code (inner_stmt) != GIMPLE_PHI)
3362                     continue;
3363                 gphi *inner_phi = as_a <gphi *> (inner_stmt);
3364                 if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi),
3365                                    &iv, true))
3366                     continue;
3367 
3368                 stmt_vec_info inner_phi_info
3369                     = simple_loop_info->lookup_stmt (inner_phi);
3370                 stmt_vec_info inner_reduc_stmt_info
3371                     = parloops_force_simple_reduction (simple_loop_info,
3372                                                                inner_phi_info,
3373                                                                &double_reduc, true);
3374                 gcc_assert (!double_reduc);
3375                 if (!inner_reduc_stmt_info
3376                       || !valid_reduction_p (inner_reduc_stmt_info))
3377                     continue;
3378 
3379                 build_new_reduction (reduction_list, double_reduc_stmts[i], phi);
3380               }
3381             delete simple_loop_info;
3382           }
3383     }
3384 
3385  gather_done:
3386   if (reduction_list->is_empty ())
3387     return;
3388 
3389   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
3390      and delete simple_loop_info, we can set gimple_uid of reduc_phi stmts only
3391      now.  */
3392   basic_block bb;
3393   FOR_EACH_BB_FN (bb, cfun)
3394     for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3395       gimple_set_uid (gsi_stmt (gsi), (unsigned int)-1);
3396   reduction_list->traverse <void *, set_reduc_phi_uids> (NULL);
3397 }
3398 
3399 /* Try to initialize NITER for code generation part.  */
3400 
3401 static bool
try_get_loop_niter(loop_p loop,class tree_niter_desc * niter)3402 try_get_loop_niter (loop_p loop, class tree_niter_desc *niter)
3403 {
3404   edge exit = single_dom_exit (loop);
3405 
3406   gcc_assert (exit);
3407 
3408   /* We need to know # of iterations, and there should be no uses of values
3409      defined inside loop outside of it, unless the values are invariants of
3410      the loop.  */
3411   if (!number_of_iterations_exit (loop, exit, niter, false))
3412     {
3413       if (dump_file && (dump_flags & TDF_DETAILS))
3414           fprintf (dump_file, "  FAILED: number of iterations not known\n");
3415       return false;
3416     }
3417 
3418   return true;
3419 }
3420 
3421 /* Return the default def of the first function argument.  */
3422 
3423 static tree
get_omp_data_i_param(void)3424 get_omp_data_i_param (void)
3425 {
3426   tree decl = DECL_ARGUMENTS (cfun->decl);
3427   gcc_assert (DECL_CHAIN (decl) == NULL_TREE);
3428   return ssa_default_def (cfun, decl);
3429 }
3430 
3431 /* For PHI in loop header of LOOP, look for pattern:
3432 
3433    <bb preheader>
3434    .omp_data_i = &.omp_data_arr;
3435    addr = .omp_data_i->sum;
3436    sum_a = *addr;
3437 
3438    <bb header>:
3439    sum_b = PHI <sum_a (preheader), sum_c (latch)>
3440 
3441    and return addr.  Otherwise, return NULL_TREE.  */
3442 
3443 static tree
find_reduc_addr(class loop * loop,gphi * phi)3444 find_reduc_addr (class loop *loop, gphi *phi)
3445 {
3446   edge e = loop_preheader_edge (loop);
3447   tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
3448   gimple *stmt = SSA_NAME_DEF_STMT (arg);
3449   if (!gimple_assign_single_p (stmt))
3450     return NULL_TREE;
3451   tree memref = gimple_assign_rhs1 (stmt);
3452   if (TREE_CODE (memref) != MEM_REF)
3453     return NULL_TREE;
3454   tree addr = TREE_OPERAND (memref, 0);
3455 
3456   gimple *stmt2 = SSA_NAME_DEF_STMT (addr);
3457   if (!gimple_assign_single_p (stmt2))
3458     return NULL_TREE;
3459   tree compref = gimple_assign_rhs1 (stmt2);
3460   if (TREE_CODE (compref) != COMPONENT_REF)
3461     return NULL_TREE;
3462   tree addr2 = TREE_OPERAND (compref, 0);
3463   if (TREE_CODE (addr2) != MEM_REF)
3464     return NULL_TREE;
3465   addr2 = TREE_OPERAND (addr2, 0);
3466   if (TREE_CODE (addr2) != SSA_NAME
3467       || addr2 != get_omp_data_i_param ())
3468     return NULL_TREE;
3469 
3470   return addr;
3471 }
3472 
3473 /* Try to initialize REDUCTION_LIST for code generation part.
3474    REDUCTION_LIST describes the reductions.  */
3475 
3476 static bool
try_create_reduction_list(loop_p loop,reduction_info_table_type * reduction_list,bool oacc_kernels_p)3477 try_create_reduction_list (loop_p loop,
3478                                  reduction_info_table_type *reduction_list,
3479                                  bool oacc_kernels_p)
3480 {
3481   edge exit = single_dom_exit (loop);
3482   gphi_iterator gsi;
3483 
3484   gcc_assert (exit);
3485 
3486   /* Try to get rid of exit phis.  */
3487   final_value_replacement_loop (loop);
3488 
3489   gather_scalar_reductions (loop, reduction_list);
3490 
3491 
3492   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
3493     {
3494       gphi *phi = gsi.phi ();
3495       struct reduction_info *red;
3496       imm_use_iterator imm_iter;
3497       use_operand_p use_p;
3498       gimple *reduc_phi;
3499       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
3500 
3501       if (!virtual_operand_p (val))
3502           {
3503             if (TREE_CODE (val) != SSA_NAME)
3504               {
3505                 if (dump_file && (dump_flags & TDF_DETAILS))
3506                     fprintf (dump_file,
3507                                "  FAILED: exit PHI argument invariant.\n");
3508                 return false;
3509               }
3510 
3511             if (dump_file && (dump_flags & TDF_DETAILS))
3512               {
3513                 fprintf (dump_file, "phi is ");
3514                 print_gimple_stmt (dump_file, phi, 0);
3515                 fprintf (dump_file, "arg of phi to exit:   value ");
3516                 print_generic_expr (dump_file, val);
3517                 fprintf (dump_file, " used outside loop\n");
3518                 fprintf (dump_file,
3519                            "  checking if it is part of reduction pattern:\n");
3520               }
3521             if (reduction_list->is_empty ())
3522               {
3523                 if (dump_file && (dump_flags & TDF_DETAILS))
3524                     fprintf (dump_file,
3525                                "  FAILED: it is not a part of reduction.\n");
3526                 return false;
3527               }
3528             reduc_phi = NULL;
3529             FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
3530               {
3531                 if (!gimple_debug_bind_p (USE_STMT (use_p))
3532                       && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3533                     {
3534                       reduc_phi = USE_STMT (use_p);
3535                       break;
3536                     }
3537               }
3538             red = reduction_phi (reduction_list, reduc_phi);
3539             if (red == NULL)
3540               {
3541                 if (dump_file && (dump_flags & TDF_DETAILS))
3542                     fprintf (dump_file,
3543                                "  FAILED: it is not a part of reduction.\n");
3544                 return false;
3545               }
3546             if (red->keep_res != NULL)
3547               {
3548                 if (dump_file && (dump_flags & TDF_DETAILS))
3549                     fprintf (dump_file,
3550                                "  FAILED: reduction has multiple exit phis.\n");
3551                 return false;
3552               }
3553             red->keep_res = phi;
3554             if (dump_file && (dump_flags & TDF_DETAILS))
3555               {
3556                 fprintf (dump_file, "reduction phi is  ");
3557                 print_gimple_stmt (dump_file, red->reduc_phi, 0);
3558                 fprintf (dump_file, "reduction stmt is  ");
3559                 print_gimple_stmt (dump_file, red->reduc_stmt, 0);
3560               }
3561           }
3562     }
3563 
3564   /* The iterations of the loop may communicate only through bivs whose
3565      iteration space can be distributed efficiently.  */
3566   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
3567     {
3568       gphi *phi = gsi.phi ();
3569       tree def = PHI_RESULT (phi);
3570       affine_iv iv;
3571 
3572       if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
3573           {
3574             struct reduction_info *red;
3575 
3576             red = reduction_phi (reduction_list, phi);
3577             if (red == NULL)
3578               {
3579                 if (dump_file && (dump_flags & TDF_DETAILS))
3580                     fprintf (dump_file,
3581                                "  FAILED: scalar dependency between iterations\n");
3582                 return false;
3583               }
3584           }
3585     }
3586 
3587   if (oacc_kernels_p)
3588     {
3589       for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
3590              gsi_next (&gsi))
3591           {
3592             gphi *phi = gsi.phi ();
3593             tree def = PHI_RESULT (phi);
3594             affine_iv iv;
3595 
3596             if (!virtual_operand_p (def)
3597                 && !simple_iv (loop, loop, def, &iv, true))
3598               {
3599                 tree addr = find_reduc_addr (loop, phi);
3600                 if (addr == NULL_TREE)
3601                     return false;
3602                 struct reduction_info *red = reduction_phi (reduction_list, phi);
3603                 red->reduc_addr = addr;
3604               }
3605           }
3606     }
3607 
3608   return true;
3609 }
3610 
3611 /* Return true if LOOP contains phis with ADDR_EXPR in args.  */
3612 
3613 static bool
loop_has_phi_with_address_arg(class loop * loop)3614 loop_has_phi_with_address_arg (class loop *loop)
3615 {
3616   basic_block *bbs = get_loop_body (loop);
3617   bool res = false;
3618 
3619   unsigned i, j;
3620   gphi_iterator gsi;
3621   for (i = 0; i < loop->num_nodes; i++)
3622     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3623       {
3624           gphi *phi = gsi.phi ();
3625           for (j = 0; j < gimple_phi_num_args (phi); j++)
3626             {
3627               tree arg = gimple_phi_arg_def (phi, j);
3628               if (TREE_CODE (arg) == ADDR_EXPR)
3629                 {
3630                     /* This should be handled by eliminate_local_variables, but that
3631                        function currently ignores phis.  */
3632                     res = true;
3633                     goto end;
3634                 }
3635             }
3636       }
3637  end:
3638   free (bbs);
3639 
3640   return res;
3641 }
3642 
3643 /* Return true if memory ref REF (corresponding to the stmt at GSI in
3644    REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi,
3645    or the statements in REGIONS_BB[I + n].  REF_IS_STORE indicates if REF is a
3646    store.  Ignore conflicts with SKIP_STMT.  */
3647 
3648 static bool
ref_conflicts_with_region(gimple_stmt_iterator gsi,ao_ref * ref,bool ref_is_store,vec<basic_block> region_bbs,unsigned int i,gimple * skip_stmt)3649 ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref,
3650                                  bool ref_is_store, vec<basic_block> region_bbs,
3651                                  unsigned int i, gimple *skip_stmt)
3652 {
3653   basic_block bb = region_bbs[i];
3654   gsi_next (&gsi);
3655 
3656   while (true)
3657     {
3658       for (; !gsi_end_p (gsi);
3659              gsi_next (&gsi))
3660           {
3661             gimple *stmt = gsi_stmt (gsi);
3662             if (stmt == skip_stmt)
3663               {
3664                 if (dump_file)
3665                     {
3666                       fprintf (dump_file, "skipping reduction store: ");
3667                       print_gimple_stmt (dump_file, stmt, 0);
3668                     }
3669                 continue;
3670               }
3671 
3672             if (!gimple_vdef (stmt)
3673                 && !gimple_vuse (stmt))
3674               continue;
3675 
3676             if (gimple_code (stmt) == GIMPLE_RETURN)
3677               continue;
3678 
3679             if (ref_is_store)
3680               {
3681                 if (ref_maybe_used_by_stmt_p (stmt, ref))
3682                     {
3683                       if (dump_file)
3684                         {
3685                           fprintf (dump_file, "Stmt ");
3686                           print_gimple_stmt (dump_file, stmt, 0);
3687                         }
3688                       return true;
3689                     }
3690               }
3691             else
3692               {
3693                 if (stmt_may_clobber_ref_p_1 (stmt, ref))
3694                     {
3695                       if (dump_file)
3696                         {
3697                           fprintf (dump_file, "Stmt ");
3698                           print_gimple_stmt (dump_file, stmt, 0);
3699                         }
3700                       return true;
3701                     }
3702               }
3703           }
3704       i++;
3705       if (i == region_bbs.length ())
3706           break;
3707       bb = region_bbs[i];
3708       gsi = gsi_start_bb (bb);
3709     }
3710 
3711   return false;
3712 }
3713 
3714 /* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed
3715    in parallel with REGION_BBS containing the loop.  Return the stores of
3716    reduction results in REDUCTION_STORES.  */
3717 
3718 static bool
oacc_entry_exit_ok_1(bitmap in_loop_bbs,const vec<basic_block> & region_bbs,reduction_info_table_type * reduction_list,bitmap reduction_stores)3719 oacc_entry_exit_ok_1 (bitmap in_loop_bbs, const vec<basic_block> &region_bbs,
3720                           reduction_info_table_type *reduction_list,
3721                           bitmap reduction_stores)
3722 {
3723   tree omp_data_i = get_omp_data_i_param ();
3724 
3725   unsigned i;
3726   basic_block bb;
3727   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3728     {
3729       if (bitmap_bit_p (in_loop_bbs, bb->index))
3730           continue;
3731 
3732       gimple_stmt_iterator gsi;
3733       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
3734              gsi_next (&gsi))
3735           {
3736             gimple *stmt = gsi_stmt (gsi);
3737             gimple *skip_stmt = NULL;
3738 
3739             if (is_gimple_debug (stmt)
3740                 || gimple_code (stmt) == GIMPLE_COND)
3741               continue;
3742 
3743             ao_ref ref;
3744             bool ref_is_store = false;
3745             if (gimple_assign_load_p (stmt))
3746               {
3747                 tree rhs = gimple_assign_rhs1 (stmt);
3748                 tree base = get_base_address (rhs);
3749                 if (TREE_CODE (base) == MEM_REF
3750                       && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0))
3751                     continue;
3752 
3753                 tree lhs = gimple_assign_lhs (stmt);
3754                 if (TREE_CODE (lhs) == SSA_NAME
3755                       && has_single_use (lhs))
3756                     {
3757                       use_operand_p use_p;
3758                       gimple *use_stmt;
3759                       struct reduction_info *red;
3760                       single_imm_use (lhs, &use_p, &use_stmt);
3761                       if (gimple_code (use_stmt) == GIMPLE_PHI
3762                           && (red = reduction_phi (reduction_list, use_stmt)))
3763                         {
3764                           tree val = PHI_RESULT (red->keep_res);
3765                           if (has_single_use (val))
3766                               {
3767                                 single_imm_use (val, &use_p, &use_stmt);
3768                                 if (gimple_store_p (use_stmt))
3769                                   {
3770                                     unsigned int id
3771                                         = SSA_NAME_VERSION (gimple_vdef (use_stmt));
3772                                     bitmap_set_bit (reduction_stores, id);
3773                                     skip_stmt = use_stmt;
3774                                     if (dump_file)
3775                                         {
3776                                           fprintf (dump_file, "found reduction load: ");
3777                                           print_gimple_stmt (dump_file, stmt, 0);
3778                                         }
3779                                   }
3780                               }
3781                         }
3782                     }
3783 
3784                 ao_ref_init (&ref, rhs);
3785               }
3786             else if (gimple_store_p (stmt))
3787               {
3788                 ao_ref_init (&ref, gimple_assign_lhs (stmt));
3789                 ref_is_store = true;
3790               }
3791             else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
3792               continue;
3793             else if (!gimple_has_side_effects (stmt)
3794                        && !gimple_could_trap_p (stmt)
3795                        && !stmt_could_throw_p (cfun, stmt)
3796                        && !gimple_vdef (stmt)
3797                        && !gimple_vuse (stmt))
3798               continue;
3799             else if (gimple_call_internal_p (stmt, IFN_GOACC_DIM_POS))
3800               continue;
3801             else if (gimple_code (stmt) == GIMPLE_RETURN)
3802               continue;
3803             else
3804               {
3805                 if (dump_file)
3806                     {
3807                       fprintf (dump_file, "Unhandled stmt in entry/exit: ");
3808                       print_gimple_stmt (dump_file, stmt, 0);
3809                     }
3810                 return false;
3811               }
3812 
3813             if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs,
3814                                                    i, skip_stmt))
3815               {
3816                 if (dump_file)
3817                     {
3818                       fprintf (dump_file, "conflicts with entry/exit stmt: ");
3819                       print_gimple_stmt (dump_file, stmt, 0);
3820                     }
3821                 return false;
3822               }
3823           }
3824     }
3825 
3826   return true;
3827 }
3828 
3829 /* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with
3830    gang_pos == 0, except when the stores are REDUCTION_STORES.  Return true
3831    if any changes were made.  */
3832 
3833 static bool
oacc_entry_exit_single_gang(bitmap in_loop_bbs,const vec<basic_block> & region_bbs,bitmap reduction_stores)3834 oacc_entry_exit_single_gang (bitmap in_loop_bbs,
3835                                    const vec<basic_block> &region_bbs,
3836                                    bitmap reduction_stores)
3837 {
3838   tree gang_pos = NULL_TREE;
3839   bool changed = false;
3840 
3841   unsigned i;
3842   basic_block bb;
3843   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3844     {
3845       if (bitmap_bit_p (in_loop_bbs, bb->index))
3846           continue;
3847 
3848       gimple_stmt_iterator gsi;
3849       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
3850           {
3851             gimple *stmt = gsi_stmt (gsi);
3852 
3853             if (!gimple_store_p (stmt))
3854               {
3855                 /* Update gsi to point to next stmt.  */
3856                 gsi_next (&gsi);
3857                 continue;
3858               }
3859 
3860             if (bitmap_bit_p (reduction_stores,
3861                                   SSA_NAME_VERSION (gimple_vdef (stmt))))
3862               {
3863                 if (dump_file)
3864                     {
3865                       fprintf (dump_file,
3866                                  "skipped reduction store for single-gang"
3867                                  " neutering: ");
3868                       print_gimple_stmt (dump_file, stmt, 0);
3869                     }
3870 
3871                 /* Update gsi to point to next stmt.  */
3872                 gsi_next (&gsi);
3873                 continue;
3874               }
3875 
3876             changed = true;
3877 
3878             if (gang_pos == NULL_TREE)
3879               {
3880                 tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG);
3881                 gcall *gang_single
3882                     = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg);
3883                 gang_pos = make_ssa_name (integer_type_node);
3884                 gimple_call_set_lhs (gang_single, gang_pos);
3885                 gimple_stmt_iterator start
3886                     = gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
3887                 tree vuse = ssa_default_def (cfun, gimple_vop (cfun));
3888                 gimple_set_vuse (gang_single, vuse);
3889                 gsi_insert_before (&start, gang_single, GSI_SAME_STMT);
3890               }
3891 
3892             if (dump_file)
3893               {
3894                 fprintf (dump_file,
3895                            "found store that needs single-gang neutering: ");
3896                 print_gimple_stmt (dump_file, stmt, 0);
3897               }
3898 
3899             {
3900               /* Split block before store.  */
3901               gimple_stmt_iterator gsi2 = gsi;
3902               gsi_prev (&gsi2);
3903               edge e;
3904               if (gsi_end_p (gsi2))
3905                 {
3906                     e = split_block_after_labels (bb);
3907                     gsi2 = gsi_last_bb (bb);
3908                 }
3909               else
3910                 e = split_block (bb, gsi_stmt (gsi2));
3911               basic_block bb2 = e->dest;
3912 
3913               /* Split block after store.  */
3914               gimple_stmt_iterator gsi3 = gsi_start_bb (bb2);
3915               edge e2 = split_block (bb2, gsi_stmt (gsi3));
3916               basic_block bb3 = e2->dest;
3917 
3918               gimple *cond
3919                 = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node,
3920                                            NULL_TREE, NULL_TREE);
3921               gsi_insert_after (&gsi2, cond, GSI_NEW_STMT);
3922 
3923               edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE);
3924               /* FIXME: What is the probability?  */
3925               e3->probability = profile_probability::guessed_never ();
3926               e->flags = EDGE_TRUE_VALUE;
3927 
3928               tree vdef = gimple_vdef (stmt);
3929               tree vuse = gimple_vuse (stmt);
3930 
3931               tree phi_res = copy_ssa_name (vdef);
3932               gphi *new_phi = create_phi_node (phi_res, bb3);
3933               replace_uses_by (vdef, phi_res);
3934               add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION);
3935               add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION);
3936 
3937               /* Update gsi to point to next stmt.  */
3938               bb = bb3;
3939               gsi = gsi_start_bb (bb);
3940             }
3941           }
3942     }
3943 
3944   return changed;
3945 }
3946 
3947 /* Return true if the statements before and after the LOOP can be executed in
3948    parallel with the function containing the loop.  Resolve conflicting stores
3949    outside LOOP by guarding them such that only a single gang executes them.  */
3950 
3951 static bool
oacc_entry_exit_ok(class loop * loop,reduction_info_table_type * reduction_list)3952 oacc_entry_exit_ok (class loop *loop,
3953                         reduction_info_table_type *reduction_list)
3954 {
3955   basic_block *loop_bbs = get_loop_body_in_dom_order (loop);
3956   auto_vec<basic_block> region_bbs
3957     = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun));
3958 
3959   bitmap in_loop_bbs = BITMAP_ALLOC (NULL);
3960   bitmap_clear (in_loop_bbs);
3961   for (unsigned int i = 0; i < loop->num_nodes; i++)
3962     bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index);
3963 
3964   bitmap reduction_stores = BITMAP_ALLOC (NULL);
3965   bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list,
3966                                            reduction_stores);
3967 
3968   if (res)
3969     {
3970       bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs,
3971                                                               reduction_stores);
3972       if (changed)
3973           {
3974             free_dominance_info (CDI_DOMINATORS);
3975             calculate_dominance_info (CDI_DOMINATORS);
3976           }
3977     }
3978 
3979   free (loop_bbs);
3980 
3981   BITMAP_FREE (in_loop_bbs);
3982   BITMAP_FREE (reduction_stores);
3983 
3984   return res;
3985 }
3986 
3987 /* Detect parallel loops and generate parallel code using libgomp
3988    primitives.  Returns true if some loop was parallelized, false
3989    otherwise.  */
3990 
3991 static bool
parallelize_loops(bool oacc_kernels_p)3992 parallelize_loops (bool oacc_kernels_p)
3993 {
3994   unsigned n_threads;
3995   bool changed = false;
3996   class loop *skip_loop = NULL;
3997   class tree_niter_desc niter_desc;
3998   struct obstack parloop_obstack;
3999   HOST_WIDE_INT estimated;
4000 
4001   /* Do not parallelize loops in the functions created by parallelization.  */
4002   if (!oacc_kernels_p
4003       && parallelized_function_p (cfun->decl))
4004     return false;
4005 
4006   /* Do not parallelize loops in offloaded functions.  */
4007   if (!oacc_kernels_p
4008       && oacc_get_fn_attrib (cfun->decl) != NULL)
4009      return false;
4010 
4011   if (cfun->has_nonlocal_label)
4012     return false;
4013 
4014   /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
4015      the argument to -ftree-parallelize-loops.  */
4016   if (oacc_kernels_p)
4017     n_threads = 0;
4018   else
4019     n_threads = flag_tree_parallelize_loops;
4020 
4021   gcc_obstack_init (&parloop_obstack);
4022   reduction_info_table_type reduction_list (10);
4023 
4024   calculate_dominance_info (CDI_DOMINATORS);
4025 
4026   for (auto loop : loops_list (cfun, 0))
4027     {
4028       if (loop == skip_loop)
4029           {
4030             if (!loop->in_oacc_kernels_region
4031                 && dump_file && (dump_flags & TDF_DETAILS))
4032               fprintf (dump_file,
4033                          "Skipping loop %d as inner loop of parallelized loop\n",
4034                          loop->num);
4035 
4036             skip_loop = loop->inner;
4037             continue;
4038           }
4039       else
4040           skip_loop = NULL;
4041 
4042       reduction_list.empty ();
4043 
4044       if (oacc_kernels_p)
4045           {
4046             if (!loop->in_oacc_kernels_region)
4047               continue;
4048 
4049             /* Don't try to parallelize inner loops in an oacc kernels region.  */
4050             if (loop->inner)
4051               skip_loop = loop->inner;
4052 
4053             if (dump_file && (dump_flags & TDF_DETAILS))
4054               fprintf (dump_file,
4055                          "Trying loop %d with header bb %d in oacc kernels"
4056                          " region\n", loop->num, loop->header->index);
4057           }
4058 
4059       if (dump_file && (dump_flags & TDF_DETAILS))
4060       {
4061         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
4062           if (loop->inner)
4063             fprintf (dump_file, "loop %d is not innermost\n",loop->num);
4064           else
4065             fprintf (dump_file, "loop %d is innermost\n",loop->num);
4066       }
4067 
4068       if (!single_dom_exit (loop))
4069       {
4070 
4071         if (dump_file && (dump_flags & TDF_DETAILS))
4072             fprintf (dump_file, "loop is !single_dom_exit\n");
4073 
4074           continue;
4075       }
4076 
4077       if (/* And of course, the loop must be parallelizable.  */
4078             !can_duplicate_loop_p (loop)
4079             || loop_has_blocks_with_irreducible_flag (loop)
4080             || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
4081             /* FIXME: the check for vector phi nodes could be removed.  */
4082             || loop_has_vector_phi_nodes (loop))
4083           continue;
4084 
4085       estimated = estimated_loop_iterations_int (loop);
4086       if (estimated == -1)
4087           estimated = get_likely_max_loop_iterations_int (loop);
4088       /* FIXME: Bypass this check as graphite doesn't update the
4089            count and frequency correctly now.  */
4090       if (!flag_loop_parallelize_all
4091             && !oacc_kernels_p
4092             && ((estimated != -1
4093                  && (estimated
4094                        < ((HOST_WIDE_INT) n_threads
4095                           * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
4096                 /* Do not bother with loops in cold areas.  */
4097                 || optimize_loop_nest_for_size_p (loop)))
4098           continue;
4099 
4100       if (!try_get_loop_niter (loop, &niter_desc))
4101           continue;
4102 
4103       if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
4104           continue;
4105 
4106       if (loop_has_phi_with_address_arg (loop))
4107           continue;
4108 
4109       if (!loop->can_be_parallel
4110             && !loop_parallel_p (loop, &parloop_obstack))
4111           continue;
4112 
4113       if (oacc_kernels_p
4114           && !oacc_entry_exit_ok (loop, &reduction_list))
4115           {
4116             if (dump_file)
4117               fprintf (dump_file, "entry/exit not ok: FAILED\n");
4118             continue;
4119           }
4120 
4121       changed = true;
4122       skip_loop = loop->inner;
4123 
4124       if (dump_enabled_p ())
4125           {
4126             dump_user_location_t loop_loc = find_loop_location (loop);
4127             if (loop->inner)
4128               dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
4129                                    "parallelizing outer loop %d\n", loop->num);
4130             else
4131               dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
4132                                    "parallelizing inner loop %d\n", loop->num);
4133           }
4134 
4135       gen_parallel_loop (loop, &reduction_list,
4136                                n_threads, &niter_desc, oacc_kernels_p);
4137     }
4138 
4139   obstack_free (&parloop_obstack, NULL);
4140 
4141   /* Parallelization will cause new function calls to be inserted through
4142      which local variables will escape.  Reset the points-to solution
4143      for ESCAPED.  */
4144   if (changed)
4145     pt_solution_reset (&cfun->gimple_df->escaped);
4146 
4147   return changed;
4148 }
4149 
4150 /* Parallelization.  */
4151 
4152 namespace {
4153 
4154 const pass_data pass_data_parallelize_loops =
4155 {
4156   GIMPLE_PASS, /* type */
4157   "parloops", /* name */
4158   OPTGROUP_LOOP, /* optinfo_flags */
4159   TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
4160   ( PROP_cfg | PROP_ssa ), /* properties_required */
4161   0, /* properties_provided */
4162   0, /* properties_destroyed */
4163   0, /* todo_flags_start */
4164   0, /* todo_flags_finish */
4165 };
4166 
4167 class pass_parallelize_loops : public gimple_opt_pass
4168 {
4169 public:
pass_parallelize_loops(gcc::context * ctxt)4170   pass_parallelize_loops (gcc::context *ctxt)
4171     : gimple_opt_pass (pass_data_parallelize_loops, ctxt),
4172       oacc_kernels_p (false)
4173   {}
4174 
4175   /* opt_pass methods: */
gate(function *)4176   virtual bool gate (function *)
4177   {
4178     if (oacc_kernels_p)
4179       return flag_openacc;
4180     else
4181       return flag_tree_parallelize_loops > 1;
4182   }
4183   virtual unsigned int execute (function *);
clone()4184   opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); }
set_pass_param(unsigned int n,bool param)4185   void set_pass_param (unsigned int n, bool param)
4186     {
4187       gcc_assert (n == 0);
4188       oacc_kernels_p = param;
4189     }
4190 
4191  private:
4192   bool oacc_kernels_p;
4193 }; // class pass_parallelize_loops
4194 
4195 unsigned
execute(function * fun)4196 pass_parallelize_loops::execute (function *fun)
4197 {
4198   tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
4199   if (nthreads == NULL_TREE)
4200     return 0;
4201 
4202   bool in_loop_pipeline = scev_initialized_p ();
4203   if (!in_loop_pipeline)
4204     loop_optimizer_init (LOOPS_NORMAL
4205                                | LOOPS_HAVE_RECORDED_EXITS);
4206 
4207   if (number_of_loops (fun) <= 1)
4208     return 0;
4209 
4210   if (!in_loop_pipeline)
4211     {
4212       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
4213       scev_initialize ();
4214     }
4215 
4216   unsigned int todo = 0;
4217   if (parallelize_loops (oacc_kernels_p))
4218     {
4219       fun->curr_properties &= ~(PROP_gimple_eomp);
4220 
4221       checking_verify_loop_structure ();
4222 
4223       todo |= TODO_update_ssa;
4224     }
4225 
4226   if (!in_loop_pipeline)
4227     {
4228       scev_finalize ();
4229       loop_optimizer_finalize ();
4230     }
4231 
4232   return todo;
4233 }
4234 
4235 } // anon namespace
4236 
4237 gimple_opt_pass *
make_pass_parallelize_loops(gcc::context * ctxt)4238 make_pass_parallelize_loops (gcc::context *ctxt)
4239 {
4240   return new pass_parallelize_loops (ctxt);
4241 }
4242