1 /* Loop Vectorization
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 
59 /* Loop Vectorization Pass.
60 
61    This pass tries to vectorize loops.
62 
63    For example, the vectorizer transforms the following simple loop:
64 
65         short a[N]; short b[N]; short c[N]; int i;
66 
67         for (i=0; i<N; i++){
68           a[i] = b[i] + c[i];
69         }
70 
71    as if it was manually vectorized by rewriting the source code into:
72 
73         typedef int __attribute__((mode(V8HI))) v8hi;
74         short a[N];  short b[N]; short c[N];   int i;
75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76         v8hi va, vb, vc;
77 
78         for (i=0; i<N/8; i++){
79           vb = pb[i];
80           vc = pc[i];
81           va = vb + vc;
82           pa[i] = va;
83         }
84 
85         The main entry to this pass is vectorize_loops(), in which
86    the vectorizer applies a set of analyses on a given set of loops,
87    followed by the actual vectorization transformation for the loops that
88    had successfully passed the analysis phase.
89         Throughout this pass we make a distinction between two types of
90    data: scalars (which are represented by SSA_NAMES), and memory references
91    ("data-refs").  These two types of data require different handling both
92    during analysis and transformation. The types of data-refs that the
93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95    accesses are required to have a simple (consecutive) access pattern.
96 
97    Analysis phase:
98    ===============
99         The driver for the analysis phase is vect_analyze_loop().
100    It applies a set of analyses, some of which rely on the scalar evolution
101    analyzer (scev) developed by Sebastian Pop.
102 
103         During the analysis phase the vectorizer records some information
104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105    loop, as well as general information about the loop as a whole, which is
106    recorded in a "loop_vec_info" struct attached to each loop.
107 
108    Transformation phase:
109    =====================
110         The loop transformation phase scans all the stmts in the loop, and
111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112    the loop that needs to be vectorized.  It inserts the vector code sequence
113    just before the scalar stmt S, and records a pointer to the vector code
114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115    attached to S).  This pointer will be used for the vectorization of following
116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117    otherwise, we rely on dead code elimination for removing it.
118 
119         For example, say stmt S1 was vectorized into stmt VS1:
120 
121    VS1: vb = px[i];
122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123    S2:  a = b;
124 
125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
128    resulting sequence would be:
129 
130    VS1: vb = px[i];
131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132    VS2: va = vb;
133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
134 
135         Operands that are not SSA_NAMEs, are data-refs that appear in
136    load/store operations (like 'x[i]' in S1), and are handled differently.
137 
138    Target modeling:
139    =================
140         Currently the only target specific information that is used is the
141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142    Targets that can support different sizes of vectors, for now will need
143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
144    flexibility will be added in the future.
145 
146         Since we only vectorize operations which vector form can be
147    expressed using existing tree codes, to verify that an operation is
148    supported, the vectorizer checks the relevant optab at the relevant
149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
150    the value found is CODE_FOR_nothing, then there's no target support, and
151    we can't vectorize the stmt.
152 
153    For additional information on this project see:
154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 */
156 
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158                                                             unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160                                                          bool *, bool *);
161 
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164    may already be set for general statements (not just data refs).  */
165 
166 static opt_result
vect_determine_vf_for_stmt_1(vec_info * vinfo,stmt_vec_info stmt_info,bool vectype_maybe_set_p,poly_uint64 * vf)167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168                                     bool vectype_maybe_set_p,
169                                     poly_uint64 *vf)
170 {
171   gimple *stmt = stmt_info->stmt;
172 
173   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174        && !STMT_VINFO_LIVE_P (stmt_info))
175       || gimple_clobber_p (stmt))
176     {
177       if (dump_enabled_p ())
178           dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179       return opt_result::success ();
180     }
181 
182   tree stmt_vectype, nunits_vectype;
183   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184                                                                &stmt_vectype,
185                                                                &nunits_vectype);
186   if (!res)
187     return res;
188 
189   if (stmt_vectype)
190     {
191       if (STMT_VINFO_VECTYPE (stmt_info))
192           /* The only case when a vectype had been already set is for stmts
193              that contain a data ref, or for "pattern-stmts" (stmts generated
194              by the vectorizer to represent/replace a certain idiom).  */
195           gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196                          || vectype_maybe_set_p)
197                         && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198       else
199           STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200     }
201 
202   if (nunits_vectype)
203     vect_update_max_nunits (vf, nunits_vectype);
204 
205   return opt_result::success ();
206 }
207 
208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
209    types of STMT_INFO and all attached pattern statements and update
210    the vectorization factor VF accordingly.  Return true on success
211    or false if something prevented vectorization.  */
212 
213 static opt_result
vect_determine_vf_for_stmt(vec_info * vinfo,stmt_vec_info stmt_info,poly_uint64 * vf)214 vect_determine_vf_for_stmt (vec_info *vinfo,
215                                   stmt_vec_info stmt_info, poly_uint64 *vf)
216 {
217   if (dump_enabled_p ())
218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219                          stmt_info->stmt);
220   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221   if (!res)
222     return res;
223 
224   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225       && STMT_VINFO_RELATED_STMT (stmt_info))
226     {
227       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
229 
230       /* If a pattern statement has def stmts, analyze them too.  */
231       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232              !gsi_end_p (si); gsi_next (&si))
233           {
234             stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235             if (dump_enabled_p ())
236               dump_printf_loc (MSG_NOTE, vect_location,
237                                    "==> examining pattern def stmt: %G",
238                                    def_stmt_info->stmt);
239             res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240             if (!res)
241               return res;
242           }
243 
244       if (dump_enabled_p ())
245           dump_printf_loc (MSG_NOTE, vect_location,
246                                "==> examining pattern statement: %G",
247                                stmt_info->stmt);
248       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249       if (!res)
250           return res;
251     }
252 
253   return opt_result::success ();
254 }
255 
256 /* Function vect_determine_vectorization_factor
257 
258    Determine the vectorization factor (VF).  VF is the number of data elements
259    that are operated upon in parallel in a single iteration of the vectorized
260    loop.  For example, when vectorizing a loop that operates on 4byte elements,
261    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262    elements can fit in a single vector register.
263 
264    We currently support vectorization of loops in which all types operated upon
265    are of the same size.  Therefore this function currently sets VF according to
266    the size of the types operated upon, and fails if there are multiple sizes
267    in the loop.
268 
269    VF is also the factor by which the loop iterations are strip-mined, e.g.:
270    original loop:
271         for (i=0; i<N; i++){
272           a[i] = b[i] + c[i];
273         }
274 
275    vectorized loop:
276         for (i=0; i<N; i+=VF){
277           a[i:VF] = b[i:VF] + c[i:VF];
278         }
279 */
280 
281 static opt_result
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
283 {
284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286   unsigned nbbs = loop->num_nodes;
287   poly_uint64 vectorization_factor = 1;
288   tree scalar_type = NULL_TREE;
289   gphi *phi;
290   tree vectype;
291   stmt_vec_info stmt_info;
292   unsigned i;
293 
294   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
295 
296   for (i = 0; i < nbbs; i++)
297     {
298       basic_block bb = bbs[i];
299 
300       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301              gsi_next (&si))
302           {
303             phi = si.phi ();
304             stmt_info = loop_vinfo->lookup_stmt (phi);
305             if (dump_enabled_p ())
306               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307                                    phi);
308 
309             gcc_assert (stmt_info);
310 
311             if (STMT_VINFO_RELEVANT_P (stmt_info)
312                 || STMT_VINFO_LIVE_P (stmt_info))
313             {
314                 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315               scalar_type = TREE_TYPE (PHI_RESULT (phi));
316 
317                 if (dump_enabled_p ())
318                     dump_printf_loc (MSG_NOTE, vect_location,
319                                          "get vectype for scalar type:  %T\n",
320                                          scalar_type);
321 
322                 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323                 if (!vectype)
324                     return opt_result::failure_at (phi,
325                                                          "not vectorized: unsupported "
326                                                          "data-type %T\n",
327                                                          scalar_type);
328                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
329 
330                 if (dump_enabled_p ())
331                     dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332                                          vectype);
333 
334                 if (dump_enabled_p ())
335                     {
336                       dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337                       dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338                       dump_printf (MSG_NOTE, "\n");
339                     }
340 
341                 vect_update_max_nunits (&vectorization_factor, vectype);
342               }
343           }
344 
345       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346              gsi_next (&si))
347           {
348             if (is_gimple_debug (gsi_stmt (si)))
349               continue;
350             stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351             opt_result res
352               = vect_determine_vf_for_stmt (loop_vinfo,
353                                                     stmt_info, &vectorization_factor);
354             if (!res)
355               return res;
356         }
357     }
358 
359   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
360   if (dump_enabled_p ())
361     {
362       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363       dump_dec (MSG_NOTE, vectorization_factor);
364       dump_printf (MSG_NOTE, "\n");
365     }
366 
367   if (known_le (vectorization_factor, 1U))
368     return opt_result::failure_at (vect_location,
369                                            "not vectorized: unsupported data-type\n");
370   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371   return opt_result::success ();
372 }
373 
374 
375 /* Function vect_is_simple_iv_evolution.
376 
377    FORNOW: A simple evolution of an induction variables in the loop is
378    considered a polynomial evolution.  */
379 
380 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382                              tree * step)
383 {
384   tree init_expr;
385   tree step_expr;
386   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387   basic_block bb;
388 
389   /* When there is no evolution in this loop, the evolution function
390      is not "simple".  */
391   if (evolution_part == NULL_TREE)
392     return false;
393 
394   /* When the evolution is a polynomial of degree >= 2
395      the evolution function is not "simple".  */
396   if (tree_is_chrec (evolution_part))
397     return false;
398 
399   step_expr = evolution_part;
400   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
401 
402   if (dump_enabled_p ())
403     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
404                          step_expr, init_expr);
405 
406   *init = init_expr;
407   *step = step_expr;
408 
409   if (TREE_CODE (step_expr) != INTEGER_CST
410       && (TREE_CODE (step_expr) != SSA_NAME
411             || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412                 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413             || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414                 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415                       || !flag_associative_math)))
416       && (TREE_CODE (step_expr) != REAL_CST
417             || !flag_associative_math))
418     {
419       if (dump_enabled_p ())
420         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421                          "step unknown.\n");
422       return false;
423     }
424 
425   return true;
426 }
427 
428 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
429    what we are assuming is a double reduction.  For example, given
430    a structure like this:
431 
432       outer1:
433           x_1 = PHI <x_4(outer2), ...>;
434           ...
435 
436       inner:
437           x_2 = PHI <x_1(outer1), ...>;
438           ...
439           x_3 = ...;
440           ...
441 
442       outer2:
443           x_4 = PHI <x_3(inner)>;
444           ...
445 
446    outer loop analysis would treat x_1 as a double reduction phi and
447    this function would then return true for x_2.  */
448 
449 static bool
vect_inner_phi_in_double_reduction_p(loop_vec_info loop_vinfo,gphi * phi)450 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
451 {
452   use_operand_p use_p;
453   ssa_op_iter op_iter;
454   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
455     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
456       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
457           return true;
458   return false;
459 }
460 
461 /* Function vect_analyze_scalar_cycles_1.
462 
463    Examine the cross iteration def-use cycles of scalar variables
464    in LOOP.  LOOP_VINFO represents the loop that is now being
465    considered for vectorization (can be LOOP, or an outer-loop
466    enclosing LOOP).  */
467 
468 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,class loop * loop)469 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
470 {
471   basic_block bb = loop->header;
472   tree init, step;
473   auto_vec<stmt_vec_info, 64> worklist;
474   gphi_iterator gsi;
475   bool double_reduc, reduc_chain;
476 
477   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
478 
479   /* First - identify all inductions.  Reduction detection assumes that all the
480      inductions have been identified, therefore, this order must not be
481      changed.  */
482   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
483     {
484       gphi *phi = gsi.phi ();
485       tree access_fn = NULL;
486       tree def = PHI_RESULT (phi);
487       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
488 
489       if (dump_enabled_p ())
490           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
491 
492       /* Skip virtual phi's.  The data dependences that are associated with
493          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
494       if (virtual_operand_p (def))
495           continue;
496 
497       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
498 
499       /* Analyze the evolution function.  */
500       access_fn = analyze_scalar_evolution (loop, def);
501       if (access_fn)
502           {
503             STRIP_NOPS (access_fn);
504             if (dump_enabled_p ())
505               dump_printf_loc (MSG_NOTE, vect_location,
506                                    "Access function of PHI: %T\n", access_fn);
507             STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
508               = initial_condition_in_loop_num (access_fn, loop->num);
509             STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
510               = evolution_part_in_loop_num (access_fn, loop->num);
511           }
512 
513       if (!access_fn
514             || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
515             || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
516             || (LOOP_VINFO_LOOP (loop_vinfo) != loop
517                 && TREE_CODE (step) != INTEGER_CST))
518           {
519             worklist.safe_push (stmt_vinfo);
520             continue;
521           }
522 
523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
524                       != NULL_TREE);
525       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
526 
527       if (dump_enabled_p ())
528           dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
529       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530     }
531 
532 
533   /* Second - identify all reductions and nested cycles.  */
534   while (worklist.length () > 0)
535     {
536       stmt_vec_info stmt_vinfo = worklist.pop ();
537       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
538       tree def = PHI_RESULT (phi);
539 
540       if (dump_enabled_p ())
541           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
542 
543       gcc_assert (!virtual_operand_p (def)
544                       && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
545 
546       stmt_vec_info reduc_stmt_info
547           = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
548                                             &reduc_chain);
549       if (reduc_stmt_info)
550         {
551             STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
552             STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
553             if (double_reduc)
554               {
555                 if (dump_enabled_p ())
556                     dump_printf_loc (MSG_NOTE, vect_location,
557                                          "Detected double reduction.\n");
558 
559               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
560                 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
561             }
562           else
563             {
564               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
565                 {
566                   if (dump_enabled_p ())
567                     dump_printf_loc (MSG_NOTE, vect_location,
568                                              "Detected vectorizable nested cycle.\n");
569 
570                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
571                 }
572               else
573                 {
574                   if (dump_enabled_p ())
575                     dump_printf_loc (MSG_NOTE, vect_location,
576                                              "Detected reduction.\n");
577 
578                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
579                       STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
580                   /* Store the reduction cycles for possible vectorization in
581                      loop-aware SLP if it was not detected as reduction
582                          chain.  */
583                       if (! reduc_chain)
584                         LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
585                           (reduc_stmt_info);
586                 }
587             }
588         }
589       else
590         if (dump_enabled_p ())
591           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
592                                  "Unknown def-use cycle pattern.\n");
593     }
594 }
595 
596 
597 /* Function vect_analyze_scalar_cycles.
598 
599    Examine the cross iteration def-use cycles of scalar variables, by
600    analyzing the loop-header PHIs of scalar variables.  Classify each
601    cycle as one of the following: invariant, induction, reduction, unknown.
602    We do that for the loop represented by LOOP_VINFO, and also to its
603    inner-loop, if exists.
604    Examples for scalar cycles:
605 
606    Example1: reduction:
607 
608               loop1:
609               for (i=0; i<N; i++)
610                  sum += a[i];
611 
612    Example2: induction:
613 
614               loop2:
615               for (i=0; i<N; i++)
616                  a[i] = i;  */
617 
618 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)619 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
620 {
621   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
622 
623   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
624 
625   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
626      Reductions in such inner-loop therefore have different properties than
627      the reductions in the nest that gets vectorized:
628      1. When vectorized, they are executed in the same order as in the original
629         scalar loop, so we can't change the order of computation when
630         vectorizing them.
631      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
632         current checks are too strict.  */
633 
634   if (loop->inner)
635     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
636 }
637 
638 /* Transfer group and reduction information from STMT_INFO to its
639    pattern stmt.  */
640 
641 static void
vect_fixup_reduc_chain(stmt_vec_info stmt_info)642 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
643 {
644   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
645   stmt_vec_info stmtp;
646   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
647                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
648   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
649   do
650     {
651       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
652       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
653                                  == STMT_VINFO_DEF_TYPE (stmt_info));
654       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
655       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
656       if (stmt_info)
657           REDUC_GROUP_NEXT_ELEMENT (stmtp)
658             = STMT_VINFO_RELATED_STMT (stmt_info);
659     }
660   while (stmt_info);
661 }
662 
663 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
664 
665 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)666 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
667 {
668   stmt_vec_info first;
669   unsigned i;
670 
671   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
672     {
673       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
674       while (next)
675           {
676             if ((STMT_VINFO_IN_PATTERN_P (next)
677                  != STMT_VINFO_IN_PATTERN_P (first))
678                 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
679               break;
680             next = REDUC_GROUP_NEXT_ELEMENT (next);
681           }
682       /* If all reduction chain members are well-formed patterns adjust
683            the group to group the pattern stmts instead.  */
684       if (! next
685             && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
686           {
687             if (STMT_VINFO_IN_PATTERN_P (first))
688               {
689                 vect_fixup_reduc_chain (first);
690                 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
691                     = STMT_VINFO_RELATED_STMT (first);
692               }
693           }
694       /* If not all stmt in the chain are patterns or if we failed
695            to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
696            it as regular reduction instead.  */
697       else
698           {
699             stmt_vec_info vinfo = first;
700             stmt_vec_info last = NULL;
701             while (vinfo)
702               {
703                 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
704                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
705                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
706                 last = vinfo;
707                 vinfo = next;
708               }
709             STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
710               = vect_internal_def;
711             loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
712             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
713             --i;
714           }
715     }
716 }
717 
718 /* Function vect_get_loop_niters.
719 
720    Determine how many iterations the loop is executed and place it
721    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
722    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
723    niter information holds in ASSUMPTIONS.
724 
725    Return the loop exit condition.  */
726 
727 
728 static gcond *
vect_get_loop_niters(class loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)729 vect_get_loop_niters (class loop *loop, tree *assumptions,
730                           tree *number_of_iterations, tree *number_of_iterationsm1)
731 {
732   edge exit = single_exit (loop);
733   class tree_niter_desc niter_desc;
734   tree niter_assumptions, niter, may_be_zero;
735   gcond *cond = get_loop_exit_condition (loop);
736 
737   *assumptions = boolean_true_node;
738   *number_of_iterationsm1 = chrec_dont_know;
739   *number_of_iterations = chrec_dont_know;
740   DUMP_VECT_SCOPE ("get_loop_niters");
741 
742   if (!exit)
743     return cond;
744 
745   may_be_zero = NULL_TREE;
746   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
747       || chrec_contains_undetermined (niter_desc.niter))
748     return cond;
749 
750   niter_assumptions = niter_desc.assumptions;
751   may_be_zero = niter_desc.may_be_zero;
752   niter = niter_desc.niter;
753 
754   if (may_be_zero && integer_zerop (may_be_zero))
755     may_be_zero = NULL_TREE;
756 
757   if (may_be_zero)
758     {
759       if (COMPARISON_CLASS_P (may_be_zero))
760           {
761             /* Try to combine may_be_zero with assumptions, this can simplify
762                computation of niter expression.  */
763             if (niter_assumptions && !integer_nonzerop (niter_assumptions))
764               niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
765                                                        niter_assumptions,
766                                                        fold_build1 (TRUTH_NOT_EXPR,
767                                                                         boolean_type_node,
768                                                                         may_be_zero));
769             else
770               niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
771                                          build_int_cst (TREE_TYPE (niter), 0),
772                                          rewrite_to_non_trapping_overflow (niter));
773 
774             may_be_zero = NULL_TREE;
775           }
776       else if (integer_nonzerop (may_be_zero))
777           {
778             *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
779             *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
780             return cond;
781           }
782       else
783           return cond;
784     }
785 
786   *assumptions = niter_assumptions;
787   *number_of_iterationsm1 = niter;
788 
789   /* We want the number of loop header executions which is the number
790      of latch executions plus one.
791      ???  For UINT_MAX latch executions this number overflows to zero
792      for loops like do { n++; } while (n != 0);  */
793   if (niter && !chrec_contains_undetermined (niter))
794     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
795                                 build_int_cst (TREE_TYPE (niter), 1));
796   *number_of_iterations = niter;
797 
798   return cond;
799 }
800 
801 /* Function bb_in_loop_p
802 
803    Used as predicate for dfs order traversal of the loop bbs.  */
804 
805 static bool
bb_in_loop_p(const_basic_block bb,const void * data)806 bb_in_loop_p (const_basic_block bb, const void *data)
807 {
808   const class loop *const loop = (const class loop *)data;
809   if (flow_bb_inside_loop_p (loop, bb))
810     return true;
811   return false;
812 }
813 
814 
815 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
816    stmt_vec_info structs for all the stmts in LOOP_IN.  */
817 
_loop_vec_info(class loop * loop_in,vec_info_shared * shared)818 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
819   : vec_info (vec_info::loop, shared),
820     loop (loop_in),
821     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
822     num_itersm1 (NULL_TREE),
823     num_iters (NULL_TREE),
824     num_iters_unchanged (NULL_TREE),
825     num_iters_assumptions (NULL_TREE),
826     vector_costs (nullptr),
827     scalar_costs (nullptr),
828     th (0),
829     versioning_threshold (0),
830     vectorization_factor (0),
831     main_loop_edge (nullptr),
832     skip_main_loop_edge (nullptr),
833     skip_this_loop_edge (nullptr),
834     reusable_accumulators (),
835     suggested_unroll_factor (1),
836     max_vectorization_factor (0),
837     mask_skip_niters (NULL_TREE),
838     rgroup_compare_type (NULL_TREE),
839     simd_if_cond (NULL_TREE),
840     unaligned_dr (NULL),
841     peeling_for_alignment (0),
842     ptr_mask (0),
843     ivexpr_map (NULL),
844     scan_map (NULL),
845     slp_unrolling_factor (1),
846     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
847     vectorizable (false),
848     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
849     using_partial_vectors_p (false),
850     epil_using_partial_vectors_p (false),
851     partial_load_store_bias (0),
852     peeling_for_gaps (false),
853     peeling_for_niter (false),
854     no_data_dependencies (false),
855     has_mask_store (false),
856     scalar_loop_scaling (profile_probability::uninitialized ()),
857     scalar_loop (NULL),
858     orig_loop_info (NULL)
859 {
860   /* CHECKME: We want to visit all BBs before their successors (except for
861      latch blocks, for which this assertion wouldn't hold).  In the simple
862      case of the loop forms we allow, a dfs order of the BBs would the same
863      as reversed postorder traversal, so we are safe.  */
864 
865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
866                                                     bbs, loop->num_nodes, loop);
867   gcc_assert (nbbs == loop->num_nodes);
868 
869   for (unsigned int i = 0; i < nbbs; i++)
870     {
871       basic_block bb = bbs[i];
872       gimple_stmt_iterator si;
873 
874       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
875           {
876             gimple *phi = gsi_stmt (si);
877             gimple_set_uid (phi, 0);
878             add_stmt (phi);
879           }
880 
881       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
882           {
883             gimple *stmt = gsi_stmt (si);
884             gimple_set_uid (stmt, 0);
885             if (is_gimple_debug (stmt))
886               continue;
887             add_stmt (stmt);
888             /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
889                third argument is the #pragma omp simd if (x) condition, when 0,
890                loop shouldn't be vectorized, when non-zero constant, it should
891                be vectorized normally, otherwise versioned with vectorized loop
892                done if the condition is non-zero at runtime.  */
893             if (loop_in->simduid
894                 && is_gimple_call (stmt)
895                 && gimple_call_internal_p (stmt)
896                 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
897                 && gimple_call_num_args (stmt) >= 3
898                 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
899                 && (loop_in->simduid
900                       == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
901               {
902                 tree arg = gimple_call_arg (stmt, 2);
903                 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
904                     simd_if_cond = arg;
905                 else
906                     gcc_assert (integer_nonzerop (arg));
907               }
908           }
909     }
910 
911   epilogue_vinfos.create (6);
912 }
913 
914 /* Free all levels of rgroup CONTROLS.  */
915 
916 void
release_vec_loop_controls(vec<rgroup_controls> * controls)917 release_vec_loop_controls (vec<rgroup_controls> *controls)
918 {
919   rgroup_controls *rgc;
920   unsigned int i;
921   FOR_EACH_VEC_ELT (*controls, i, rgc)
922     rgc->controls.release ();
923   controls->release ();
924 }
925 
926 /* Free all memory used by the _loop_vec_info, as well as all the
927    stmt_vec_info structs of all the stmts in the loop.  */
928 
~_loop_vec_info()929 _loop_vec_info::~_loop_vec_info ()
930 {
931   free (bbs);
932 
933   release_vec_loop_controls (&masks);
934   release_vec_loop_controls (&lens);
935   delete ivexpr_map;
936   delete scan_map;
937   epilogue_vinfos.release ();
938   delete scalar_costs;
939   delete vector_costs;
940 
941   /* When we release an epiloge vinfo that we do not intend to use
942      avoid clearing AUX of the main loop which should continue to
943      point to the main loop vinfo since otherwise we'll leak that.  */
944   if (loop->aux == this)
945     loop->aux = NULL;
946 }
947 
948 /* Return an invariant or register for EXPR and emit necessary
949    computations in the LOOP_VINFO loop preheader.  */
950 
951 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)952 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
953 {
954   if (is_gimple_reg (expr)
955       || is_gimple_min_invariant (expr))
956     return expr;
957 
958   if (! loop_vinfo->ivexpr_map)
959     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
960   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
961   if (! cached)
962     {
963       gimple_seq stmts = NULL;
964       cached = force_gimple_operand (unshare_expr (expr),
965                                              &stmts, true, NULL_TREE);
966       if (stmts)
967           {
968             edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
969             gsi_insert_seq_on_edge_immediate (e, stmts);
970           }
971     }
972   return cached;
973 }
974 
975 /* Return true if we can use CMP_TYPE as the comparison type to produce
976    all masks required to mask LOOP_VINFO.  */
977 
978 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)979 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
980 {
981   rgroup_controls *rgm;
982   unsigned int i;
983   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
984     if (rgm->type != NULL_TREE
985           && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
986                                                       cmp_type, rgm->type,
987                                                       OPTIMIZE_FOR_SPEED))
988       return false;
989   return true;
990 }
991 
992 /* Calculate the maximum number of scalars per iteration for every
993    rgroup in LOOP_VINFO.  */
994 
995 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)996 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
997 {
998   unsigned int res = 1;
999   unsigned int i;
1000   rgroup_controls *rgm;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     res = MAX (res, rgm->max_nscalars_per_iter);
1003   return res;
1004 }
1005 
1006 /* Calculate the minimum precision necessary to represent:
1007 
1008       MAX_NITERS * FACTOR
1009 
1010    as an unsigned integer, where MAX_NITERS is the maximum number of
1011    loop header iterations for the original scalar form of LOOP_VINFO.  */
1012 
1013 static unsigned
vect_min_prec_for_max_niters(loop_vec_info loop_vinfo,unsigned int factor)1014 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1015 {
1016   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1017 
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022 
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027 
1028   /* Work out how many bits we need to represent the limit.  */
1029   return wi::min_precision (max_ni * factor, UNSIGNED);
1030 }
1031 
1032 /* True if the loop needs peeling or partial vectors when vectorized.  */
1033 
1034 static bool
vect_need_peeling_or_partial_vectors_p(loop_vec_info loop_vinfo)1035 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1036 {
1037   unsigned HOST_WIDE_INT const_vf;
1038   HOST_WIDE_INT max_niter
1039     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1040 
1041   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1042   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1043     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1044                                                     (loop_vinfo));
1045 
1046   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1047       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1048     {
1049       /* Work out the (constant) number of iterations that need to be
1050            peeled for reasons other than niters.  */
1051       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1052       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1053           peel_niter += 1;
1054       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1055                            LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1056           return true;
1057     }
1058   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1059       /* ??? When peeling for gaps but not alignment, we could
1060            try to check whether the (variable) niters is known to be
1061            VF * N + 1.  That's something of a niche case though.  */
1062       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1063       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1064       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1065              < (unsigned) exact_log2 (const_vf))
1066             /* In case of versioning, check if the maximum number of
1067                iterations is greater than th.  If they are identical,
1068                the epilogue is unnecessary.  */
1069             && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1070                 || ((unsigned HOST_WIDE_INT) max_niter
1071                       > (th / const_vf) * const_vf))))
1072     return true;
1073 
1074   return false;
1075 }
1076 
1077 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1078    whether we can actually generate the masks required.  Return true if so,
1079    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1080 
1081 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1082 vect_verify_full_masking (loop_vec_info loop_vinfo)
1083 {
1084   unsigned int min_ni_width;
1085   unsigned int max_nscalars_per_iter
1086     = vect_get_max_nscalars_per_iter (loop_vinfo);
1087 
1088   /* Use a normal loop if there are no statements that need masking.
1089      This only happens in rare degenerate cases: it means that the loop
1090      has no loads, no stores, and no live-out values.  */
1091   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1092     return false;
1093 
1094   /* Work out how many bits we need to represent the limit.  */
1095   min_ni_width
1096     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1097 
1098   /* Find a scalar mode for which WHILE_ULT is supported.  */
1099   opt_scalar_int_mode cmp_mode_iter;
1100   tree cmp_type = NULL_TREE;
1101   tree iv_type = NULL_TREE;
1102   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1103   unsigned int iv_precision = UINT_MAX;
1104 
1105   if (iv_limit != -1)
1106     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1107                                               UNSIGNED);
1108 
1109   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1110     {
1111       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1112       if (cmp_bits >= min_ni_width
1113             && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1114           {
1115             tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1116             if (this_type
1117                 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1118               {
1119                 /* Although we could stop as soon as we find a valid mode,
1120                      there are at least two reasons why that's not always the
1121                      best choice:
1122 
1123                      - An IV that's Pmode or wider is more likely to be reusable
1124                        in address calculations than an IV that's narrower than
1125                        Pmode.
1126 
1127                      - Doing the comparison in IV_PRECISION or wider allows
1128                        a natural 0-based IV, whereas using a narrower comparison
1129                        type requires mitigations against wrap-around.
1130 
1131                      Conversely, if the IV limit is variable, doing the comparison
1132                      in a wider type than the original type can introduce
1133                      unnecessary extensions, so picking the widest valid mode
1134                      is not always a good choice either.
1135 
1136                      Here we prefer the first IV type that's Pmode or wider,
1137                      and the first comparison type that's IV_PRECISION or wider.
1138                      (The comparison type must be no wider than the IV type,
1139                      to avoid extensions in the vector loop.)
1140 
1141                      ??? We might want to try continuing beyond Pmode for ILP32
1142                      targets if CMP_BITS < IV_PRECISION.  */
1143                 iv_type = this_type;
1144                 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1145                     cmp_type = this_type;
1146                 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1147                     break;
1148               }
1149           }
1150     }
1151 
1152   if (!cmp_type)
1153     return false;
1154 
1155   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1156   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1157   return true;
1158 }
1159 
1160 /* Check whether we can use vector access with length based on precison
1161    comparison.  So far, to keep it simple, we only allow the case that the
1162    precision of the target supported length is larger than the precision
1163    required by loop niters.  */
1164 
1165 static bool
vect_verify_loop_lens(loop_vec_info loop_vinfo)1166 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1167 {
1168   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1169     return false;
1170 
1171   machine_mode len_load_mode = get_len_load_store_mode
1172     (loop_vinfo->vector_mode, true).require ();
1173   machine_mode len_store_mode = get_len_load_store_mode
1174     (loop_vinfo->vector_mode, false).require ();
1175 
1176   signed char partial_load_bias = internal_len_load_store_bias
1177     (IFN_LEN_LOAD, len_load_mode);
1178 
1179   signed char partial_store_bias = internal_len_load_store_bias
1180     (IFN_LEN_STORE, len_store_mode);
1181 
1182   gcc_assert (partial_load_bias == partial_store_bias);
1183 
1184   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1185     return false;
1186 
1187   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1188      len_loads with a length of zero.  In order to avoid that we prohibit
1189      more than one loop length here.  */
1190   if (partial_load_bias == -1
1191       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1192     return false;
1193 
1194   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1195 
1196   unsigned int max_nitems_per_iter = 1;
1197   unsigned int i;
1198   rgroup_controls *rgl;
1199   /* Find the maximum number of items per iteration for every rgroup.  */
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1201     {
1202       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1203       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1204     }
1205 
1206   /* Work out how many bits we need to represent the length limit.  */
1207   unsigned int min_ni_prec
1208     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1209 
1210   /* Now use the maximum of below precisions for one suitable IV type:
1211      - the IV's natural precision
1212      - the precision needed to hold: the maximum number of scalar
1213        iterations multiplied by the scale factor (min_ni_prec above)
1214      - the Pmode precision
1215 
1216      If min_ni_prec is less than the precision of the current niters,
1217      we perfer to still use the niters type.  Prefer to use Pmode and
1218      wider IV to avoid narrow conversions.  */
1219 
1220   unsigned int ni_prec
1221     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1222   min_ni_prec = MAX (min_ni_prec, ni_prec);
1223   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1224 
1225   tree iv_type = NULL_TREE;
1226   opt_scalar_int_mode tmode_iter;
1227   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1228     {
1229       scalar_mode tmode = tmode_iter.require ();
1230       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1231 
1232       /* ??? Do we really want to construct one IV whose precision exceeds
1233            BITS_PER_WORD?  */
1234       if (tbits > BITS_PER_WORD)
1235           break;
1236 
1237       /* Find the first available standard integral type.  */
1238       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1239           {
1240             iv_type = build_nonstandard_integer_type (tbits, true);
1241             break;
1242           }
1243     }
1244 
1245   if (!iv_type)
1246     {
1247       if (dump_enabled_p ())
1248           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                                "can't vectorize with length-based partial vectors"
1250                                " because there is no suitable iv type.\n");
1251       return false;
1252     }
1253 
1254   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1255   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1256 
1257   return true;
1258 }
1259 
1260 /* Calculate the cost of one scalar iteration of the loop.  */
1261 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1262 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1263 {
1264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1265   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1266   int nbbs = loop->num_nodes, factor;
1267   int innerloop_iters, i;
1268 
1269   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1270 
1271   /* Gather costs for statements in the scalar loop.  */
1272 
1273   /* FORNOW.  */
1274   innerloop_iters = 1;
1275   if (loop->inner)
1276     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1277 
1278   for (i = 0; i < nbbs; i++)
1279     {
1280       gimple_stmt_iterator si;
1281       basic_block bb = bbs[i];
1282 
1283       if (bb->loop_father == loop->inner)
1284         factor = innerloop_iters;
1285       else
1286         factor = 1;
1287 
1288       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1289         {
1290             gimple *stmt = gsi_stmt (si);
1291             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1292 
1293           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1294             continue;
1295 
1296           /* Skip stmts that are not vectorized inside the loop.  */
1297             stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1298           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1299               && (!STMT_VINFO_LIVE_P (vstmt_info)
1300                   || !VECTORIZABLE_CYCLE_DEF
1301                               (STMT_VINFO_DEF_TYPE (vstmt_info))))
1302             continue;
1303 
1304             vect_cost_for_stmt kind;
1305           if (STMT_VINFO_DATA_REF (stmt_info))
1306             {
1307               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1308                kind = scalar_load;
1309              else
1310                kind = scalar_store;
1311             }
1312             else if (vect_nop_conversion_p (stmt_info))
1313               continue;
1314             else
1315             kind = scalar_stmt;
1316 
1317             /* We are using vect_prologue here to avoid scaling twice
1318                by the inner loop factor.  */
1319             record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1320                                   factor, kind, stmt_info, 0, vect_prologue);
1321         }
1322     }
1323 
1324   /* Now accumulate cost.  */
1325   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1326   add_stmt_costs (loop_vinfo->scalar_costs,
1327                       &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1328   loop_vinfo->scalar_costs->finish_cost (nullptr);
1329 }
1330 
1331 
1332 /* Function vect_analyze_loop_form.
1333 
1334    Verify that certain CFG restrictions hold, including:
1335    - the loop has a pre-header
1336    - the loop has a single entry and exit
1337    - the loop exit condition is simple enough
1338    - the number of iterations can be analyzed, i.e, a countable loop.  The
1339      niter could be analyzed under some assumptions.  */
1340 
1341 opt_result
vect_analyze_loop_form(class loop * loop,vect_loop_form_info * info)1342 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1343 {
1344   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1345 
1346   /* Different restrictions apply when we are considering an inner-most loop,
1347      vs. an outer (nested) loop.
1348      (FORNOW. May want to relax some of these restrictions in the future).  */
1349 
1350   info->inner_loop_cond = NULL;
1351   if (!loop->inner)
1352     {
1353       /* Inner-most loop.  We currently require that the number of BBs is
1354            exactly 2 (the header and latch).  Vectorizable inner-most loops
1355            look like this:
1356 
1357                         (pre-header)
1358                            |
1359                           header <--------+
1360                            | |            |
1361                            | +--> latch --+
1362                            |
1363                         (exit-bb)  */
1364 
1365       if (loop->num_nodes != 2)
1366           return opt_result::failure_at (vect_location,
1367                                                "not vectorized:"
1368                                                " control flow in loop.\n");
1369 
1370       if (empty_block_p (loop->header))
1371           return opt_result::failure_at (vect_location,
1372                                                "not vectorized: empty loop.\n");
1373     }
1374   else
1375     {
1376       class loop *innerloop = loop->inner;
1377       edge entryedge;
1378 
1379       /* Nested loop. We currently require that the loop is doubly-nested,
1380            contains a single inner loop, and the number of BBs is exactly 5.
1381            Vectorizable outer-loops look like this:
1382 
1383                               (pre-header)
1384                                  |
1385                                 header <---+
1386                                  |         |
1387                               inner-loop |
1388                                  |         |
1389                                 tail ------+
1390                                  |
1391                             (exit-bb)
1392 
1393            The inner-loop has the properties expected of inner-most loops
1394            as described above.  */
1395 
1396       if ((loop->inner)->inner || (loop->inner)->next)
1397           return opt_result::failure_at (vect_location,
1398                                                "not vectorized:"
1399                                                " multiple nested loops.\n");
1400 
1401       if (loop->num_nodes != 5)
1402           return opt_result::failure_at (vect_location,
1403                                                "not vectorized:"
1404                                                " control flow in loop.\n");
1405 
1406       entryedge = loop_preheader_edge (innerloop);
1407       if (entryedge->src != loop->header
1408             || !single_exit (innerloop)
1409             || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1410           return opt_result::failure_at (vect_location,
1411                                                "not vectorized:"
1412                                                " unsupported outerloop form.\n");
1413 
1414       /* Analyze the inner-loop.  */
1415       vect_loop_form_info inner;
1416       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1417       if (!res)
1418           {
1419             if (dump_enabled_p ())
1420               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421                                    "not vectorized: Bad inner loop.\n");
1422             return res;
1423           }
1424 
1425       /* Don't support analyzing niter under assumptions for inner
1426            loop.  */
1427       if (!integer_onep (inner.assumptions))
1428           return opt_result::failure_at (vect_location,
1429                                                "not vectorized: Bad inner loop.\n");
1430 
1431       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1432           return opt_result::failure_at (vect_location,
1433                                                "not vectorized: inner-loop count not"
1434                                                " invariant.\n");
1435 
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_NOTE, vect_location,
1438                                "Considering outer-loop vectorization.\n");
1439       info->inner_loop_cond = inner.loop_cond;
1440     }
1441 
1442   if (!single_exit (loop))
1443     return opt_result::failure_at (vect_location,
1444                                            "not vectorized: multiple exits.\n");
1445   if (EDGE_COUNT (loop->header->preds) != 2)
1446     return opt_result::failure_at (vect_location,
1447                                            "not vectorized:"
1448                                            " too many incoming edges.\n");
1449 
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     return opt_result::failure_at (vect_location,
1457                                            "not vectorized: latch block not empty.\n");
1458 
1459   /* Make sure the exit is not abnormal.  */
1460   edge e = single_exit (loop);
1461   if (e->flags & EDGE_ABNORMAL)
1462     return opt_result::failure_at (vect_location,
1463                                            "not vectorized:"
1464                                            " abnormal loop exit edge.\n");
1465 
1466   info->loop_cond
1467     = vect_get_loop_niters (loop, &info->assumptions,
1468                                   &info->number_of_iterations,
1469                                   &info->number_of_iterationsm1);
1470   if (!info->loop_cond)
1471     return opt_result::failure_at
1472       (vect_location,
1473        "not vectorized: complicated exit condition.\n");
1474 
1475   if (integer_zerop (info->assumptions)
1476       || !info->number_of_iterations
1477       || chrec_contains_undetermined (info->number_of_iterations))
1478     return opt_result::failure_at
1479       (info->loop_cond,
1480        "not vectorized: number of iterations cannot be computed.\n");
1481 
1482   if (integer_zerop (info->number_of_iterations))
1483     return opt_result::failure_at
1484       (info->loop_cond,
1485        "not vectorized: number of iterations = 0.\n");
1486 
1487   if (!(tree_fits_shwi_p (info->number_of_iterations)
1488           && tree_to_shwi (info->number_of_iterations) > 0))
1489     {
1490       if (dump_enabled_p ())
1491           {
1492             dump_printf_loc (MSG_NOTE, vect_location,
1493                                  "Symbolic number of iterations is ");
1494             dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1495             dump_printf (MSG_NOTE, "\n");
1496           }
1497     }
1498 
1499   return opt_result::success ();
1500 }
1501 
1502 /* Create a loop_vec_info for LOOP with SHARED and the
1503    vect_analyze_loop_form result.  */
1504 
1505 loop_vec_info
vect_create_loop_vinfo(class loop * loop,vec_info_shared * shared,const vect_loop_form_info * info,loop_vec_info main_loop_info)1506 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1507                               const vect_loop_form_info *info,
1508                               loop_vec_info main_loop_info)
1509 {
1510   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1511   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1512   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1513   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1514   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1515   /* Also record the assumptions for versioning.  */
1516   if (!integer_onep (info->assumptions) && !main_loop_info)
1517     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1518 
1519   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1520   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1521   if (info->inner_loop_cond)
1522     {
1523       stmt_vec_info inner_loop_cond_info
1524           = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1525       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1526       /* If we have an estimate on the number of iterations of the inner
1527            loop use that to limit the scale for costing, otherwise use
1528            --param vect-inner-loop-cost-factor literally.  */
1529       widest_int nit;
1530       if (estimated_stmt_executions (loop->inner, &nit))
1531           LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1532             = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1533     }
1534 
1535   return loop_vinfo;
1536 }
1537 
1538 
1539 
1540 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1541    statements update the vectorization factor.  */
1542 
1543 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1544 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1545 {
1546   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1547   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1548   int nbbs = loop->num_nodes;
1549   poly_uint64 vectorization_factor;
1550   int i;
1551 
1552   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1553 
1554   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555   gcc_assert (known_ne (vectorization_factor, 0U));
1556 
1557   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1558      vectorization factor of the loop is the unrolling factor required by
1559      the SLP instances.  If that unrolling factor is 1, we say, that we
1560      perform pure SLP on loop - cross iteration parallelism is not
1561      exploited.  */
1562   bool only_slp_in_loop = true;
1563   for (i = 0; i < nbbs; i++)
1564     {
1565       basic_block bb = bbs[i];
1566       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1567              gsi_next (&si))
1568           {
1569             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1570             if (!stmt_info)
1571               continue;
1572             if ((STMT_VINFO_RELEVANT_P (stmt_info)
1573                  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1574                 && !PURE_SLP_STMT (stmt_info))
1575               /* STMT needs both SLP and loop-based vectorization.  */
1576               only_slp_in_loop = false;
1577           }
1578       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579              gsi_next (&si))
1580           {
1581             if (is_gimple_debug (gsi_stmt (si)))
1582               continue;
1583             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1584             stmt_info = vect_stmt_to_vectorize (stmt_info);
1585             if ((STMT_VINFO_RELEVANT_P (stmt_info)
1586                  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1587                 && !PURE_SLP_STMT (stmt_info))
1588               /* STMT needs both SLP and loop-based vectorization.  */
1589               only_slp_in_loop = false;
1590           }
1591     }
1592 
1593   if (only_slp_in_loop)
1594     {
1595       if (dump_enabled_p ())
1596           dump_printf_loc (MSG_NOTE, vect_location,
1597                                "Loop contains only SLP stmts\n");
1598       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1599     }
1600   else
1601     {
1602       if (dump_enabled_p ())
1603           dump_printf_loc (MSG_NOTE, vect_location,
1604                                "Loop contains SLP and non-SLP stmts\n");
1605       /* Both the vectorization factor and unroll factor have the form
1606            GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1607            so they must have a common multiple.  */
1608       vectorization_factor
1609           = force_common_multiple (vectorization_factor,
1610                                          LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1611     }
1612 
1613   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1614   if (dump_enabled_p ())
1615     {
1616       dump_printf_loc (MSG_NOTE, vect_location,
1617                            "Updating vectorization factor to ");
1618       dump_dec (MSG_NOTE, vectorization_factor);
1619       dump_printf (MSG_NOTE, ".\n");
1620     }
1621 }
1622 
1623 /* Return true if STMT_INFO describes a double reduction phi and if
1624    the other phi in the reduction is also relevant for vectorization.
1625    This rejects cases such as:
1626 
1627       outer1:
1628           x_1 = PHI <x_3(outer2), ...>;
1629           ...
1630 
1631       inner:
1632           x_2 = ...;
1633           ...
1634 
1635       outer2:
1636           x_3 = PHI <x_2(inner)>;
1637 
1638    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1639 
1640 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1641 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1642 {
1643   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1644     return false;
1645 
1646   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1647 }
1648 
1649 /* Function vect_analyze_loop_operations.
1650 
1651    Scan the loop stmts and make sure they are all vectorizable.  */
1652 
1653 static opt_result
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1654 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1655 {
1656   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1657   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1658   int nbbs = loop->num_nodes;
1659   int i;
1660   stmt_vec_info stmt_info;
1661   bool need_to_vectorize = false;
1662   bool ok;
1663 
1664   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1665 
1666   auto_vec<stmt_info_for_cost> cost_vec;
1667 
1668   for (i = 0; i < nbbs; i++)
1669     {
1670       basic_block bb = bbs[i];
1671 
1672       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1673              gsi_next (&si))
1674         {
1675           gphi *phi = si.phi ();
1676           ok = true;
1677 
1678             stmt_info = loop_vinfo->lookup_stmt (phi);
1679           if (dump_enabled_p ())
1680               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1681             if (virtual_operand_p (gimple_phi_result (phi)))
1682               continue;
1683 
1684           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685              (i.e., a phi in the tail of the outer-loop).  */
1686           if (! is_loop_header_bb_p (bb))
1687             {
1688               /* FORNOW: we currently don't support the case that these phis
1689                  are not used in the outerloop (unless it is double reduction,
1690                  i.e., this phi is vect_reduction_def), cause this case
1691                  requires to actually do something here.  */
1692               if (STMT_VINFO_LIVE_P (stmt_info)
1693                       && !vect_active_double_reduction_p (stmt_info))
1694                     return opt_result::failure_at (phi,
1695                                                          "Unsupported loop-closed phi"
1696                                                          " in outer-loop.\n");
1697 
1698               /* If PHI is used in the outer loop, we check that its operand
1699                  is defined in the inner loop.  */
1700               if (STMT_VINFO_RELEVANT_P (stmt_info))
1701                 {
1702                   tree phi_op;
1703 
1704                   if (gimple_phi_num_args (phi) != 1)
1705                     return opt_result::failure_at (phi, "unsupported phi");
1706 
1707                   phi_op = PHI_ARG_DEF (phi, 0);
1708                       stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1709                       if (!op_def_info)
1710                         return opt_result::failure_at (phi, "unsupported phi\n");
1711 
1712                       if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1713                           && (STMT_VINFO_RELEVANT (op_def_info)
1714                                 != vect_used_in_outer_by_reduction))
1715                         return opt_result::failure_at (phi, "unsupported phi\n");
1716 
1717                       if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1718                            || (STMT_VINFO_DEF_TYPE (stmt_info)
1719                                  == vect_double_reduction_def))
1720                           && !vectorizable_lc_phi (loop_vinfo,
1721                                                          stmt_info, NULL, NULL))
1722                         return opt_result::failure_at (phi, "unsupported phi\n");
1723                 }
1724 
1725               continue;
1726             }
1727 
1728           gcc_assert (stmt_info);
1729 
1730           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1731                || STMT_VINFO_LIVE_P (stmt_info))
1732               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1733               /* A scalar-dependence cycle that we don't support.  */
1734               return opt_result::failure_at (phi,
1735                                                      "not vectorized:"
1736                                                      " scalar dependence cycle.\n");
1737 
1738           if (STMT_VINFO_RELEVANT_P (stmt_info))
1739             {
1740               need_to_vectorize = true;
1741               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1742                       && ! PURE_SLP_STMT (stmt_info))
1743                     ok = vectorizable_induction (loop_vinfo,
1744                                                        stmt_info, NULL, NULL,
1745                                                        &cost_vec);
1746                 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1747                               || (STMT_VINFO_DEF_TYPE (stmt_info)
1748                                   == vect_double_reduction_def)
1749                               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1750                            && ! PURE_SLP_STMT (stmt_info))
1751                     ok = vectorizable_reduction (loop_vinfo,
1752                                                        stmt_info, NULL, NULL, &cost_vec);
1753             }
1754 
1755             /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1756             if (ok
1757                 && STMT_VINFO_LIVE_P (stmt_info)
1758                 && !PURE_SLP_STMT (stmt_info))
1759               ok = vectorizable_live_operation (loop_vinfo,
1760                                                         stmt_info, NULL, NULL, NULL,
1761                                                         -1, false, &cost_vec);
1762 
1763           if (!ok)
1764               return opt_result::failure_at (phi,
1765                                                      "not vectorized: relevant phi not "
1766                                                      "supported: %G",
1767                                                      static_cast <gimple *> (phi));
1768         }
1769 
1770       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771              gsi_next (&si))
1772         {
1773             gimple *stmt = gsi_stmt (si);
1774             if (!gimple_clobber_p (stmt)
1775                 && !is_gimple_debug (stmt))
1776               {
1777                 opt_result res
1778                     = vect_analyze_stmt (loop_vinfo,
1779                                              loop_vinfo->lookup_stmt (stmt),
1780                                              &need_to_vectorize,
1781                                              NULL, NULL, &cost_vec);
1782                 if (!res)
1783                     return res;
1784               }
1785         }
1786     } /* bbs */
1787 
1788   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1789 
1790   /* All operations in the loop are either irrelevant (deal with loop
1791      control, or dead), or only used outside the loop and can be moved
1792      out of the loop (e.g. invariants, inductions).  The loop can be
1793      optimized away by scalar optimizations.  We're better off not
1794      touching this loop.  */
1795   if (!need_to_vectorize)
1796     {
1797       if (dump_enabled_p ())
1798         dump_printf_loc (MSG_NOTE, vect_location,
1799                                "All the computation can be taken out of the loop.\n");
1800       return opt_result::failure_at
1801           (vect_location,
1802            "not vectorized: redundant loop. no profit to vectorize.\n");
1803     }
1804 
1805   return opt_result::success ();
1806 }
1807 
1808 /* Return true if we know that the iteration count is smaller than the
1809    vectorization factor.  Return false if it isn't, or if we can't be sure
1810    either way.  */
1811 
1812 static bool
vect_known_niters_smaller_than_vf(loop_vec_info loop_vinfo)1813 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 
1817   HOST_WIDE_INT max_niter;
1818   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1819     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1820   else
1821     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1822 
1823   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1824     return true;
1825 
1826   return false;
1827 }
1828 
1829 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1830    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1831    definitely no, or -1 if it's worth retrying.  */
1832 
1833 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo,unsigned * suggested_unroll_factor)1834 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1835                                  unsigned *suggested_unroll_factor)
1836 {
1837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1838   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1839 
1840   /* Only loops that can handle partially-populated vectors can have iteration
1841      counts less than the vectorization factor.  */
1842   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1843     {
1844       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1845           {
1846             if (dump_enabled_p ())
1847               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                                    "not vectorized: iteration count smaller than "
1849                                    "vectorization factor.\n");
1850             return 0;
1851           }
1852     }
1853 
1854   /* If using the "very cheap" model. reject cases in which we'd keep
1855      a copy of the scalar code (even if we might be able to vectorize it).  */
1856   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1857       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858             || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1859             || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1860     {
1861       if (dump_enabled_p ())
1862           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863                                "some scalar iterations would need to be peeled\n");
1864       return 0;
1865     }
1866 
1867   int min_profitable_iters, min_profitable_estimate;
1868   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1869                                               &min_profitable_estimate,
1870                                               suggested_unroll_factor);
1871 
1872   if (min_profitable_iters < 0)
1873     {
1874       if (dump_enabled_p ())
1875           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876                                "not vectorized: vectorization not profitable.\n");
1877       if (dump_enabled_p ())
1878           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1879                                "not vectorized: vector version will never be "
1880                                "profitable.\n");
1881       return -1;
1882     }
1883 
1884   int min_scalar_loop_bound = (param_min_vect_loop_bound
1885                                      * assumed_vf);
1886 
1887   /* Use the cost model only if it is more conservative than user specified
1888      threshold.  */
1889   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1890                                             min_profitable_iters);
1891 
1892   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1893 
1894   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1895       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1896     {
1897       if (dump_enabled_p ())
1898           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                                "not vectorized: vectorization not profitable.\n");
1900       if (dump_enabled_p ())
1901           dump_printf_loc (MSG_NOTE, vect_location,
1902                                "not vectorized: iteration count smaller than user "
1903                                "specified loop bound parameter or minimum profitable "
1904                                "iterations (whichever is more conservative).\n");
1905       return 0;
1906     }
1907 
1908   /* The static profitablity threshold min_profitable_estimate includes
1909      the cost of having to check at runtime whether the scalar loop
1910      should be used instead.  If it turns out that we don't need or want
1911      such a check, the threshold we should use for the static estimate
1912      is simply the point at which the vector loop becomes more profitable
1913      than the scalar loop.  */
1914   if (min_profitable_estimate > min_profitable_iters
1915       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1916       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1917       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1918       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1919     {
1920       if (dump_enabled_p ())
1921           dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1922                                " choice between the scalar and vector loops\n");
1923       min_profitable_estimate = min_profitable_iters;
1924     }
1925 
1926   /* If the vector loop needs multiple iterations to be beneficial then
1927      things are probably too close to call, and the conservative thing
1928      would be to stick with the scalar code.  */
1929   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1930       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1931     {
1932       if (dump_enabled_p ())
1933           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                                "one iteration of the vector loop would be"
1935                                " more expensive than the equivalent number of"
1936                                " iterations of the scalar loop\n");
1937       return 0;
1938     }
1939 
1940   HOST_WIDE_INT estimated_niter;
1941 
1942   /* If we are vectorizing an epilogue then we know the maximum number of
1943      scalar iterations it will cover is at least one lower than the
1944      vectorization factor of the main loop.  */
1945   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1946     estimated_niter
1947       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1948   else
1949     {
1950       estimated_niter = estimated_stmt_executions_int (loop);
1951       if (estimated_niter == -1)
1952           estimated_niter = likely_max_stmt_executions_int (loop);
1953     }
1954   if (estimated_niter != -1
1955       && ((unsigned HOST_WIDE_INT) estimated_niter
1956             < MAX (th, (unsigned) min_profitable_estimate)))
1957     {
1958       if (dump_enabled_p ())
1959           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960                                "not vectorized: estimated iteration count too "
1961                                "small.\n");
1962       if (dump_enabled_p ())
1963           dump_printf_loc (MSG_NOTE, vect_location,
1964                                "not vectorized: estimated iteration count smaller "
1965                                "than specified loop bound parameter or minimum "
1966                                "profitable iterations (whichever is more "
1967                                "conservative).\n");
1968       return -1;
1969     }
1970 
1971   return 1;
1972 }
1973 
1974 static opt_result
vect_get_datarefs_in_loop(loop_p loop,basic_block * bbs,vec<data_reference_p> * datarefs,unsigned int * n_stmts)1975 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1976                                  vec<data_reference_p> *datarefs,
1977                                  unsigned int *n_stmts)
1978 {
1979   *n_stmts = 0;
1980   for (unsigned i = 0; i < loop->num_nodes; i++)
1981     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1982            !gsi_end_p (gsi); gsi_next (&gsi))
1983       {
1984           gimple *stmt = gsi_stmt (gsi);
1985           if (is_gimple_debug (stmt))
1986             continue;
1987           ++(*n_stmts);
1988           opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1989                                                                       NULL, 0);
1990           if (!res)
1991             {
1992               if (is_gimple_call (stmt) && loop->safelen)
1993                 {
1994                     tree fndecl = gimple_call_fndecl (stmt), op;
1995                     if (fndecl != NULL_TREE)
1996                       {
1997                         cgraph_node *node = cgraph_node::get (fndecl);
1998                         if (node != NULL && node->simd_clones != NULL)
1999                           {
2000                               unsigned int j, n = gimple_call_num_args (stmt);
2001                               for (j = 0; j < n; j++)
2002                                 {
2003                                   op = gimple_call_arg (stmt, j);
2004                                   if (DECL_P (op)
2005                                         || (REFERENCE_CLASS_P (op)
2006                                             && get_base_address (op)))
2007                                     break;
2008                                 }
2009                               op = gimple_call_lhs (stmt);
2010                               /* Ignore #pragma omp declare simd functions
2011                                  if they don't have data references in the
2012                                  call stmt itself.  */
2013                               if (j == n
2014                                   && !(op
2015                                          && (DECL_P (op)
2016                                              || (REFERENCE_CLASS_P (op)
2017                                                    && get_base_address (op)))))
2018                                 continue;
2019                           }
2020                       }
2021                 }
2022               return res;
2023             }
2024           /* If dependence analysis will give up due to the limit on the
2025              number of datarefs stop here and fail fatally.  */
2026           if (datarefs->length ()
2027               > (unsigned)param_loop_max_datarefs_for_datadeps)
2028             return opt_result::failure_at (stmt, "exceeded param "
2029                                                    "loop-max-datarefs-for-datadeps\n");
2030       }
2031   return opt_result::success ();
2032 }
2033 
2034 /* Look for SLP-only access groups and turn each individual access into its own
2035    group.  */
2036 static void
vect_dissolve_slp_only_groups(loop_vec_info loop_vinfo)2037 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2038 {
2039   unsigned int i;
2040   struct data_reference *dr;
2041 
2042   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2043 
2044   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2045   FOR_EACH_VEC_ELT (datarefs, i, dr)
2046     {
2047       gcc_assert (DR_REF (dr));
2048       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2049 
2050       /* Check if the load is a part of an interleaving chain.  */
2051       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052           {
2053             stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2054             dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2055             unsigned int group_size = DR_GROUP_SIZE (first_element);
2056 
2057             /* Check if SLP-only groups.  */
2058             if (!STMT_SLP_TYPE (stmt_info)
2059                 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2060               {
2061                 /* Dissolve the group.  */
2062                 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2063 
2064                 stmt_vec_info vinfo = first_element;
2065                 while (vinfo)
2066                     {
2067                       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2068                       DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2069                       DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2070                       DR_GROUP_SIZE (vinfo) = 1;
2071                       if (STMT_VINFO_STRIDED_P (first_element))
2072                         DR_GROUP_GAP (vinfo) = 0;
2073                       else
2074                         DR_GROUP_GAP (vinfo) = group_size - 1;
2075                       /* Duplicate and adjust alignment info, it needs to
2076                          be present on each group leader, see dr_misalignment.  */
2077                       if (vinfo != first_element)
2078                         {
2079                           dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2080                           dr_info2->target_alignment = dr_info->target_alignment;
2081                           int misalignment = dr_info->misalignment;
2082                           if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2083                               {
2084                                 HOST_WIDE_INT diff
2085                                   = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2086                                      - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2087                                 unsigned HOST_WIDE_INT align_c
2088                                   = dr_info->target_alignment.to_constant ();
2089                                 misalignment = (misalignment + diff) % align_c;
2090                               }
2091                           dr_info2->misalignment = misalignment;
2092                         }
2093                       vinfo = next;
2094                     }
2095               }
2096           }
2097     }
2098 }
2099 
2100 /* Determine if operating on full vectors for LOOP_VINFO might leave
2101    some scalar iterations still to do.  If so, decide how we should
2102    handle those scalar iterations.  The possibilities are:
2103 
2104    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2105        In this case:
2106 
2107            LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2108            LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2109            LOOP_VINFO_PEELING_FOR_NITER == false
2110 
2111    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2112        to handle the remaining scalar iterations.  In this case:
2113 
2114            LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2115            LOOP_VINFO_PEELING_FOR_NITER == true
2116 
2117        There are two choices:
2118 
2119        (2a) Consider vectorizing the epilogue loop at the same VF as the
2120               main loop, but using partial vectors instead of full vectors.
2121               In this case:
2122 
2123                 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2124 
2125        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2126               In this case:
2127 
2128                 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2129 
2130    When FOR_EPILOGUE_P is true, make this determination based on the
2131    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2132    based on the assumption that LOOP_VINFO is the main loop.  The caller
2133    has made sure that the number of iterations is set appropriately for
2134    this value of FOR_EPILOGUE_P.  */
2135 
2136 opt_result
vect_determine_partial_vectors_and_peeling(loop_vec_info loop_vinfo,bool for_epilogue_p)2137 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2138                                                       bool for_epilogue_p)
2139 {
2140   /* Determine whether there would be any scalar iterations left over.  */
2141   bool need_peeling_or_partial_vectors_p
2142     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2143 
2144   /* Decide whether to vectorize the loop with partial vectors.  */
2145   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2146   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2147   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2148       && need_peeling_or_partial_vectors_p)
2149     {
2150       /* For partial-vector-usage=1, try to push the handling of partial
2151            vectors to the epilogue, with the main loop continuing to operate
2152            on full vectors.
2153 
2154            If we are unrolling we also do not want to use partial vectors. This
2155            is to avoid the overhead of generating multiple masks and also to
2156            avoid having to execute entire iterations of FALSE masked instructions
2157            when dealing with one or less full iterations.
2158 
2159            ??? We could then end up failing to use partial vectors if we
2160            decide to peel iterations into a prologue, and if the main loop
2161            then ends up processing fewer than VF iterations.  */
2162       if ((param_vect_partial_vector_usage == 1
2163              || loop_vinfo->suggested_unroll_factor > 1)
2164             && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2165             && !vect_known_niters_smaller_than_vf (loop_vinfo))
2166           LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2167       else
2168           LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2169     }
2170 
2171   if (dump_enabled_p ())
2172     {
2173       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2174           dump_printf_loc (MSG_NOTE, vect_location,
2175                                "operating on partial vectors%s.\n",
2176                                for_epilogue_p ? " for epilogue loop" : "");
2177       else
2178           dump_printf_loc (MSG_NOTE, vect_location,
2179                                "operating only on full vectors%s.\n",
2180                                for_epilogue_p ? " for epilogue loop" : "");
2181     }
2182 
2183   if (for_epilogue_p)
2184     {
2185       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2186       gcc_assert (orig_loop_vinfo);
2187       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2188           gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2189                                     LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2190     }
2191 
2192   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2194     {
2195       /* Check that the loop processes at least one full vector.  */
2196       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2198       if (known_lt (wi::to_widest (scalar_niters), vf))
2199           return opt_result::failure_at (vect_location,
2200                                                "loop does not have enough iterations"
2201                                                " to support vectorization.\n");
2202 
2203       /* If we need to peel an extra epilogue iteration to handle data
2204            accesses with gaps, check that there are enough scalar iterations
2205            available.
2206 
2207            The check above is redundant with this one when peeling for gaps,
2208            but the distinction is useful for diagnostics.  */
2209       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2210       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2211             && known_lt (wi::to_widest (scalar_nitersm1), vf))
2212           return opt_result::failure_at (vect_location,
2213                                                "loop does not have enough iterations"
2214                                                " to support peeling for gaps.\n");
2215     }
2216 
2217   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2218     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2219        && need_peeling_or_partial_vectors_p);
2220 
2221   return opt_result::success ();
2222 }
2223 
2224 /* Function vect_analyze_loop_2.
2225 
2226    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2227    for it.  The different analyses will record information in the
2228    loop_vec_info struct.  */
2229 static opt_result
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal,unsigned * suggested_unroll_factor)2230 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2231                          unsigned *suggested_unroll_factor)
2232 {
2233   opt_result ok = opt_result::success ();
2234   int res;
2235   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2236   poly_uint64 min_vf = 2;
2237   loop_vec_info orig_loop_vinfo = NULL;
2238 
2239   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2240      loop_vec_info of the first vectorized loop.  */
2241   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2242     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2243   else
2244     orig_loop_vinfo = loop_vinfo;
2245   gcc_assert (orig_loop_vinfo);
2246 
2247   /* The first group of checks is independent of the vector size.  */
2248   fatal = true;
2249 
2250   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2251       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2252     return opt_result::failure_at (vect_location,
2253                                            "not vectorized: simd if(0)\n");
2254 
2255   /* Find all data references in the loop (which correspond to vdefs/vuses)
2256      and analyze their evolution in the loop.  */
2257 
2258   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2259 
2260   /* Gather the data references and count stmts in the loop.  */
2261   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2262     {
2263       opt_result res
2264           = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2265                                              &LOOP_VINFO_DATAREFS (loop_vinfo),
2266                                              &LOOP_VINFO_N_STMTS (loop_vinfo));
2267       if (!res)
2268           {
2269             if (dump_enabled_p ())
2270               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271                                    "not vectorized: loop contains function "
2272                                    "calls or data references that cannot "
2273                                    "be analyzed\n");
2274             return res;
2275           }
2276       loop_vinfo->shared->save_datarefs ();
2277     }
2278   else
2279     loop_vinfo->shared->check_datarefs ();
2280 
2281   /* Analyze the data references and also adjust the minimal
2282      vectorization factor according to the loads and stores.  */
2283 
2284   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                                "bad data references.\n");
2290       return ok;
2291     }
2292 
2293   /* Classify all cross-iteration scalar data-flow cycles.
2294      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2295   vect_analyze_scalar_cycles (loop_vinfo);
2296 
2297   vect_pattern_recog (loop_vinfo);
2298 
2299   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2300 
2301   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2302      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2303 
2304   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2305   if (!ok)
2306     {
2307       if (dump_enabled_p ())
2308           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                                "bad data access.\n");
2310       return ok;
2311     }
2312 
2313   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2314 
2315   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2316   if (!ok)
2317     {
2318       if (dump_enabled_p ())
2319           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                                "unexpected pattern.\n");
2321       return ok;
2322     }
2323 
2324   /* While the rest of the analysis below depends on it in some way.  */
2325   fatal = false;
2326 
2327   /* Analyze data dependences between the data-refs in the loop
2328      and adjust the maximum vectorization factor according to
2329      the dependences.
2330      FORNOW: fail at the first data dependence that we encounter.  */
2331 
2332   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2333   if (!ok)
2334     {
2335       if (dump_enabled_p ())
2336           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                                "bad data dependence.\n");
2338       return ok;
2339     }
2340   if (max_vf != MAX_VECTORIZATION_FACTOR
2341       && maybe_lt (max_vf, min_vf))
2342     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2343   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2344 
2345   ok = vect_determine_vectorization_factor (loop_vinfo);
2346   if (!ok)
2347     {
2348       if (dump_enabled_p ())
2349           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2350                                "can't determine vectorization factor.\n");
2351       return ok;
2352     }
2353   if (max_vf != MAX_VECTORIZATION_FACTOR
2354       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2355     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2356 
2357   /* Compute the scalar iteration cost.  */
2358   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2359 
2360   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361 
2362   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2363   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2364   if (!ok)
2365     return ok;
2366 
2367   /* If there are any SLP instances mark them as pure_slp.  */
2368   bool slp = vect_make_slp_decision (loop_vinfo);
2369   if (slp)
2370     {
2371       /* Find stmts that need to be both vectorized and SLPed.  */
2372       vect_detect_hybrid_slp (loop_vinfo);
2373 
2374       /* Update the vectorization factor based on the SLP decision.  */
2375       vect_update_vf_for_slp (loop_vinfo);
2376 
2377       /* Optimize the SLP graph with the vectorization factor fixed.  */
2378       vect_optimize_slp (loop_vinfo);
2379 
2380       /* Gather the loads reachable from the SLP graph entries.  */
2381       vect_gather_slp_loads (loop_vinfo);
2382     }
2383 
2384   bool saved_can_use_partial_vectors_p
2385     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2386 
2387   /* We don't expect to have to roll back to anything other than an empty
2388      set of rgroups.  */
2389   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2390 
2391   /* This is the point where we can re-start analysis with SLP forced off.  */
2392 start_over:
2393 
2394   /* Apply the suggested unrolling factor, this was determined by the backend
2395      during finish_cost the first time we ran the analyzis for this
2396      vector mode.  */
2397   if (loop_vinfo->suggested_unroll_factor > 1)
2398     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2399 
2400   /* Now the vectorization factor is final.  */
2401   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2402   gcc_assert (known_ne (vectorization_factor, 0U));
2403 
2404   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2405     {
2406       dump_printf_loc (MSG_NOTE, vect_location,
2407                            "vectorization_factor = ");
2408       dump_dec (MSG_NOTE, vectorization_factor);
2409       dump_printf (MSG_NOTE, ", niters = %wd\n",
2410                        LOOP_VINFO_INT_NITERS (loop_vinfo));
2411     }
2412 
2413   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2414 
2415   /* Analyze the alignment of the data-refs in the loop.
2416      Fail if a data reference is found that cannot be vectorized.  */
2417 
2418   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2419   if (!ok)
2420     {
2421       if (dump_enabled_p ())
2422           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2423                                "bad data alignment.\n");
2424       return ok;
2425     }
2426 
2427   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2428      It is important to call pruning after vect_analyze_data_ref_accesses,
2429      since we use grouping information gathered by interleaving analysis.  */
2430   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2431   if (!ok)
2432     return ok;
2433 
2434   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2435      vectorization, since we do not want to add extra peeling or
2436      add versioning for alignment.  */
2437   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2438     /* This pass will decide on using loop versioning and/or loop peeling in
2439        order to enhance the alignment of data references in the loop.  */
2440     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2441   if (!ok)
2442     return ok;
2443 
2444   if (slp)
2445     {
2446       /* Analyze operations in the SLP instances.  Note this may
2447            remove unsupported SLP instances which makes the above
2448            SLP kind detection invalid.  */
2449       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2450       vect_slp_analyze_operations (loop_vinfo);
2451       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2452           {
2453             ok = opt_result::failure_at (vect_location,
2454                                                "unsupported SLP instances\n");
2455             goto again;
2456           }
2457 
2458       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2459       slp_tree load_node, slp_root;
2460       unsigned i, x;
2461       slp_instance instance;
2462       bool can_use_lanes = true;
2463       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2464           {
2465             slp_root = SLP_INSTANCE_TREE (instance);
2466             int group_size = SLP_TREE_LANES (slp_root);
2467             tree vectype = SLP_TREE_VECTYPE (slp_root);
2468             bool loads_permuted = false;
2469             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2470               {
2471                 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2472                     continue;
2473                 unsigned j;
2474                 stmt_vec_info load_info;
2475                 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2476                     if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2477                       {
2478                         loads_permuted = true;
2479                         break;
2480                       }
2481               }
2482 
2483             /* If the loads and stores can be handled with load/store-lane
2484                instructions record it and move on to the next instance.  */
2485             if (loads_permuted
2486                 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2487                 && vect_store_lanes_supported (vectype, group_size, false))
2488               {
2489                 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2490                     {
2491                       stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2492                           (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2493                       /* Use SLP for strided accesses (or if we can't
2494                          load-lanes).  */
2495                       if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2496                           || ! vect_load_lanes_supported
2497                                   (STMT_VINFO_VECTYPE (stmt_vinfo),
2498                                    DR_GROUP_SIZE (stmt_vinfo), false))
2499                         break;
2500                     }
2501 
2502                 can_use_lanes
2503                     = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2504 
2505                 if (can_use_lanes && dump_enabled_p ())
2506                     dump_printf_loc (MSG_NOTE, vect_location,
2507                                          "SLP instance %p can use load/store-lanes\n",
2508                                          instance);
2509               }
2510             else
2511               {
2512                 can_use_lanes = false;
2513                 break;
2514               }
2515           }
2516 
2517       /* If all SLP instances can use load/store-lanes abort SLP and try again
2518            with SLP disabled.  */
2519       if (can_use_lanes)
2520           {
2521             ok = opt_result::failure_at (vect_location,
2522                                                "Built SLP cancelled: can use "
2523                                                "load/store-lanes\n");
2524             if (dump_enabled_p ())
2525               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526                                    "Built SLP cancelled: all SLP instances support "
2527                                    "load/store-lanes\n");
2528             goto again;
2529           }
2530     }
2531 
2532   /* Dissolve SLP-only groups.  */
2533   vect_dissolve_slp_only_groups (loop_vinfo);
2534 
2535   /* Scan all the remaining operations in the loop that are not subject
2536      to SLP and make sure they are vectorizable.  */
2537   ok = vect_analyze_loop_operations (loop_vinfo);
2538   if (!ok)
2539     {
2540       if (dump_enabled_p ())
2541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542                                "bad operation or unsupported loop bound.\n");
2543       return ok;
2544     }
2545 
2546   /* For now, we don't expect to mix both masking and length approaches for one
2547      loop, disable it if both are recorded.  */
2548   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2549       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2550       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2551     {
2552       if (dump_enabled_p ())
2553           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                                "can't vectorize a loop with partial vectors"
2555                                " because we don't expect to mix different"
2556                                " approaches with partial vectors for the"
2557                                " same loop.\n");
2558       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2559     }
2560 
2561   /* If we still have the option of using partial vectors,
2562      check whether we can generate the necessary loop controls.  */
2563   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2564       && !vect_verify_full_masking (loop_vinfo)
2565       && !vect_verify_loop_lens (loop_vinfo))
2566     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2567 
2568   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2569      to be able to handle fewer than VF scalars, or needs to have a lower VF
2570      than the main loop.  */
2571   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2572       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2573       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2574                        LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2575     return opt_result::failure_at (vect_location,
2576                                            "Vectorization factor too high for"
2577                                            " epilogue loop.\n");
2578 
2579   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2580      assuming that the loop will be used as a main loop.  We will redo
2581      this analysis later if we instead decide to use the loop as an
2582      epilogue loop.  */
2583   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2584   if (!ok)
2585     return ok;
2586 
2587   /* Check the costings of the loop make vectorizing worthwhile.  */
2588   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2589   if (res < 0)
2590     {
2591       ok = opt_result::failure_at (vect_location,
2592                                            "Loop costings may not be worthwhile.\n");
2593       goto again;
2594     }
2595   if (!res)
2596     return opt_result::failure_at (vect_location,
2597                                            "Loop costings not worthwhile.\n");
2598 
2599   /* If an epilogue loop is required make sure we can create one.  */
2600   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2601       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2602     {
2603       if (dump_enabled_p ())
2604         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2605       if (!vect_can_advance_ivs_p (loop_vinfo)
2606             || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2607                                                      single_exit (LOOP_VINFO_LOOP
2608                                                                        (loop_vinfo))))
2609         {
2610             ok = opt_result::failure_at (vect_location,
2611                                                "not vectorized: can't create required "
2612                                                "epilog loop\n");
2613           goto again;
2614         }
2615     }
2616 
2617   /* During peeling, we need to check if number of loop iterations is
2618      enough for both peeled prolog loop and vector loop.  This check
2619      can be merged along with threshold check of loop versioning, so
2620      increase threshold for this case if necessary.
2621 
2622      If we are analyzing an epilogue we still want to check what its
2623      versioning threshold would be.  If we decide to vectorize the epilogues we
2624      will want to use the lowest versioning threshold of all epilogues and main
2625      loop.  This will enable us to enter a vectorized epilogue even when
2626      versioning the loop.  We can't simply check whether the epilogue requires
2627      versioning though since we may have skipped some versioning checks when
2628      analyzing the epilogue.  For instance, checks for alias versioning will be
2629      skipped when dealing with epilogues as we assume we already checked them
2630      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2631   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2632     {
2633       poly_uint64 niters_th = 0;
2634       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2635 
2636       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2637           {
2638             /* Niters for peeled prolog loop.  */
2639             if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2640               {
2641                 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2642                 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2643                 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2644               }
2645             else
2646               niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2647           }
2648 
2649       /* Niters for at least one iteration of vectorized loop.  */
2650       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2651           niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2652       /* One additional iteration because of peeling for gap.  */
2653       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2654           niters_th += 1;
2655 
2656       /*  Use the same condition as vect_transform_loop to decide when to use
2657             the cost to determine a versioning threshold.  */
2658       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2659             && ordered_p (th, niters_th))
2660           niters_th = ordered_max (poly_uint64 (th), niters_th);
2661 
2662       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2663     }
2664 
2665   gcc_assert (known_eq (vectorization_factor,
2666                               LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2667 
2668   /* Ok to vectorize!  */
2669   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2670   return opt_result::success ();
2671 
2672 again:
2673   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2674   gcc_assert (!ok);
2675 
2676   /* Try again with SLP forced off but if we didn't do any SLP there is
2677      no point in re-trying.  */
2678   if (!slp)
2679     return ok;
2680 
2681   /* If there are reduction chains re-trying will fail anyway.  */
2682   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2683     return ok;
2684 
2685   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2686      via interleaving or lane instructions.  */
2687   slp_instance instance;
2688   slp_tree node;
2689   unsigned i, j;
2690   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2691     {
2692       stmt_vec_info vinfo;
2693       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2694       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2695           continue;
2696       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2697       unsigned int size = DR_GROUP_SIZE (vinfo);
2698       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2699       if (! vect_store_lanes_supported (vectype, size, false)
2700            && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2701            && ! vect_grouped_store_supported (vectype, size))
2702           return opt_result::failure_at (vinfo->stmt,
2703                                                "unsupported grouped store\n");
2704       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2705           {
2706             vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2707             vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2708             bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2709             size = DR_GROUP_SIZE (vinfo);
2710             vectype = STMT_VINFO_VECTYPE (vinfo);
2711             if (! vect_load_lanes_supported (vectype, size, false)
2712                 && ! vect_grouped_load_supported (vectype, single_element_p,
2713                                                             size))
2714               return opt_result::failure_at (vinfo->stmt,
2715                                                      "unsupported grouped load\n");
2716           }
2717     }
2718 
2719   if (dump_enabled_p ())
2720     dump_printf_loc (MSG_NOTE, vect_location,
2721                          "re-trying with SLP disabled\n");
2722 
2723   /* Roll back state appropriately.  No SLP this time.  */
2724   slp = false;
2725   /* Restore vectorization factor as it were without SLP.  */
2726   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2727   /* Free the SLP instances.  */
2728   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2729     vect_free_slp_instance (instance);
2730   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2731   /* Reset SLP type to loop_vect on all stmts.  */
2732   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2733     {
2734       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2735       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2736              !gsi_end_p (si); gsi_next (&si))
2737           {
2738             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2739             STMT_SLP_TYPE (stmt_info) = loop_vect;
2740             if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2741                 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2742               {
2743                 /* vectorizable_reduction adjusts reduction stmt def-types,
2744                      restore them to that of the PHI.  */
2745                 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2746                     = STMT_VINFO_DEF_TYPE (stmt_info);
2747                 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2748                                                   (STMT_VINFO_REDUC_DEF (stmt_info)))
2749                     = STMT_VINFO_DEF_TYPE (stmt_info);
2750               }
2751           }
2752       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2753              !gsi_end_p (si); gsi_next (&si))
2754           {
2755             if (is_gimple_debug (gsi_stmt (si)))
2756               continue;
2757             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2758             STMT_SLP_TYPE (stmt_info) = loop_vect;
2759             if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2760               {
2761                 stmt_vec_info pattern_stmt_info
2762                     = STMT_VINFO_RELATED_STMT (stmt_info);
2763                 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2764                     STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2765 
2766                 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2767                 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2768                 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2769                        !gsi_end_p (pi); gsi_next (&pi))
2770                     STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2771                       = loop_vect;
2772               }
2773           }
2774     }
2775   /* Free optimized alias test DDRS.  */
2776   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2777   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2778   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2779   /* Reset target cost data.  */
2780   delete loop_vinfo->vector_costs;
2781   loop_vinfo->vector_costs = nullptr;
2782   /* Reset accumulated rgroup information.  */
2783   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2784   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2785   /* Reset assorted flags.  */
2786   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2787   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2788   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2789   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2790   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2791     = saved_can_use_partial_vectors_p;
2792 
2793   goto start_over;
2794 }
2795 
2796 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2797    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2798    OLD_LOOP_VINFO is better unless something specifically indicates
2799    otherwise.
2800 
2801    Note that this deliberately isn't a partial order.  */
2802 
2803 static bool
vect_better_loop_vinfo_p(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2804 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2805                                 loop_vec_info old_loop_vinfo)
2806 {
2807   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2808   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2809 
2810   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2811   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2812 
2813   /* Always prefer a VF of loop->simdlen over any other VF.  */
2814   if (loop->simdlen)
2815     {
2816       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2817       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2818       if (new_simdlen_p != old_simdlen_p)
2819           return new_simdlen_p;
2820     }
2821 
2822   const auto *old_costs = old_loop_vinfo->vector_costs;
2823   const auto *new_costs = new_loop_vinfo->vector_costs;
2824   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2825     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2826 
2827   return new_costs->better_main_loop_than_p (old_costs);
2828 }
2829 
2830 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2831    true if we should.  */
2832 
2833 static bool
vect_joust_loop_vinfos(loop_vec_info new_loop_vinfo,loop_vec_info old_loop_vinfo)2834 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2835                               loop_vec_info old_loop_vinfo)
2836 {
2837   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2838     return false;
2839 
2840   if (dump_enabled_p ())
2841     dump_printf_loc (MSG_NOTE, vect_location,
2842                          "***** Preferring vector mode %s to vector mode %s\n",
2843                          GET_MODE_NAME (new_loop_vinfo->vector_mode),
2844                          GET_MODE_NAME (old_loop_vinfo->vector_mode));
2845   return true;
2846 }
2847 
2848 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2849    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2850    MODE_I to the next mode useful to analyze.
2851    Return the loop_vinfo on success and wrapped null on failure.  */
2852 
2853 static opt_loop_vec_info
vect_analyze_loop_1(class loop * loop,vec_info_shared * shared,const vect_loop_form_info * loop_form_info,loop_vec_info main_loop_vinfo,const vector_modes & vector_modes,unsigned & mode_i,machine_mode & autodetected_vector_mode,bool & fatal)2854 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2855                          const vect_loop_form_info *loop_form_info,
2856                          loop_vec_info main_loop_vinfo,
2857                          const vector_modes &vector_modes, unsigned &mode_i,
2858                          machine_mode &autodetected_vector_mode,
2859                          bool &fatal)
2860 {
2861   loop_vec_info loop_vinfo
2862     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2863 
2864   machine_mode vector_mode = vector_modes[mode_i];
2865   loop_vinfo->vector_mode = vector_mode;
2866   unsigned int suggested_unroll_factor = 1;
2867 
2868   /* Run the main analysis.  */
2869   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2870                                                   &suggested_unroll_factor);
2871   if (dump_enabled_p ())
2872     dump_printf_loc (MSG_NOTE, vect_location,
2873                          "***** Analysis %s with vector mode %s\n",
2874                          res ? "succeeded" : " failed",
2875                          GET_MODE_NAME (loop_vinfo->vector_mode));
2876 
2877   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
2878     {
2879       if (dump_enabled_p ())
2880           dump_printf_loc (MSG_NOTE, vect_location,
2881                                "***** Re-trying analysis for unrolling"
2882                                " with unroll factor %d.\n",
2883                                suggested_unroll_factor);
2884       loop_vec_info unroll_vinfo
2885           = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2886       unroll_vinfo->vector_mode = vector_mode;
2887       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2888       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL);
2889       if (new_res)
2890           {
2891             delete loop_vinfo;
2892             loop_vinfo = unroll_vinfo;
2893           }
2894       else
2895           delete unroll_vinfo;
2896     }
2897 
2898   /* Remember the autodetected vector mode.  */
2899   if (vector_mode == VOIDmode)
2900     autodetected_vector_mode = loop_vinfo->vector_mode;
2901 
2902   /* Advance mode_i, first skipping modes that would result in the
2903      same analysis result.  */
2904   while (mode_i + 1 < vector_modes.length ()
2905            && vect_chooses_same_modes_p (loop_vinfo,
2906                                                vector_modes[mode_i + 1]))
2907     {
2908       if (dump_enabled_p ())
2909           dump_printf_loc (MSG_NOTE, vect_location,
2910                                "***** The result for vector mode %s would"
2911                                " be the same\n",
2912                                GET_MODE_NAME (vector_modes[mode_i + 1]));
2913       mode_i += 1;
2914     }
2915   if (mode_i + 1 < vector_modes.length ()
2916       && VECTOR_MODE_P (autodetected_vector_mode)
2917       && (related_vector_mode (vector_modes[mode_i + 1],
2918                                      GET_MODE_INNER (autodetected_vector_mode))
2919             == autodetected_vector_mode)
2920       && (related_vector_mode (autodetected_vector_mode,
2921                                      GET_MODE_INNER (vector_modes[mode_i + 1]))
2922             == vector_modes[mode_i + 1]))
2923     {
2924       if (dump_enabled_p ())
2925           dump_printf_loc (MSG_NOTE, vect_location,
2926                                "***** Skipping vector mode %s, which would"
2927                                " repeat the analysis for %s\n",
2928                                GET_MODE_NAME (vector_modes[mode_i + 1]),
2929                                GET_MODE_NAME (autodetected_vector_mode));
2930       mode_i += 1;
2931     }
2932   mode_i++;
2933 
2934   if (!res)
2935     {
2936       delete loop_vinfo;
2937       if (fatal)
2938           gcc_checking_assert (main_loop_vinfo == NULL);
2939       return opt_loop_vec_info::propagate_failure (res);
2940     }
2941 
2942   return opt_loop_vec_info::success (loop_vinfo);
2943 }
2944 
2945 /* Function vect_analyze_loop.
2946 
2947    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2948    for it.  The different analyses will record information in the
2949    loop_vec_info struct.  */
2950 opt_loop_vec_info
vect_analyze_loop(class loop * loop,vec_info_shared * shared)2951 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2952 {
2953   DUMP_VECT_SCOPE ("analyze_loop_nest");
2954 
2955   if (loop_outer (loop)
2956       && loop_vec_info_for_loop (loop_outer (loop))
2957       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2958     return opt_loop_vec_info::failure_at (vect_location,
2959                                                     "outer-loop already vectorized.\n");
2960 
2961   if (!find_loop_nest (loop, &shared->loop_nest))
2962     return opt_loop_vec_info::failure_at
2963       (vect_location,
2964        "not vectorized: loop nest containing two or more consecutive inner"
2965        " loops cannot be vectorized\n");
2966 
2967   /* Analyze the loop form.  */
2968   vect_loop_form_info loop_form_info;
2969   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2970   if (!res)
2971     {
2972       if (dump_enabled_p ())
2973           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                                "bad loop form.\n");
2975       return opt_loop_vec_info::propagate_failure (res);
2976     }
2977   if (!integer_onep (loop_form_info.assumptions))
2978     {
2979       /* We consider to vectorize this loop by versioning it under
2980            some assumptions.  In order to do this, we need to clear
2981            existing information computed by scev and niter analyzer.  */
2982       scev_reset_htab ();
2983       free_numbers_of_iterations_estimates (loop);
2984       /* Also set flag for this loop so that following scev and niter
2985            analysis are done under the assumptions.  */
2986       loop_constraint_set (loop, LOOP_C_FINITE);
2987     }
2988 
2989   auto_vector_modes vector_modes;
2990   /* Autodetect first vector size we try.  */
2991   vector_modes.safe_push (VOIDmode);
2992   unsigned int autovec_flags
2993     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2994                                                                 loop->simdlen != 0);
2995   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2996                                    && !unlimited_cost_model (loop));
2997   machine_mode autodetected_vector_mode = VOIDmode;
2998   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2999   unsigned int mode_i = 0;
3000   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3001 
3002   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3003      a mode has not been analyzed.  */
3004   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3005   for (unsigned i = 0; i < vector_modes.length (); ++i)
3006     cached_vf_per_mode.safe_push (0);
3007 
3008   /* First determine the main loop vectorization mode, either the first
3009      one that works, starting with auto-detecting the vector mode and then
3010      following the targets order of preference, or the one with the
3011      lowest cost if pick_lowest_cost_p.  */
3012   while (1)
3013     {
3014       bool fatal;
3015       unsigned int last_mode_i = mode_i;
3016       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3017            failed.  */
3018       cached_vf_per_mode[last_mode_i] = -1;
3019       opt_loop_vec_info loop_vinfo
3020           = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3021                                      NULL, vector_modes, mode_i,
3022                                      autodetected_vector_mode, fatal);
3023       if (fatal)
3024           break;
3025 
3026       if (loop_vinfo)
3027           {
3028             /*  Analyzis has been successful so update the VF value.  The
3029                 VF should always be a multiple of unroll_factor and we want to
3030                 capture the original VF here.  */
3031             cached_vf_per_mode[last_mode_i]
3032               = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3033                                loop_vinfo->suggested_unroll_factor);
3034             /* Once we hit the desired simdlen for the first time,
3035                discard any previous attempts.  */
3036             if (simdlen
3037                 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3038               {
3039                 delete first_loop_vinfo;
3040                 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3041                 simdlen = 0;
3042               }
3043             else if (pick_lowest_cost_p
3044                        && first_loop_vinfo
3045                        && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3046               {
3047                 /* Pick loop_vinfo over first_loop_vinfo.  */
3048                 delete first_loop_vinfo;
3049                 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050               }
3051             if (first_loop_vinfo == NULL)
3052               first_loop_vinfo = loop_vinfo;
3053             else
3054               {
3055                 delete loop_vinfo;
3056                 loop_vinfo = opt_loop_vec_info::success (NULL);
3057               }
3058 
3059             /* Commit to first_loop_vinfo if we have no reason to try
3060                alternatives.  */
3061             if (!simdlen && !pick_lowest_cost_p)
3062               break;
3063           }
3064       if (mode_i == vector_modes.length ()
3065             || autodetected_vector_mode == VOIDmode)
3066           break;
3067 
3068       /* Try the next biggest vector size.  */
3069       if (dump_enabled_p ())
3070           dump_printf_loc (MSG_NOTE, vect_location,
3071                                "***** Re-trying analysis with vector mode %s\n",
3072                                GET_MODE_NAME (vector_modes[mode_i]));
3073     }
3074   if (!first_loop_vinfo)
3075     return opt_loop_vec_info::propagate_failure (res);
3076 
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079                          "***** Choosing vector mode %s\n",
3080                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3081 
3082   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3083      enabled, SIMDUID is not set, it is the innermost loop and we have
3084      either already found the loop's SIMDLEN or there was no SIMDLEN to
3085      begin with.
3086      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3087   bool vect_epilogues = (!simdlen
3088                                && loop->inner == NULL
3089                                && param_vect_epilogues_nomask
3090                                && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3091                                && !loop->simduid);
3092   if (!vect_epilogues)
3093     return first_loop_vinfo;
3094 
3095   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3096   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3097 
3098   /* For epilogues start the analysis from the first mode.  The motivation
3099      behind starting from the beginning comes from cases where the VECTOR_MODES
3100      array may contain length-agnostic and length-specific modes.  Their
3101      ordering is not guaranteed, so we could end up picking a mode for the main
3102      loop that is after the epilogue's optimal mode.  */
3103   vector_modes[0] = autodetected_vector_mode;
3104   mode_i = 0;
3105 
3106   bool supports_partial_vectors =
3107     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3108   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3109 
3110   while (1)
3111     {
3112       /* If the target does not support partial vectors we can shorten the
3113            number of modes to analyze for the epilogue as we know we can't pick a
3114            mode that would lead to a VF at least as big as the
3115            FIRST_VINFO_VF.  */
3116       if (!supports_partial_vectors
3117             && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3118           {
3119             mode_i++;
3120             if (mode_i == vector_modes.length ())
3121               break;
3122             continue;
3123           }
3124 
3125       if (dump_enabled_p ())
3126           dump_printf_loc (MSG_NOTE, vect_location,
3127                                "***** Re-trying epilogue analysis with vector "
3128                                "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3129 
3130       bool fatal;
3131       opt_loop_vec_info loop_vinfo
3132           = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133                                      first_loop_vinfo,
3134                                      vector_modes, mode_i,
3135                                      autodetected_vector_mode, fatal);
3136       if (fatal)
3137           break;
3138 
3139       if (loop_vinfo)
3140           {
3141             if (pick_lowest_cost_p)
3142               {
3143                 /* Keep trying to roll back vectorization attempts while the
3144                      loop_vec_infos they produced were worse than this one.  */
3145                 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3146                 while (!vinfos.is_empty ()
3147                          && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3148                     {
3149                       gcc_assert (vect_epilogues);
3150                       delete vinfos.pop ();
3151                     }
3152               }
3153             /* For now only allow one epilogue loop.  */
3154             if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3155               {
3156                 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3157                 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3158                 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3159                                 || maybe_ne (lowest_th, 0U));
3160                 /* Keep track of the known smallest versioning
3161                      threshold.  */
3162                 if (ordered_p (lowest_th, th))
3163                     lowest_th = ordered_min (lowest_th, th);
3164               }
3165             else
3166               {
3167                 delete loop_vinfo;
3168                 loop_vinfo = opt_loop_vec_info::success (NULL);
3169               }
3170 
3171             /* For now only allow one epilogue loop, but allow
3172                pick_lowest_cost_p to replace it, so commit to the
3173                first epilogue if we have no reason to try alternatives.  */
3174             if (!pick_lowest_cost_p)
3175               break;
3176           }
3177 
3178       if (mode_i == vector_modes.length ())
3179           break;
3180 
3181     }
3182 
3183   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3184     {
3185       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3186       if (dump_enabled_p ())
3187           dump_printf_loc (MSG_NOTE, vect_location,
3188                                "***** Choosing epilogue vector mode %s\n",
3189                                GET_MODE_NAME
3190                                  (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3191     }
3192 
3193   return first_loop_vinfo;
3194 }
3195 
3196 /* Return true if there is an in-order reduction function for CODE, storing
3197    it in *REDUC_FN if so.  */
3198 
3199 static bool
fold_left_reduction_fn(code_helper code,internal_fn * reduc_fn)3200 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3201 {
3202   if (code == PLUS_EXPR)
3203     {
3204       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3205       return true;
3206     }
3207   return false;
3208 }
3209 
3210 /* Function reduction_fn_for_scalar_code
3211 
3212    Input:
3213    CODE - tree_code of a reduction operations.
3214 
3215    Output:
3216    REDUC_FN - the corresponding internal function to be used to reduce the
3217       vector of partial results into a single scalar result, or IFN_LAST
3218       if the operation is a supported reduction operation, but does not have
3219       such an internal function.
3220 
3221    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3222 
3223 bool
reduction_fn_for_scalar_code(code_helper code,internal_fn * reduc_fn)3224 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3225 {
3226   if (code.is_tree_code ())
3227     switch (tree_code (code))
3228       {
3229       case MAX_EXPR:
3230           *reduc_fn = IFN_REDUC_MAX;
3231           return true;
3232 
3233       case MIN_EXPR:
3234           *reduc_fn = IFN_REDUC_MIN;
3235           return true;
3236 
3237       case PLUS_EXPR:
3238           *reduc_fn = IFN_REDUC_PLUS;
3239           return true;
3240 
3241       case BIT_AND_EXPR:
3242           *reduc_fn = IFN_REDUC_AND;
3243           return true;
3244 
3245       case BIT_IOR_EXPR:
3246           *reduc_fn = IFN_REDUC_IOR;
3247           return true;
3248 
3249       case BIT_XOR_EXPR:
3250           *reduc_fn = IFN_REDUC_XOR;
3251           return true;
3252 
3253       case MULT_EXPR:
3254       case MINUS_EXPR:
3255           *reduc_fn = IFN_LAST;
3256           return true;
3257 
3258       default:
3259           return false;
3260       }
3261   else
3262     switch (combined_fn (code))
3263       {
3264       CASE_CFN_FMAX:
3265           *reduc_fn = IFN_REDUC_FMAX;
3266           return true;
3267 
3268       CASE_CFN_FMIN:
3269           *reduc_fn = IFN_REDUC_FMIN;
3270           return true;
3271 
3272       default:
3273           return false;
3274       }
3275 }
3276 
3277 /* If there is a neutral value X such that a reduction would not be affected
3278    by the introduction of additional X elements, return that X, otherwise
3279    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3280    of the scalar elements.  If the reduction has just a single initial value
3281    then INITIAL_VALUE is that value, otherwise it is null.  */
3282 
3283 tree
neutral_op_for_reduction(tree scalar_type,code_helper code,tree initial_value)3284 neutral_op_for_reduction (tree scalar_type, code_helper code,
3285                                 tree initial_value)
3286 {
3287   if (code.is_tree_code ())
3288     switch (tree_code (code))
3289       {
3290       case WIDEN_SUM_EXPR:
3291       case DOT_PROD_EXPR:
3292       case SAD_EXPR:
3293       case PLUS_EXPR:
3294       case MINUS_EXPR:
3295       case BIT_IOR_EXPR:
3296       case BIT_XOR_EXPR:
3297           return build_zero_cst (scalar_type);
3298 
3299       case MULT_EXPR:
3300           return build_one_cst (scalar_type);
3301 
3302       case BIT_AND_EXPR:
3303           return build_all_ones_cst (scalar_type);
3304 
3305       case MAX_EXPR:
3306       case MIN_EXPR:
3307           return initial_value;
3308 
3309       default:
3310           return NULL_TREE;
3311       }
3312   else
3313     switch (combined_fn (code))
3314       {
3315       CASE_CFN_FMIN:
3316       CASE_CFN_FMAX:
3317           return initial_value;
3318 
3319       default:
3320           return NULL_TREE;
3321       }
3322 }
3323 
3324 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3325    STMT is printed with a message MSG. */
3326 
3327 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)3328 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3329 {
3330   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3331 }
3332 
3333 /* Return true if we need an in-order reduction for operation CODE
3334    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3335    overflow must wrap.  */
3336 
3337 bool
needs_fold_left_reduction_p(tree type,code_helper code)3338 needs_fold_left_reduction_p (tree type, code_helper code)
3339 {
3340   /* CHECKME: check for !flag_finite_math_only too?  */
3341   if (SCALAR_FLOAT_TYPE_P (type))
3342     {
3343       if (code.is_tree_code ())
3344           switch (tree_code (code))
3345             {
3346             case MIN_EXPR:
3347             case MAX_EXPR:
3348               return false;
3349 
3350             default:
3351               return !flag_associative_math;
3352             }
3353       else
3354           switch (combined_fn (code))
3355             {
3356             CASE_CFN_FMIN:
3357             CASE_CFN_FMAX:
3358               return false;
3359 
3360             default:
3361               return !flag_associative_math;
3362             }
3363     }
3364 
3365   if (INTEGRAL_TYPE_P (type))
3366     return (!code.is_tree_code ()
3367               || !operation_no_trapping_overflow (type, tree_code (code)));
3368 
3369   if (SAT_FIXED_POINT_TYPE_P (type))
3370     return true;
3371 
3372   return false;
3373 }
3374 
3375 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3376    has a handled computation expression.  Store the main reduction
3377    operation in *CODE.  */
3378 
3379 static bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,code_helper * code,vec<std::pair<ssa_op_iter,use_operand_p>> & path)3380 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3381                           tree loop_arg, code_helper *code,
3382                           vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3383 {
3384   auto_bitmap visited;
3385   tree lookfor = PHI_RESULT (phi);
3386   ssa_op_iter curri;
3387   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3388   while (USE_FROM_PTR (curr) != loop_arg)
3389     curr = op_iter_next_use (&curri);
3390   curri.i = curri.numops;
3391   do
3392     {
3393       path.safe_push (std::make_pair (curri, curr));
3394       tree use = USE_FROM_PTR (curr);
3395       if (use == lookfor)
3396           break;
3397       gimple *def = SSA_NAME_DEF_STMT (use);
3398       if (gimple_nop_p (def)
3399             || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3400           {
3401 pop:
3402             do
3403               {
3404                 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3405                 curri = x.first;
3406                 curr = x.second;
3407                 do
3408                     curr = op_iter_next_use (&curri);
3409                 /* Skip already visited or non-SSA operands (from iterating
3410                    over PHI args).  */
3411                 while (curr != NULL_USE_OPERAND_P
3412                          && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3413                                || ! bitmap_set_bit (visited,
3414                                                         SSA_NAME_VERSION
3415                                                           (USE_FROM_PTR (curr)))));
3416               }
3417             while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3418             if (curr == NULL_USE_OPERAND_P)
3419               break;
3420           }
3421       else
3422           {
3423             if (gimple_code (def) == GIMPLE_PHI)
3424               curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3425             else
3426               curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3427             while (curr != NULL_USE_OPERAND_P
3428                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3429                          || ! bitmap_set_bit (visited,
3430                                                     SSA_NAME_VERSION
3431                                                       (USE_FROM_PTR (curr)))))
3432               curr = op_iter_next_use (&curri);
3433             if (curr == NULL_USE_OPERAND_P)
3434               goto pop;
3435           }
3436     }
3437   while (1);
3438   if (dump_file && (dump_flags & TDF_DETAILS))
3439     {
3440       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3441       unsigned i;
3442       std::pair<ssa_op_iter, use_operand_p> *x;
3443       FOR_EACH_VEC_ELT (path, i, x)
3444           dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3445       dump_printf (MSG_NOTE, "\n");
3446     }
3447 
3448   /* Check whether the reduction path detected is valid.  */
3449   bool fail = path.length () == 0;
3450   bool neg = false;
3451   int sign = -1;
3452   *code = ERROR_MARK;
3453   for (unsigned i = 1; i < path.length (); ++i)
3454     {
3455       gimple *use_stmt = USE_STMT (path[i].second);
3456       gimple_match_op op;
3457       if (!gimple_extract_op (use_stmt, &op))
3458           {
3459             fail = true;
3460             break;
3461           }
3462       unsigned int opi = op.num_ops;
3463       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3464           {
3465             /* The following make sure we can compute the operand index
3466                easily plus it mostly disallows chaining via COND_EXPR condition
3467                operands.  */
3468             for (opi = 0; opi < op.num_ops; ++opi)
3469               if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3470                 break;
3471           }
3472       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3473           {
3474             for (opi = 0; opi < op.num_ops; ++opi)
3475               if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3476                 break;
3477           }
3478       if (opi == op.num_ops)
3479           {
3480             fail = true;
3481             break;
3482           }
3483       op.code = canonicalize_code (op.code, op.type);
3484       if (op.code == MINUS_EXPR)
3485           {
3486             op.code = PLUS_EXPR;
3487             /* Track whether we negate the reduction value each iteration.  */
3488             if (op.ops[1] == op.ops[opi])
3489               neg = ! neg;
3490           }
3491       if (CONVERT_EXPR_CODE_P (op.code)
3492             && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3493           ;
3494       else if (*code == ERROR_MARK)
3495           {
3496             *code = op.code;
3497             sign = TYPE_SIGN (op.type);
3498           }
3499       else if (op.code != *code)
3500           {
3501             fail = true;
3502             break;
3503           }
3504       else if ((op.code == MIN_EXPR
3505                     || op.code == MAX_EXPR)
3506                  && sign != TYPE_SIGN (op.type))
3507           {
3508             fail = true;
3509             break;
3510           }
3511       /* Check there's only a single stmt the op is used on.  For the
3512            not value-changing tail and the last stmt allow out-of-loop uses.
3513            ???  We could relax this and handle arbitrary live stmts by
3514            forcing a scalar epilogue for example.  */
3515       imm_use_iterator imm_iter;
3516       use_operand_p use_p;
3517       gimple *op_use_stmt;
3518       unsigned cnt = 0;
3519       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3520           if (!is_gimple_debug (op_use_stmt)
3521               && (*code != ERROR_MARK
3522                     || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3523             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3524               cnt++;
3525       if (cnt != 1)
3526           {
3527             fail = true;
3528             break;
3529           }
3530     }
3531   return ! fail && ! neg && *code != ERROR_MARK;
3532 }
3533 
3534 bool
check_reduction_path(dump_user_location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3535 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3536                           tree loop_arg, enum tree_code code)
3537 {
3538   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3539   code_helper code_;
3540   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3541             && code_ == code);
3542 }
3543 
3544 
3545 
3546 /* Function vect_is_simple_reduction
3547 
3548    (1) Detect a cross-iteration def-use cycle that represents a simple
3549    reduction computation.  We look for the following pattern:
3550 
3551    loop_header:
3552      a1 = phi < a0, a2 >
3553      a3 = ...
3554      a2 = operation (a3, a1)
3555 
3556    or
3557 
3558    a3 = ...
3559    loop_header:
3560      a1 = phi < a0, a2 >
3561      a2 = operation (a3, a1)
3562 
3563    such that:
3564    1. operation is commutative and associative and it is safe to
3565       change the order of the computation
3566    2. no uses for a2 in the loop (a2 is used out of the loop)
3567    3. no uses of a1 in the loop besides the reduction operation
3568    4. no uses of a1 outside the loop.
3569 
3570    Conditions 1,4 are tested here.
3571    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3572 
3573    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3574    nested cycles.
3575 
3576    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3577    reductions:
3578 
3579      a1 = phi < a0, a2 >
3580      inner loop (def of a3)
3581      a2 = phi < a3 >
3582 
3583    (4) Detect condition expressions, ie:
3584      for (int i = 0; i < N; i++)
3585        if (a[i] < val)
3586           ret_val = a[i];
3587 
3588 */
3589 
3590 static stmt_vec_info
vect_is_simple_reduction(loop_vec_info loop_info,stmt_vec_info phi_info,bool * double_reduc,bool * reduc_chain_p)3591 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3592                                 bool *double_reduc, bool *reduc_chain_p)
3593 {
3594   gphi *phi = as_a <gphi *> (phi_info->stmt);
3595   gimple *phi_use_stmt = NULL;
3596   imm_use_iterator imm_iter;
3597   use_operand_p use_p;
3598 
3599   *double_reduc = false;
3600   *reduc_chain_p = false;
3601   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3602 
3603   tree phi_name = PHI_RESULT (phi);
3604   /* ???  If there are no uses of the PHI result the inner loop reduction
3605      won't be detected as possibly double-reduction by vectorizable_reduction
3606      because that tries to walk the PHI arg from the preheader edge which
3607      can be constant.  See PR60382.  */
3608   if (has_zero_uses (phi_name))
3609     return NULL;
3610   class loop *loop = (gimple_bb (phi))->loop_father;
3611   unsigned nphi_def_loop_uses = 0;
3612   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3613     {
3614       gimple *use_stmt = USE_STMT (use_p);
3615       if (is_gimple_debug (use_stmt))
3616           continue;
3617 
3618       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3619         {
3620           if (dump_enabled_p ())
3621               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622                                    "intermediate value used outside loop.\n");
3623 
3624           return NULL;
3625         }
3626 
3627       nphi_def_loop_uses++;
3628       phi_use_stmt = use_stmt;
3629     }
3630 
3631   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3632   if (TREE_CODE (latch_def) != SSA_NAME)
3633     {
3634       if (dump_enabled_p ())
3635           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636                                "reduction: not ssa_name: %T\n", latch_def);
3637       return NULL;
3638     }
3639 
3640   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3641   if (!def_stmt_info
3642       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3643     return NULL;
3644 
3645   bool nested_in_vect_loop
3646     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3647   unsigned nlatch_def_loop_uses = 0;
3648   auto_vec<gphi *, 3> lcphis;
3649   bool inner_loop_of_double_reduc = false;
3650   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3651     {
3652       gimple *use_stmt = USE_STMT (use_p);
3653       if (is_gimple_debug (use_stmt))
3654           continue;
3655       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3656           nlatch_def_loop_uses++;
3657       else
3658           {
3659             /* We can have more than one loop-closed PHI.  */
3660             lcphis.safe_push (as_a <gphi *> (use_stmt));
3661             if (nested_in_vect_loop
3662                 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3663                       == vect_double_reduction_def))
3664               inner_loop_of_double_reduc = true;
3665           }
3666     }
3667 
3668   /* If we are vectorizing an inner reduction we are executing that
3669      in the original order only in case we are not dealing with a
3670      double reduction.  */
3671   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3672     {
3673       if (dump_enabled_p ())
3674           report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3675                               "detected nested cycle: ");
3676       return def_stmt_info;
3677     }
3678 
3679   /* When the inner loop of a double reduction ends up with more than
3680      one loop-closed PHI we have failed to classify alternate such
3681      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3682   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3683     {
3684       if (dump_enabled_p ())
3685           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3686                                "unhandle double reduction\n");
3687       return NULL;
3688     }
3689 
3690   /* If this isn't a nested cycle or if the nested cycle reduction value
3691      is used ouside of the inner loop we cannot handle uses of the reduction
3692      value.  */
3693   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3694     {
3695       if (dump_enabled_p ())
3696           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697                                "reduction used in loop.\n");
3698       return NULL;
3699     }
3700 
3701   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3702      defined in the inner loop.  */
3703   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3704     {
3705       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3706       if (gimple_phi_num_args (def_stmt) != 1
3707           || TREE_CODE (op1) != SSA_NAME)
3708         {
3709           if (dump_enabled_p ())
3710               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711                                    "unsupported phi node definition.\n");
3712 
3713           return NULL;
3714         }
3715 
3716       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3717       if (gimple_bb (def1)
3718             && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3719             && loop->inner
3720             && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3721             && (is_gimple_assign (def1) || is_gimple_call (def1))
3722             && is_a <gphi *> (phi_use_stmt)
3723             && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3724         {
3725           if (dump_enabled_p ())
3726             report_vect_op (MSG_NOTE, def_stmt,
3727                                   "detected double reduction: ");
3728 
3729           *double_reduc = true;
3730             return def_stmt_info;
3731         }
3732 
3733       return NULL;
3734     }
3735 
3736   /* Look for the expression computing latch_def from then loop PHI result.  */
3737   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3738   code_helper code;
3739   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3740                                   path))
3741     {
3742       STMT_VINFO_REDUC_CODE (phi_info) = code;
3743       if (code == COND_EXPR && !nested_in_vect_loop)
3744           STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3745 
3746       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3747            reduction chain for which the additional restriction is that
3748            all operations in the chain are the same.  */
3749       auto_vec<stmt_vec_info, 8> reduc_chain;
3750       unsigned i;
3751       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3752       for (i = path.length () - 1; i >= 1; --i)
3753           {
3754             gimple *stmt = USE_STMT (path[i].second);
3755             stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3756             gimple_match_op op;
3757             if (!gimple_extract_op (stmt, &op))
3758               gcc_unreachable ();
3759             if (gassign *assign = dyn_cast<gassign *> (stmt))
3760               STMT_VINFO_REDUC_IDX (stmt_info)
3761                 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3762             else
3763               {
3764                 gcall *call = as_a<gcall *> (stmt);
3765                 STMT_VINFO_REDUC_IDX (stmt_info)
3766                     = path[i].second->use - gimple_call_arg_ptr (call, 0);
3767               }
3768             bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3769                                              && (i == 1 || i == path.length () - 1));
3770             if ((op.code != code && !leading_conversion)
3771                 /* We can only handle the final value in epilogue
3772                      generation for reduction chains.  */
3773                 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3774               is_slp_reduc = false;
3775             /* For reduction chains we support a trailing/leading
3776                conversions.  We do not store those in the actual chain.  */
3777             if (leading_conversion)
3778               continue;
3779             reduc_chain.safe_push (stmt_info);
3780           }
3781       if (is_slp_reduc && reduc_chain.length () > 1)
3782           {
3783             for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3784               {
3785                 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3786                 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3787               }
3788             REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3789             REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3790 
3791             /* Save the chain for further analysis in SLP detection.  */
3792             LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3793             REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3794 
3795             *reduc_chain_p = true;
3796             if (dump_enabled_p ())
3797               dump_printf_loc (MSG_NOTE, vect_location,
3798                                   "reduction: detected reduction chain\n");
3799           }
3800       else if (dump_enabled_p ())
3801           dump_printf_loc (MSG_NOTE, vect_location,
3802                                "reduction: detected reduction\n");
3803 
3804       return def_stmt_info;
3805     }
3806 
3807   if (dump_enabled_p ())
3808     dump_printf_loc (MSG_NOTE, vect_location,
3809                          "reduction: unknown pattern\n");
3810 
3811   return NULL;
3812 }
3813 
3814 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3815    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3816    or -1 if not known.  */
3817 
3818 static int
vect_get_peel_iters_epilogue(loop_vec_info loop_vinfo,int peel_iters_prologue)3819 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3820 {
3821   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3822   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3823     {
3824       if (dump_enabled_p ())
3825           dump_printf_loc (MSG_NOTE, vect_location,
3826                                "cost model: epilogue peel iters set to vf/2 "
3827                                "because loop iterations are unknown .\n");
3828       return assumed_vf / 2;
3829     }
3830   else
3831     {
3832       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3833       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3834       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3835       /* If we need to peel for gaps, but no peeling is required, we have to
3836            peel VF iterations.  */
3837       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3838           peel_iters_epilogue = assumed_vf;
3839       return peel_iters_epilogue;
3840     }
3841 }
3842 
3843 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3844 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3845 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3846                                    int *peel_iters_epilogue,
3847                                    stmt_vector_for_cost *scalar_cost_vec,
3848                                    stmt_vector_for_cost *prologue_cost_vec,
3849                                    stmt_vector_for_cost *epilogue_cost_vec)
3850 {
3851   int retval = 0;
3852 
3853   *peel_iters_epilogue
3854     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3855 
3856   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3857     {
3858       /* If peeled iterations are known but number of scalar loop
3859            iterations are unknown, count a taken branch per peeled loop.  */
3860       if (peel_iters_prologue > 0)
3861           retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3862                                            vect_prologue);
3863       if (*peel_iters_epilogue > 0)
3864           retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3865                                             vect_epilogue);
3866     }
3867 
3868   stmt_info_for_cost *si;
3869   int j;
3870   if (peel_iters_prologue)
3871     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3872       retval += record_stmt_cost (prologue_cost_vec,
3873                                           si->count * peel_iters_prologue,
3874                                           si->kind, si->stmt_info, si->misalign,
3875                                           vect_prologue);
3876   if (*peel_iters_epilogue)
3877     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3878       retval += record_stmt_cost (epilogue_cost_vec,
3879                                           si->count * *peel_iters_epilogue,
3880                                           si->kind, si->stmt_info, si->misalign,
3881                                           vect_epilogue);
3882 
3883   return retval;
3884 }
3885 
3886 /* Function vect_estimate_min_profitable_iters
3887 
3888    Return the number of iterations required for the vector version of the
3889    loop to be profitable relative to the cost of the scalar version of the
3890    loop.
3891 
3892    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3893    of iterations for vectorization.  -1 value means loop vectorization
3894    is not profitable.  This returned value may be used for dynamic
3895    profitability check.
3896 
3897    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3898    for static check against estimated number of iterations.  */
3899 
3900 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate,unsigned * suggested_unroll_factor)3901 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3902                                             int *ret_min_profitable_niters,
3903                                             int *ret_min_profitable_estimate,
3904                                             unsigned *suggested_unroll_factor)
3905 {
3906   int min_profitable_iters;
3907   int min_profitable_estimate;
3908   int peel_iters_prologue;
3909   int peel_iters_epilogue;
3910   unsigned vec_inside_cost = 0;
3911   int vec_outside_cost = 0;
3912   unsigned vec_prologue_cost = 0;
3913   unsigned vec_epilogue_cost = 0;
3914   int scalar_single_iter_cost = 0;
3915   int scalar_outside_cost = 0;
3916   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3917   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3918   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3919 
3920   /* Cost model disabled.  */
3921   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3922     {
3923       if (dump_enabled_p ())
3924           dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3925       *ret_min_profitable_niters = 0;
3926       *ret_min_profitable_estimate = 0;
3927       return;
3928     }
3929 
3930   /* Requires loop versioning tests to handle misalignment.  */
3931   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3932     {
3933       /*  FIXME: Make cost depend on complexity of individual check.  */
3934       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3935       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3936       if (dump_enabled_p ())
3937           dump_printf (MSG_NOTE,
3938                          "cost model: Adding cost of checks for loop "
3939                          "versioning to treat misalignment.\n");
3940     }
3941 
3942   /* Requires loop versioning with alias checks.  */
3943   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3944     {
3945       /*  FIXME: Make cost depend on complexity of individual check.  */
3946       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3947       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3948       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3949       if (len)
3950           /* Count LEN - 1 ANDs and LEN comparisons.  */
3951           (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3952                                     scalar_stmt, vect_prologue);
3953       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3954       if (len)
3955           {
3956             /* Count LEN - 1 ANDs and LEN comparisons.  */
3957             unsigned int nstmts = len * 2 - 1;
3958             /* +1 for each bias that needs adding.  */
3959             for (unsigned int i = 0; i < len; ++i)
3960               if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3961                 nstmts += 1;
3962             (void) add_stmt_cost (target_cost_data, nstmts,
3963                                         scalar_stmt, vect_prologue);
3964           }
3965       if (dump_enabled_p ())
3966           dump_printf (MSG_NOTE,
3967                          "cost model: Adding cost of checks for loop "
3968                          "versioning aliasing.\n");
3969     }
3970 
3971   /* Requires loop versioning with niter checks.  */
3972   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3973     {
3974       /*  FIXME: Make cost depend on complexity of individual check.  */
3975       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3976                                   NULL, NULL, NULL_TREE, 0, vect_prologue);
3977       if (dump_enabled_p ())
3978           dump_printf (MSG_NOTE,
3979                          "cost model: Adding cost of checks for loop "
3980                          "versioning niters.\n");
3981     }
3982 
3983   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3984     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3985                                 vect_prologue);
3986 
3987   /* Count statements in scalar loop.  Using this as scalar cost for a single
3988      iteration for now.
3989 
3990      TODO: Add outer loop support.
3991 
3992      TODO: Consider assigning different costs to different scalar
3993      statements.  */
3994 
3995   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3996 
3997   /* Add additional cost for the peeled instructions in prologue and epilogue
3998      loop.  (For fully-masked loops there will be no peeling.)
3999 
4000      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4001      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4002 
4003      TODO: Build an expression that represents peel_iters for prologue and
4004      epilogue to be used in a run-time test.  */
4005 
4006   bool prologue_need_br_taken_cost = false;
4007   bool prologue_need_br_not_taken_cost = false;
4008 
4009   /* Calculate peel_iters_prologue.  */
4010   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4011     peel_iters_prologue = 0;
4012   else if (npeel < 0)
4013     {
4014       peel_iters_prologue = assumed_vf / 2;
4015       if (dump_enabled_p ())
4016           dump_printf (MSG_NOTE, "cost model: "
4017                          "prologue peel iters set to vf/2.\n");
4018 
4019       /* If peeled iterations are unknown, count a taken branch and a not taken
4020            branch per peeled loop.  Even if scalar loop iterations are known,
4021            vector iterations are not known since peeled prologue iterations are
4022            not known.  Hence guards remain the same.  */
4023       prologue_need_br_taken_cost = true;
4024       prologue_need_br_not_taken_cost = true;
4025     }
4026   else
4027     {
4028       peel_iters_prologue = npeel;
4029       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4030           /* If peeled iterations are known but number of scalar loop
4031              iterations are unknown, count a taken branch per peeled loop.  */
4032           prologue_need_br_taken_cost = true;
4033     }
4034 
4035   bool epilogue_need_br_taken_cost = false;
4036   bool epilogue_need_br_not_taken_cost = false;
4037 
4038   /* Calculate peel_iters_epilogue.  */
4039   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4040     /* We need to peel exactly one iteration for gaps.  */
4041     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4042   else if (npeel < 0)
4043     {
4044       /* If peeling for alignment is unknown, loop bound of main loop
4045            becomes unknown.  */
4046       peel_iters_epilogue = assumed_vf / 2;
4047       if (dump_enabled_p ())
4048           dump_printf (MSG_NOTE, "cost model: "
4049                          "epilogue peel iters set to vf/2 because "
4050                          "peeling for alignment is unknown.\n");
4051 
4052       /* See the same reason above in peel_iters_prologue calculation.  */
4053       epilogue_need_br_taken_cost = true;
4054       epilogue_need_br_not_taken_cost = true;
4055     }
4056   else
4057     {
4058       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4059       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4060           /* If peeled iterations are known but number of scalar loop
4061              iterations are unknown, count a taken branch per peeled loop.  */
4062           epilogue_need_br_taken_cost = true;
4063     }
4064 
4065   stmt_info_for_cost *si;
4066   int j;
4067   /* Add costs associated with peel_iters_prologue.  */
4068   if (peel_iters_prologue)
4069     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4070       {
4071           (void) add_stmt_cost (target_cost_data,
4072                                     si->count * peel_iters_prologue, si->kind,
4073                                     si->stmt_info, si->node, si->vectype,
4074                                     si->misalign, vect_prologue);
4075       }
4076 
4077   /* Add costs associated with peel_iters_epilogue.  */
4078   if (peel_iters_epilogue)
4079     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4080       {
4081           (void) add_stmt_cost (target_cost_data,
4082                                     si->count * peel_iters_epilogue, si->kind,
4083                                     si->stmt_info, si->node, si->vectype,
4084                                     si->misalign, vect_epilogue);
4085       }
4086 
4087   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4088 
4089   if (prologue_need_br_taken_cost)
4090     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4091                                 vect_prologue);
4092 
4093   if (prologue_need_br_not_taken_cost)
4094     (void) add_stmt_cost (target_cost_data, 1,
4095                                 cond_branch_not_taken, vect_prologue);
4096 
4097   if (epilogue_need_br_taken_cost)
4098     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4099                                 vect_epilogue);
4100 
4101   if (epilogue_need_br_not_taken_cost)
4102     (void) add_stmt_cost (target_cost_data, 1,
4103                                 cond_branch_not_taken, vect_epilogue);
4104 
4105   /* Take care of special costs for rgroup controls of partial vectors.  */
4106   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4107     {
4108       /* Calculate how many masks we need to generate.  */
4109       unsigned int num_masks = 0;
4110       rgroup_controls *rgm;
4111       unsigned int num_vectors_m1;
4112       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4113           if (rgm->type)
4114             num_masks += num_vectors_m1 + 1;
4115       gcc_assert (num_masks > 0);
4116 
4117       /* In the worst case, we need to generate each mask in the prologue
4118            and in the loop body.  One of the loop body mask instructions
4119            replaces the comparison in the scalar loop, and since we don't
4120            count the scalar comparison against the scalar body, we shouldn't
4121            count that vector instruction against the vector body either.
4122 
4123            Sometimes we can use unpacks instead of generating prologue
4124            masks and sometimes the prologue mask will fold to a constant,
4125            so the actual prologue cost might be smaller.  However, it's
4126            simpler and safer to use the worst-case cost; if this ends up
4127            being the tie-breaker between vectorizing or not, then it's
4128            probably better not to vectorize.  */
4129       (void) add_stmt_cost (target_cost_data, num_masks,
4130                                   vector_stmt, NULL, NULL, NULL_TREE, 0,
4131                                   vect_prologue);
4132       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4133                                   vector_stmt, NULL, NULL, NULL_TREE, 0,
4134                                   vect_body);
4135     }
4136   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4137     {
4138       /* Referring to the functions vect_set_loop_condition_partial_vectors
4139            and vect_set_loop_controls_directly, we need to generate each
4140            length in the prologue and in the loop body if required. Although
4141            there are some possible optimizations, we consider the worst case
4142            here.  */
4143 
4144       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4145       signed char partial_load_store_bias
4146           = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4147       bool need_iterate_p
4148           = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4149              && !vect_known_niters_smaller_than_vf (loop_vinfo));
4150 
4151       /* Calculate how many statements to be added.  */
4152       unsigned int prologue_stmts = 0;
4153       unsigned int body_stmts = 0;
4154 
4155       rgroup_controls *rgc;
4156       unsigned int num_vectors_m1;
4157       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4158           if (rgc->type)
4159             {
4160               /* May need one SHIFT for nitems_total computation.  */
4161               unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4162               if (nitems != 1 && !niters_known_p)
4163                 prologue_stmts += 1;
4164 
4165               /* May need one MAX and one MINUS for wrap around.  */
4166               if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4167                 prologue_stmts += 2;
4168 
4169               /* Need one MAX and one MINUS for each batch limit excepting for
4170                  the 1st one.  */
4171               prologue_stmts += num_vectors_m1 * 2;
4172 
4173               unsigned int num_vectors = num_vectors_m1 + 1;
4174 
4175               /* Need to set up lengths in prologue, only one MIN required
4176                  for each since start index is zero.  */
4177               prologue_stmts += num_vectors;
4178 
4179               /* If we have a non-zero partial load bias, we need one PLUS
4180                  to adjust the load length.  */
4181               if (partial_load_store_bias != 0)
4182                 body_stmts += 1;
4183 
4184               /* Each may need two MINs and one MINUS to update lengths in body
4185                  for next iteration.  */
4186               if (need_iterate_p)
4187                 body_stmts += 3 * num_vectors;
4188             }
4189 
4190       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4191                                   scalar_stmt, vect_prologue);
4192       (void) add_stmt_cost (target_cost_data, body_stmts,
4193                                   scalar_stmt, vect_body);
4194     }
4195 
4196   /* FORNOW: The scalar outside cost is incremented in one of the
4197      following ways:
4198 
4199      1. The vectorizer checks for alignment and aliasing and generates
4200      a condition that allows dynamic vectorization.  A cost model
4201      check is ANDED with the versioning condition.  Hence scalar code
4202      path now has the added cost of the versioning check.
4203 
4204        if (cost > th & versioning_check)
4205          jmp to vector code
4206 
4207      Hence run-time scalar is incremented by not-taken branch cost.
4208 
4209      2. The vectorizer then checks if a prologue is required.  If the
4210      cost model check was not done before during versioning, it has to
4211      be done before the prologue check.
4212 
4213        if (cost <= th)
4214          prologue = scalar_iters
4215        if (prologue == 0)
4216          jmp to vector code
4217        else
4218          execute prologue
4219        if (prologue == num_iters)
4220            go to exit
4221 
4222      Hence the run-time scalar cost is incremented by a taken branch,
4223      plus a not-taken branch, plus a taken branch cost.
4224 
4225      3. The vectorizer then checks if an epilogue is required.  If the
4226      cost model check was not done before during prologue check, it
4227      has to be done with the epilogue check.
4228 
4229        if (prologue == 0)
4230          jmp to vector code
4231        else
4232          execute prologue
4233        if (prologue == num_iters)
4234            go to exit
4235        vector code:
4236          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4237            jmp to epilogue
4238 
4239      Hence the run-time scalar cost should be incremented by 2 taken
4240      branches.
4241 
4242      TODO: The back end may reorder the BBS's differently and reverse
4243      conditions/branch directions.  Change the estimates below to
4244      something more reasonable.  */
4245 
4246   /* If the number of iterations is known and we do not do versioning, we can
4247      decide whether to vectorize at compile time.  Hence the scalar version
4248      do not carry cost model guard costs.  */
4249   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4250       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4251     {
4252       /* Cost model check occurs at versioning.  */
4253       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4254           scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4255       else
4256           {
4257             /* Cost model check occurs at prologue generation.  */
4258             if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4259               scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4260                 + vect_get_stmt_cost (cond_branch_not_taken);
4261             /* Cost model check occurs at epilogue generation.  */
4262             else
4263               scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4264           }
4265     }
4266 
4267   /* Complete the target-specific cost calculations.  */
4268   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4269                  &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4270                  suggested_unroll_factor);
4271 
4272   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4273       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4274       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4275                         *suggested_unroll_factor,
4276                         LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4277     {
4278       if (dump_enabled_p ())
4279           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4280                                "can't unroll as unrolled vectorization factor larger"
4281                                " than maximum vectorization factor: "
4282                                HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4283                                LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4284       *suggested_unroll_factor = 1;
4285     }
4286 
4287   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4288 
4289   if (dump_enabled_p ())
4290     {
4291       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4292       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4293                    vec_inside_cost);
4294       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4295                    vec_prologue_cost);
4296       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4297                    vec_epilogue_cost);
4298       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4299                    scalar_single_iter_cost);
4300       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4301                    scalar_outside_cost);
4302       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4303                    vec_outside_cost);
4304       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4305                    peel_iters_prologue);
4306       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4307                    peel_iters_epilogue);
4308     }
4309 
4310   /* Calculate number of iterations required to make the vector version
4311      profitable, relative to the loop bodies only.  The following condition
4312      must hold true:
4313      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4314      where
4315      SIC = scalar iteration cost, VIC = vector iteration cost,
4316      VOC = vector outside cost, VF = vectorization factor,
4317      NPEEL = prologue iterations + epilogue iterations,
4318      SOC = scalar outside cost for run time cost model check.  */
4319 
4320   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4321                                 - vec_inside_cost);
4322   if (saving_per_viter <= 0)
4323     {
4324       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4325           warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4326                         "vectorization did not happen for a simd loop");
4327 
4328       if (dump_enabled_p ())
4329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4330                                "cost model: the vector iteration cost = %d "
4331                                "divided by the scalar iteration cost = %d "
4332                                "is greater or equal to the vectorization factor = %d"
4333                          ".\n",
4334                                vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4335       *ret_min_profitable_niters = -1;
4336       *ret_min_profitable_estimate = -1;
4337       return;
4338     }
4339 
4340   /* ??? The "if" arm is written to handle all cases; see below for what
4341      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4342   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4343     {
4344       /* Rewriting the condition above in terms of the number of
4345            vector iterations (vniters) rather than the number of
4346            scalar iterations (niters) gives:
4347 
4348            SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4349 
4350            <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4351 
4352            For integer N, X and Y when X > 0:
4353 
4354            N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4355       int outside_overhead = (vec_outside_cost
4356                                     - scalar_single_iter_cost * peel_iters_prologue
4357                                     - scalar_single_iter_cost * peel_iters_epilogue
4358                                     - scalar_outside_cost);
4359       /* We're only interested in cases that require at least one
4360            vector iteration.  */
4361       int min_vec_niters = 1;
4362       if (outside_overhead > 0)
4363           min_vec_niters = outside_overhead / saving_per_viter + 1;
4364 
4365       if (dump_enabled_p ())
4366           dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4367                          min_vec_niters);
4368 
4369       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4370           {
4371             /* Now that we know the minimum number of vector iterations,
4372                find the minimum niters for which the scalar cost is larger:
4373 
4374                SIC * niters > VIC * vniters + VOC - SOC
4375 
4376                We know that the minimum niters is no more than
4377                vniters * VF + NPEEL, but it might be (and often is) less
4378                than that if a partial vector iteration is cheaper than the
4379                equivalent scalar code.  */
4380             int threshold = (vec_inside_cost * min_vec_niters
4381                                  + vec_outside_cost
4382                                  - scalar_outside_cost);
4383             if (threshold <= 0)
4384               min_profitable_iters = 1;
4385             else
4386               min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4387           }
4388       else
4389           /* Convert the number of vector iterations into a number of
4390              scalar iterations.  */
4391           min_profitable_iters = (min_vec_niters * assumed_vf
4392                                         + peel_iters_prologue
4393                                         + peel_iters_epilogue);
4394     }
4395   else
4396     {
4397       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4398                                     * assumed_vf
4399                                     - vec_inside_cost * peel_iters_prologue
4400                                     - vec_inside_cost * peel_iters_epilogue);
4401       if (min_profitable_iters <= 0)
4402         min_profitable_iters = 0;
4403       else
4404           {
4405             min_profitable_iters /= saving_per_viter;
4406 
4407             if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4408                 <= (((int) vec_inside_cost * min_profitable_iters)
4409                       + (((int) vec_outside_cost - scalar_outside_cost)
4410                          * assumed_vf)))
4411               min_profitable_iters++;
4412           }
4413     }
4414 
4415   if (dump_enabled_p ())
4416     dump_printf (MSG_NOTE,
4417                      "  Calculated minimum iters for profitability: %d\n",
4418                      min_profitable_iters);
4419 
4420   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4421       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4422     /* We want the vectorized loop to execute at least once.  */
4423     min_profitable_iters = assumed_vf + peel_iters_prologue;
4424   else if (min_profitable_iters < peel_iters_prologue)
4425     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4426        vectorized loop executes at least once.  */
4427     min_profitable_iters = peel_iters_prologue;
4428 
4429   if (dump_enabled_p ())
4430     dump_printf_loc (MSG_NOTE, vect_location,
4431                      "  Runtime profitability threshold = %d\n",
4432                      min_profitable_iters);
4433 
4434   *ret_min_profitable_niters = min_profitable_iters;
4435 
4436   /* Calculate number of iterations required to make the vector version
4437      profitable, relative to the loop bodies only.
4438 
4439      Non-vectorized variant is SIC * niters and it must win over vector
4440      variant on the expected loop trip count.  The following condition must hold true:
4441      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4442 
4443   if (vec_outside_cost <= 0)
4444     min_profitable_estimate = 0;
4445   /* ??? This "else if" arm is written to handle all cases; see below for
4446      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4447   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4448     {
4449       /* This is a repeat of the code above, but with + SOC rather
4450            than - SOC.  */
4451       int outside_overhead = (vec_outside_cost
4452                                     - scalar_single_iter_cost * peel_iters_prologue
4453                                     - scalar_single_iter_cost * peel_iters_epilogue
4454                                     + scalar_outside_cost);
4455       int min_vec_niters = 1;
4456       if (outside_overhead > 0)
4457           min_vec_niters = outside_overhead / saving_per_viter + 1;
4458 
4459       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4460           {
4461             int threshold = (vec_inside_cost * min_vec_niters
4462                                  + vec_outside_cost
4463                                  + scalar_outside_cost);
4464             min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4465           }
4466       else
4467           min_profitable_estimate = (min_vec_niters * assumed_vf
4468                                            + peel_iters_prologue
4469                                            + peel_iters_epilogue);
4470     }
4471   else
4472     {
4473       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4474                                          * assumed_vf
4475                                          - vec_inside_cost * peel_iters_prologue
4476                                          - vec_inside_cost * peel_iters_epilogue)
4477                                          / ((scalar_single_iter_cost * assumed_vf)
4478                                            - vec_inside_cost);
4479     }
4480   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4481   if (dump_enabled_p ())
4482     dump_printf_loc (MSG_NOTE, vect_location,
4483                          "  Static estimate profitability threshold = %d\n",
4484                          min_profitable_estimate);
4485 
4486   *ret_min_profitable_estimate = min_profitable_estimate;
4487 }
4488 
4489 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4490    vector elements (not bits) for a vector with NELT elements.  */
4491 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4492 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4493                                     vec_perm_builder *sel)
4494 {
4495   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4496      by vec_perm_indices.  */
4497   sel->new_vector (nelt, 1, 3);
4498   for (unsigned int i = 0; i < 3; i++)
4499     sel->quick_push (i + offset);
4500 }
4501 
4502 /* Checks whether the target supports whole-vector shifts for vectors of mode
4503    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4504    it supports vec_perm_const with masks for all necessary shift amounts.  */
4505 static bool
have_whole_vector_shift(machine_mode mode)4506 have_whole_vector_shift (machine_mode mode)
4507 {
4508   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4509     return true;
4510 
4511   /* Variable-length vectors should be handled via the optab.  */
4512   unsigned int nelt;
4513   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4514     return false;
4515 
4516   vec_perm_builder sel;
4517   vec_perm_indices indices;
4518   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4519     {
4520       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4521       indices.new_vector (sel, 2, nelt);
4522       if (!can_vec_perm_const_p (mode, indices, false))
4523           return false;
4524     }
4525   return true;
4526 }
4527 
4528 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4529    functions. Design better to avoid maintenance issues.  */
4530 
4531 /* Function vect_model_reduction_cost.
4532 
4533    Models cost for a reduction operation, including the vector ops
4534    generated within the strip-mine loop in some cases, the initial
4535    definition before the loop, and the epilogue code that must be generated.  */
4536 
4537 static void
vect_model_reduction_cost(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,internal_fn reduc_fn,vect_reduction_type reduction_type,int ncopies,stmt_vector_for_cost * cost_vec)4538 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4539                                  stmt_vec_info stmt_info, internal_fn reduc_fn,
4540                                  vect_reduction_type reduction_type,
4541                                  int ncopies, stmt_vector_for_cost *cost_vec)
4542 {
4543   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4544   tree vectype;
4545   machine_mode mode;
4546   class loop *loop = NULL;
4547 
4548   if (loop_vinfo)
4549     loop = LOOP_VINFO_LOOP (loop_vinfo);
4550 
4551   /* Condition reductions generate two reductions in the loop.  */
4552   if (reduction_type == COND_REDUCTION)
4553     ncopies *= 2;
4554 
4555   vectype = STMT_VINFO_VECTYPE (stmt_info);
4556   mode = TYPE_MODE (vectype);
4557   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4558 
4559   gimple_match_op op;
4560   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4561     gcc_unreachable ();
4562 
4563   if (reduction_type == EXTRACT_LAST_REDUCTION)
4564     /* No extra instructions are needed in the prologue.  The loop body
4565        operations are costed in vectorizable_condition.  */
4566     inside_cost = 0;
4567   else if (reduction_type == FOLD_LEFT_REDUCTION)
4568     {
4569       /* No extra instructions needed in the prologue.  */
4570       prologue_cost = 0;
4571 
4572       if (reduc_fn != IFN_LAST)
4573           /* Count one reduction-like operation per vector.  */
4574           inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4575                                                   stmt_info, 0, vect_body);
4576       else
4577           {
4578             /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4579             unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4580             inside_cost = record_stmt_cost (cost_vec, nelements,
4581                                                     vec_to_scalar, stmt_info, 0,
4582                                                     vect_body);
4583             inside_cost += record_stmt_cost (cost_vec, nelements,
4584                                                      scalar_stmt, stmt_info, 0,
4585                                                      vect_body);
4586           }
4587     }
4588   else
4589     {
4590       /* Add in cost for initial definition.
4591            For cond reduction we have four vectors: initial index, step,
4592            initial result of the data reduction, initial value of the index
4593            reduction.  */
4594       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4595       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4596                                                    scalar_to_vec, stmt_info, 0,
4597                                                    vect_prologue);
4598     }
4599 
4600   /* Determine cost of epilogue code.
4601 
4602      We have a reduction operator that will reduce the vector in one statement.
4603      Also requires scalar extract.  */
4604 
4605   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4606     {
4607       if (reduc_fn != IFN_LAST)
4608           {
4609             if (reduction_type == COND_REDUCTION)
4610               {
4611                 /* An EQ stmt and an COND_EXPR stmt.  */
4612                 epilogue_cost += record_stmt_cost (cost_vec, 2,
4613                                                              vector_stmt, stmt_info, 0,
4614                                                              vect_epilogue);
4615                 /* Reduction of the max index and a reduction of the found
4616                      values.  */
4617                 epilogue_cost += record_stmt_cost (cost_vec, 2,
4618                                                              vec_to_scalar, stmt_info, 0,
4619                                                              vect_epilogue);
4620                 /* A broadcast of the max value.  */
4621                 epilogue_cost += record_stmt_cost (cost_vec, 1,
4622                                                              scalar_to_vec, stmt_info, 0,
4623                                                              vect_epilogue);
4624               }
4625             else
4626               {
4627                 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4628                                                              stmt_info, 0, vect_epilogue);
4629                 epilogue_cost += record_stmt_cost (cost_vec, 1,
4630                                                              vec_to_scalar, stmt_info, 0,
4631                                                              vect_epilogue);
4632               }
4633           }
4634       else if (reduction_type == COND_REDUCTION)
4635           {
4636             unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4637             /* Extraction of scalar elements.  */
4638             epilogue_cost += record_stmt_cost (cost_vec,
4639                                                        2 * estimated_nunits,
4640                                                        vec_to_scalar, stmt_info, 0,
4641                                                        vect_epilogue);
4642             /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4643             epilogue_cost += record_stmt_cost (cost_vec,
4644                                                        2 * estimated_nunits - 3,
4645                                                        scalar_stmt, stmt_info, 0,
4646                                                        vect_epilogue);
4647           }
4648       else if (reduction_type == EXTRACT_LAST_REDUCTION
4649                  || reduction_type == FOLD_LEFT_REDUCTION)
4650           /* No extra instructions need in the epilogue.  */
4651           ;
4652       else
4653           {
4654             int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4655             tree bitsize = TYPE_SIZE (op.type);
4656             int element_bitsize = tree_to_uhwi (bitsize);
4657             int nelements = vec_size_in_bits / element_bitsize;
4658 
4659             if (op.code == COND_EXPR)
4660               op.code = MAX_EXPR;
4661 
4662             /* We have a whole vector shift available.  */
4663             if (VECTOR_MODE_P (mode)
4664                 && directly_supported_p (op.code, vectype)
4665                 && have_whole_vector_shift (mode))
4666               {
4667                 /* Final reduction via vector shifts and the reduction operator.
4668                      Also requires scalar extract.  */
4669                 epilogue_cost += record_stmt_cost (cost_vec,
4670                                                              exact_log2 (nelements) * 2,
4671                                                              vector_stmt, stmt_info, 0,
4672                                                              vect_epilogue);
4673                 epilogue_cost += record_stmt_cost (cost_vec, 1,
4674                                                              vec_to_scalar, stmt_info, 0,
4675                                                              vect_epilogue);
4676               }
4677             else
4678               /* Use extracts and reduction op for final reduction.  For N
4679                  elements, we have N extracts and N-1 reduction ops.  */
4680               epilogue_cost += record_stmt_cost (cost_vec,
4681                                                          nelements + nelements - 1,
4682                                                          vector_stmt, stmt_info, 0,
4683                                                          vect_epilogue);
4684           }
4685     }
4686 
4687   if (dump_enabled_p ())
4688     dump_printf (MSG_NOTE,
4689                  "vect_model_reduction_cost: inside_cost = %d, "
4690                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4691                  prologue_cost, epilogue_cost);
4692 }
4693 
4694 /* SEQ is a sequence of instructions that initialize the reduction
4695    described by REDUC_INFO.  Emit them in the appropriate place.  */
4696 
4697 static void
vect_emit_reduction_init_stmts(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,gimple * seq)4698 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4699                                         stmt_vec_info reduc_info, gimple *seq)
4700 {
4701   if (reduc_info->reused_accumulator)
4702     {
4703       /* When reusing an accumulator from the main loop, we only need
4704            initialization instructions if the main loop can be skipped.
4705            In that case, emit the initialization instructions at the end
4706            of the guard block that does the skip.  */
4707       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4708       gcc_assert (skip_edge);
4709       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4710       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4711     }
4712   else
4713     {
4714       /* The normal case: emit the initialization instructions on the
4715            preheader edge.  */
4716       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4717       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4718     }
4719 }
4720 
4721 /* Function get_initial_def_for_reduction
4722 
4723    Input:
4724    REDUC_INFO - the info_for_reduction
4725    INIT_VAL - the initial value of the reduction variable
4726    NEUTRAL_OP - a value that has no effect on the reduction, as per
4727                     neutral_op_for_reduction
4728 
4729    Output:
4730    Return a vector variable, initialized according to the operation that
4731           STMT_VINFO performs. This vector will be used as the initial value
4732           of the vector of partial results.
4733 
4734    The value we need is a vector in which element 0 has value INIT_VAL
4735    and every other element has value NEUTRAL_OP.  */
4736 
4737 static tree
get_initial_def_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,tree init_val,tree neutral_op)4738 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4739                                      stmt_vec_info reduc_info,
4740                                      tree init_val, tree neutral_op)
4741 {
4742   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4743   tree scalar_type = TREE_TYPE (init_val);
4744   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4745   tree init_def;
4746   gimple_seq stmts = NULL;
4747 
4748   gcc_assert (vectype);
4749 
4750   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4751                 || SCALAR_FLOAT_TYPE_P (scalar_type));
4752 
4753   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4754                 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4755 
4756   if (operand_equal_p (init_val, neutral_op))
4757     {
4758       /* If both elements are equal then the vector described above is
4759            just a splat.  */
4760       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4761       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4762     }
4763   else
4764     {
4765       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4766       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4767       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4768           {
4769             /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4770                element 0.  */
4771             init_def = gimple_build_vector_from_val (&stmts, vectype,
4772                                                                neutral_op);
4773             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4774                                            vectype, init_def, init_val);
4775           }
4776       else
4777           {
4778             /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4779             tree_vector_builder elts (vectype, 1, 2);
4780             elts.quick_push (init_val);
4781             elts.quick_push (neutral_op);
4782             init_def = gimple_build_vector (&stmts, &elts);
4783           }
4784     }
4785 
4786   if (stmts)
4787     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4788   return init_def;
4789 }
4790 
4791 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4792    which performs a reduction involving GROUP_SIZE scalar statements.
4793    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4794    is nonnull, introducing extra elements of that value will not change the
4795    result.  */
4796 
4797 static void
get_initial_defs_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info reduc_info,vec<tree> * vec_oprnds,unsigned int number_of_vectors,unsigned int group_size,tree neutral_op)4798 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4799                                         stmt_vec_info reduc_info,
4800                                         vec<tree> *vec_oprnds,
4801                                         unsigned int number_of_vectors,
4802                                         unsigned int group_size, tree neutral_op)
4803 {
4804   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4805   unsigned HOST_WIDE_INT nunits;
4806   unsigned j, number_of_places_left_in_vector;
4807   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4808   unsigned int i;
4809 
4810   gcc_assert (group_size == initial_values.length () || neutral_op);
4811 
4812   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4813      created vectors. It is greater than 1 if unrolling is performed.
4814 
4815      For example, we have two scalar operands, s1 and s2 (e.g., group of
4816      strided accesses of size two), while NUNITS is four (i.e., four scalars
4817      of this type can be packed in a vector).  The output vector will contain
4818      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4819      will be 2).
4820 
4821      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4822      vectors containing the operands.
4823 
4824      For example, NUNITS is four as before, and the group size is 8
4825      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4826      {s5, s6, s7, s8}.  */
4827 
4828   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4829     nunits = group_size;
4830 
4831   number_of_places_left_in_vector = nunits;
4832   bool constant_p = true;
4833   tree_vector_builder elts (vector_type, nunits, 1);
4834   elts.quick_grow (nunits);
4835   gimple_seq ctor_seq = NULL;
4836   for (j = 0; j < nunits * number_of_vectors; ++j)
4837     {
4838       tree op;
4839       i = j % group_size;
4840 
4841       /* Get the def before the loop.  In reduction chain we have only
4842            one initial value.  Else we have as many as PHIs in the group.  */
4843       if (i >= initial_values.length () || (j > i && neutral_op))
4844           op = neutral_op;
4845       else
4846           op = initial_values[i];
4847 
4848       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4849       number_of_places_left_in_vector--;
4850       elts[nunits - number_of_places_left_in_vector - 1] = op;
4851       if (!CONSTANT_CLASS_P (op))
4852           constant_p = false;
4853 
4854       if (number_of_places_left_in_vector == 0)
4855           {
4856             tree init;
4857             if (constant_p && !neutral_op
4858                 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4859                 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4860               /* Build the vector directly from ELTS.  */
4861               init = gimple_build_vector (&ctor_seq, &elts);
4862             else if (neutral_op)
4863               {
4864                 /* Build a vector of the neutral value and shift the
4865                      other elements into place.  */
4866                 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4867                                                                neutral_op);
4868                 int k = nunits;
4869                 while (k > 0 && elts[k - 1] == neutral_op)
4870                     k -= 1;
4871                 while (k > 0)
4872                     {
4873                       k -= 1;
4874                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4875                                                vector_type, init, elts[k]);
4876                     }
4877               }
4878             else
4879               {
4880                 /* First time round, duplicate ELTS to fill the
4881                      required number of vectors.  */
4882                 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4883                                                   elts, number_of_vectors, *vec_oprnds);
4884                 break;
4885               }
4886             vec_oprnds->quick_push (init);
4887 
4888             number_of_places_left_in_vector = nunits;
4889             elts.new_vector (vector_type, nunits, 1);
4890             elts.quick_grow (nunits);
4891             constant_p = true;
4892           }
4893     }
4894   if (ctor_seq != NULL)
4895     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4896 }
4897 
4898 /* For a statement STMT_INFO taking part in a reduction operation return
4899    the stmt_vec_info the meta information is stored on.  */
4900 
4901 stmt_vec_info
info_for_reduction(vec_info * vinfo,stmt_vec_info stmt_info)4902 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4903 {
4904   stmt_info = vect_orig_stmt (stmt_info);
4905   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4906   if (!is_a <gphi *> (stmt_info->stmt)
4907       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4908     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4909   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4910   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4911     {
4912       if (gimple_phi_num_args (phi) == 1)
4913           stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4914     }
4915   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4916     {
4917       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4918       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4919           stmt_info = info;
4920     }
4921   return stmt_info;
4922 }
4923 
4924 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4925    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4926    return false.  */
4927 
4928 static bool
vect_find_reusable_accumulator(loop_vec_info loop_vinfo,stmt_vec_info reduc_info)4929 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4930                                         stmt_vec_info reduc_info)
4931 {
4932   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4933   if (!main_loop_vinfo)
4934     return false;
4935 
4936   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4937     return false;
4938 
4939   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4940   auto_vec<tree, 16> main_loop_results (num_phis);
4941   auto_vec<tree, 16> initial_values (num_phis);
4942   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4943     {
4944       /* The epilogue loop can be entered either from the main loop or
4945            from an earlier guard block.  */
4946       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4947       for (tree incoming_value : reduc_info->reduc_initial_values)
4948           {
4949             /* Look for:
4950 
4951                  INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4952                                             INITIAL_VALUE(guard block)>.  */
4953             gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4954 
4955             gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4956             gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4957 
4958             tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4959             tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4960 
4961             main_loop_results.quick_push (from_main_loop);
4962             initial_values.quick_push (from_skip);
4963           }
4964     }
4965   else
4966     /* The main loop dominates the epilogue loop.  */
4967     main_loop_results.splice (reduc_info->reduc_initial_values);
4968 
4969   /* See if the main loop has the kind of accumulator we need.  */
4970   vect_reusable_accumulator *accumulator
4971     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4972   if (!accumulator
4973       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4974       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4975                           accumulator->reduc_info->reduc_scalar_results.begin ()))
4976     return false;
4977 
4978   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4979   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4980   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4981   unsigned HOST_WIDE_INT m;
4982   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4983                                   TYPE_VECTOR_SUBPARTS (vectype), &m))
4984     return false;
4985   /* Check the intermediate vector types and operations are available.  */
4986   tree prev_vectype = old_vectype;
4987   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
4988   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
4989     {
4990       intermediate_nunits = exact_div (intermediate_nunits, 2);
4991       tree intermediate_vectype = get_related_vectype_for_scalar_type
4992           (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
4993       if (!intermediate_vectype
4994             || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
4995                                             intermediate_vectype)
4996             || !can_vec_extract (TYPE_MODE (prev_vectype),
4997                                      TYPE_MODE (intermediate_vectype)))
4998           return false;
4999       prev_vectype = intermediate_vectype;
5000     }
5001 
5002   /* Non-SLP reductions might apply an adjustment after the reduction
5003      operation, in order to simplify the initialization of the accumulator.
5004      If the epilogue loop carries on from where the main loop left off,
5005      it should apply the same adjustment to the final reduction result.
5006 
5007      If the epilogue loop can also be entered directly (rather than via
5008      the main loop), we need to be able to handle that case in the same way,
5009      with the same adjustment.  (In principle we could add a PHI node
5010      to select the correct adjustment, but in practice that shouldn't be
5011      necessary.)  */
5012   tree main_adjustment
5013     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5014   if (loop_vinfo->main_loop_edge && main_adjustment)
5015     {
5016       gcc_assert (num_phis == 1);
5017       tree initial_value = initial_values[0];
5018       /* Check that we can use INITIAL_VALUE as the adjustment and
5019            initialize the accumulator with a neutral value instead.  */
5020       if (!operand_equal_p (initial_value, main_adjustment))
5021           return false;
5022       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5023       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5024                                                                 code, initial_value);
5025     }
5026   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5027   reduc_info->reduc_initial_values.truncate (0);
5028   reduc_info->reduc_initial_values.splice (initial_values);
5029   reduc_info->reused_accumulator = accumulator;
5030   return true;
5031 }
5032 
5033 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5034    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5035 
5036 static tree
vect_create_partial_epilog(tree vec_def,tree vectype,code_helper code,gimple_seq * seq)5037 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5038                                   gimple_seq *seq)
5039 {
5040   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5041   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5042   tree stype = TREE_TYPE (vectype);
5043   tree new_temp = vec_def;
5044   while (nunits > nunits1)
5045     {
5046       nunits /= 2;
5047       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5048                                                                          stype, nunits);
5049       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5050 
5051       /* The target has to make sure we support lowpart/highpart
5052            extraction, either via direct vector extract or through
5053            an integer mode punning.  */
5054       tree dst1, dst2;
5055       gimple *epilog_stmt;
5056       if (convert_optab_handler (vec_extract_optab,
5057                                          TYPE_MODE (TREE_TYPE (new_temp)),
5058                                          TYPE_MODE (vectype1))
5059             != CODE_FOR_nothing)
5060           {
5061             /* Extract sub-vectors directly once vec_extract becomes
5062                a conversion optab.  */
5063             dst1 = make_ssa_name (vectype1);
5064             epilog_stmt
5065                 = gimple_build_assign (dst1, BIT_FIELD_REF,
5066                                              build3 (BIT_FIELD_REF, vectype1,
5067                                                        new_temp, TYPE_SIZE (vectype1),
5068                                                        bitsize_int (0)));
5069             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5070             dst2 =  make_ssa_name (vectype1);
5071             epilog_stmt
5072                 = gimple_build_assign (dst2, BIT_FIELD_REF,
5073                                              build3 (BIT_FIELD_REF, vectype1,
5074                                                        new_temp, TYPE_SIZE (vectype1),
5075                                                        bitsize_int (bitsize)));
5076             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5077           }
5078       else
5079           {
5080             /* Extract via punning to appropriately sized integer mode
5081                vector.  */
5082             tree eltype = build_nonstandard_integer_type (bitsize, 1);
5083             tree etype = build_vector_type (eltype, 2);
5084             gcc_assert (convert_optab_handler (vec_extract_optab,
5085                                                        TYPE_MODE (etype),
5086                                                        TYPE_MODE (eltype))
5087                           != CODE_FOR_nothing);
5088             tree tem = make_ssa_name (etype);
5089             epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5090                                                        build1 (VIEW_CONVERT_EXPR,
5091                                                                  etype, new_temp));
5092             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5093             new_temp = tem;
5094             tem = make_ssa_name (eltype);
5095             epilog_stmt
5096                 = gimple_build_assign (tem, BIT_FIELD_REF,
5097                                              build3 (BIT_FIELD_REF, eltype,
5098                                                        new_temp, TYPE_SIZE (eltype),
5099                                                        bitsize_int (0)));
5100             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5101             dst1 = make_ssa_name (vectype1);
5102             epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5103                                                        build1 (VIEW_CONVERT_EXPR,
5104                                                                  vectype1, tem));
5105             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5106             tem = make_ssa_name (eltype);
5107             epilog_stmt
5108                 = gimple_build_assign (tem, BIT_FIELD_REF,
5109                                              build3 (BIT_FIELD_REF, eltype,
5110                                                        new_temp, TYPE_SIZE (eltype),
5111                                                        bitsize_int (bitsize)));
5112             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5113             dst2 =  make_ssa_name (vectype1);
5114             epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5115                                                        build1 (VIEW_CONVERT_EXPR,
5116                                                                  vectype1, tem));
5117             gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5118           }
5119 
5120       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5121     }
5122 
5123   return new_temp;
5124 }
5125 
5126 /* Function vect_create_epilog_for_reduction
5127 
5128    Create code at the loop-epilog to finalize the result of a reduction
5129    computation.
5130 
5131    STMT_INFO is the scalar reduction stmt that is being vectorized.
5132    SLP_NODE is an SLP node containing a group of reduction statements. The
5133      first one in this group is STMT_INFO.
5134    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5135    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5136      (counting from 0)
5137 
5138    This function:
5139    1. Completes the reduction def-use cycles.
5140    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5141       by calling the function specified by REDUC_FN if available, or by
5142       other means (whole-vector shifts or a scalar loop).
5143       The function also creates a new phi node at the loop exit to preserve
5144       loop-closed form, as illustrated below.
5145 
5146      The flow at the entry to this function:
5147 
5148         loop:
5149           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5150           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5151           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5152         loop_exit:
5153           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5154           use <s_out0>
5155           use <s_out0>
5156 
5157      The above is transformed by this function into:
5158 
5159         loop:
5160           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5161           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5162           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5163         loop_exit:
5164           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5165           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5166           v_out2 = reduce <v_out1>
5167           s_out3 = extract_field <v_out2, 0>
5168           s_out4 = adjust_result <s_out3>
5169           use <s_out4>
5170           use <s_out4>
5171 */
5172 
5173 static void
vect_create_epilog_for_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance)5174 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5175                                           stmt_vec_info stmt_info,
5176                                           slp_tree slp_node,
5177                                           slp_instance slp_node_instance)
5178 {
5179   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5180   gcc_assert (reduc_info->is_reduc_info);
5181   /* For double reductions we need to get at the inner loop reduction
5182      stmt which has the meta info attached.  Our stmt_info is that of the
5183      loop-closed PHI of the inner loop which we remember as
5184      def for the reduction PHI generation.  */
5185   bool double_reduc = false;
5186   stmt_vec_info rdef_info = stmt_info;
5187   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5188     {
5189       gcc_assert (!slp_node);
5190       double_reduc = true;
5191       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5192                                                       (stmt_info->stmt, 0));
5193       stmt_info = vect_stmt_to_vectorize (stmt_info);
5194     }
5195   gphi *reduc_def_stmt
5196     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5197   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5198   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5199   tree vectype;
5200   machine_mode mode;
5201   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5202   basic_block exit_bb;
5203   tree scalar_dest;
5204   tree scalar_type;
5205   gimple *new_phi = NULL, *phi;
5206   gimple_stmt_iterator exit_gsi;
5207   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5208   gimple *epilog_stmt = NULL;
5209   gimple *exit_phi;
5210   tree bitsize;
5211   tree def;
5212   tree orig_name, scalar_result;
5213   imm_use_iterator imm_iter, phi_imm_iter;
5214   use_operand_p use_p, phi_use_p;
5215   gimple *use_stmt;
5216   auto_vec<tree> reduc_inputs;
5217   int j, i;
5218   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5219   unsigned int group_size = 1, k;
5220   auto_vec<gimple *> phis;
5221   /* SLP reduction without reduction chain, e.g.,
5222      # a1 = phi <a2, a0>
5223      # b1 = phi <b2, b0>
5224      a2 = operation (a1)
5225      b2 = operation (b1)  */
5226   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5227   bool direct_slp_reduc;
5228   tree induction_index = NULL_TREE;
5229 
5230   if (slp_node)
5231     group_size = SLP_TREE_LANES (slp_node);
5232 
5233   if (nested_in_vect_loop_p (loop, stmt_info))
5234     {
5235       outer_loop = loop;
5236       loop = loop->inner;
5237       gcc_assert (!slp_node && double_reduc);
5238     }
5239 
5240   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5241   gcc_assert (vectype);
5242   mode = TYPE_MODE (vectype);
5243 
5244   tree induc_val = NULL_TREE;
5245   tree adjustment_def = NULL;
5246   if (slp_node)
5247     ;
5248   else
5249     {
5250       /* Optimize: for induction condition reduction, if we can't use zero
5251          for induc_val, use initial_def.  */
5252       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5253           induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5254       else if (double_reduc)
5255           ;
5256       else
5257           adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5258     }
5259 
5260   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5261   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5262   if (slp_reduc)
5263     /* All statements produce live-out values.  */
5264     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5265   else if (slp_node)
5266     {
5267       /* The last statement in the reduction chain produces the live-out
5268            value.  Note SLP optimization can shuffle scalar stmts to
5269            optimize permutations so we have to search for the last stmt.  */
5270       for (k = 0; k < group_size; ++k)
5271           if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5272             {
5273               single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5274               break;
5275             }
5276     }
5277 
5278   unsigned vec_num;
5279   int ncopies;
5280   if (slp_node)
5281     {
5282       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5283       ncopies = 1;
5284     }
5285   else
5286     {
5287       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5288       vec_num = 1;
5289       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5290     }
5291 
5292   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5293      which is updated with the current index of the loop for every match of
5294      the original loop's cond_expr (VEC_STMT).  This results in a vector
5295      containing the last time the condition passed for that vector lane.
5296      The first match will be a 1 to allow 0 to be used for non-matching
5297      indexes.  If there are no matches at all then the vector will be all
5298      zeroes.
5299 
5300      PR92772: This algorithm is broken for architectures that support
5301      masked vectors, but do not provide fold_extract_last.  */
5302   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5303     {
5304       auto_vec<std::pair<tree, bool>, 2> ccompares;
5305       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5306       cond_info = vect_stmt_to_vectorize (cond_info);
5307       while (cond_info != reduc_info)
5308           {
5309             if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5310               {
5311                 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5312                 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5313                 ccompares.safe_push
5314                     (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5315                                          STMT_VINFO_REDUC_IDX (cond_info) == 2));
5316               }
5317             cond_info
5318               = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5319                                                              1 + STMT_VINFO_REDUC_IDX
5320                                                                       (cond_info)));
5321             cond_info = vect_stmt_to_vectorize (cond_info);
5322           }
5323       gcc_assert (ccompares.length () != 0);
5324 
5325       tree indx_before_incr, indx_after_incr;
5326       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5327       int scalar_precision
5328           = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5329       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5330       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5331           (TYPE_MODE (vectype), cr_index_scalar_type,
5332            TYPE_VECTOR_SUBPARTS (vectype));
5333 
5334       /* First we create a simple vector induction variable which starts
5335            with the values {1,2,3,...} (SERIES_VECT) and increments by the
5336            vector size (STEP).  */
5337 
5338       /* Create a {1,2,3,...} vector.  */
5339       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5340 
5341       /* Create a vector of the step value.  */
5342       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5343       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5344 
5345       /* Create an induction variable.  */
5346       gimple_stmt_iterator incr_gsi;
5347       bool insert_after;
5348       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5349       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5350                      insert_after, &indx_before_incr, &indx_after_incr);
5351 
5352       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5353            filled with zeros (VEC_ZERO).  */
5354 
5355       /* Create a vector of 0s.  */
5356       tree zero = build_zero_cst (cr_index_scalar_type);
5357       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5358 
5359       /* Create a vector phi node.  */
5360       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5361       new_phi = create_phi_node (new_phi_tree, loop->header);
5362       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5363                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
5364 
5365       /* Now take the condition from the loops original cond_exprs
5366            and produce a new cond_exprs (INDEX_COND_EXPR) which for
5367            every match uses values from the induction variable
5368            (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5369            (NEW_PHI_TREE).
5370            Finally, we update the phi (NEW_PHI_TREE) to take the value of
5371            the new cond_expr (INDEX_COND_EXPR).  */
5372       gimple_seq stmts = NULL;
5373       for (int i = ccompares.length () - 1; i != -1; --i)
5374           {
5375             tree ccompare = ccompares[i].first;
5376             if (ccompares[i].second)
5377               new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5378                                                    cr_index_vector_type,
5379                                                    ccompare,
5380                                                    indx_before_incr, new_phi_tree);
5381             else
5382               new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5383                                                    cr_index_vector_type,
5384                                                    ccompare,
5385                                                    new_phi_tree, indx_before_incr);
5386           }
5387       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5388 
5389       /* Update the phi with the vec cond.  */
5390       induction_index = new_phi_tree;
5391       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5392                        loop_latch_edge (loop), UNKNOWN_LOCATION);
5393     }
5394 
5395   /* 2. Create epilog code.
5396         The reduction epilog code operates across the elements of the vector
5397         of partial results computed by the vectorized loop.
5398         The reduction epilog code consists of:
5399 
5400         step 1: compute the scalar result in a vector (v_out2)
5401         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5402         step 3: adjust the scalar result (s_out3) if needed.
5403 
5404         Step 1 can be accomplished using one the following three schemes:
5405           (scheme 1) using reduc_fn, if available.
5406           (scheme 2) using whole-vector shifts, if available.
5407           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5408                      combined.
5409 
5410           The overall epilog code looks like this:
5411 
5412           s_out0 = phi <s_loop>         # original EXIT_PHI
5413           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5414           v_out2 = reduce <v_out1>              # step 1
5415           s_out3 = extract_field <v_out2, 0>    # step 2
5416           s_out4 = adjust_result <s_out3>       # step 3
5417 
5418           (step 3 is optional, and steps 1 and 2 may be combined).
5419           Lastly, the uses of s_out0 are replaced by s_out4.  */
5420 
5421 
5422   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5423          v_out1 = phi <VECT_DEF>
5424          Store them in NEW_PHIS.  */
5425   if (double_reduc)
5426     loop = outer_loop;
5427   exit_bb = single_exit (loop)->dest;
5428   exit_gsi = gsi_after_labels (exit_bb);
5429   reduc_inputs.create (slp_node ? vec_num : ncopies);
5430   for (unsigned i = 0; i < vec_num; i++)
5431     {
5432       gimple_seq stmts = NULL;
5433       if (slp_node)
5434           def = vect_get_slp_vect_def (slp_node, i);
5435       else
5436           def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5437       for (j = 0; j < ncopies; j++)
5438           {
5439             tree new_def = copy_ssa_name (def);
5440             phi = create_phi_node (new_def, exit_bb);
5441             if (j)
5442               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5443             SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5444             new_def = gimple_convert (&stmts, vectype, new_def);
5445             reduc_inputs.quick_push (new_def);
5446           }
5447       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5448     }
5449 
5450   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5451          (i.e. when reduc_fn is not available) and in the final adjustment
5452            code (if needed).  Also get the original scalar reduction variable as
5453          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5454          represents a reduction pattern), the tree-code and scalar-def are
5455          taken from the original stmt that the pattern-stmt (STMT) replaces.
5456          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5457          are taken from STMT.  */
5458 
5459   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5460   if (orig_stmt_info != stmt_info)
5461     {
5462       /* Reduction pattern  */
5463       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5464       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5465     }
5466 
5467   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5468   scalar_type = TREE_TYPE (scalar_dest);
5469   scalar_results.truncate (0);
5470   scalar_results.reserve_exact (group_size);
5471   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5472   bitsize = TYPE_SIZE (scalar_type);
5473 
5474   /* True if we should implement SLP_REDUC using native reduction operations
5475      instead of scalar operations.  */
5476   direct_slp_reduc = (reduc_fn != IFN_LAST
5477                           && slp_reduc
5478                           && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5479 
5480   /* In case of reduction chain, e.g.,
5481      # a1 = phi <a3, a0>
5482      a2 = operation (a1)
5483      a3 = operation (a2),
5484 
5485      we may end up with more than one vector result.  Here we reduce them
5486      to one vector.
5487 
5488      The same is true if we couldn't use a single defuse cycle.  */
5489   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5490       || direct_slp_reduc
5491       || ncopies > 1)
5492     {
5493       gimple_seq stmts = NULL;
5494       tree single_input = reduc_inputs[0];
5495       for (k = 1; k < reduc_inputs.length (); k++)
5496           single_input = gimple_build (&stmts, code, vectype,
5497                                              single_input, reduc_inputs[k]);
5498       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5499 
5500       reduc_inputs.truncate (0);
5501       reduc_inputs.safe_push (single_input);
5502     }
5503 
5504   tree orig_reduc_input = reduc_inputs[0];
5505 
5506   /* If this loop is an epilogue loop that can be skipped after the
5507      main loop, we can only share a reduction operation between the
5508      main loop and the epilogue if we put it at the target of the
5509      skip edge.
5510 
5511      We can still reuse accumulators if this check fails.  Doing so has
5512      the minor(?) benefit of making the epilogue loop's scalar result
5513      independent of the main loop's scalar result.  */
5514   bool unify_with_main_loop_p = false;
5515   if (reduc_info->reused_accumulator
5516       && loop_vinfo->skip_this_loop_edge
5517       && single_succ_p (exit_bb)
5518       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5519     {
5520       unify_with_main_loop_p = true;
5521 
5522       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5523       reduc_inputs[0] = make_ssa_name (vectype);
5524       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5525       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5526                        UNKNOWN_LOCATION);
5527       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5528                        loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5529       exit_gsi = gsi_after_labels (reduc_block);
5530     }
5531 
5532   /* Shouldn't be used beyond this point.  */
5533   exit_bb = nullptr;
5534 
5535   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5536       && reduc_fn != IFN_LAST)
5537     {
5538       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5539            various data values where the condition matched and another vector
5540            (INDUCTION_INDEX) containing all the indexes of those matches.  We
5541            need to extract the last matching index (which will be the index with
5542            highest value) and use this to index into the data vector.
5543            For the case where there were no matches, the data vector will contain
5544            all default values and the index vector will be all zeros.  */
5545 
5546       /* Get various versions of the type of the vector of indexes.  */
5547       tree index_vec_type = TREE_TYPE (induction_index);
5548       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5549       tree index_scalar_type = TREE_TYPE (index_vec_type);
5550       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5551 
5552       /* Get an unsigned integer version of the type of the data vector.  */
5553       int scalar_precision
5554           = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5555       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5556       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5557                                                             vectype);
5558 
5559       /* First we need to create a vector (ZERO_VEC) of zeros and another
5560            vector (MAX_INDEX_VEC) filled with the last matching index, which we
5561            can create using a MAX reduction and then expanding.
5562            In the case where the loop never made any matches, the max index will
5563            be zero.  */
5564 
5565       /* Vector of {0, 0, 0,...}.  */
5566       tree zero_vec = build_zero_cst (vectype);
5567 
5568       /* Find maximum value from the vector of found indexes.  */
5569       tree max_index = make_ssa_name (index_scalar_type);
5570       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5571                                                                         1, induction_index);
5572       gimple_call_set_lhs (max_index_stmt, max_index);
5573       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5574 
5575       /* Vector of {max_index, max_index, max_index,...}.  */
5576       tree max_index_vec = make_ssa_name (index_vec_type);
5577       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5578                                                                   max_index);
5579       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5580                                                                       max_index_vec_rhs);
5581       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5582 
5583       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5584            with the vector (INDUCTION_INDEX) of found indexes, choosing values
5585            from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5586            otherwise.  Only one value should match, resulting in a vector
5587            (VEC_COND) with one data value and the rest zeros.
5588            In the case where the loop never made any matches, every index will
5589            match, resulting in a vector with all data values (which will all be
5590            the default value).  */
5591 
5592       /* Compare the max index vector to the vector of found indexes to find
5593            the position of the max value.  */
5594       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5595       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5596                                                                   induction_index,
5597                                                                   max_index_vec);
5598       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5599 
5600       /* Use the compare to choose either values from the data vector or
5601            zero.  */
5602       tree vec_cond = make_ssa_name (vectype);
5603       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5604                                                                vec_compare,
5605                                                                reduc_inputs[0],
5606                                                                zero_vec);
5607       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5608 
5609       /* Finally we need to extract the data value from the vector (VEC_COND)
5610            into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5611            reduction, but because this doesn't exist, we can use a MAX reduction
5612            instead.  The data value might be signed or a float so we need to cast
5613            it first.
5614            In the case where the loop never made any matches, the data values are
5615            all identical, and so will reduce down correctly.  */
5616 
5617       /* Make the matched data values unsigned.  */
5618       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5619       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5620                                                vec_cond);
5621       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5622                                                                       VIEW_CONVERT_EXPR,
5623                                                                       vec_cond_cast_rhs);
5624       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5625 
5626       /* Reduce down to a scalar value.  */
5627       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5628       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5629                                                                          1, vec_cond_cast);
5630       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5631       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5632 
5633       /* Convert the reduced value back to the result type and set as the
5634            result.  */
5635       gimple_seq stmts = NULL;
5636       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5637                                      data_reduc);
5638       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5639       scalar_results.safe_push (new_temp);
5640     }
5641   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5642              && reduc_fn == IFN_LAST)
5643     {
5644       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5645            idx = 0;
5646          idx_val = induction_index[0];
5647            val = data_reduc[0];
5648          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5649              if (induction_index[i] > idx_val)
5650                val = data_reduc[i], idx_val = induction_index[i];
5651            return val;  */
5652 
5653       tree data_eltype = TREE_TYPE (vectype);
5654       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5655       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5656       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5657       /* Enforced by vectorizable_reduction, which ensures we have target
5658            support before allowing a conditional reduction on variable-length
5659            vectors.  */
5660       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5661       tree idx_val = NULL_TREE, val = NULL_TREE;
5662       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5663           {
5664             tree old_idx_val = idx_val;
5665             tree old_val = val;
5666             idx_val = make_ssa_name (idx_eltype);
5667             epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5668                                                        build3 (BIT_FIELD_REF, idx_eltype,
5669                                                                  induction_index,
5670                                                                  bitsize_int (el_size),
5671                                                                  bitsize_int (off)));
5672             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5673             val = make_ssa_name (data_eltype);
5674             epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5675                                                        build3 (BIT_FIELD_REF,
5676                                                                  data_eltype,
5677                                                                  reduc_inputs[0],
5678                                                                  bitsize_int (el_size),
5679                                                                  bitsize_int (off)));
5680             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5681             if (off != 0)
5682               {
5683                 tree new_idx_val = idx_val;
5684                 if (off != v_size - el_size)
5685                     {
5686                       new_idx_val = make_ssa_name (idx_eltype);
5687                       epilog_stmt = gimple_build_assign (new_idx_val,
5688                                                                  MAX_EXPR, idx_val,
5689                                                                  old_idx_val);
5690                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5691                     }
5692                 tree new_val = make_ssa_name (data_eltype);
5693                 epilog_stmt = gimple_build_assign (new_val,
5694                                                              COND_EXPR,
5695                                                              build2 (GT_EXPR,
5696                                                                        boolean_type_node,
5697                                                                        idx_val,
5698                                                                        old_idx_val),
5699                                                              val, old_val);
5700                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5701                 idx_val = new_idx_val;
5702                 val = new_val;
5703               }
5704           }
5705       /* Convert the reduced value back to the result type and set as the
5706            result.  */
5707       gimple_seq stmts = NULL;
5708       val = gimple_convert (&stmts, scalar_type, val);
5709       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5710       scalar_results.safe_push (val);
5711     }
5712 
5713   /* 2.3 Create the reduction code, using one of the three schemes described
5714          above. In SLP we simply need to extract all the elements from the
5715          vector (without reducing them), so we use scalar shifts.  */
5716   else if (reduc_fn != IFN_LAST && !slp_reduc)
5717     {
5718       tree tmp;
5719       tree vec_elem_type;
5720 
5721       /* Case 1:  Create:
5722          v_out2 = reduc_expr <v_out1>  */
5723 
5724       if (dump_enabled_p ())
5725         dump_printf_loc (MSG_NOTE, vect_location,
5726                                "Reduce using direct vector reduction.\n");
5727 
5728       gimple_seq stmts = NULL;
5729       vec_elem_type = TREE_TYPE (vectype);
5730       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5731                                      vec_elem_type, reduc_inputs[0]);
5732       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5733       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5734 
5735       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5736             && induc_val)
5737           {
5738             /* Earlier we set the initial value to be a vector if induc_val
5739                values.  Check the result and if it is induc_val then replace
5740                with the original initial value, unless induc_val is
5741                the same as initial_def already.  */
5742             tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5743                                           induc_val);
5744             tree initial_def = reduc_info->reduc_initial_values[0];
5745 
5746             tmp = make_ssa_name (new_scalar_dest);
5747             epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5748                                                        initial_def, new_temp);
5749             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750             new_temp = tmp;
5751           }
5752 
5753       scalar_results.safe_push (new_temp);
5754     }
5755   else if (direct_slp_reduc)
5756     {
5757       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5758            with the elements for other SLP statements replaced with the
5759            neutral value.  We can then do a normal reduction on each vector.  */
5760 
5761       /* Enforced by vectorizable_reduction.  */
5762       gcc_assert (reduc_inputs.length () == 1);
5763       gcc_assert (pow2p_hwi (group_size));
5764 
5765       gimple_seq seq = NULL;
5766 
5767       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5768            and the same element size as VECTYPE.  */
5769       tree index = build_index_vector (vectype, 0, 1);
5770       tree index_type = TREE_TYPE (index);
5771       tree index_elt_type = TREE_TYPE (index_type);
5772       tree mask_type = truth_type_for (index_type);
5773 
5774       /* Create a vector that, for each element, identifies which of
5775            the REDUC_GROUP_SIZE results should use it.  */
5776       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5777       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5778                                   build_vector_from_val (index_type, index_mask));
5779 
5780       /* Get a neutral vector value.  This is simply a splat of the neutral
5781            scalar value if we have one, otherwise the initial scalar value
5782            is itself a neutral value.  */
5783       tree vector_identity = NULL_TREE;
5784       tree neutral_op = NULL_TREE;
5785       if (slp_node)
5786           {
5787             tree initial_value = NULL_TREE;
5788             if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5789               initial_value = reduc_info->reduc_initial_values[0];
5790             neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5791                                                              initial_value);
5792           }
5793       if (neutral_op)
5794           vector_identity = gimple_build_vector_from_val (&seq, vectype,
5795                                                                       neutral_op);
5796       for (unsigned int i = 0; i < group_size; ++i)
5797           {
5798             /* If there's no univeral neutral value, we can use the
5799                initial scalar value from the original PHI.  This is used
5800                for MIN and MAX reduction, for example.  */
5801             if (!neutral_op)
5802               {
5803                 tree scalar_value = reduc_info->reduc_initial_values[i];
5804                 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5805                                                        scalar_value);
5806                 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5807                                                                             scalar_value);
5808               }
5809 
5810             /* Calculate the equivalent of:
5811 
5812                sel[j] = (index[j] == i);
5813 
5814                which selects the elements of REDUC_INPUTS[0] that should
5815                be included in the result.  */
5816             tree compare_val = build_int_cst (index_elt_type, i);
5817             compare_val = build_vector_from_val (index_type, compare_val);
5818             tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5819                                            index, compare_val);
5820 
5821             /* Calculate the equivalent of:
5822 
5823                vec = seq ? reduc_inputs[0] : vector_identity;
5824 
5825                VEC is now suitable for a full vector reduction.  */
5826             tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5827                                            sel, reduc_inputs[0], vector_identity);
5828 
5829             /* Do the reduction and convert it to the appropriate type.  */
5830             tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5831                                               TREE_TYPE (vectype), vec);
5832             scalar = gimple_convert (&seq, scalar_type, scalar);
5833             scalar_results.safe_push (scalar);
5834           }
5835       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5836     }
5837   else
5838     {
5839       bool reduce_with_shift;
5840       tree vec_temp;
5841 
5842       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5843 
5844       /* See if the target wants to do the final (shift) reduction
5845            in a vector mode of smaller size and first reduce upper/lower
5846            halves against each other.  */
5847       enum machine_mode mode1 = mode;
5848       tree stype = TREE_TYPE (vectype);
5849       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5850       unsigned nunits1 = nunits;
5851       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5852             && reduc_inputs.length () == 1)
5853           {
5854             nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5855             /* For SLP reductions we have to make sure lanes match up, but
5856                since we're doing individual element final reduction reducing
5857                vector width here is even more important.
5858                ???  We can also separate lanes with permutes, for the common
5859                case of power-of-two group-size odd/even extracts would work.  */
5860             if (slp_reduc && nunits != nunits1)
5861               {
5862                 nunits1 = least_common_multiple (nunits1, group_size);
5863                 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5864               }
5865           }
5866       if (!slp_reduc
5867             && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5868           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5869 
5870       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5871                                                                          stype, nunits1);
5872       reduce_with_shift = have_whole_vector_shift (mode1);
5873       if (!VECTOR_MODE_P (mode1)
5874             || !directly_supported_p (code, vectype1))
5875           reduce_with_shift = false;
5876 
5877       /* First reduce the vector to the desired vector size we should
5878            do shift reduction on by combining upper and lower halves.  */
5879       gimple_seq stmts = NULL;
5880       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5881                                                        code, &stmts);
5882       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5883       reduc_inputs[0] = new_temp;
5884 
5885       if (reduce_with_shift && !slp_reduc)
5886           {
5887             int element_bitsize = tree_to_uhwi (bitsize);
5888             /* Enforced by vectorizable_reduction, which disallows SLP reductions
5889                for variable-length vectors and also requires direct target support
5890                for loop reductions.  */
5891             int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5892             int nelements = vec_size_in_bits / element_bitsize;
5893             vec_perm_builder sel;
5894             vec_perm_indices indices;
5895 
5896           int elt_offset;
5897 
5898           tree zero_vec = build_zero_cst (vectype1);
5899           /* Case 2: Create:
5900              for (offset = nelements/2; offset >= 1; offset/=2)
5901                 {
5902                   Create:  va' = vec_shift <va, offset>
5903                   Create:  va = vop <va, va'>
5904                 }  */
5905 
5906           tree rhs;
5907 
5908           if (dump_enabled_p ())
5909             dump_printf_loc (MSG_NOTE, vect_location,
5910                                    "Reduce using vector shifts\n");
5911 
5912             gimple_seq stmts = NULL;
5913             new_temp = gimple_convert (&stmts, vectype1, new_temp);
5914           for (elt_offset = nelements / 2;
5915                elt_offset >= 1;
5916                elt_offset /= 2)
5917             {
5918                 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5919                 indices.new_vector (sel, 2, nelements);
5920                 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5921                 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5922                                                new_temp, zero_vec, mask);
5923                 new_temp = gimple_build (&stmts, code,
5924                                                vectype1, new_name, new_temp);
5925             }
5926             gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5927 
5928             /* 2.4  Extract the final scalar result.  Create:
5929                s_out3 = extract_field <v_out2, bitpos>  */
5930 
5931             if (dump_enabled_p ())
5932               dump_printf_loc (MSG_NOTE, vect_location,
5933                                    "extract scalar result\n");
5934 
5935             rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5936                               bitsize, bitsize_zero_node);
5937             epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5938             new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5939             gimple_assign_set_lhs (epilog_stmt, new_temp);
5940             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5941             scalar_results.safe_push (new_temp);
5942         }
5943       else
5944         {
5945           /* Case 3: Create:
5946              s = extract_field <v_out2, 0>
5947              for (offset = element_size;
5948                   offset < vector_size;
5949                   offset += element_size;)
5950                {
5951                  Create:  s' = extract_field <v_out2, offset>
5952                  Create:  s = op <s, s'>  // For non SLP cases
5953                }  */
5954 
5955           if (dump_enabled_p ())
5956             dump_printf_loc (MSG_NOTE, vect_location,
5957                                    "Reduce using scalar code.\n");
5958 
5959             int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5960             int element_bitsize = tree_to_uhwi (bitsize);
5961             tree compute_type = TREE_TYPE (vectype);
5962             gimple_seq stmts = NULL;
5963             FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5964             {
5965               int bit_offset;
5966                 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5967                                                vec_temp, bitsize, bitsize_zero_node);
5968 
5969               /* In SLP we don't need to apply reduction operation, so we just
5970                  collect s' values in SCALAR_RESULTS.  */
5971               if (slp_reduc)
5972                 scalar_results.safe_push (new_temp);
5973 
5974               for (bit_offset = element_bitsize;
5975                    bit_offset < vec_size_in_bits;
5976                    bit_offset += element_bitsize)
5977                 {
5978                   tree bitpos = bitsize_int (bit_offset);
5979                       new_name = gimple_build (&stmts, BIT_FIELD_REF,
5980                                                      compute_type, vec_temp,
5981                                                      bitsize, bitpos);
5982                   if (slp_reduc)
5983                     {
5984                       /* In SLP we don't need to apply reduction operation, so
5985                          we just collect s' values in SCALAR_RESULTS.  */
5986                       new_temp = new_name;
5987                       scalar_results.safe_push (new_name);
5988                     }
5989                   else
5990                         new_temp = gimple_build (&stmts, code, compute_type,
5991                                                        new_name, new_temp);
5992                 }
5993             }
5994 
5995           /* The only case where we need to reduce scalar results in SLP, is
5996              unrolling.  If the size of SCALAR_RESULTS is greater than
5997              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5998              REDUC_GROUP_SIZE.  */
5999           if (slp_reduc)
6000             {
6001               tree res, first_res, new_res;
6002 
6003               /* Reduce multiple scalar results in case of SLP unrolling.  */
6004               for (j = group_size; scalar_results.iterate (j, &res);
6005                    j++)
6006                 {
6007                   first_res = scalar_results[j % group_size];
6008                       new_res = gimple_build (&stmts, code, compute_type,
6009                                                     first_res, res);
6010                   scalar_results[j % group_size] = new_res;
6011                 }
6012                 scalar_results.truncate (group_size);
6013                 for (k = 0; k < group_size; k++)
6014                     scalar_results[k] = gimple_convert (&stmts, scalar_type,
6015                                                                 scalar_results[k]);
6016             }
6017           else
6018               {
6019                 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6020                 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6021                 scalar_results.safe_push (new_temp);
6022               }
6023 
6024             gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6025         }
6026 
6027       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6028             && induc_val)
6029           {
6030             /* Earlier we set the initial value to be a vector if induc_val
6031                values.  Check the result and if it is induc_val then replace
6032                with the original initial value, unless induc_val is
6033                the same as initial_def already.  */
6034             tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
6035                                           induc_val);
6036             tree initial_def = reduc_info->reduc_initial_values[0];
6037 
6038             tree tmp = make_ssa_name (new_scalar_dest);
6039             epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6040                                                        initial_def, new_temp);
6041             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6042             scalar_results[0] = tmp;
6043           }
6044     }
6045 
6046   /* 2.5 Adjust the final result by the initial value of the reduction
6047            variable. (When such adjustment is not needed, then
6048            'adjustment_def' is zero).  For example, if code is PLUS we create:
6049            new_temp = loop_exit_def + adjustment_def  */
6050 
6051   if (adjustment_def)
6052     {
6053       gcc_assert (!slp_reduc);
6054       gimple_seq stmts = NULL;
6055       if (double_reduc)
6056           {
6057             gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6058             adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6059             new_temp = gimple_build (&stmts, code, vectype,
6060                                            reduc_inputs[0], adjustment_def);
6061           }
6062       else
6063           {
6064           new_temp = scalar_results[0];
6065             gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6066             adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6067                                                      adjustment_def);
6068             new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6069             new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6070                                            new_temp, adjustment_def);
6071             new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6072           }
6073 
6074       epilog_stmt = gimple_seq_last_stmt (stmts);
6075       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6076       scalar_results[0] = new_temp;
6077     }
6078 
6079   /* Record this operation if it could be reused by the epilogue loop.  */
6080   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6081       && vec_num == 1)
6082     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6083                                                      { orig_reduc_input, reduc_info });
6084 
6085   if (double_reduc)
6086     loop = outer_loop;
6087 
6088   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6089           phis with new adjusted scalar results, i.e., replace use <s_out0>
6090           with use <s_out4>.
6091 
6092      Transform:
6093         loop_exit:
6094           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6095           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6096           v_out2 = reduce <v_out1>
6097           s_out3 = extract_field <v_out2, 0>
6098           s_out4 = adjust_result <s_out3>
6099           use <s_out0>
6100           use <s_out0>
6101 
6102      into:
6103 
6104         loop_exit:
6105           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6106           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6107           v_out2 = reduce <v_out1>
6108           s_out3 = extract_field <v_out2, 0>
6109           s_out4 = adjust_result <s_out3>
6110           use <s_out4>
6111           use <s_out4> */
6112 
6113   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6114   for (k = 0; k < live_out_stmts.size (); k++)
6115     {
6116       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6117       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6118 
6119       phis.create (3);
6120       /* Find the loop-closed-use at the loop exit of the original scalar
6121          result.  (The reduction result is expected to have two immediate uses,
6122          one at the latch block, and one at the loop exit).  For double
6123          reductions we are looking for exit phis of the outer loop.  */
6124       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6125         {
6126           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6127               {
6128                 if (!is_gimple_debug (USE_STMT (use_p)))
6129                     phis.safe_push (USE_STMT (use_p));
6130               }
6131           else
6132             {
6133               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6134                 {
6135                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6136 
6137                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6138                     {
6139                       if (!flow_bb_inside_loop_p (loop,
6140                                              gimple_bb (USE_STMT (phi_use_p)))
6141                                 && !is_gimple_debug (USE_STMT (phi_use_p)))
6142                         phis.safe_push (USE_STMT (phi_use_p));
6143                     }
6144                 }
6145             }
6146         }
6147 
6148       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6149         {
6150           /* Replace the uses:  */
6151           orig_name = PHI_RESULT (exit_phi);
6152 
6153             /* Look for a single use at the target of the skip edge.  */
6154             if (unify_with_main_loop_p)
6155               {
6156                 use_operand_p use_p;
6157                 gimple *user;
6158                 if (!single_imm_use (orig_name, &use_p, &user))
6159                     gcc_unreachable ();
6160                 orig_name = gimple_get_lhs (user);
6161               }
6162 
6163           scalar_result = scalar_results[k];
6164           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6165               {
6166                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6167                     SET_USE (use_p, scalar_result);
6168                 update_stmt (use_stmt);
6169               }
6170         }
6171 
6172       phis.release ();
6173     }
6174 }
6175 
6176 /* Return a vector of type VECTYPE that is equal to the vector select
6177    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6178    before GSI.  */
6179 
6180 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6181 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6182                          tree vec, tree identity)
6183 {
6184   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6185   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6186                                                     mask, vec, identity);
6187   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6188   return cond;
6189 }
6190 
6191 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6192    order, starting with LHS.  Insert the extraction statements before GSI and
6193    associate the new scalar SSA names with variable SCALAR_DEST.
6194    Return the SSA name for the result.  */
6195 
6196 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6197 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6198                            tree_code code, tree lhs, tree vector_rhs)
6199 {
6200   tree vectype = TREE_TYPE (vector_rhs);
6201   tree scalar_type = TREE_TYPE (vectype);
6202   tree bitsize = TYPE_SIZE (scalar_type);
6203   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6204   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6205 
6206   for (unsigned HOST_WIDE_INT bit_offset = 0;
6207        bit_offset < vec_size_in_bits;
6208        bit_offset += element_bitsize)
6209     {
6210       tree bitpos = bitsize_int (bit_offset);
6211       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6212                                bitsize, bitpos);
6213 
6214       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6215       rhs = make_ssa_name (scalar_dest, stmt);
6216       gimple_assign_set_lhs (stmt, rhs);
6217       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6218 
6219       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6220       tree new_name = make_ssa_name (scalar_dest, stmt);
6221       gimple_assign_set_lhs (stmt, new_name);
6222       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6223       lhs = new_name;
6224     }
6225   return lhs;
6226 }
6227 
6228 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6229    type of the vector input.  */
6230 
6231 static internal_fn
get_masked_reduction_fn(internal_fn reduc_fn,tree vectype_in)6232 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6233 {
6234   internal_fn mask_reduc_fn;
6235 
6236   switch (reduc_fn)
6237     {
6238     case IFN_FOLD_LEFT_PLUS:
6239       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6240       break;
6241 
6242     default:
6243       return IFN_LAST;
6244     }
6245 
6246   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6247                                               OPTIMIZE_FOR_SPEED))
6248     return mask_reduc_fn;
6249   return IFN_LAST;
6250 }
6251 
6252 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6253    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6254    statement.  CODE is the operation performed by STMT_INFO and OPS are
6255    its scalar operands.  REDUC_INDEX is the index of the operand in
6256    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6257    implements in-order reduction, or IFN_LAST if we should open-code it.
6258    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6259    that should be used to control the operation in a fully-masked loop.  */
6260 
6261 static bool
vectorize_fold_left_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6262 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6263                                      stmt_vec_info stmt_info,
6264                                      gimple_stmt_iterator *gsi,
6265                                      gimple **vec_stmt, slp_tree slp_node,
6266                                      gimple *reduc_def_stmt,
6267                                      tree_code code, internal_fn reduc_fn,
6268                                      tree ops[3], tree vectype_in,
6269                                      int reduc_index, vec_loop_masks *masks)
6270 {
6271   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6272   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6273   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6274 
6275   int ncopies;
6276   if (slp_node)
6277     ncopies = 1;
6278   else
6279     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6280 
6281   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6282   gcc_assert (ncopies == 1);
6283   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6284 
6285   if (slp_node)
6286     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6287                                 TYPE_VECTOR_SUBPARTS (vectype_in)));
6288 
6289   tree op0 = ops[1 - reduc_index];
6290 
6291   int group_size = 1;
6292   stmt_vec_info scalar_dest_def_info;
6293   auto_vec<tree> vec_oprnds0;
6294   if (slp_node)
6295     {
6296       auto_vec<vec<tree> > vec_defs (2);
6297       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6298       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6299       vec_defs[0].release ();
6300       vec_defs[1].release ();
6301       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6302       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6303     }
6304   else
6305     {
6306       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6307                                              op0, &vec_oprnds0);
6308       scalar_dest_def_info = stmt_info;
6309     }
6310 
6311   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6312   tree scalar_type = TREE_TYPE (scalar_dest);
6313   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6314 
6315   int vec_num = vec_oprnds0.length ();
6316   gcc_assert (vec_num == 1 || slp_node);
6317   tree vec_elem_type = TREE_TYPE (vectype_out);
6318   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6319 
6320   tree vector_identity = NULL_TREE;
6321   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6322     vector_identity = build_zero_cst (vectype_out);
6323 
6324   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6325   int i;
6326   tree def0;
6327   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6328     {
6329       gimple *new_stmt;
6330       tree mask = NULL_TREE;
6331       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6332           mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6333 
6334       /* Handle MINUS by adding the negative.  */
6335       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6336           {
6337             tree negated = make_ssa_name (vectype_out);
6338             new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6339             gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6340             def0 = negated;
6341           }
6342 
6343       if (mask && mask_reduc_fn == IFN_LAST)
6344           def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6345                                             vector_identity);
6346 
6347       /* On the first iteration the input is simply the scalar phi
6348            result, and for subsequent iterations it is the output of
6349            the preceding operation.  */
6350       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6351           {
6352             if (mask && mask_reduc_fn != IFN_LAST)
6353               new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6354                                                                def0, mask);
6355             else
6356               new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6357                                                                def0);
6358             /* For chained SLP reductions the output of the previous reduction
6359                operation serves as the input of the next. For the final statement
6360                the output cannot be a temporary - we reuse the original
6361                scalar destination of the last statement.  */
6362             if (i != vec_num - 1)
6363               {
6364                 gimple_set_lhs (new_stmt, scalar_dest_var);
6365                 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6366                 gimple_set_lhs (new_stmt, reduc_var);
6367               }
6368           }
6369       else
6370           {
6371             reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6372                                                        reduc_var, def0);
6373             new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6374             /* Remove the statement, so that we can use the same code paths
6375                as for statements that we've just created.  */
6376             gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6377             gsi_remove (&tmp_gsi, true);
6378           }
6379 
6380       if (i == vec_num - 1)
6381           {
6382             gimple_set_lhs (new_stmt, scalar_dest);
6383             vect_finish_replace_stmt (loop_vinfo,
6384                                             scalar_dest_def_info,
6385                                             new_stmt);
6386           }
6387       else
6388           vect_finish_stmt_generation (loop_vinfo,
6389                                              scalar_dest_def_info,
6390                                              new_stmt, gsi);
6391 
6392       if (slp_node)
6393           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6394       else
6395           {
6396             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6397             *vec_stmt = new_stmt;
6398           }
6399     }
6400 
6401   return true;
6402 }
6403 
6404 /* Function is_nonwrapping_integer_induction.
6405 
6406    Check if STMT_VINO (which is part of loop LOOP) both increments and
6407    does not cause overflow.  */
6408 
6409 static bool
is_nonwrapping_integer_induction(stmt_vec_info stmt_vinfo,class loop * loop)6410 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6411 {
6412   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6413   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6414   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6415   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6416   widest_int ni, max_loop_value, lhs_max;
6417   wi::overflow_type overflow = wi::OVF_NONE;
6418 
6419   /* Make sure the loop is integer based.  */
6420   if (TREE_CODE (base) != INTEGER_CST
6421       || TREE_CODE (step) != INTEGER_CST)
6422     return false;
6423 
6424   /* Check that the max size of the loop will not wrap.  */
6425 
6426   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6427     return true;
6428 
6429   if (! max_stmt_executions (loop, &ni))
6430     return false;
6431 
6432   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6433                                   &overflow);
6434   if (overflow)
6435     return false;
6436 
6437   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6438                                   TYPE_SIGN (lhs_type), &overflow);
6439   if (overflow)
6440     return false;
6441 
6442   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6443             <= TYPE_PRECISION (lhs_type));
6444 }
6445 
6446 /* Check if masking can be supported by inserting a conditional expression.
6447    CODE is the code for the operation.  COND_FN is the conditional internal
6448    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6449 static bool
use_mask_by_cond_expr_p(code_helper code,internal_fn cond_fn,tree vectype_in)6450 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6451                                tree vectype_in)
6452 {
6453   if (cond_fn != IFN_LAST
6454       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6455                                                    OPTIMIZE_FOR_SPEED))
6456     return false;
6457 
6458   if (code.is_tree_code ())
6459     switch (tree_code (code))
6460       {
6461       case DOT_PROD_EXPR:
6462       case SAD_EXPR:
6463           return true;
6464 
6465       default:
6466           break;
6467       }
6468   return false;
6469 }
6470 
6471 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6472    code for the operation.  VOP is the array of operands.  MASK is the loop
6473    mask.  GSI is a statement iterator used to place the new conditional
6474    expression.  */
6475 static void
build_vect_cond_expr(code_helper code,tree vop[3],tree mask,gimple_stmt_iterator * gsi)6476 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6477                           gimple_stmt_iterator *gsi)
6478 {
6479   switch (tree_code (code))
6480     {
6481     case DOT_PROD_EXPR:
6482       {
6483           tree vectype = TREE_TYPE (vop[1]);
6484           tree zero = build_zero_cst (vectype);
6485           tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6486           gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6487                                                          mask, vop[1], zero);
6488           gsi_insert_before (gsi, select, GSI_SAME_STMT);
6489           vop[1] = masked_op1;
6490           break;
6491       }
6492 
6493     case SAD_EXPR:
6494       {
6495           tree vectype = TREE_TYPE (vop[1]);
6496           tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6497           gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6498                                                          mask, vop[1], vop[0]);
6499           gsi_insert_before (gsi, select, GSI_SAME_STMT);
6500           vop[1] = masked_op1;
6501           break;
6502       }
6503 
6504     default:
6505       gcc_unreachable ();
6506     }
6507 }
6508 
6509 /* Function vectorizable_reduction.
6510 
6511    Check if STMT_INFO performs a reduction operation that can be vectorized.
6512    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6513    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6514    Return true if STMT_INFO is vectorizable in this way.
6515 
6516    This function also handles reduction idioms (patterns) that have been
6517    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6518    may be of this form:
6519      X = pattern_expr (arg0, arg1, ..., X)
6520    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6521    sequence that had been detected and replaced by the pattern-stmt
6522    (STMT_INFO).
6523 
6524    This function also handles reduction of condition expressions, for example:
6525      for (int i = 0; i < N; i++)
6526        if (a[i] < value)
6527            last = a[i];
6528    This is handled by vectorising the loop and creating an additional vector
6529    containing the loop indexes for which "a[i] < value" was true.  In the
6530    function epilogue this is reduced to a single max value and then used to
6531    index into the vector of results.
6532 
6533    In some cases of reduction patterns, the type of the reduction variable X is
6534    different than the type of the other arguments of STMT_INFO.
6535    In such cases, the vectype that is used when transforming STMT_INFO into
6536    a vector stmt is different than the vectype that is used to determine the
6537    vectorization factor, because it consists of a different number of elements
6538    than the actual number of elements that are being operated upon in parallel.
6539 
6540    For example, consider an accumulation of shorts into an int accumulator.
6541    On some targets it's possible to vectorize this pattern operating on 8
6542    shorts at a time (hence, the vectype for purposes of determining the
6543    vectorization factor should be V8HI); on the other hand, the vectype that
6544    is used to create the vector form is actually V4SI (the type of the result).
6545 
6546    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6547    indicates what is the actual level of parallelism (V8HI in the example), so
6548    that the right vectorization factor would be derived.  This vectype
6549    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6550    be used to create the vectorized stmt.  The right vectype for the vectorized
6551    stmt is obtained from the type of the result X:
6552       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6553 
6554    This means that, contrary to "regular" reductions (or "regular" stmts in
6555    general), the following equation:
6556       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6557    does *NOT* necessarily hold for reduction patterns.  */
6558 
6559 bool
vectorizable_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,slp_tree slp_node,slp_instance slp_node_instance,stmt_vector_for_cost * cost_vec)6560 vectorizable_reduction (loop_vec_info loop_vinfo,
6561                               stmt_vec_info stmt_info, slp_tree slp_node,
6562                               slp_instance slp_node_instance,
6563                               stmt_vector_for_cost *cost_vec)
6564 {
6565   tree vectype_in = NULL_TREE;
6566   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
6567   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6568   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6569   stmt_vec_info cond_stmt_vinfo = NULL;
6570   int i;
6571   int ncopies;
6572   bool single_defuse_cycle = false;
6573   bool nested_cycle = false;
6574   bool double_reduc = false;
6575   int vec_num;
6576   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6577   tree cond_reduc_val = NULL_TREE;
6578 
6579   /* Make sure it was already recognized as a reduction computation.  */
6580   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6581       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6582       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6583     return false;
6584 
6585   /* The stmt we store reduction analysis meta on.  */
6586   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6587   reduc_info->is_reduc_info = true;
6588 
6589   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6590     {
6591       if (is_a <gphi *> (stmt_info->stmt))
6592           {
6593             if (slp_node)
6594               {
6595                 /* We eventually need to set a vector type on invariant
6596                      arguments.  */
6597                 unsigned j;
6598                 slp_tree child;
6599                 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6600                     if (!vect_maybe_update_slp_op_vectype
6601                            (child, SLP_TREE_VECTYPE (slp_node)))
6602                       {
6603                         if (dump_enabled_p ())
6604                           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605                                                "incompatible vector types for "
6606                                                "invariants\n");
6607                         return false;
6608                       }
6609               }
6610             /* Analysis for double-reduction is done on the outer
6611                loop PHI, nested cycles have no further restrictions.  */
6612             STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6613           }
6614       else
6615           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6616       return true;
6617     }
6618 
6619   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6620   stmt_vec_info phi_info = stmt_info;
6621   if (!is_a <gphi *> (stmt_info->stmt))
6622     {
6623       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6624       return true;
6625     }
6626   if (slp_node)
6627     {
6628       slp_node_instance->reduc_phis = slp_node;
6629       /* ???  We're leaving slp_node to point to the PHIs, we only
6630            need it to get at the number of vector stmts which wasn't
6631            yet initialized for the instance root.  */
6632     }
6633   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6634     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6635   else
6636     {
6637       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6638                       == vect_double_reduction_def);
6639       use_operand_p use_p;
6640       gimple *use_stmt;
6641       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6642                                          &use_p, &use_stmt);
6643       gcc_assert (res);
6644       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6645       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6646     }
6647 
6648   /* PHIs should not participate in patterns.  */
6649   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6650   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6651 
6652   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6653      and compute the reduction chain length.  Discover the real
6654      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6655   tree reduc_def
6656     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6657                                    loop_latch_edge
6658                                      (gimple_bb (reduc_def_phi)->loop_father));
6659   unsigned reduc_chain_length = 0;
6660   bool only_slp_reduc_chain = true;
6661   stmt_info = NULL;
6662   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6663   while (reduc_def != PHI_RESULT (reduc_def_phi))
6664     {
6665       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6666       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6667       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6668           {
6669             if (dump_enabled_p ())
6670               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6671                                    "reduction chain broken by patterns.\n");
6672             return false;
6673           }
6674       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6675           only_slp_reduc_chain = false;
6676       /* For epilogue generation live members of the chain need
6677          to point back to the PHI via their original stmt for
6678            info_for_reduction to work.  For SLP we need to look at
6679            all lanes here - even though we only will vectorize from
6680            the SLP node with live lane zero the other live lanes also
6681            need to be identified as part of a reduction to be able
6682            to skip code generation for them.  */
6683       if (slp_for_stmt_info)
6684           {
6685             for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6686               if (STMT_VINFO_LIVE_P (s))
6687                 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6688           }
6689       else if (STMT_VINFO_LIVE_P (vdef))
6690           STMT_VINFO_REDUC_DEF (def) = phi_info;
6691       gimple_match_op op;
6692       if (!gimple_extract_op (vdef->stmt, &op))
6693           {
6694             if (dump_enabled_p ())
6695               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6696                                    "reduction chain includes unsupported"
6697                                    " statement type.\n");
6698             return false;
6699           }
6700       if (CONVERT_EXPR_CODE_P (op.code))
6701           {
6702             if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6703               {
6704                 if (dump_enabled_p ())
6705                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6706                                          "conversion in the reduction chain.\n");
6707                 return false;
6708               }
6709           }
6710       else if (!stmt_info)
6711           /* First non-conversion stmt.  */
6712           stmt_info = vdef;
6713       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6714       reduc_chain_length++;
6715       if (!stmt_info && slp_node)
6716           slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6717     }
6718   /* PHIs should not participate in patterns.  */
6719   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6720 
6721   if (nested_in_vect_loop_p (loop, stmt_info))
6722     {
6723       loop = loop->inner;
6724       nested_cycle = true;
6725     }
6726 
6727   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6728      element.  */
6729   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6730     {
6731       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6732       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6733     }
6734   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6735     gcc_assert (slp_node
6736                     && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6737 
6738   /* 1. Is vectorizable reduction?  */
6739   /* Not supportable if the reduction variable is used in the loop, unless
6740      it's a reduction chain.  */
6741   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6742       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6743     return false;
6744 
6745   /* Reductions that are not used even in an enclosing outer-loop,
6746      are expected to be "live" (used out of the loop).  */
6747   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6748       && !STMT_VINFO_LIVE_P (stmt_info))
6749     return false;
6750 
6751   /* 2. Has this been recognized as a reduction pattern?
6752 
6753      Check if STMT represents a pattern that has been recognized
6754      in earlier analysis stages.  For stmts that represent a pattern,
6755      the STMT_VINFO_RELATED_STMT field records the last stmt in
6756      the original sequence that constitutes the pattern.  */
6757 
6758   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6759   if (orig_stmt_info)
6760     {
6761       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6762       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6763     }
6764 
6765   /* 3. Check the operands of the operation.  The first operands are defined
6766         inside the loop body. The last operand is the reduction variable,
6767         which is defined by the loop-header-phi.  */
6768 
6769   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6770   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6771   gimple_match_op op;
6772   if (!gimple_extract_op (stmt_info->stmt, &op))
6773     gcc_unreachable ();
6774   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6775                                   || op.code == WIDEN_SUM_EXPR
6776                                   || op.code == SAD_EXPR);
6777   enum optab_subtype optab_query_kind = optab_vector;
6778   if (op.code == DOT_PROD_EXPR
6779       && (TYPE_SIGN (TREE_TYPE (op.ops[0]))
6780             != TYPE_SIGN (TREE_TYPE (op.ops[1]))))
6781     optab_query_kind = optab_vector_mixed_sign;
6782 
6783   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6784       && !SCALAR_FLOAT_TYPE_P (op.type))
6785     return false;
6786 
6787   /* Do not try to vectorize bit-precision reductions.  */
6788   if (!type_has_mode_precision_p (op.type))
6789     return false;
6790 
6791   /* For lane-reducing ops we're reducing the number of reduction PHIs
6792      which means the only use of that may be in the lane-reducing operation.  */
6793   if (lane_reduc_code_p
6794       && reduc_chain_length != 1
6795       && !only_slp_reduc_chain)
6796     {
6797       if (dump_enabled_p ())
6798           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799                                "lane-reducing reduction with extra stmts.\n");
6800       return false;
6801     }
6802 
6803   /* All uses but the last are expected to be defined in the loop.
6804      The last use is the reduction variable.  In case of nested cycle this
6805      assumption is not true: we use reduc_index to record the index of the
6806      reduction variable.  */
6807   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6808   /* We need to skip an extra operand for COND_EXPRs with embedded
6809      comparison.  */
6810   unsigned opno_adjust = 0;
6811   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6812     opno_adjust = 1;
6813   for (i = 0; i < (int) op.num_ops; i++)
6814     {
6815       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6816       if (i == 0 && op.code == COND_EXPR)
6817         continue;
6818 
6819       stmt_vec_info def_stmt_info;
6820       enum vect_def_type dt;
6821       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6822                                      i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6823                                      &vectype_op[i], &def_stmt_info))
6824           {
6825             if (dump_enabled_p ())
6826               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6827                                    "use not simple.\n");
6828             return false;
6829           }
6830       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6831           continue;
6832 
6833       /* There should be only one cycle def in the stmt, the one
6834          leading to reduc_def.  */
6835       if (VECTORIZABLE_CYCLE_DEF (dt))
6836           return false;
6837 
6838       if (!vectype_op[i])
6839           vectype_op[i]
6840             = get_vectype_for_scalar_type (loop_vinfo,
6841                                                    TREE_TYPE (op.ops[i]), slp_op[i]);
6842 
6843       /* To properly compute ncopies we are interested in the widest
6844            non-reduction input type in case we're looking at a widening
6845            accumulation that we later handle in vect_transform_reduction.  */
6846       if (lane_reduc_code_p
6847             && vectype_op[i]
6848             && (!vectype_in
6849                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6850                       < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
6851           vectype_in = vectype_op[i];
6852 
6853       /* Record how the non-reduction-def value of COND_EXPR is defined.
6854            ???  For a chain of multiple CONDs we'd have to match them up all.  */
6855       if (op.code == COND_EXPR && reduc_chain_length == 1)
6856           {
6857             if (dt == vect_constant_def)
6858               {
6859                 cond_reduc_dt = dt;
6860                 cond_reduc_val = op.ops[i];
6861               }
6862             else if (dt == vect_induction_def
6863                        && def_stmt_info
6864                        && is_nonwrapping_integer_induction (def_stmt_info, loop))
6865               {
6866                 cond_reduc_dt = dt;
6867                 cond_stmt_vinfo = def_stmt_info;
6868               }
6869           }
6870     }
6871   if (!vectype_in)
6872     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6873   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6874 
6875   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6876   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6877   /* If we have a condition reduction, see if we can simplify it further.  */
6878   if (v_reduc_type == COND_REDUCTION)
6879     {
6880       if (slp_node)
6881           return false;
6882 
6883       /* When the condition uses the reduction value in the condition, fail.  */
6884       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6885           {
6886             if (dump_enabled_p ())
6887               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6888                                    "condition depends on previous iteration\n");
6889             return false;
6890           }
6891 
6892       if (reduc_chain_length == 1
6893             && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6894                                                        vectype_in, OPTIMIZE_FOR_SPEED))
6895           {
6896             if (dump_enabled_p ())
6897               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898                                    "optimizing condition reduction with"
6899                                    " FOLD_EXTRACT_LAST.\n");
6900             STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6901           }
6902       else if (cond_reduc_dt == vect_induction_def)
6903           {
6904             tree base
6905               = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6906             tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6907 
6908             gcc_assert (TREE_CODE (base) == INTEGER_CST
6909                           && TREE_CODE (step) == INTEGER_CST);
6910             cond_reduc_val = NULL_TREE;
6911             enum tree_code cond_reduc_op_code = ERROR_MARK;
6912             tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6913             if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6914               ;
6915             /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6916                above base; punt if base is the minimum value of the type for
6917                MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6918             else if (tree_int_cst_sgn (step) == -1)
6919               {
6920                 cond_reduc_op_code = MIN_EXPR;
6921                 if (tree_int_cst_sgn (base) == -1)
6922                     cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6923                 else if (tree_int_cst_lt (base,
6924                                                   TYPE_MAX_VALUE (TREE_TYPE (base))))
6925                     cond_reduc_val
6926                       = int_const_binop (PLUS_EXPR, base, integer_one_node);
6927               }
6928             else
6929               {
6930                 cond_reduc_op_code = MAX_EXPR;
6931                 if (tree_int_cst_sgn (base) == 1)
6932                     cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6933                 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6934                                                   base))
6935                     cond_reduc_val
6936                       = int_const_binop (MINUS_EXPR, base, integer_one_node);
6937               }
6938             if (cond_reduc_val)
6939               {
6940                 if (dump_enabled_p ())
6941                     dump_printf_loc (MSG_NOTE, vect_location,
6942                                          "condition expression based on "
6943                                          "integer induction.\n");
6944                 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6945                 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6946                     = cond_reduc_val;
6947                 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6948               }
6949           }
6950       else if (cond_reduc_dt == vect_constant_def)
6951           {
6952             enum vect_def_type cond_initial_dt;
6953             tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6954             vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6955             if (cond_initial_dt == vect_constant_def
6956                 && types_compatible_p (TREE_TYPE (cond_initial_val),
6957                                              TREE_TYPE (cond_reduc_val)))
6958               {
6959                 tree e = fold_binary (LE_EXPR, boolean_type_node,
6960                                             cond_initial_val, cond_reduc_val);
6961                 if (e && (integer_onep (e) || integer_zerop (e)))
6962                     {
6963                       if (dump_enabled_p ())
6964                         dump_printf_loc (MSG_NOTE, vect_location,
6965                                              "condition expression based on "
6966                                              "compile time constant.\n");
6967                       /* Record reduction code at analysis stage.  */
6968                       STMT_VINFO_REDUC_CODE (reduc_info)
6969                         = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6970                       STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6971                     }
6972               }
6973           }
6974     }
6975 
6976   if (STMT_VINFO_LIVE_P (phi_info))
6977     return false;
6978 
6979   if (slp_node)
6980     ncopies = 1;
6981   else
6982     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6983 
6984   gcc_assert (ncopies >= 1);
6985 
6986   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6987 
6988   if (nested_cycle)
6989     {
6990       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6991                       == vect_double_reduction_def);
6992       double_reduc = true;
6993     }
6994 
6995   /* 4.2. Check support for the epilog operation.
6996 
6997           If STMT represents a reduction pattern, then the type of the
6998           reduction variable may be different than the type of the rest
6999           of the arguments.  For example, consider the case of accumulation
7000           of shorts into an int accumulator; The original code:
7001                         S1: int_a = (int) short_a;
7002           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7003 
7004           was replaced with:
7005                         STMT: int_acc = widen_sum <short_a, int_acc>
7006 
7007           This means that:
7008           1. The tree-code that is used to create the vector operation in the
7009              epilog code (that reduces the partial results) is not the
7010              tree-code of STMT, but is rather the tree-code of the original
7011              stmt from the pattern that STMT is replacing.  I.e, in the example
7012              above we want to use 'widen_sum' in the loop, but 'plus' in the
7013              epilog.
7014           2. The type (mode) we use to check available target support
7015              for the vector operation to be created in the *epilog*, is
7016              determined by the type of the reduction variable (in the example
7017              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7018              However the type (mode) we use to check available target support
7019              for the vector operation to be created *inside the loop*, is
7020              determined by the type of the other arguments to STMT (in the
7021              example we'd check this: optab_handler (widen_sum_optab,
7022                vect_short_mode)).
7023 
7024           This is contrary to "regular" reductions, in which the types of all
7025           the arguments are the same as the type of the reduction variable.
7026           For "regular" reductions we can therefore use the same vector type
7027           (and also the same tree-code) when generating the epilog code and
7028           when generating the code inside the loop.  */
7029 
7030   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7031   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7032 
7033   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7034   if (reduction_type == TREE_CODE_REDUCTION)
7035     {
7036       /* Check whether it's ok to change the order of the computation.
7037            Generally, when vectorizing a reduction we change the order of the
7038            computation.  This may change the behavior of the program in some
7039            cases, so we need to check that this is ok.  One exception is when
7040            vectorizing an outer-loop: the inner-loop is executed sequentially,
7041            and therefore vectorizing reductions in the inner-loop during
7042            outer-loop vectorization is safe.  Likewise when we are vectorizing
7043            a series of reductions using SLP and the VF is one the reductions
7044            are performed in scalar order.  */
7045       if (slp_node
7046             && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7047             && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7048           ;
7049       else if (needs_fold_left_reduction_p (op.type, orig_code))
7050           {
7051             /* When vectorizing a reduction chain w/o SLP the reduction PHI
7052                is not directy used in stmt.  */
7053             if (!only_slp_reduc_chain
7054                 && reduc_chain_length != 1)
7055               {
7056                 if (dump_enabled_p ())
7057                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7058                                          "in-order reduction chain without SLP.\n");
7059                 return false;
7060               }
7061             STMT_VINFO_REDUC_TYPE (reduc_info)
7062               = reduction_type = FOLD_LEFT_REDUCTION;
7063           }
7064       else if (!commutative_binary_op_p (orig_code, op.type)
7065                  || !associative_binary_op_p (orig_code, op.type))
7066           {
7067             if (dump_enabled_p ())
7068               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7069                                   "reduction: not commutative/associative");
7070             return false;
7071           }
7072     }
7073 
7074   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7075       && ncopies > 1)
7076     {
7077       if (dump_enabled_p ())
7078           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7079                                "multiple types in double reduction or condition "
7080                                "reduction or fold-left reduction.\n");
7081       return false;
7082     }
7083 
7084   internal_fn reduc_fn = IFN_LAST;
7085   if (reduction_type == TREE_CODE_REDUCTION
7086       || reduction_type == FOLD_LEFT_REDUCTION
7087       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7088       || reduction_type == CONST_COND_REDUCTION)
7089     {
7090       if (reduction_type == FOLD_LEFT_REDUCTION
7091             ? fold_left_reduction_fn (orig_code, &reduc_fn)
7092             : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7093           {
7094             if (reduc_fn != IFN_LAST
7095                 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7096                                                               OPTIMIZE_FOR_SPEED))
7097               {
7098                 if (dump_enabled_p ())
7099                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7100                                          "reduc op not supported by target.\n");
7101 
7102                 reduc_fn = IFN_LAST;
7103               }
7104           }
7105       else
7106           {
7107             if (!nested_cycle || double_reduc)
7108               {
7109                 if (dump_enabled_p ())
7110                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111                                          "no reduc code for scalar code.\n");
7112 
7113                 return false;
7114               }
7115           }
7116     }
7117   else if (reduction_type == COND_REDUCTION)
7118     {
7119       int scalar_precision
7120           = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7121       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7122       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7123                                                             vectype_out);
7124 
7125       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7126                                                     OPTIMIZE_FOR_SPEED))
7127           reduc_fn = IFN_REDUC_MAX;
7128     }
7129   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7130 
7131   if (reduction_type != EXTRACT_LAST_REDUCTION
7132       && (!nested_cycle || double_reduc)
7133       && reduc_fn == IFN_LAST
7134       && !nunits_out.is_constant ())
7135     {
7136       if (dump_enabled_p ())
7137           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7138                                "missing target support for reduction on"
7139                                " variable-length vectors.\n");
7140       return false;
7141     }
7142 
7143   /* For SLP reductions, see if there is a neutral value we can use.  */
7144   tree neutral_op = NULL_TREE;
7145   if (slp_node)
7146     {
7147       tree initial_value = NULL_TREE;
7148       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7149           initial_value = vect_phi_initial_value (reduc_def_phi);
7150       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7151                                                        orig_code, initial_value);
7152     }
7153 
7154   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7155     {
7156       /* We can't support in-order reductions of code such as this:
7157 
7158              for (int i = 0; i < n1; ++i)
7159                for (int j = 0; j < n2; ++j)
7160                  l += a[j];
7161 
7162            since GCC effectively transforms the loop when vectorizing:
7163 
7164              for (int i = 0; i < n1 / VF; ++i)
7165                for (int j = 0; j < n2; ++j)
7166                  for (int k = 0; k < VF; ++k)
7167                      l += a[j];
7168 
7169            which is a reassociation of the original operation.  */
7170       if (dump_enabled_p ())
7171           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7172                                "in-order double reduction not supported.\n");
7173 
7174       return false;
7175     }
7176 
7177   if (reduction_type == FOLD_LEFT_REDUCTION
7178       && slp_node
7179       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7180     {
7181       /* We cannot use in-order reductions in this case because there is
7182            an implicit reassociation of the operations involved.  */
7183       if (dump_enabled_p ())
7184           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7185                                "in-order unchained SLP reductions not supported.\n");
7186       return false;
7187     }
7188 
7189   /* For double reductions, and for SLP reductions with a neutral value,
7190      we construct a variable-length initial vector by loading a vector
7191      full of the neutral value and then shift-and-inserting the start
7192      values into the low-numbered elements.  */
7193   if ((double_reduc || neutral_op)
7194       && !nunits_out.is_constant ()
7195       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7196                                                     vectype_out, OPTIMIZE_FOR_SPEED))
7197     {
7198       if (dump_enabled_p ())
7199           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7200                                "reduction on variable-length vectors requires"
7201                                " target support for a vector-shift-and-insert"
7202                                " operation.\n");
7203       return false;
7204     }
7205 
7206   /* Check extra constraints for variable-length unchained SLP reductions.  */
7207   if (slp_node
7208       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7209       && !nunits_out.is_constant ())
7210     {
7211       /* We checked above that we could build the initial vector when
7212            there's a neutral element value.  Check here for the case in
7213            which each SLP statement has its own initial value and in which
7214            that value needs to be repeated for every instance of the
7215            statement within the initial vector.  */
7216       unsigned int group_size = SLP_TREE_LANES (slp_node);
7217       if (!neutral_op
7218             && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7219                                                         TREE_TYPE (vectype_out)))
7220           {
7221             if (dump_enabled_p ())
7222               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7223                                    "unsupported form of SLP reduction for"
7224                                    " variable-length vectors: cannot build"
7225                                    " initial vector.\n");
7226             return false;
7227           }
7228       /* The epilogue code relies on the number of elements being a multiple
7229            of the group size.  The duplicate-and-interleave approach to setting
7230            up the initial vector does too.  */
7231       if (!multiple_p (nunits_out, group_size))
7232           {
7233             if (dump_enabled_p ())
7234               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7235                                    "unsupported form of SLP reduction for"
7236                                    " variable-length vectors: the vector size"
7237                                    " is not a multiple of the number of results.\n");
7238             return false;
7239           }
7240     }
7241 
7242   if (reduction_type == COND_REDUCTION)
7243     {
7244       widest_int ni;
7245 
7246       if (! max_loop_iterations (loop, &ni))
7247           {
7248             if (dump_enabled_p ())
7249               dump_printf_loc (MSG_NOTE, vect_location,
7250                                    "loop count not known, cannot create cond "
7251                                    "reduction.\n");
7252             return false;
7253           }
7254       /* Convert backedges to iterations.  */
7255       ni += 1;
7256 
7257       /* The additional index will be the same type as the condition.  Check
7258            that the loop can fit into this less one (because we'll use up the
7259            zero slot for when there are no matches).  */
7260       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7261       if (wi::geu_p (ni, wi::to_widest (max_index)))
7262           {
7263             if (dump_enabled_p ())
7264               dump_printf_loc (MSG_NOTE, vect_location,
7265                                    "loop size is greater than data size.\n");
7266             return false;
7267           }
7268     }
7269 
7270   /* In case the vectorization factor (VF) is bigger than the number
7271      of elements that we can fit in a vectype (nunits), we have to generate
7272      more than one vector stmt - i.e - we need to "unroll" the
7273      vector stmt by a factor VF/nunits.  For more details see documentation
7274      in vectorizable_operation.  */
7275 
7276   /* If the reduction is used in an outer loop we need to generate
7277      VF intermediate results, like so (e.g. for ncopies=2):
7278           r0 = phi (init, r0)
7279           r1 = phi (init, r1)
7280           r0 = x0 + r0;
7281         r1 = x1 + r1;
7282     (i.e. we generate VF results in 2 registers).
7283     In this case we have a separate def-use cycle for each copy, and therefore
7284     for each copy we get the vector def for the reduction variable from the
7285     respective phi node created for this copy.
7286 
7287     Otherwise (the reduction is unused in the loop nest), we can combine
7288     together intermediate results, like so (e.g. for ncopies=2):
7289           r = phi (init, r)
7290           r = x0 + r;
7291           r = x1 + r;
7292    (i.e. we generate VF/2 results in a single register).
7293    In this case for each copy we get the vector def for the reduction variable
7294    from the vectorized reduction operation generated in the previous iteration.
7295 
7296    This only works when we see both the reduction PHI and its only consumer
7297    in vectorizable_reduction and there are no intermediate stmts
7298    participating.  When unrolling we want each unrolled iteration to have its
7299    own reduction accumulator since one of the main goals of unrolling a
7300    reduction is to reduce the aggregate loop-carried latency.  */
7301   if (ncopies > 1
7302       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7303       && reduc_chain_length == 1
7304       && loop_vinfo->suggested_unroll_factor == 1)
7305     single_defuse_cycle = true;
7306 
7307   if (single_defuse_cycle || lane_reduc_code_p)
7308     {
7309       gcc_assert (op.code != COND_EXPR);
7310 
7311       /* 4. Supportable by target?  */
7312       bool ok = true;
7313 
7314       /* 4.1. check support for the operation in the loop  */
7315       machine_mode vec_mode = TYPE_MODE (vectype_in);
7316       if (!directly_supported_p (op.code, vectype_in, optab_query_kind))
7317         {
7318           if (dump_enabled_p ())
7319             dump_printf (MSG_NOTE, "op not supported by target.\n");
7320             if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7321                 || !vect_can_vectorize_without_simd_p (op.code))
7322               ok = false;
7323             else
7324               if (dump_enabled_p ())
7325                 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7326         }
7327 
7328       if (vect_emulated_vector_p (vectype_in)
7329             && !vect_can_vectorize_without_simd_p (op.code))
7330           {
7331             if (dump_enabled_p ())
7332               dump_printf (MSG_NOTE, "using word mode not possible.\n");
7333             return false;
7334           }
7335 
7336       /* lane-reducing operations have to go through vect_transform_reduction.
7337          For the other cases try without the single cycle optimization.  */
7338       if (!ok)
7339           {
7340             if (lane_reduc_code_p)
7341               return false;
7342             else
7343               single_defuse_cycle = false;
7344           }
7345     }
7346   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7347 
7348   /* If the reduction stmt is one of the patterns that have lane
7349      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7350   if ((ncopies > 1 && ! single_defuse_cycle)
7351       && lane_reduc_code_p)
7352     {
7353       if (dump_enabled_p ())
7354           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7355                                "multi def-use cycle not possible for lane-reducing "
7356                                "reduction operation\n");
7357       return false;
7358     }
7359 
7360   if (slp_node
7361       && !(!single_defuse_cycle
7362              && !lane_reduc_code_p
7363              && reduction_type != FOLD_LEFT_REDUCTION))
7364     for (i = 0; i < (int) op.num_ops; i++)
7365       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7366           {
7367             if (dump_enabled_p ())
7368               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7369                                    "incompatible vector types for invariants\n");
7370             return false;
7371           }
7372 
7373   if (slp_node)
7374     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7375   else
7376     vec_num = 1;
7377 
7378   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7379                                    reduction_type, ncopies, cost_vec);
7380   /* Cost the reduction op inside the loop if transformed via
7381      vect_transform_reduction.  Otherwise this is costed by the
7382      separate vectorizable_* routines.  */
7383   if (single_defuse_cycle || lane_reduc_code_p)
7384     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7385 
7386   if (dump_enabled_p ()
7387       && reduction_type == FOLD_LEFT_REDUCTION)
7388     dump_printf_loc (MSG_NOTE, vect_location,
7389                          "using an in-order (fold-left) reduction.\n");
7390   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7391   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7392      reductions go through their own vectorizable_* routines.  */
7393   if (!single_defuse_cycle
7394       && !lane_reduc_code_p
7395       && reduction_type != FOLD_LEFT_REDUCTION)
7396     {
7397       stmt_vec_info tem
7398           = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7399       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7400           {
7401             gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7402             tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7403           }
7404       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7405       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7406     }
7407   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7408     {
7409       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7410       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7411 
7412       if (reduction_type != FOLD_LEFT_REDUCTION
7413             && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7414             && (cond_fn == IFN_LAST
7415                 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7416                                                               OPTIMIZE_FOR_SPEED)))
7417           {
7418             if (dump_enabled_p ())
7419               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7420                                    "can't operate on partial vectors because"
7421                                    " no conditional operation is available.\n");
7422             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7423           }
7424       else if (reduction_type == FOLD_LEFT_REDUCTION
7425                  && reduc_fn == IFN_LAST
7426                  && !expand_vec_cond_expr_p (vectype_in,
7427                                                      truth_type_for (vectype_in),
7428                                                      SSA_NAME))
7429           {
7430             if (dump_enabled_p ())
7431               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7432                                    "can't operate on partial vectors because"
7433                                    " no conditional operation is available.\n");
7434             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7435           }
7436       else
7437           vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7438                                      vectype_in, NULL);
7439     }
7440   return true;
7441 }
7442 
7443 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7444    value.  */
7445 
7446 bool
vect_transform_reduction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node)7447 vect_transform_reduction (loop_vec_info loop_vinfo,
7448                                 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7449                                 gimple **vec_stmt, slp_tree slp_node)
7450 {
7451   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7452   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7453   int i;
7454   int ncopies;
7455   int vec_num;
7456 
7457   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7458   gcc_assert (reduc_info->is_reduc_info);
7459 
7460   if (nested_in_vect_loop_p (loop, stmt_info))
7461     {
7462       loop = loop->inner;
7463       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7464     }
7465 
7466   gimple_match_op op;
7467   if (!gimple_extract_op (stmt_info->stmt, &op))
7468     gcc_unreachable ();
7469 
7470   /* All uses but the last are expected to be defined in the loop.
7471      The last use is the reduction variable.  In case of nested cycle this
7472      assumption is not true: we use reduc_index to record the index of the
7473      reduction variable.  */
7474   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7475   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7476   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7477   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7478 
7479   if (slp_node)
7480     {
7481       ncopies = 1;
7482       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7483     }
7484   else
7485     {
7486       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7487       vec_num = 1;
7488     }
7489 
7490   code_helper code = canonicalize_code (op.code, op.type);
7491   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7492   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7493   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7494 
7495   /* Transform.  */
7496   tree new_temp = NULL_TREE;
7497   auto_vec<tree> vec_oprnds0;
7498   auto_vec<tree> vec_oprnds1;
7499   auto_vec<tree> vec_oprnds2;
7500   tree def0;
7501 
7502   if (dump_enabled_p ())
7503     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7504 
7505   /* FORNOW: Multiple types are not supported for condition.  */
7506   if (code == COND_EXPR)
7507     gcc_assert (ncopies == 1);
7508 
7509   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7510 
7511   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7512   if (reduction_type == FOLD_LEFT_REDUCTION)
7513     {
7514       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7515       gcc_assert (code.is_tree_code ());
7516       return vectorize_fold_left_reduction
7517             (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7518              tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7519     }
7520 
7521   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7522   gcc_assert (single_defuse_cycle
7523                 || code == DOT_PROD_EXPR
7524                 || code == WIDEN_SUM_EXPR
7525                 || code == SAD_EXPR);
7526 
7527   /* Create the destination vector  */
7528   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7529   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7530 
7531   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7532                          single_defuse_cycle && reduc_index == 0
7533                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
7534                          single_defuse_cycle && reduc_index == 1
7535                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
7536                          op.num_ops == 3
7537                          && !(single_defuse_cycle && reduc_index == 2)
7538                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7539   if (single_defuse_cycle)
7540     {
7541       gcc_assert (!slp_node);
7542       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7543                                              op.ops[reduc_index],
7544                                              reduc_index == 0 ? &vec_oprnds0
7545                                              : (reduc_index == 1 ? &vec_oprnds1
7546                                                   : &vec_oprnds2));
7547     }
7548 
7549   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7550     {
7551       gimple *new_stmt;
7552       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7553       if (masked_loop_p && !mask_by_cond_expr)
7554           {
7555             /* Make sure that the reduction accumulator is vop[0].  */
7556             if (reduc_index == 1)
7557               {
7558                 gcc_assert (commutative_binary_op_p (code, op.type));
7559                 std::swap (vop[0], vop[1]);
7560               }
7561             tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7562                                                     vectype_in, i);
7563             gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7564                                                                 vop[0], vop[1], vop[0]);
7565             new_temp = make_ssa_name (vec_dest, call);
7566             gimple_call_set_lhs (call, new_temp);
7567             gimple_call_set_nothrow (call, true);
7568             vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7569             new_stmt = call;
7570           }
7571       else
7572           {
7573             if (op.num_ops == 3)
7574               vop[2] = vec_oprnds2[i];
7575 
7576             if (masked_loop_p && mask_by_cond_expr)
7577               {
7578                 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7579                                                         vectype_in, i);
7580                 build_vect_cond_expr (code, vop, mask, gsi);
7581               }
7582 
7583             if (code.is_internal_fn ())
7584               new_stmt = gimple_build_call_internal (internal_fn (code),
7585                                                                op.num_ops,
7586                                                                vop[0], vop[1], vop[2]);
7587             else
7588               new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7589                                                       vop[0], vop[1], vop[2]);
7590             new_temp = make_ssa_name (vec_dest, new_stmt);
7591             gimple_set_lhs (new_stmt, new_temp);
7592             vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7593           }
7594 
7595       if (slp_node)
7596           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7597       else if (single_defuse_cycle
7598                  && i < ncopies - 1)
7599           {
7600             if (reduc_index == 0)
7601               vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7602             else if (reduc_index == 1)
7603               vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7604             else if (reduc_index == 2)
7605               vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7606           }
7607       else
7608           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7609     }
7610 
7611   if (!slp_node)
7612     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7613 
7614   return true;
7615 }
7616 
7617 /* Transform phase of a cycle PHI.  */
7618 
7619 bool
vect_transform_cycle_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)7620 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7621                                 stmt_vec_info stmt_info, gimple **vec_stmt,
7622                                 slp_tree slp_node, slp_instance slp_node_instance)
7623 {
7624   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7625   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7626   int i;
7627   int ncopies;
7628   int j;
7629   bool nested_cycle = false;
7630   int vec_num;
7631 
7632   if (nested_in_vect_loop_p (loop, stmt_info))
7633     {
7634       loop = loop->inner;
7635       nested_cycle = true;
7636     }
7637 
7638   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7639   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7640   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7641   gcc_assert (reduc_info->is_reduc_info);
7642 
7643   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7644       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7645     /* Leave the scalar phi in place.  */
7646     return true;
7647 
7648   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7649   /* For a nested cycle we do not fill the above.  */
7650   if (!vectype_in)
7651     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7652   gcc_assert (vectype_in);
7653 
7654   if (slp_node)
7655     {
7656       /* The size vect_schedule_slp_instance computes is off for us.  */
7657       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7658                                               * SLP_TREE_LANES (slp_node), vectype_in);
7659       ncopies = 1;
7660     }
7661   else
7662     {
7663       vec_num = 1;
7664       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7665     }
7666 
7667   /* Check whether we should use a single PHI node and accumulate
7668      vectors to one before the backedge.  */
7669   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7670     ncopies = 1;
7671 
7672   /* Create the destination vector  */
7673   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7674   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7675                                                          vectype_out);
7676 
7677   /* Get the loop-entry arguments.  */
7678   tree vec_initial_def = NULL_TREE;
7679   auto_vec<tree> vec_initial_defs;
7680   if (slp_node)
7681     {
7682       vec_initial_defs.reserve (vec_num);
7683       if (nested_cycle)
7684           {
7685             unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7686             vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7687                                    &vec_initial_defs);
7688           }
7689       else
7690           {
7691             gcc_assert (slp_node == slp_node_instance->reduc_phis);
7692             vec<tree> &initial_values = reduc_info->reduc_initial_values;
7693             vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7694 
7695             unsigned int num_phis = stmts.length ();
7696             if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7697               num_phis = 1;
7698             initial_values.reserve (num_phis);
7699             for (unsigned int i = 0; i < num_phis; ++i)
7700               {
7701                 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7702                 initial_values.quick_push (vect_phi_initial_value (this_phi));
7703               }
7704             if (vec_num == 1)
7705               vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7706             if (!initial_values.is_empty ())
7707               {
7708                 tree initial_value
7709                     = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7710                 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7711                 tree neutral_op
7712                     = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7713                                                       code, initial_value);
7714                 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7715                                                         &vec_initial_defs, vec_num,
7716                                                         stmts.length (), neutral_op);
7717               }
7718           }
7719     }
7720   else
7721     {
7722       /* Get at the scalar def before the loop, that defines the initial
7723            value of the reduction variable.  */
7724       tree initial_def = vect_phi_initial_value (phi);
7725       reduc_info->reduc_initial_values.safe_push (initial_def);
7726       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7727            and we can't use zero for induc_val, use initial_def.  Similarly
7728            for REDUC_MIN and initial_def larger than the base.  */
7729       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7730           {
7731             tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7732             if (TREE_CODE (initial_def) == INTEGER_CST
7733                 && !integer_zerop (induc_val)
7734                 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7735                        && tree_int_cst_lt (initial_def, induc_val))
7736                       || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7737                           && tree_int_cst_lt (induc_val, initial_def))))
7738               {
7739                 induc_val = initial_def;
7740                 /* Communicate we used the initial_def to epilouge
7741                      generation.  */
7742                 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7743               }
7744             vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7745           }
7746       else if (nested_cycle)
7747           {
7748             /* Do not use an adjustment def as that case is not supported
7749                correctly if ncopies is not one.  */
7750             vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7751                                                    ncopies, initial_def,
7752                                                    &vec_initial_defs);
7753           }
7754       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7755                  || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7756           /* Fill the initial vector with the initial scalar value.  */
7757           vec_initial_def
7758             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7759                                                      initial_def, initial_def);
7760       else
7761           {
7762             if (ncopies == 1)
7763               vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7764             if (!reduc_info->reduc_initial_values.is_empty ())
7765               {
7766                 initial_def = reduc_info->reduc_initial_values[0];
7767                 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7768                 tree neutral_op
7769                     = neutral_op_for_reduction (TREE_TYPE (initial_def),
7770                                                       code, initial_def);
7771                 gcc_assert (neutral_op);
7772                 /* Try to simplify the vector initialization by applying an
7773                      adjustment after the reduction has been performed.  */
7774                 if (!reduc_info->reused_accumulator
7775                       && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7776                       && !operand_equal_p (neutral_op, initial_def))
7777                     {
7778                       STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7779                         = initial_def;
7780                       initial_def = neutral_op;
7781                     }
7782                 vec_initial_def
7783                     = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7784                                                              initial_def, neutral_op);
7785               }
7786           }
7787     }
7788 
7789   if (vec_initial_def)
7790     {
7791       vec_initial_defs.create (ncopies);
7792       for (i = 0; i < ncopies; ++i)
7793           vec_initial_defs.quick_push (vec_initial_def);
7794     }
7795 
7796   if (auto *accumulator = reduc_info->reused_accumulator)
7797     {
7798       tree def = accumulator->reduc_input;
7799       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7800           {
7801             unsigned int nreduc;
7802             bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7803                                                       (TREE_TYPE (def)),
7804                                                     TYPE_VECTOR_SUBPARTS (vectype_out),
7805                                                     &nreduc);
7806             gcc_assert (res);
7807             gimple_seq stmts = NULL;
7808             /* Reduce the single vector to a smaller one.  */
7809             if (nreduc != 1)
7810               {
7811                 /* Perform the reduction in the appropriate type.  */
7812                 tree rvectype = vectype_out;
7813                 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7814                                                         TREE_TYPE (TREE_TYPE (def))))
7815                     rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7816                                                         TYPE_VECTOR_SUBPARTS
7817                                                             (vectype_out));
7818                 def = vect_create_partial_epilog (def, rvectype,
7819                                                             STMT_VINFO_REDUC_CODE
7820                                                               (reduc_info),
7821                                                             &stmts);
7822               }
7823             /* The epilogue loop might use a different vector mode, like
7824                VNx2DI vs. V2DI.  */
7825             if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7826               {
7827                 tree reduc_type = build_vector_type_for_mode
7828                     (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7829                 def = gimple_convert (&stmts, reduc_type, def);
7830               }
7831             /* Adjust the input so we pick up the partially reduced value
7832                for the skip edge in vect_create_epilog_for_reduction.  */
7833             accumulator->reduc_input = def;
7834             /* And the reduction could be carried out using a different sign.  */
7835             if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7836               def = gimple_convert (&stmts, vectype_out, def);
7837             if (loop_vinfo->main_loop_edge)
7838               {
7839                 /* While we'd like to insert on the edge this will split
7840                      blocks and disturb bookkeeping, we also will eventually
7841                      need this on the skip edge.  Rely on sinking to
7842                      fixup optimal placement and insert in the pred.  */
7843                 gimple_stmt_iterator gsi
7844                     = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7845                 /* Insert before a cond that eventually skips the
7846                      epilogue.  */
7847                 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7848                     gsi_prev (&gsi);
7849                 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7850               }
7851             else
7852               gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7853                                                         stmts);
7854           }
7855       if (loop_vinfo->main_loop_edge)
7856           vec_initial_defs[0]
7857             = vect_get_main_loop_result (loop_vinfo, def,
7858                                                vec_initial_defs[0]);
7859       else
7860           vec_initial_defs.safe_push (def);
7861     }
7862 
7863   /* Generate the reduction PHIs upfront.  */
7864   for (i = 0; i < vec_num; i++)
7865     {
7866       tree vec_init_def = vec_initial_defs[i];
7867       for (j = 0; j < ncopies; j++)
7868           {
7869             /* Create the reduction-phi that defines the reduction
7870                operand.  */
7871             gphi *new_phi = create_phi_node (vec_dest, loop->header);
7872 
7873             /* Set the loop-entry arg of the reduction-phi.  */
7874             if (j != 0 && nested_cycle)
7875               vec_init_def = vec_initial_defs[j];
7876             add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7877                            UNKNOWN_LOCATION);
7878 
7879             /* The loop-latch arg is set in epilogue processing.  */
7880 
7881             if (slp_node)
7882               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7883             else
7884               {
7885                 if (j == 0)
7886                     *vec_stmt = new_phi;
7887                 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7888               }
7889           }
7890     }
7891 
7892   return true;
7893 }
7894 
7895 /* Vectorizes LC PHIs.  */
7896 
7897 bool
vectorizable_lc_phi(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node)7898 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7899                          stmt_vec_info stmt_info, gimple **vec_stmt,
7900                          slp_tree slp_node)
7901 {
7902   if (!loop_vinfo
7903       || !is_a <gphi *> (stmt_info->stmt)
7904       || gimple_phi_num_args (stmt_info->stmt) != 1)
7905     return false;
7906 
7907   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7908       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7909     return false;
7910 
7911   if (!vec_stmt) /* transformation not required.  */
7912     {
7913       /* Deal with copies from externs or constants that disguise as
7914            loop-closed PHI nodes (PR97886).  */
7915       if (slp_node
7916             && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7917                                                             SLP_TREE_VECTYPE (slp_node)))
7918           {
7919             if (dump_enabled_p ())
7920               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7921                                    "incompatible vector types for invariants\n");
7922             return false;
7923           }
7924       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7925       return true;
7926     }
7927 
7928   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7929   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7930   basic_block bb = gimple_bb (stmt_info->stmt);
7931   edge e = single_pred_edge (bb);
7932   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7933   auto_vec<tree> vec_oprnds;
7934   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7935                          !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7936                          gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7937   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7938     {
7939       /* Create the vectorized LC PHI node.  */
7940       gphi *new_phi = create_phi_node (vec_dest, bb);
7941       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7942       if (slp_node)
7943           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7944       else
7945           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7946     }
7947   if (!slp_node)
7948     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7949 
7950   return true;
7951 }
7952 
7953 /* Vectorizes PHIs.  */
7954 
7955 bool
vectorizable_phi(vec_info *,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)7956 vectorizable_phi (vec_info *,
7957                       stmt_vec_info stmt_info, gimple **vec_stmt,
7958                       slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7959 {
7960   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7961     return false;
7962 
7963   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7964     return false;
7965 
7966   tree vectype = SLP_TREE_VECTYPE (slp_node);
7967 
7968   if (!vec_stmt) /* transformation not required.  */
7969     {
7970       slp_tree child;
7971       unsigned i;
7972       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7973           if (!child)
7974             {
7975               if (dump_enabled_p ())
7976                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977                                      "PHI node with unvectorized backedge def\n");
7978               return false;
7979             }
7980           else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7981             {
7982               if (dump_enabled_p ())
7983                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7984                                      "incompatible vector types for invariants\n");
7985               return false;
7986             }
7987           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7988                      && !useless_type_conversion_p (vectype,
7989                                                             SLP_TREE_VECTYPE (child)))
7990             {
7991               /* With bools we can have mask and non-mask precision vectors
7992                  or different non-mask precisions.  while pattern recog is
7993                  supposed to guarantee consistency here bugs in it can cause
7994                  mismatches (PR103489 and PR103800 for example).
7995                  Deal with them here instead of ICEing later.  */
7996               if (dump_enabled_p ())
7997                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7998                                      "incompatible vector type setup from "
7999                                      "bool pattern detection\n");
8000               return false;
8001             }
8002 
8003       /* For single-argument PHIs assume coalescing which means zero cost
8004            for the scalar and the vector PHIs.  This avoids artificially
8005            favoring the vector path (but may pessimize it in some cases).  */
8006       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8007           record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8008                                 vector_stmt, stmt_info, vectype, 0, vect_body);
8009       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8010       return true;
8011     }
8012 
8013   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8014   basic_block bb = gimple_bb (stmt_info->stmt);
8015   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8016   auto_vec<gphi *> new_phis;
8017   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8018     {
8019       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8020 
8021       /* Skip not yet vectorized defs.  */
8022       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8023             && SLP_TREE_VEC_STMTS (child).is_empty ())
8024           continue;
8025 
8026       auto_vec<tree> vec_oprnds;
8027       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8028       if (!new_phis.exists ())
8029           {
8030             new_phis.create (vec_oprnds.length ());
8031             for (unsigned j = 0; j < vec_oprnds.length (); j++)
8032               {
8033                 /* Create the vectorized LC PHI node.  */
8034                 new_phis.quick_push (create_phi_node (vec_dest, bb));
8035                 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8036               }
8037           }
8038       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8039       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8040           add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8041     }
8042   /* We should have at least one already vectorized child.  */
8043   gcc_assert (new_phis.exists ());
8044 
8045   return true;
8046 }
8047 
8048 /* Return true if VECTYPE represents a vector that requires lowering
8049    by the vector lowering pass.  */
8050 
8051 bool
vect_emulated_vector_p(tree vectype)8052 vect_emulated_vector_p (tree vectype)
8053 {
8054   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8055             && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8056                 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8057 }
8058 
8059 /* Return true if we can emulate CODE on an integer mode representation
8060    of a vector.  */
8061 
8062 bool
vect_can_vectorize_without_simd_p(tree_code code)8063 vect_can_vectorize_without_simd_p (tree_code code)
8064 {
8065   switch (code)
8066     {
8067     case PLUS_EXPR:
8068     case MINUS_EXPR:
8069     case NEGATE_EXPR:
8070     case BIT_AND_EXPR:
8071     case BIT_IOR_EXPR:
8072     case BIT_XOR_EXPR:
8073     case BIT_NOT_EXPR:
8074       return true;
8075 
8076     default:
8077       return false;
8078     }
8079 }
8080 
8081 /* Likewise, but taking a code_helper.  */
8082 
8083 bool
vect_can_vectorize_without_simd_p(code_helper code)8084 vect_can_vectorize_without_simd_p (code_helper code)
8085 {
8086   return (code.is_tree_code ()
8087             && vect_can_vectorize_without_simd_p (tree_code (code)));
8088 }
8089 
8090 /* Function vectorizable_induction
8091 
8092    Check if STMT_INFO performs an induction computation that can be vectorized.
8093    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8094    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8095    Return true if STMT_INFO is vectorizable in this way.  */
8096 
8097 bool
vectorizable_induction(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple ** vec_stmt,slp_tree slp_node,stmt_vector_for_cost * cost_vec)8098 vectorizable_induction (loop_vec_info loop_vinfo,
8099                               stmt_vec_info stmt_info,
8100                               gimple **vec_stmt, slp_tree slp_node,
8101                               stmt_vector_for_cost *cost_vec)
8102 {
8103   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8104   unsigned ncopies;
8105   bool nested_in_vect_loop = false;
8106   class loop *iv_loop;
8107   tree vec_def;
8108   edge pe = loop_preheader_edge (loop);
8109   basic_block new_bb;
8110   tree new_vec, vec_init, vec_step, t;
8111   tree new_name;
8112   gimple *new_stmt;
8113   gphi *induction_phi;
8114   tree induc_def, vec_dest;
8115   tree init_expr, step_expr;
8116   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8117   unsigned i;
8118   tree expr;
8119   gimple_stmt_iterator si;
8120 
8121   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8122   if (!phi)
8123     return false;
8124 
8125   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8126     return false;
8127 
8128   /* Make sure it was recognized as induction computation.  */
8129   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8130     return false;
8131 
8132   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8133   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8134 
8135   if (slp_node)
8136     ncopies = 1;
8137   else
8138     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8139   gcc_assert (ncopies >= 1);
8140 
8141   /* FORNOW. These restrictions should be relaxed.  */
8142   if (nested_in_vect_loop_p (loop, stmt_info))
8143     {
8144       imm_use_iterator imm_iter;
8145       use_operand_p use_p;
8146       gimple *exit_phi;
8147       edge latch_e;
8148       tree loop_arg;
8149 
8150       if (ncopies > 1)
8151           {
8152             if (dump_enabled_p ())
8153               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8154                                    "multiple types in nested loop.\n");
8155             return false;
8156           }
8157 
8158       exit_phi = NULL;
8159       latch_e = loop_latch_edge (loop->inner);
8160       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8161       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8162           {
8163             gimple *use_stmt = USE_STMT (use_p);
8164             if (is_gimple_debug (use_stmt))
8165               continue;
8166 
8167             if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
8168               {
8169                 exit_phi = use_stmt;
8170                 break;
8171               }
8172           }
8173       if (exit_phi)
8174           {
8175             stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8176             if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8177                     && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8178               {
8179                 if (dump_enabled_p ())
8180                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8181                                          "inner-loop induction only used outside "
8182                                          "of the outer vectorized loop.\n");
8183                 return false;
8184               }
8185           }
8186 
8187       nested_in_vect_loop = true;
8188       iv_loop = loop->inner;
8189     }
8190   else
8191     iv_loop = loop;
8192   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8193 
8194   if (slp_node && !nunits.is_constant ())
8195     {
8196       /* The current SLP code creates the step value element-by-element.  */
8197       if (dump_enabled_p ())
8198           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199                                "SLP induction not supported for variable-length"
8200                                " vectors.\n");
8201       return false;
8202     }
8203 
8204   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
8205     {
8206       if (dump_enabled_p ())
8207           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8208                                "floating point induction vectorization disabled\n");
8209       return false;
8210     }
8211 
8212   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8213   gcc_assert (step_expr != NULL_TREE);
8214   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
8215       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
8216     {
8217       if (dump_enabled_p ())
8218           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8219                                "bit-precision induction vectorization not "
8220                                "supported.\n");
8221       return false;
8222     }
8223   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8224 
8225   /* Check for backend support of PLUS/MINUS_EXPR. */
8226   if (!directly_supported_p (PLUS_EXPR, step_vectype)
8227       || !directly_supported_p (MINUS_EXPR, step_vectype))
8228     return false;
8229 
8230   if (!vec_stmt) /* transformation not required.  */
8231     {
8232       unsigned inside_cost = 0, prologue_cost = 0;
8233       if (slp_node)
8234           {
8235             /* We eventually need to set a vector type on invariant
8236                arguments.  */
8237             unsigned j;
8238             slp_tree child;
8239             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8240               if (!vect_maybe_update_slp_op_vectype
8241                     (child, SLP_TREE_VECTYPE (slp_node)))
8242                 {
8243                     if (dump_enabled_p ())
8244                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8245                                            "incompatible vector types for "
8246                                            "invariants\n");
8247                     return false;
8248                 }
8249             /* loop cost for vec_loop.  */
8250             inside_cost
8251               = record_stmt_cost (cost_vec,
8252                                         SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8253                                         vector_stmt, stmt_info, 0, vect_body);
8254             /* prologue cost for vec_init (if not nested) and step.  */
8255             prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8256                                                       scalar_to_vec,
8257                                                       stmt_info, 0, vect_prologue);
8258           }
8259       else /* if (!slp_node) */
8260           {
8261             /* loop cost for vec_loop.  */
8262             inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8263                                                     stmt_info, 0, vect_body);
8264             /* prologue cost for vec_init and vec_step.  */
8265             prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8266                                                       stmt_info, 0, vect_prologue);
8267           }
8268       if (dump_enabled_p ())
8269           dump_printf_loc (MSG_NOTE, vect_location,
8270                                "vect_model_induction_cost: inside_cost = %d, "
8271                                "prologue_cost = %d .\n", inside_cost,
8272                                prologue_cost);
8273 
8274       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8275       DUMP_VECT_SCOPE ("vectorizable_induction");
8276       return true;
8277     }
8278 
8279   /* Transform.  */
8280 
8281   /* Compute a vector variable, initialized with the first VF values of
8282      the induction variable.  E.g., for an iv with IV_PHI='X' and
8283      evolution S, for a vector of 4 units, we want to compute:
8284      [X, X + S, X + 2*S, X + 3*S].  */
8285 
8286   if (dump_enabled_p ())
8287     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8288 
8289   pe = loop_preheader_edge (iv_loop);
8290   /* Find the first insertion point in the BB.  */
8291   basic_block bb = gimple_bb (phi);
8292   si = gsi_after_labels (bb);
8293 
8294   /* For SLP induction we have to generate several IVs as for example
8295      with group size 3 we need
8296        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8297        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8298   if (slp_node)
8299     {
8300       /* Enforced above.  */
8301       unsigned int const_nunits = nunits.to_constant ();
8302 
8303       /* The initial values are vectorized, but any lanes > group_size
8304            need adjustment.  */
8305       slp_tree init_node
8306           = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8307 
8308       /* Gather steps.  Since we do not vectorize inductions as
8309            cycles we have to reconstruct the step from SCEV data.  */
8310       unsigned group_size = SLP_TREE_LANES (slp_node);
8311       tree *steps = XALLOCAVEC (tree, group_size);
8312       tree *inits = XALLOCAVEC (tree, group_size);
8313       stmt_vec_info phi_info;
8314       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8315           {
8316             steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8317             if (!init_node)
8318               inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8319                                                      pe->dest_idx);
8320           }
8321 
8322       /* Now generate the IVs.  */
8323       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8324       gcc_assert ((const_nunits * nvects) % group_size == 0);
8325       unsigned nivs;
8326       if (nested_in_vect_loop)
8327           nivs = nvects;
8328       else
8329           {
8330             /* Compute the number of distinct IVs we need.  First reduce
8331                group_size if it is a multiple of const_nunits so we get
8332                one IV for a group_size of 4 but const_nunits 2.  */
8333             unsigned group_sizep = group_size;
8334             if (group_sizep % const_nunits == 0)
8335               group_sizep = group_sizep / const_nunits;
8336             nivs = least_common_multiple (group_sizep,
8337                                                   const_nunits) / const_nunits;
8338           }
8339       tree stept = TREE_TYPE (step_vectype);
8340       tree lupdate_mul = NULL_TREE;
8341       if (!nested_in_vect_loop)
8342           {
8343             /* The number of iterations covered in one vector iteration.  */
8344             unsigned lup_mul = (nvects * const_nunits) / group_size;
8345             lupdate_mul
8346               = build_vector_from_val (step_vectype,
8347                                              SCALAR_FLOAT_TYPE_P (stept)
8348                                              ? build_real_from_wide (stept, lup_mul,
8349                                                                            UNSIGNED)
8350                                              : build_int_cstu (stept, lup_mul));
8351           }
8352       tree peel_mul = NULL_TREE;
8353       gimple_seq init_stmts = NULL;
8354       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8355           {
8356             if (SCALAR_FLOAT_TYPE_P (stept))
8357               peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8358                                              LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8359             else
8360               peel_mul = gimple_convert (&init_stmts, stept,
8361                                                LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8362             peel_mul = gimple_build_vector_from_val (&init_stmts,
8363                                                                step_vectype, peel_mul);
8364           }
8365       unsigned ivn;
8366       auto_vec<tree> vec_steps;
8367       for (ivn = 0; ivn < nivs; ++ivn)
8368           {
8369             tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8370             tree_vector_builder init_elts (vectype, const_nunits, 1);
8371             tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8372             for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8373               {
8374                 /* The scalar steps of the IVs.  */
8375                 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8376                 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8377                 step_elts.quick_push (elt);
8378                 if (!init_node)
8379                     {
8380                       /* The scalar inits of the IVs if not vectorized.  */
8381                       elt = inits[(ivn*const_nunits + eltn) % group_size];
8382                       if (!useless_type_conversion_p (TREE_TYPE (vectype),
8383                                                               TREE_TYPE (elt)))
8384                         elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8385                                                   TREE_TYPE (vectype), elt);
8386                       init_elts.quick_push (elt);
8387                     }
8388                 /* The number of steps to add to the initial values.  */
8389                 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8390                 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8391                                            ? build_real_from_wide (stept,
8392                                                                          mul_elt, UNSIGNED)
8393                                            : build_int_cstu (stept, mul_elt));
8394               }
8395             vec_step = gimple_build_vector (&init_stmts, &step_elts);
8396             vec_steps.safe_push (vec_step);
8397             tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8398             if (peel_mul)
8399               step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8400                                              step_mul, peel_mul);
8401             if (!init_node)
8402               vec_init = gimple_build_vector (&init_stmts, &init_elts);
8403 
8404             /* Create the induction-phi that defines the induction-operand.  */
8405             vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8406                                                       "vec_iv_");
8407             induction_phi = create_phi_node (vec_dest, iv_loop->header);
8408             induc_def = PHI_RESULT (induction_phi);
8409 
8410             /* Create the iv update inside the loop  */
8411             tree up = vec_step;
8412             if (lupdate_mul)
8413               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8414                                      vec_step, lupdate_mul);
8415             gimple_seq stmts = NULL;
8416             vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8417             vec_def = gimple_build (&stmts,
8418                                           PLUS_EXPR, step_vectype, vec_def, up);
8419             vec_def = gimple_convert (&stmts, vectype, vec_def);
8420             gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8421             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8422                            UNKNOWN_LOCATION);
8423 
8424             if (init_node)
8425               vec_init = vect_get_slp_vect_def (init_node, ivn);
8426             if (!nested_in_vect_loop
8427                 && !integer_zerop (step_mul))
8428               {
8429                 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8430                 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8431                                          vec_step, step_mul);
8432                 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8433                                               vec_def, up);
8434                 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8435               }
8436 
8437             /* Set the arguments of the phi node:  */
8438             add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8439 
8440             SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8441           }
8442       if (!nested_in_vect_loop)
8443           {
8444             /* Fill up to the number of vectors we need for the whole group.  */
8445             nivs = least_common_multiple (group_size,
8446                                                   const_nunits) / const_nunits;
8447             vec_steps.reserve (nivs-ivn);
8448             for (; ivn < nivs; ++ivn)
8449               {
8450                 SLP_TREE_VEC_STMTS (slp_node)
8451                     .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8452                 vec_steps.quick_push (vec_steps[0]);
8453               }
8454           }
8455 
8456       /* Re-use IVs when we can.  We are generating further vector
8457            stmts by adding VF' * stride to the IVs generated above.  */
8458       if (ivn < nvects)
8459           {
8460             unsigned vfp
8461               = least_common_multiple (group_size, const_nunits) / group_size;
8462             tree lupdate_mul
8463               = build_vector_from_val (step_vectype,
8464                                              SCALAR_FLOAT_TYPE_P (stept)
8465                                              ? build_real_from_wide (stept,
8466                                                                            vfp, UNSIGNED)
8467                                              : build_int_cstu (stept, vfp));
8468             for (; ivn < nvects; ++ivn)
8469               {
8470                 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8471                 tree def = gimple_get_lhs (iv);
8472                 if (ivn < 2*nivs)
8473                     vec_steps[ivn - nivs]
8474                       = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8475                                           vec_steps[ivn - nivs], lupdate_mul);
8476                 gimple_seq stmts = NULL;
8477                 def = gimple_convert (&stmts, step_vectype, def);
8478                 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8479                                           def, vec_steps[ivn % nivs]);
8480                 def = gimple_convert (&stmts, vectype, def);
8481                 if (gimple_code (iv) == GIMPLE_PHI)
8482                     gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8483                 else
8484                     {
8485                       gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8486                       gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8487                     }
8488                 SLP_TREE_VEC_STMTS (slp_node)
8489                     .quick_push (SSA_NAME_DEF_STMT (def));
8490               }
8491           }
8492 
8493       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8494       gcc_assert (!new_bb);
8495 
8496       return true;
8497     }
8498 
8499   init_expr = vect_phi_initial_value (phi);
8500 
8501   gimple_seq stmts = NULL;
8502   if (!nested_in_vect_loop)
8503     {
8504       /* Convert the initial value to the IV update type.  */
8505       tree new_type = TREE_TYPE (step_expr);
8506       init_expr = gimple_convert (&stmts, new_type, init_expr);
8507 
8508       /* If we are using the loop mask to "peel" for alignment then we need
8509            to adjust the start value here.  */
8510       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8511       if (skip_niters != NULL_TREE)
8512           {
8513             if (FLOAT_TYPE_P (vectype))
8514               skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8515                                                   skip_niters);
8516             else
8517               skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8518             tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8519                                                    skip_niters, step_expr);
8520             init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8521                                             init_expr, skip_step);
8522           }
8523     }
8524 
8525   if (stmts)
8526     {
8527       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8528       gcc_assert (!new_bb);
8529     }
8530 
8531   /* Create the vector that holds the initial_value of the induction.  */
8532   if (nested_in_vect_loop)
8533     {
8534       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8535            been created during vectorization of previous stmts.  We obtain it
8536            from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8537       auto_vec<tree> vec_inits;
8538       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8539                                              init_expr, &vec_inits);
8540       vec_init = vec_inits[0];
8541       /* If the initial value is not of proper type, convert it.  */
8542       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8543           {
8544             new_stmt
8545               = gimple_build_assign (vect_get_new_ssa_name (vectype,
8546                                                                         vect_simple_var,
8547                                                                         "vec_iv_"),
8548                                            VIEW_CONVERT_EXPR,
8549                                            build1 (VIEW_CONVERT_EXPR, vectype,
8550                                                      vec_init));
8551             vec_init = gimple_assign_lhs (new_stmt);
8552             new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8553                                                              new_stmt);
8554             gcc_assert (!new_bb);
8555           }
8556     }
8557   else
8558     {
8559       /* iv_loop is the loop to be vectorized. Create:
8560            vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8561       stmts = NULL;
8562       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8563 
8564       unsigned HOST_WIDE_INT const_nunits;
8565       if (nunits.is_constant (&const_nunits))
8566           {
8567             tree_vector_builder elts (step_vectype, const_nunits, 1);
8568             elts.quick_push (new_name);
8569             for (i = 1; i < const_nunits; i++)
8570               {
8571                 /* Create: new_name_i = new_name + step_expr  */
8572                 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8573                                                new_name, step_expr);
8574                 elts.quick_push (new_name);
8575               }
8576             /* Create a vector from [new_name_0, new_name_1, ...,
8577                new_name_nunits-1]  */
8578             vec_init = gimple_build_vector (&stmts, &elts);
8579           }
8580       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8581           /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8582           vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8583                                          new_name, step_expr);
8584       else
8585           {
8586             /* Build:
8587                   [base, base, base, ...]
8588                     + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8589             gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8590             gcc_assert (flag_associative_math);
8591             tree index = build_index_vector (step_vectype, 0, 1);
8592             tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8593                                                                       new_name);
8594             tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8595                                                                       step_expr);
8596             vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8597             vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8598                                            vec_init, step_vec);
8599             vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8600                                            vec_init, base_vec);
8601           }
8602       vec_init = gimple_convert (&stmts, vectype, vec_init);
8603 
8604       if (stmts)
8605           {
8606             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8607             gcc_assert (!new_bb);
8608           }
8609     }
8610 
8611 
8612   /* Create the vector that holds the step of the induction.  */
8613   if (nested_in_vect_loop)
8614     /* iv_loop is nested in the loop to be vectorized. Generate:
8615        vec_step = [S, S, S, S]  */
8616     new_name = step_expr;
8617   else
8618     {
8619       /* iv_loop is the loop to be vectorized. Generate:
8620             vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8621       gimple_seq seq = NULL;
8622       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8623           {
8624             expr = build_int_cst (integer_type_node, vf);
8625             expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8626           }
8627       else
8628           expr = build_int_cst (TREE_TYPE (step_expr), vf);
8629       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8630                                      expr, step_expr);
8631       if (seq)
8632           {
8633             new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8634             gcc_assert (!new_bb);
8635           }
8636     }
8637 
8638   t = unshare_expr (new_name);
8639   gcc_assert (CONSTANT_CLASS_P (new_name)
8640                 || TREE_CODE (new_name) == SSA_NAME);
8641   new_vec = build_vector_from_val (step_vectype, t);
8642   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8643                                      new_vec, step_vectype, NULL);
8644 
8645 
8646   /* Create the following def-use cycle:
8647      loop prolog:
8648          vec_init = ...
8649            vec_step = ...
8650      loop:
8651          vec_iv = PHI <vec_init, vec_loop>
8652          ...
8653          STMT
8654          ...
8655          vec_loop = vec_iv + vec_step;  */
8656 
8657   /* Create the induction-phi that defines the induction-operand.  */
8658   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8659   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8660   induc_def = PHI_RESULT (induction_phi);
8661 
8662   /* Create the iv update inside the loop  */
8663   stmts = NULL;
8664   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8665   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8666   vec_def = gimple_convert (&stmts, vectype, vec_def);
8667   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8668   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8669 
8670   /* Set the arguments of the phi node:  */
8671   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8672   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8673                  UNKNOWN_LOCATION);
8674 
8675   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8676   *vec_stmt = induction_phi;
8677 
8678   /* In case that vectorization factor (VF) is bigger than the number
8679      of elements that we can fit in a vectype (nunits), we have to generate
8680      more than one vector stmt - i.e - we need to "unroll" the
8681      vector stmt by a factor VF/nunits.  For more details see documentation
8682      in vectorizable_operation.  */
8683 
8684   if (ncopies > 1)
8685     {
8686       gimple_seq seq = NULL;
8687       /* FORNOW. This restriction should be relaxed.  */
8688       gcc_assert (!nested_in_vect_loop);
8689 
8690       /* Create the vector that holds the step of the induction.  */
8691       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8692           {
8693             expr = build_int_cst (integer_type_node, nunits);
8694             expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8695           }
8696       else
8697           expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8698       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8699                                      expr, step_expr);
8700       if (seq)
8701           {
8702             new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8703             gcc_assert (!new_bb);
8704           }
8705 
8706       t = unshare_expr (new_name);
8707       gcc_assert (CONSTANT_CLASS_P (new_name)
8708                       || TREE_CODE (new_name) == SSA_NAME);
8709       new_vec = build_vector_from_val (step_vectype, t);
8710       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8711                                            new_vec, step_vectype, NULL);
8712 
8713       vec_def = induc_def;
8714       for (i = 1; i < ncopies; i++)
8715           {
8716             /* vec_i = vec_prev + vec_step  */
8717             gimple_seq stmts = NULL;
8718             vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8719             vec_def = gimple_build (&stmts,
8720                                           PLUS_EXPR, step_vectype, vec_def, vec_step);
8721             vec_def = gimple_convert (&stmts, vectype, vec_def);
8722 
8723             gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8724             new_stmt = SSA_NAME_DEF_STMT (vec_def);
8725             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8726           }
8727     }
8728 
8729   if (dump_enabled_p ())
8730     dump_printf_loc (MSG_NOTE, vect_location,
8731                          "transform induction: created def-use cycle: %G%G",
8732                          induction_phi, SSA_NAME_DEF_STMT (vec_def));
8733 
8734   return true;
8735 }
8736 
8737 /* Function vectorizable_live_operation.
8738 
8739    STMT_INFO computes a value that is used outside the loop.  Check if
8740    it can be supported.  */
8741 
8742 bool
vectorizable_live_operation(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,slp_tree slp_node,slp_instance slp_node_instance,int slp_index,bool vec_stmt_p,stmt_vector_for_cost * cost_vec)8743 vectorizable_live_operation (vec_info *vinfo,
8744                                    stmt_vec_info stmt_info,
8745                                    gimple_stmt_iterator *gsi,
8746                                    slp_tree slp_node, slp_instance slp_node_instance,
8747                                    int slp_index, bool vec_stmt_p,
8748                                    stmt_vector_for_cost *cost_vec)
8749 {
8750   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8751   imm_use_iterator imm_iter;
8752   tree lhs, lhs_type, bitsize;
8753   tree vectype = (slp_node
8754                       ? SLP_TREE_VECTYPE (slp_node)
8755                       : STMT_VINFO_VECTYPE (stmt_info));
8756   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8757   int ncopies;
8758   gimple *use_stmt;
8759   auto_vec<tree> vec_oprnds;
8760   int vec_entry = 0;
8761   poly_uint64 vec_index = 0;
8762 
8763   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8764 
8765   /* If a stmt of a reduction is live, vectorize it via
8766      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8767      validity so just trigger the transform here.  */
8768   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8769     {
8770       if (!vec_stmt_p)
8771           return true;
8772       if (slp_node)
8773           {
8774             /* For reduction chains the meta-info is attached to
8775                the group leader.  */
8776             if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8777               stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8778             /* For SLP reductions we vectorize the epilogue for
8779                all involved stmts together.  */
8780             else if (slp_index != 0)
8781               return true;
8782           }
8783       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8784       gcc_assert (reduc_info->is_reduc_info);
8785       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8786             || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8787           return true;
8788       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8789                                                   slp_node_instance);
8790       return true;
8791     }
8792 
8793   /* If STMT is not relevant and it is a simple assignment and its inputs are
8794      invariant then it can remain in place, unvectorized.  The original last
8795      scalar value that it computes will be used.  */
8796   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8797     {
8798       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8799       if (dump_enabled_p ())
8800           dump_printf_loc (MSG_NOTE, vect_location,
8801                                "statement is simple and uses invariant.  Leaving in "
8802                                "place.\n");
8803       return true;
8804     }
8805 
8806   if (slp_node)
8807     ncopies = 1;
8808   else
8809     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8810 
8811   if (slp_node)
8812     {
8813       gcc_assert (slp_index >= 0);
8814 
8815       /* Get the last occurrence of the scalar index from the concatenation of
8816            all the slp vectors. Calculate which slp vector it is and the index
8817            within.  */
8818       int num_scalar = SLP_TREE_LANES (slp_node);
8819       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8820       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8821 
8822       /* Calculate which vector contains the result, and which lane of
8823            that vector we need.  */
8824       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8825           {
8826             if (dump_enabled_p ())
8827               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8828                                    "Cannot determine which vector holds the"
8829                                    " final result.\n");
8830             return false;
8831           }
8832     }
8833 
8834   if (!vec_stmt_p)
8835     {
8836       /* No transformation required.  */
8837       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8838           {
8839             if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8840                                                          OPTIMIZE_FOR_SPEED))
8841               {
8842                 if (dump_enabled_p ())
8843                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8844                                          "can't operate on partial vectors "
8845                                          "because the target doesn't support extract "
8846                                          "last reduction.\n");
8847                 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8848               }
8849             else if (slp_node)
8850               {
8851                 if (dump_enabled_p ())
8852                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8853                                          "can't operate on partial vectors "
8854                                          "because an SLP statement is live after "
8855                                          "the loop.\n");
8856                 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8857               }
8858             else if (ncopies > 1)
8859               {
8860                 if (dump_enabled_p ())
8861                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8862                                          "can't operate on partial vectors "
8863                                          "because ncopies is greater than 1.\n");
8864                 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8865               }
8866             else
8867               {
8868                 gcc_assert (ncopies == 1 && !slp_node);
8869                 vect_record_loop_mask (loop_vinfo,
8870                                              &LOOP_VINFO_MASKS (loop_vinfo),
8871                                              1, vectype, NULL);
8872               }
8873           }
8874       /* ???  Enable for loop costing as well.  */
8875       if (!loop_vinfo)
8876           record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8877                                 0, vect_epilogue);
8878       return true;
8879     }
8880 
8881   /* Use the lhs of the original scalar statement.  */
8882   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8883   if (dump_enabled_p ())
8884     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8885                          "stmt %G", stmt);
8886 
8887   lhs = gimple_get_lhs (stmt);
8888   lhs_type = TREE_TYPE (lhs);
8889 
8890   bitsize = vector_element_bits_tree (vectype);
8891 
8892   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8893   tree vec_lhs, bitstart;
8894   gimple *vec_stmt;
8895   if (slp_node)
8896     {
8897       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8898 
8899       /* Get the correct slp vectorized stmt.  */
8900       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8901       vec_lhs = gimple_get_lhs (vec_stmt);
8902 
8903       /* Get entry to use.  */
8904       bitstart = bitsize_int (vec_index);
8905       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8906     }
8907   else
8908     {
8909       /* For multiple copies, get the last copy.  */
8910       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8911       vec_lhs = gimple_get_lhs (vec_stmt);
8912 
8913       /* Get the last lane in the vector.  */
8914       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8915     }
8916 
8917   if (loop_vinfo)
8918     {
8919       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8920            requirement, insert one phi node for it.  It looks like:
8921              loop;
8922            BB:
8923              # lhs' = PHI <lhs>
8924            ==>
8925              loop;
8926            BB:
8927              # vec_lhs' = PHI <vec_lhs>
8928              new_tree = lane_extract <vec_lhs', ...>;
8929              lhs' = new_tree;  */
8930 
8931       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8932       basic_block exit_bb = single_exit (loop)->dest;
8933       gcc_assert (single_pred_p (exit_bb));
8934 
8935       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8936       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8937       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8938 
8939       gimple_seq stmts = NULL;
8940       tree new_tree;
8941       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8942           {
8943             /* Emit:
8944 
8945                  SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8946 
8947                where VEC_LHS is the vectorized live-out result and MASK is
8948                the loop mask for the final iteration.  */
8949             gcc_assert (ncopies == 1 && !slp_node);
8950             tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8951             tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8952                                                     1, vectype, 0);
8953             tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8954                                                     mask, vec_lhs_phi);
8955 
8956             /* Convert the extracted vector element to the scalar type.  */
8957             new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8958           }
8959       else
8960           {
8961             tree bftype = TREE_TYPE (vectype);
8962             if (VECTOR_BOOLEAN_TYPE_P (vectype))
8963               bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8964             new_tree = build3 (BIT_FIELD_REF, bftype,
8965                                    vec_lhs_phi, bitsize, bitstart);
8966             new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8967                                                      &stmts, true, NULL_TREE);
8968           }
8969 
8970       if (stmts)
8971           {
8972             gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8973             gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8974 
8975             /* Remove existing phi from lhs and create one copy from new_tree.  */
8976             tree lhs_phi = NULL_TREE;
8977             gimple_stmt_iterator gsi;
8978             for (gsi = gsi_start_phis (exit_bb);
8979                  !gsi_end_p (gsi); gsi_next (&gsi))
8980               {
8981                 gimple *phi = gsi_stmt (gsi);
8982                 if ((gimple_phi_arg_def (phi, 0) == lhs))
8983                     {
8984                       remove_phi_node (&gsi, false);
8985                       lhs_phi = gimple_phi_result (phi);
8986                       gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8987                       gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8988                       break;
8989                     }
8990               }
8991           }
8992 
8993       /* Replace use of lhs with newly computed result.  If the use stmt is a
8994            single arg PHI, just replace all uses of PHI result.  It's necessary
8995            because lcssa PHI defining lhs may be before newly inserted stmt.  */
8996       use_operand_p use_p;
8997       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8998           if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8999               && !is_gimple_debug (use_stmt))
9000             {
9001               if (gimple_code (use_stmt) == GIMPLE_PHI
9002                     && gimple_phi_num_args (use_stmt) == 1)
9003                 {
9004                     replace_uses_by (gimple_phi_result (use_stmt), new_tree);
9005                 }
9006               else
9007                 {
9008                     FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9009                         SET_USE (use_p, new_tree);
9010                 }
9011               update_stmt (use_stmt);
9012             }
9013     }
9014   else
9015     {
9016       /* For basic-block vectorization simply insert the lane-extraction.  */
9017       tree bftype = TREE_TYPE (vectype);
9018       if (VECTOR_BOOLEAN_TYPE_P (vectype))
9019           bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9020       tree new_tree = build3 (BIT_FIELD_REF, bftype,
9021                                     vec_lhs, bitsize, bitstart);
9022       gimple_seq stmts = NULL;
9023       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9024                                                &stmts, true, NULL_TREE);
9025       if (TREE_CODE (new_tree) == SSA_NAME
9026             && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9027           SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9028       if (is_a <gphi *> (vec_stmt))
9029           {
9030             gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9031             gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9032           }
9033       else
9034           {
9035             gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9036             gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9037           }
9038 
9039       /* Replace use of lhs with newly computed result.  If the use stmt is a
9040            single arg PHI, just replace all uses of PHI result.  It's necessary
9041            because lcssa PHI defining lhs may be before newly inserted stmt.  */
9042       use_operand_p use_p;
9043       stmt_vec_info use_stmt_info;
9044       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9045           if (!is_gimple_debug (use_stmt)
9046               && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9047                     || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9048             {
9049               /* ???  This can happen when the live lane ends up being
9050                  used in a vector construction code-generated by an
9051                  external SLP node (and code-generation for that already
9052                  happened).  See gcc.dg/vect/bb-slp-47.c.
9053                  Doing this is what would happen if that vector CTOR
9054                  were not code-generated yet so it is not too bad.
9055                  ???  In fact we'd likely want to avoid this situation
9056                  in the first place.  */
9057               if (TREE_CODE (new_tree) == SSA_NAME
9058                     && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9059                     && gimple_code (use_stmt) != GIMPLE_PHI
9060                     && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9061                                                             use_stmt))
9062                 {
9063                     enum tree_code code = gimple_assign_rhs_code (use_stmt);
9064                     gcc_checking_assert (code == SSA_NAME
9065                                              || code == CONSTRUCTOR
9066                                              || code == VIEW_CONVERT_EXPR
9067                                              || CONVERT_EXPR_CODE_P (code));
9068                     if (dump_enabled_p ())
9069                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9070                                            "Using original scalar computation for "
9071                                            "live lane because use preceeds vector "
9072                                            "def\n");
9073                     continue;
9074                 }
9075               /* ???  It can also happen that we end up pulling a def into
9076                  a loop where replacing out-of-loop uses would require
9077                  a new LC SSA PHI node.  Retain the original scalar in
9078                  those cases as well.  PR98064.  */
9079               if (TREE_CODE (new_tree) == SSA_NAME
9080                     && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9081                     && (gimple_bb (use_stmt)->loop_father
9082                         != gimple_bb (vec_stmt)->loop_father)
9083                     && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9084                                                   gimple_bb (use_stmt)->loop_father))
9085                 {
9086                     if (dump_enabled_p ())
9087                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9088                                            "Using original scalar computation for "
9089                                            "live lane because there is an out-of-loop "
9090                                            "definition for it\n");
9091                     continue;
9092                 }
9093               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9094                 SET_USE (use_p, new_tree);
9095               update_stmt (use_stmt);
9096             }
9097     }
9098 
9099   return true;
9100 }
9101 
9102 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
9103 
9104 static void
vect_loop_kill_debug_uses(class loop * loop,stmt_vec_info stmt_info)9105 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9106 {
9107   ssa_op_iter op_iter;
9108   imm_use_iterator imm_iter;
9109   def_operand_p def_p;
9110   gimple *ustmt;
9111 
9112   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9113     {
9114       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9115           {
9116             basic_block bb;
9117 
9118             if (!is_gimple_debug (ustmt))
9119               continue;
9120 
9121             bb = gimple_bb (ustmt);
9122 
9123             if (!flow_bb_inside_loop_p (loop, bb))
9124               {
9125                 if (gimple_debug_bind_p (ustmt))
9126                     {
9127                       if (dump_enabled_p ())
9128                         dump_printf_loc (MSG_NOTE, vect_location,
9129                                      "killing debug use\n");
9130 
9131                       gimple_debug_bind_reset_value (ustmt);
9132                       update_stmt (ustmt);
9133                     }
9134                 else
9135                     gcc_unreachable ();
9136               }
9137           }
9138     }
9139 }
9140 
9141 /* Given loop represented by LOOP_VINFO, return true if computation of
9142    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9143    otherwise.  */
9144 
9145 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)9146 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9147 {
9148   /* Constant case.  */
9149   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9150     {
9151       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9152       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9153 
9154       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9155       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9156       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9157           return true;
9158     }
9159 
9160   widest_int max;
9161   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9162   /* Check the upper bound of loop niters.  */
9163   if (get_max_loop_iterations (loop, &max))
9164     {
9165       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9166       signop sgn = TYPE_SIGN (type);
9167       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9168       if (max < type_max)
9169           return true;
9170     }
9171   return false;
9172 }
9173 
9174 /* Return a mask type with half the number of elements as OLD_TYPE,
9175    given that it should have mode NEW_MODE.  */
9176 
9177 tree
vect_halve_mask_nunits(tree old_type,machine_mode new_mode)9178 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
9179 {
9180   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
9181   return build_truth_vector_type_for_mode (nunits, new_mode);
9182 }
9183 
9184 /* Return a mask type with twice as many elements as OLD_TYPE,
9185    given that it should have mode NEW_MODE.  */
9186 
9187 tree
vect_double_mask_nunits(tree old_type,machine_mode new_mode)9188 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
9189 {
9190   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
9191   return build_truth_vector_type_for_mode (nunits, new_mode);
9192 }
9193 
9194 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9195    contain a sequence of NVECTORS masks that each control a vector of type
9196    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9197    these vector masks with the vector version of SCALAR_MASK.  */
9198 
9199 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype,tree scalar_mask)9200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9201                            unsigned int nvectors, tree vectype, tree scalar_mask)
9202 {
9203   gcc_assert (nvectors != 0);
9204   if (masks->length () < nvectors)
9205     masks->safe_grow_cleared (nvectors, true);
9206   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9207   /* The number of scalars per iteration and the number of vectors are
9208      both compile-time constants.  */
9209   unsigned int nscalars_per_iter
9210     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9211                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9212 
9213   if (scalar_mask)
9214     {
9215       scalar_cond_masked_key cond (scalar_mask, nvectors);
9216       loop_vinfo->scalar_cond_masked_set.add (cond);
9217     }
9218 
9219   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9220     {
9221       rgm->max_nscalars_per_iter = nscalars_per_iter;
9222       rgm->type = truth_type_for (vectype);
9223       rgm->factor = 1;
9224     }
9225 }
9226 
9227 /* Given a complete set of masks MASKS, extract mask number INDEX
9228    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9229    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9230 
9231    See the comment above vec_loop_masks for more details about the mask
9232    arrangement.  */
9233 
9234 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)9235 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9236                         unsigned int nvectors, tree vectype, unsigned int index)
9237 {
9238   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9239   tree mask_type = rgm->type;
9240 
9241   /* Populate the rgroup's mask array, if this is the first time we've
9242      used it.  */
9243   if (rgm->controls.is_empty ())
9244     {
9245       rgm->controls.safe_grow_cleared (nvectors, true);
9246       for (unsigned int i = 0; i < nvectors; ++i)
9247           {
9248             tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9249             /* Provide a dummy definition until the real one is available.  */
9250             SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9251             rgm->controls[i] = mask;
9252           }
9253     }
9254 
9255   tree mask = rgm->controls[index];
9256   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9257                     TYPE_VECTOR_SUBPARTS (vectype)))
9258     {
9259       /* A loop mask for data type X can be reused for data type Y
9260            if X has N times more elements than Y and if Y's elements
9261            are N times bigger than X's.  In this case each sequence
9262            of N elements in the loop mask will be all-zero or all-one.
9263            We can then view-convert the mask so that each sequence of
9264            N elements is replaced by a single element.  */
9265       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9266                                     TYPE_VECTOR_SUBPARTS (vectype)));
9267       gimple_seq seq = NULL;
9268       mask_type = truth_type_for (vectype);
9269       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9270       if (seq)
9271           gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9272     }
9273   return mask;
9274 }
9275 
9276 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9277    lengths for controlling an operation on VECTYPE.  The operation splits
9278    each element of VECTYPE into FACTOR separate subelements, measuring the
9279    length as a number of these subelements.  */
9280 
9281 void
vect_record_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,tree vectype,unsigned int factor)9282 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9283                           unsigned int nvectors, tree vectype, unsigned int factor)
9284 {
9285   gcc_assert (nvectors != 0);
9286   if (lens->length () < nvectors)
9287     lens->safe_grow_cleared (nvectors, true);
9288   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9289 
9290   /* The number of scalars per iteration, scalar occupied bytes and
9291      the number of vectors are both compile-time constants.  */
9292   unsigned int nscalars_per_iter
9293     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9294                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9295 
9296   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9297     {
9298       /* For now, we only support cases in which all loads and stores fall back
9299            to VnQI or none do.  */
9300       gcc_assert (!rgl->max_nscalars_per_iter
9301                       || (rgl->factor == 1 && factor == 1)
9302                       || (rgl->max_nscalars_per_iter * rgl->factor
9303                           == nscalars_per_iter * factor));
9304       rgl->max_nscalars_per_iter = nscalars_per_iter;
9305       rgl->type = vectype;
9306       rgl->factor = factor;
9307     }
9308 }
9309 
9310 /* Given a complete set of length LENS, extract length number INDEX for an
9311    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9312 
9313 tree
vect_get_loop_len(loop_vec_info loop_vinfo,vec_loop_lens * lens,unsigned int nvectors,unsigned int index)9314 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9315                        unsigned int nvectors, unsigned int index)
9316 {
9317   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9318   bool use_bias_adjusted_len =
9319     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
9320 
9321   /* Populate the rgroup's len array, if this is the first time we've
9322      used it.  */
9323   if (rgl->controls.is_empty ())
9324     {
9325       rgl->controls.safe_grow_cleared (nvectors, true);
9326       for (unsigned int i = 0; i < nvectors; ++i)
9327           {
9328             tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9329             gcc_assert (len_type != NULL_TREE);
9330 
9331             tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9332 
9333             /* Provide a dummy definition until the real one is available.  */
9334             SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9335             rgl->controls[i] = len;
9336 
9337             if (use_bias_adjusted_len)
9338               {
9339                 gcc_assert (i == 0);
9340                 tree adjusted_len =
9341                     make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
9342                 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
9343                 rgl->bias_adjusted_ctrl = adjusted_len;
9344               }
9345           }
9346     }
9347 
9348   if (use_bias_adjusted_len)
9349     return rgl->bias_adjusted_ctrl;
9350   else
9351     return rgl->controls[index];
9352 }
9353 
9354 /* Scale profiling counters by estimation for LOOP which is vectorized
9355    by factor VF.  */
9356 
9357 static void
scale_profile_for_vect_loop(class loop * loop,unsigned vf)9358 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9359 {
9360   edge preheader = loop_preheader_edge (loop);
9361   /* Reduce loop iterations by the vectorization factor.  */
9362   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9363   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9364 
9365   if (freq_h.nonzero_p ())
9366     {
9367       profile_probability p;
9368 
9369       /* Avoid dropping loop body profile counter to 0 because of zero count
9370            in loop's preheader.  */
9371       if (!(freq_e == profile_count::zero ()))
9372         freq_e = freq_e.force_nonzero ();
9373       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9374       scale_loop_frequencies (loop, p);
9375     }
9376 
9377   edge exit_e = single_exit (loop);
9378   exit_e->probability = profile_probability::always ()
9379                                          .apply_scale (1, new_est_niter + 1);
9380 
9381   edge exit_l = single_pred_edge (loop->latch);
9382   profile_probability prob = exit_l->probability;
9383   exit_l->probability = exit_e->probability.invert ();
9384   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9385     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9386 }
9387 
9388 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9389    latch edge values originally defined by it.  */
9390 
9391 static void
maybe_set_vectorized_backedge_value(loop_vec_info loop_vinfo,stmt_vec_info def_stmt_info)9392 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9393                                              stmt_vec_info def_stmt_info)
9394 {
9395   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9396   if (!def || TREE_CODE (def) != SSA_NAME)
9397     return;
9398   stmt_vec_info phi_info;
9399   imm_use_iterator iter;
9400   use_operand_p use_p;
9401   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9402     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9403       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9404             && (phi_info = loop_vinfo->lookup_stmt (phi))
9405             && STMT_VINFO_RELEVANT_P (phi_info)
9406             && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9407             && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9408             && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9409           {
9410             loop_p loop = gimple_bb (phi)->loop_father;
9411             edge e = loop_latch_edge (loop);
9412             if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9413               {
9414                 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9415                 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9416                 gcc_assert (phi_defs.length () == latch_defs.length ());
9417                 for (unsigned i = 0; i < phi_defs.length (); ++i)
9418                     add_phi_arg (as_a <gphi *> (phi_defs[i]),
9419                                    gimple_get_lhs (latch_defs[i]), e,
9420                                    gimple_phi_arg_location (phi, e->dest_idx));
9421               }
9422           }
9423 }
9424 
9425 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9426    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9427    stmt_vec_info.  */
9428 
9429 static bool
vect_transform_loop_stmt(loop_vec_info loop_vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,stmt_vec_info * seen_store)9430 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9431                                 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9432 {
9433   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9434   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9435 
9436   if (dump_enabled_p ())
9437     dump_printf_loc (MSG_NOTE, vect_location,
9438                          "------>vectorizing statement: %G", stmt_info->stmt);
9439 
9440   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9441     vect_loop_kill_debug_uses (loop, stmt_info);
9442 
9443   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9444       && !STMT_VINFO_LIVE_P (stmt_info))
9445     return false;
9446 
9447   if (STMT_VINFO_VECTYPE (stmt_info))
9448     {
9449       poly_uint64 nunits
9450           = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9451       if (!STMT_SLP_TYPE (stmt_info)
9452             && maybe_ne (nunits, vf)
9453             && dump_enabled_p ())
9454           /* For SLP VF is set according to unrolling factor, and not
9455              to vector size, hence for SLP this print is not valid.  */
9456           dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9457     }
9458 
9459   /* Pure SLP statements have already been vectorized.  We still need
9460      to apply loop vectorization to hybrid SLP statements.  */
9461   if (PURE_SLP_STMT (stmt_info))
9462     return false;
9463 
9464   if (dump_enabled_p ())
9465     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9466 
9467   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9468     *seen_store = stmt_info;
9469 
9470   return true;
9471 }
9472 
9473 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9474    in the hash_map with its corresponding values.  */
9475 
9476 static tree
find_in_mapping(tree t,void * context)9477 find_in_mapping (tree t, void *context)
9478 {
9479   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9480 
9481   tree *value = mapping->get (t);
9482   return value ? *value : t;
9483 }
9484 
9485 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9486    original loop that has now been vectorized.
9487 
9488    The inits of the data_references need to be advanced with the number of
9489    iterations of the main loop.  This has been computed in vect_do_peeling and
9490    is stored in parameter ADVANCE.  We first restore the data_references
9491    initial offset with the values recored in ORIG_DRS_INIT.
9492 
9493    Since the loop_vec_info of this EPILOGUE was constructed for the original
9494    loop, its stmt_vec_infos all point to the original statements.  These need
9495    to be updated to point to their corresponding copies as well as the SSA_NAMES
9496    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9497 
9498    The data_reference's connections also need to be updated.  Their
9499    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9500    stmt_vec_infos, their statements need to point to their corresponding copy,
9501    if they are gather loads or scatter stores then their reference needs to be
9502    updated to point to its corresponding copy.  */
9503 
9504 static void
update_epilogue_loop_vinfo(class loop * epilogue,tree advance)9505 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9506 {
9507   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9508   auto_vec<gimple *> stmt_worklist;
9509   hash_map<tree,tree> mapping;
9510   gimple *orig_stmt, *new_stmt;
9511   gimple_stmt_iterator epilogue_gsi;
9512   gphi_iterator epilogue_phi_gsi;
9513   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9514   basic_block *epilogue_bbs = get_loop_body (epilogue);
9515   unsigned i;
9516 
9517   free (LOOP_VINFO_BBS (epilogue_vinfo));
9518   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9519 
9520   /* Advance data_reference's with the number of iterations of the previous
9521      loop and its prologue.  */
9522   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9523 
9524 
9525   /* The EPILOGUE loop is a copy of the original loop so they share the same
9526      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9527      point to the copied statements.  We also create a mapping of all LHS' in
9528      the original loop and all the LHS' in the EPILOGUE and create worklists to
9529      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9530   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9531     {
9532       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9533              !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9534           {
9535             new_stmt = epilogue_phi_gsi.phi ();
9536 
9537             gcc_assert (gimple_uid (new_stmt) > 0);
9538             stmt_vinfo
9539               = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9540 
9541             orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9542             STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9543 
9544             mapping.put (gimple_phi_result (orig_stmt),
9545                            gimple_phi_result (new_stmt));
9546             /* PHI nodes can not have patterns or related statements.  */
9547             gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9548                           && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9549           }
9550 
9551       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9552              !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9553           {
9554             new_stmt = gsi_stmt (epilogue_gsi);
9555             if (is_gimple_debug (new_stmt))
9556               continue;
9557 
9558             gcc_assert (gimple_uid (new_stmt) > 0);
9559             stmt_vinfo
9560               = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9561 
9562             orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9563             STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9564 
9565             if (tree old_lhs = gimple_get_lhs (orig_stmt))
9566               mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9567 
9568             if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9569               {
9570                 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9571                 for (gimple_stmt_iterator gsi = gsi_start (seq);
9572                        !gsi_end_p (gsi); gsi_next (&gsi))
9573                     stmt_worklist.safe_push (gsi_stmt (gsi));
9574               }
9575 
9576             related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9577             if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9578               {
9579                 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9580                 stmt_worklist.safe_push (stmt);
9581                 /* Set BB such that the assert in
9582                     'get_initial_def_for_reduction' is able to determine that
9583                     the BB of the related stmt is inside this loop.  */
9584                 gimple_set_bb (stmt,
9585                                    gimple_bb (new_stmt));
9586                 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9587                 gcc_assert (related_vinfo == NULL
9588                                 || related_vinfo == stmt_vinfo);
9589               }
9590           }
9591     }
9592 
9593   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9594      using the original main loop and thus need to be updated to refer to the
9595      cloned variables used in the epilogue.  */
9596   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9597     {
9598       gimple *stmt = stmt_worklist[i];
9599       tree *new_op;
9600 
9601       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9602           {
9603             tree op = gimple_op (stmt, j);
9604             if ((new_op = mapping.get(op)))
9605               gimple_set_op (stmt, j, *new_op);
9606             else
9607               {
9608                 /* PR92429: The last argument of simplify_replace_tree disables
9609                      folding when replacing arguments.  This is required as
9610                      otherwise you might end up with different statements than the
9611                      ones analyzed in vect_loop_analyze, leading to different
9612                      vectorization.  */
9613                 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9614                                                     &find_in_mapping, &mapping, false);
9615                 gimple_set_op (stmt, j, op);
9616               }
9617           }
9618     }
9619 
9620   struct data_reference *dr;
9621   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9622   FOR_EACH_VEC_ELT (datarefs, i, dr)
9623     {
9624       orig_stmt = DR_STMT (dr);
9625       gcc_assert (gimple_uid (orig_stmt) > 0);
9626       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9627       /* Data references for gather loads and scatter stores do not use the
9628            updated offset we set using ADVANCE.  Instead we have to make sure the
9629            reference in the data references point to the corresponding copy of
9630            the original in the epilogue.  */
9631       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9632             == VMAT_GATHER_SCATTER)
9633           {
9634             DR_REF (dr)
9635               = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9636                                              &find_in_mapping, &mapping);
9637             DR_BASE_ADDRESS (dr)
9638               = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9639                                              &find_in_mapping, &mapping);
9640           }
9641       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9642       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9643     }
9644 
9645   epilogue_vinfo->shared->datarefs_copy.release ();
9646   epilogue_vinfo->shared->save_datarefs ();
9647 }
9648 
9649 /* Function vect_transform_loop.
9650 
9651    The analysis phase has determined that the loop is vectorizable.
9652    Vectorize the loop - created vectorized stmts to replace the scalar
9653    stmts in the loop, and update the loop exit condition.
9654    Returns scalar epilogue loop if any.  */
9655 
9656 class loop *
vect_transform_loop(loop_vec_info loop_vinfo,gimple * loop_vectorized_call)9657 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9658 {
9659   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9660   class loop *epilogue = NULL;
9661   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9662   int nbbs = loop->num_nodes;
9663   int i;
9664   tree niters_vector = NULL_TREE;
9665   tree step_vector = NULL_TREE;
9666   tree niters_vector_mult_vf = NULL_TREE;
9667   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9668   unsigned int lowest_vf = constant_lower_bound (vf);
9669   gimple *stmt;
9670   bool check_profitability = false;
9671   unsigned int th;
9672 
9673   DUMP_VECT_SCOPE ("vec_transform_loop");
9674 
9675   loop_vinfo->shared->check_datarefs ();
9676 
9677   /* Use the more conservative vectorization threshold.  If the number
9678      of iterations is constant assume the cost check has been performed
9679      by our caller.  If the threshold makes all loops profitable that
9680      run at least the (estimated) vectorization factor number of times
9681      checking is pointless, too.  */
9682   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9683   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9684     {
9685       if (dump_enabled_p ())
9686           dump_printf_loc (MSG_NOTE, vect_location,
9687                                "Profitability threshold is %d loop iterations.\n",
9688                                th);
9689       check_profitability = true;
9690     }
9691 
9692   /* Make sure there exists a single-predecessor exit bb.  Do this before
9693      versioning.   */
9694   edge e = single_exit (loop);
9695   if (! single_pred_p (e->dest))
9696     {
9697       split_loop_exit_edge (e, true);
9698       if (dump_enabled_p ())
9699           dump_printf (MSG_NOTE, "split exit edge\n");
9700     }
9701 
9702   /* Version the loop first, if required, so the profitability check
9703      comes first.  */
9704 
9705   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9706     {
9707       class loop *sloop
9708           = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9709       sloop->force_vectorize = false;
9710       check_profitability = false;
9711     }
9712 
9713   /* Make sure there exists a single-predecessor exit bb also on the
9714      scalar loop copy.  Do this after versioning but before peeling
9715      so CFG structure is fine for both scalar and if-converted loop
9716      to make slpeel_duplicate_current_defs_from_edges face matched
9717      loop closed PHI nodes on the exit.  */
9718   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9719     {
9720       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9721       if (! single_pred_p (e->dest))
9722           {
9723             split_loop_exit_edge (e, true);
9724             if (dump_enabled_p ())
9725               dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9726           }
9727     }
9728 
9729   tree niters = vect_build_loop_niters (loop_vinfo);
9730   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9731   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9732   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9733   tree advance;
9734   drs_init_vec orig_drs_init;
9735 
9736   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9737                                     &step_vector, &niters_vector_mult_vf, th,
9738                                     check_profitability, niters_no_overflow,
9739                                     &advance);
9740 
9741   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9742       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9743     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9744                                   LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9745 
9746   if (niters_vector == NULL_TREE)
9747     {
9748       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9749             && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9750             && known_eq (lowest_vf, vf))
9751           {
9752             niters_vector
9753               = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9754                                    LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9755             step_vector = build_one_cst (TREE_TYPE (niters));
9756           }
9757       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9758           vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9759                                              &step_vector, niters_no_overflow);
9760       else
9761           /* vect_do_peeling subtracted the number of peeled prologue
9762              iterations from LOOP_VINFO_NITERS.  */
9763           vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9764                                              &niters_vector, &step_vector,
9765                                              niters_no_overflow);
9766     }
9767 
9768   /* 1) Make sure the loop header has exactly two entries
9769      2) Make sure we have a preheader basic block.  */
9770 
9771   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9772 
9773   split_edge (loop_preheader_edge (loop));
9774 
9775   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9776     /* This will deal with any possible peeling.  */
9777     vect_prepare_for_masked_peels (loop_vinfo);
9778 
9779   /* Schedule the SLP instances first, then handle loop vectorization
9780      below.  */
9781   if (!loop_vinfo->slp_instances.is_empty ())
9782     {
9783       DUMP_VECT_SCOPE ("scheduling SLP instances");
9784       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9785     }
9786 
9787   /* FORNOW: the vectorizer supports only loops which body consist
9788      of one basic block (header + empty latch). When the vectorizer will
9789      support more involved loop forms, the order by which the BBs are
9790      traversed need to be reconsidered.  */
9791 
9792   for (i = 0; i < nbbs; i++)
9793     {
9794       basic_block bb = bbs[i];
9795       stmt_vec_info stmt_info;
9796 
9797       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9798              gsi_next (&si))
9799           {
9800             gphi *phi = si.phi ();
9801             if (dump_enabled_p ())
9802               dump_printf_loc (MSG_NOTE, vect_location,
9803                                    "------>vectorizing phi: %G", phi);
9804             stmt_info = loop_vinfo->lookup_stmt (phi);
9805             if (!stmt_info)
9806               continue;
9807 
9808             if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9809               vect_loop_kill_debug_uses (loop, stmt_info);
9810 
9811             if (!STMT_VINFO_RELEVANT_P (stmt_info)
9812                 && !STMT_VINFO_LIVE_P (stmt_info))
9813               continue;
9814 
9815             if (STMT_VINFO_VECTYPE (stmt_info)
9816                 && (maybe_ne
9817                       (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9818                 && dump_enabled_p ())
9819               dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9820 
9821             if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9822                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9823                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9824                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9825                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9826                 && ! PURE_SLP_STMT (stmt_info))
9827               {
9828                 if (dump_enabled_p ())
9829                     dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9830                 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9831               }
9832           }
9833 
9834       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9835              gsi_next (&si))
9836           {
9837             gphi *phi = si.phi ();
9838             stmt_info = loop_vinfo->lookup_stmt (phi);
9839             if (!stmt_info)
9840               continue;
9841 
9842             if (!STMT_VINFO_RELEVANT_P (stmt_info)
9843                 && !STMT_VINFO_LIVE_P (stmt_info))
9844               continue;
9845 
9846             if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9847                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9848                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9849                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9850                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9851                 && ! PURE_SLP_STMT (stmt_info))
9852               maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9853           }
9854 
9855       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9856              !gsi_end_p (si);)
9857           {
9858             stmt = gsi_stmt (si);
9859             /* During vectorization remove existing clobber stmts.  */
9860             if (gimple_clobber_p (stmt))
9861               {
9862                 unlink_stmt_vdef (stmt);
9863                 gsi_remove (&si, true);
9864                 release_defs (stmt);
9865               }
9866             else
9867               {
9868                 /* Ignore vector stmts created in the outer loop.  */
9869                 stmt_info = loop_vinfo->lookup_stmt (stmt);
9870 
9871                 /* vector stmts created in the outer-loop during vectorization of
9872                      stmts in an inner-loop may not have a stmt_info, and do not
9873                      need to be vectorized.  */
9874                 stmt_vec_info seen_store = NULL;
9875                 if (stmt_info)
9876                     {
9877                       if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9878                         {
9879                           gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9880                           for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9881                                  !gsi_end_p (subsi); gsi_next (&subsi))
9882                               {
9883                                 stmt_vec_info pat_stmt_info
9884                                   = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9885                                 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9886                                                                 &si, &seen_store);
9887                               }
9888                           stmt_vec_info pat_stmt_info
9889                               = STMT_VINFO_RELATED_STMT (stmt_info);
9890                           if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9891                                                                 &si, &seen_store))
9892                               maybe_set_vectorized_backedge_value (loop_vinfo,
9893                                                                            pat_stmt_info);
9894                         }
9895                       else
9896                         {
9897                           if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9898                                                                 &seen_store))
9899                               maybe_set_vectorized_backedge_value (loop_vinfo,
9900                                                                            stmt_info);
9901                         }
9902                     }
9903                 gsi_next (&si);
9904                 if (seen_store)
9905                     {
9906                       if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9907                         /* Interleaving.  If IS_STORE is TRUE, the
9908                            vectorization of the interleaving chain was
9909                            completed - free all the stores in the chain.  */
9910                         vect_remove_stores (loop_vinfo,
9911                                                   DR_GROUP_FIRST_ELEMENT (seen_store));
9912                       else
9913                         /* Free the attached stmt_vec_info and remove the stmt.  */
9914                         loop_vinfo->remove_stmt (stmt_info);
9915                     }
9916               }
9917           }
9918 
9919       /* Stub out scalar statements that must not survive vectorization.
9920            Doing this here helps with grouped statements, or statements that
9921            are involved in patterns.  */
9922       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9923              !gsi_end_p (gsi); gsi_next (&gsi))
9924           {
9925             gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9926             if (!call || !gimple_call_internal_p (call))
9927               continue;
9928             internal_fn ifn = gimple_call_internal_fn (call);
9929             if (ifn == IFN_MASK_LOAD)
9930               {
9931                 tree lhs = gimple_get_lhs (call);
9932                 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9933                     {
9934                       tree zero = build_zero_cst (TREE_TYPE (lhs));
9935                       gimple *new_stmt = gimple_build_assign (lhs, zero);
9936                       gsi_replace (&gsi, new_stmt, true);
9937                     }
9938               }
9939             else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9940               {
9941                 tree lhs = gimple_get_lhs (call);
9942                 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9943                     {
9944                       tree else_arg
9945                         = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9946                       gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9947                       gsi_replace (&gsi, new_stmt, true);
9948                     }
9949               }
9950           }
9951     }                                   /* BBs in loop */
9952 
9953   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9954      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9955   if (integer_onep (step_vector))
9956     niters_no_overflow = true;
9957   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9958                                  niters_vector_mult_vf, !niters_no_overflow);
9959 
9960   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9961   scale_profile_for_vect_loop (loop, assumed_vf);
9962 
9963   /* True if the final iteration might not handle a full vector's
9964      worth of scalar iterations.  */
9965   bool final_iter_may_be_partial
9966     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9967   /* The minimum number of iterations performed by the epilogue.  This
9968      is 1 when peeling for gaps because we always need a final scalar
9969      iteration.  */
9970   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9971   /* +1 to convert latch counts to loop iteration counts,
9972      -min_epilogue_iters to remove iterations that cannot be performed
9973        by the vector code.  */
9974   int bias_for_lowest = 1 - min_epilogue_iters;
9975   int bias_for_assumed = bias_for_lowest;
9976   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9977   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9978     {
9979       /* When the amount of peeling is known at compile time, the first
9980            iteration will have exactly alignment_npeels active elements.
9981            In the worst case it will have at least one.  */
9982       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9983       bias_for_lowest += lowest_vf - min_first_active;
9984       bias_for_assumed += assumed_vf - min_first_active;
9985     }
9986   /* In these calculations the "- 1" converts loop iteration counts
9987      back to latch counts.  */
9988   if (loop->any_upper_bound)
9989     {
9990       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9991       loop->nb_iterations_upper_bound
9992           = (final_iter_may_be_partial
9993              ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9994                                   lowest_vf) - 1
9995              : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9996                                    lowest_vf) - 1);
9997       if (main_vinfo
9998             /* Both peeling for alignment and peeling for gaps can end up
9999                with the scalar epilogue running for more than VF-1 iterations.  */
10000             && !main_vinfo->peeling_for_alignment
10001             && !main_vinfo->peeling_for_gaps)
10002           {
10003             unsigned int bound;
10004             poly_uint64 main_iters
10005               = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
10006                                  LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
10007             main_iters
10008               = upper_bound (main_iters,
10009                                  LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
10010             if (can_div_away_from_zero_p (main_iters,
10011                                                   LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10012                                                   &bound))
10013               loop->nb_iterations_upper_bound
10014                 = wi::umin ((widest_int) (bound - 1),
10015                                 loop->nb_iterations_upper_bound);
10016       }
10017   }
10018   if (loop->any_likely_upper_bound)
10019     loop->nb_iterations_likely_upper_bound
10020       = (final_iter_may_be_partial
10021            ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10022                                 + bias_for_lowest, lowest_vf) - 1
10023            : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10024                                  + bias_for_lowest, lowest_vf) - 1);
10025   if (loop->any_estimate)
10026     loop->nb_iterations_estimate
10027       = (final_iter_may_be_partial
10028            ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10029                                 assumed_vf) - 1
10030            : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10031                                  assumed_vf) - 1);
10032 
10033   if (dump_enabled_p ())
10034     {
10035       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10036           {
10037             dump_printf_loc (MSG_NOTE, vect_location,
10038                                  "LOOP VECTORIZED\n");
10039             if (loop->inner)
10040               dump_printf_loc (MSG_NOTE, vect_location,
10041                                    "OUTER LOOP VECTORIZED\n");
10042             dump_printf (MSG_NOTE, "\n");
10043           }
10044       else
10045           dump_printf_loc (MSG_NOTE, vect_location,
10046                                "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10047                                GET_MODE_NAME (loop_vinfo->vector_mode));
10048     }
10049 
10050   /* Loops vectorized with a variable factor won't benefit from
10051      unrolling/peeling.  */
10052   if (!vf.is_constant ())
10053     {
10054       loop->unroll = 1;
10055       if (dump_enabled_p ())
10056           dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10057                                " variable-length vectorization factor\n");
10058     }
10059   /* Free SLP instances here because otherwise stmt reference counting
10060      won't work.  */
10061   slp_instance instance;
10062   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10063     vect_free_slp_instance (instance);
10064   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10065   /* Clear-up safelen field since its value is invalid after vectorization
10066      since vectorized loop can have loop-carried dependencies.  */
10067   loop->safelen = 0;
10068 
10069   if (epilogue)
10070     {
10071       update_epilogue_loop_vinfo (epilogue, advance);
10072 
10073       epilogue->simduid = loop->simduid;
10074       epilogue->force_vectorize = loop->force_vectorize;
10075       epilogue->dont_vectorize = false;
10076     }
10077 
10078   return epilogue;
10079 }
10080 
10081 /* The code below is trying to perform simple optimization - revert
10082    if-conversion for masked stores, i.e. if the mask of a store is zero
10083    do not perform it and all stored value producers also if possible.
10084    For example,
10085      for (i=0; i<n; i++)
10086        if (c[i])
10087           {
10088             p1[i] += 1;
10089             p2[i] = p3[i] +2;
10090           }
10091    this transformation will produce the following semi-hammock:
10092 
10093    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10094      {
10095        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10096        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10097        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10098        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10099        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10100        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10101      }
10102 */
10103 
10104 void
optimize_mask_stores(class loop * loop)10105 optimize_mask_stores (class loop *loop)
10106 {
10107   basic_block *bbs = get_loop_body (loop);
10108   unsigned nbbs = loop->num_nodes;
10109   unsigned i;
10110   basic_block bb;
10111   class loop *bb_loop;
10112   gimple_stmt_iterator gsi;
10113   gimple *stmt;
10114   auto_vec<gimple *> worklist;
10115   auto_purge_vect_location sentinel;
10116 
10117   vect_location = find_loop_location (loop);
10118   /* Pick up all masked stores in loop if any.  */
10119   for (i = 0; i < nbbs; i++)
10120     {
10121       bb = bbs[i];
10122       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10123              gsi_next (&gsi))
10124           {
10125             stmt = gsi_stmt (gsi);
10126             if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10127               worklist.safe_push (stmt);
10128           }
10129     }
10130 
10131   free (bbs);
10132   if (worklist.is_empty ())
10133     return;
10134 
10135   /* Loop has masked stores.  */
10136   while (!worklist.is_empty ())
10137     {
10138       gimple *last, *last_store;
10139       edge e, efalse;
10140       tree mask;
10141       basic_block store_bb, join_bb;
10142       gimple_stmt_iterator gsi_to;
10143       tree vdef, new_vdef;
10144       gphi *phi;
10145       tree vectype;
10146       tree zero;
10147 
10148       last = worklist.pop ();
10149       mask = gimple_call_arg (last, 2);
10150       bb = gimple_bb (last);
10151       /* Create then_bb and if-then structure in CFG, then_bb belongs to
10152            the same loop as if_bb.  It could be different to LOOP when two
10153            level loop-nest is vectorized and mask_store belongs to the inner
10154            one.  */
10155       e = split_block (bb, last);
10156       bb_loop = bb->loop_father;
10157       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10158       join_bb = e->dest;
10159       store_bb = create_empty_bb (bb);
10160       add_bb_to_loop (store_bb, bb_loop);
10161       e->flags = EDGE_TRUE_VALUE;
10162       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10163       /* Put STORE_BB to likely part.  */
10164       efalse->probability = profile_probability::unlikely ();
10165       store_bb->count = efalse->count ();
10166       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10167       if (dom_info_available_p (CDI_DOMINATORS))
10168           set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10169       if (dump_enabled_p ())
10170           dump_printf_loc (MSG_NOTE, vect_location,
10171                                "Create new block %d to sink mask stores.",
10172                                store_bb->index);
10173       /* Create vector comparison with boolean result.  */
10174       vectype = TREE_TYPE (mask);
10175       zero = build_zero_cst (vectype);
10176       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
10177       gsi = gsi_last_bb (bb);
10178       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
10179       /* Create new PHI node for vdef of the last masked store:
10180            .MEM_2 = VDEF <.MEM_1>
10181            will be converted to
10182            .MEM.3 = VDEF <.MEM_1>
10183            and new PHI node will be created in join bb
10184            .MEM_2 = PHI <.MEM_1, .MEM_3>
10185       */
10186       vdef = gimple_vdef (last);
10187       new_vdef = make_ssa_name (gimple_vop (cfun), last);
10188       gimple_set_vdef (last, new_vdef);
10189       phi = create_phi_node (vdef, join_bb);
10190       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
10191 
10192       /* Put all masked stores with the same mask to STORE_BB if possible.  */
10193       while (true)
10194           {
10195             gimple_stmt_iterator gsi_from;
10196             gimple *stmt1 = NULL;
10197 
10198             /* Move masked store to STORE_BB.  */
10199             last_store = last;
10200             gsi = gsi_for_stmt (last);
10201             gsi_from = gsi;
10202             /* Shift GSI to the previous stmt for further traversal.  */
10203             gsi_prev (&gsi);
10204             gsi_to = gsi_start_bb (store_bb);
10205             gsi_move_before (&gsi_from, &gsi_to);
10206             /* Setup GSI_TO to the non-empty block start.  */
10207             gsi_to = gsi_start_bb (store_bb);
10208             if (dump_enabled_p ())
10209               dump_printf_loc (MSG_NOTE, vect_location,
10210                                    "Move stmt to created bb\n%G", last);
10211             /* Move all stored value producers if possible.  */
10212             while (!gsi_end_p (gsi))
10213               {
10214                 tree lhs;
10215                 imm_use_iterator imm_iter;
10216                 use_operand_p use_p;
10217                 bool res;
10218 
10219                 /* Skip debug statements.  */
10220                 if (is_gimple_debug (gsi_stmt (gsi)))
10221                     {
10222                       gsi_prev (&gsi);
10223                       continue;
10224                     }
10225                 stmt1 = gsi_stmt (gsi);
10226                 /* Do not consider statements writing to memory or having
10227                      volatile operand.  */
10228                 if (gimple_vdef (stmt1)
10229                       || gimple_has_volatile_ops (stmt1))
10230                     break;
10231                 gsi_from = gsi;
10232                 gsi_prev (&gsi);
10233                 lhs = gimple_get_lhs (stmt1);
10234                 if (!lhs)
10235                     break;
10236 
10237                 /* LHS of vectorized stmt must be SSA_NAME.  */
10238                 if (TREE_CODE (lhs) != SSA_NAME)
10239                     break;
10240 
10241                 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10242                     {
10243                       /* Remove dead scalar statement.  */
10244                       if (has_zero_uses (lhs))
10245                         {
10246                           gsi_remove (&gsi_from, true);
10247                           continue;
10248                         }
10249                     }
10250 
10251                 /* Check that LHS does not have uses outside of STORE_BB.  */
10252                 res = true;
10253                 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10254                     {
10255                       gimple *use_stmt;
10256                       use_stmt = USE_STMT (use_p);
10257                       if (is_gimple_debug (use_stmt))
10258                         continue;
10259                       if (gimple_bb (use_stmt) != store_bb)
10260                         {
10261                           res = false;
10262                           break;
10263                         }
10264                     }
10265                 if (!res)
10266                     break;
10267 
10268                 if (gimple_vuse (stmt1)
10269                       && gimple_vuse (stmt1) != gimple_vuse (last_store))
10270                     break;
10271 
10272                 /* Can move STMT1 to STORE_BB.  */
10273                 if (dump_enabled_p ())
10274                     dump_printf_loc (MSG_NOTE, vect_location,
10275                                          "Move stmt to created bb\n%G", stmt1);
10276                 gsi_move_before (&gsi_from, &gsi_to);
10277                 /* Shift GSI_TO for further insertion.  */
10278                 gsi_prev (&gsi_to);
10279               }
10280             /* Put other masked stores with the same mask to STORE_BB.  */
10281             if (worklist.is_empty ()
10282                 || gimple_call_arg (worklist.last (), 2) != mask
10283                 || worklist.last () != stmt1)
10284               break;
10285             last = worklist.pop ();
10286           }
10287       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10288     }
10289 }
10290 
10291 /* Decide whether it is possible to use a zero-based induction variable
10292    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10293    the value that the induction variable must be able to hold in order
10294    to ensure that the rgroups eventually have no active vector elements.
10295    Return -1 otherwise.  */
10296 
10297 widest_int
vect_iv_limit_for_partial_vectors(loop_vec_info loop_vinfo)10298 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10299 {
10300   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10301   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10302   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10303 
10304   /* Calculate the value that the induction variable must be able
10305      to hit in order to ensure that we end the loop with an all-false mask.
10306      This involves adding the maximum number of inactive trailing scalar
10307      iterations.  */
10308   widest_int iv_limit = -1;
10309   if (max_loop_iterations (loop, &iv_limit))
10310     {
10311       if (niters_skip)
10312           {
10313             /* Add the maximum number of skipped iterations to the
10314                maximum iteration count.  */
10315             if (TREE_CODE (niters_skip) == INTEGER_CST)
10316               iv_limit += wi::to_widest (niters_skip);
10317             else
10318               iv_limit += max_vf - 1;
10319           }
10320       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10321           /* Make a conservatively-correct assumption.  */
10322           iv_limit += max_vf - 1;
10323 
10324       /* IV_LIMIT is the maximum number of latch iterations, which is also
10325            the maximum in-range IV value.  Round this value down to the previous
10326            vector alignment boundary and then add an extra full iteration.  */
10327       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10328       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10329     }
10330   return iv_limit;
10331 }
10332 
10333 /* For the given rgroup_controls RGC, check whether an induction variable
10334    would ever hit a value that produces a set of all-false masks or zero
10335    lengths before wrapping around.  Return true if it's possible to wrap
10336    around before hitting the desirable value, otherwise return false.  */
10337 
10338 bool
vect_rgroup_iv_might_wrap_p(loop_vec_info loop_vinfo,rgroup_controls * rgc)10339 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10340 {
10341   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10342 
10343   if (iv_limit == -1)
10344     return true;
10345 
10346   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10347   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10348   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10349 
10350   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10351     return true;
10352 
10353   return false;
10354 }
10355