xref: /dragonfly/contrib/gcc-8.0/gcc/tree-vect-loop.c (revision 95059079af47f9a66a175f374f2da1a5020e3255)
1 /* Loop Vectorization
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4    Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 
58 /* Loop Vectorization Pass.
59 
60    This pass tries to vectorize loops.
61 
62    For example, the vectorizer transforms the following simple loop:
63 
64         short a[N]; short b[N]; short c[N]; int i;
65 
66         for (i=0; i<N; i++){
67           a[i] = b[i] + c[i];
68         }
69 
70    as if it was manually vectorized by rewriting the source code into:
71 
72         typedef int __attribute__((mode(V8HI))) v8hi;
73         short a[N];  short b[N]; short c[N];   int i;
74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75         v8hi va, vb, vc;
76 
77         for (i=0; i<N/8; i++){
78           vb = pb[i];
79           vc = pc[i];
80           va = vb + vc;
81           pa[i] = va;
82         }
83 
84         The main entry to this pass is vectorize_loops(), in which
85    the vectorizer applies a set of analyses on a given set of loops,
86    followed by the actual vectorization transformation for the loops that
87    had successfully passed the analysis phase.
88         Throughout this pass we make a distinction between two types of
89    data: scalars (which are represented by SSA_NAMES), and memory references
90    ("data-refs").  These two types of data require different handling both
91    during analysis and transformation. The types of data-refs that the
92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94    accesses are required to have a simple (consecutive) access pattern.
95 
96    Analysis phase:
97    ===============
98         The driver for the analysis phase is vect_analyze_loop().
99    It applies a set of analyses, some of which rely on the scalar evolution
100    analyzer (scev) developed by Sebastian Pop.
101 
102         During the analysis phase the vectorizer records some information
103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104    loop, as well as general information about the loop as a whole, which is
105    recorded in a "loop_vec_info" struct attached to each loop.
106 
107    Transformation phase:
108    =====================
109         The loop transformation phase scans all the stmts in the loop, and
110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111    the loop that needs to be vectorized.  It inserts the vector code sequence
112    just before the scalar stmt S, and records a pointer to the vector code
113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114    attached to S).  This pointer will be used for the vectorization of following
115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116    otherwise, we rely on dead code elimination for removing it.
117 
118         For example, say stmt S1 was vectorized into stmt VS1:
119 
120    VS1: vb = px[i];
121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122    S2:  a = b;
123 
124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
127    resulting sequence would be:
128 
129    VS1: vb = px[i];
130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131    VS2: va = vb;
132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 
134         Operands that are not SSA_NAMEs, are data-refs that appear in
135    load/store operations (like 'x[i]' in S1), and are handled differently.
136 
137    Target modeling:
138    =================
139         Currently the only target specific information that is used is the
140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141    Targets that can support different sizes of vectors, for now will need
142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
143    flexibility will be added in the future.
144 
145         Since we only vectorize operations which vector form can be
146    expressed using existing tree codes, to verify that an operation is
147    supported, the vectorizer checks the relevant optab at the relevant
148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
149    the value found is CODE_FOR_nothing, then there's no target support, and
150    we can't vectorize the stmt.
151 
152    For additional information on this project see:
153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155 
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 
158 /* Function vect_determine_vectorization_factor
159 
160    Determine the vectorization factor (VF).  VF is the number of data elements
161    that are operated upon in parallel in a single iteration of the vectorized
162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
164    elements can fit in a single vector register.
165 
166    We currently support vectorization of loops in which all types operated upon
167    are of the same size.  Therefore this function currently sets VF according to
168    the size of the types operated upon, and fails if there are multiple sizes
169    in the loop.
170 
171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
172    original loop:
173         for (i=0; i<N; i++){
174           a[i] = b[i] + c[i];
175         }
176 
177    vectorized loop:
178         for (i=0; i<N; i+=VF){
179           a[i:VF] = b[i:VF] + c[i:VF];
180         }
181 */
182 
183 static bool
vect_determine_vectorization_factor(loop_vec_info loop_vinfo)184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
185 {
186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
188   unsigned nbbs = loop->num_nodes;
189   poly_uint64 vectorization_factor = 1;
190   tree scalar_type = NULL_TREE;
191   gphi *phi;
192   tree vectype;
193   stmt_vec_info stmt_info;
194   unsigned i;
195   HOST_WIDE_INT dummy;
196   gimple *stmt, *pattern_stmt = NULL;
197   gimple_seq pattern_def_seq = NULL;
198   gimple_stmt_iterator pattern_def_si = gsi_none ();
199   bool analyze_pattern_stmt = false;
200   bool bool_result;
201   auto_vec<stmt_vec_info> mask_producers;
202 
203   if (dump_enabled_p ())
204     dump_printf_loc (MSG_NOTE, vect_location,
205                      "=== vect_determine_vectorization_factor ===\n");
206 
207   for (i = 0; i < nbbs; i++)
208     {
209       basic_block bb = bbs[i];
210 
211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
212              gsi_next (&si))
213           {
214             phi = si.phi ();
215             stmt_info = vinfo_for_stmt (phi);
216             if (dump_enabled_p ())
217               {
218                 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
219                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
220               }
221 
222             gcc_assert (stmt_info);
223 
224             if (STMT_VINFO_RELEVANT_P (stmt_info)
225                 || STMT_VINFO_LIVE_P (stmt_info))
226             {
227                 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
229 
230                 if (dump_enabled_p ())
231                     {
232                       dump_printf_loc (MSG_NOTE, vect_location,
233                                    "get vectype for scalar type:  ");
234                       dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
235                   dump_printf (MSG_NOTE, "\n");
236                     }
237 
238                 vectype = get_vectype_for_scalar_type (scalar_type);
239                 if (!vectype)
240                     {
241                       if (dump_enabled_p ())
242                         {
243                           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
244                                        "not vectorized: unsupported "
245                                        "data-type ");
246                           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
247                                          scalar_type);
248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
249                         }
250                       return false;
251                     }
252                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
253 
254                 if (dump_enabled_p ())
255                     {
256                       dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
257                       dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
258                   dump_printf (MSG_NOTE, "\n");
259                     }
260 
261                 if (dump_enabled_p ())
262                     {
263                       dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
264                       dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
265                       dump_printf (MSG_NOTE, "\n");
266                     }
267 
268                 vect_update_max_nunits (&vectorization_factor, vectype);
269               }
270           }
271 
272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
273              !gsi_end_p (si) || analyze_pattern_stmt;)
274         {
275           tree vf_vectype;
276 
277           if (analyze_pattern_stmt)
278               stmt = pattern_stmt;
279           else
280             stmt = gsi_stmt (si);
281 
282           stmt_info = vinfo_for_stmt (stmt);
283 
284             if (dump_enabled_p ())
285               {
286                 dump_printf_loc (MSG_NOTE, vect_location,
287                                "==> examining statement: ");
288                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
289               }
290 
291             gcc_assert (stmt_info);
292 
293             /* Skip stmts which do not need to be vectorized.  */
294             if ((!STMT_VINFO_RELEVANT_P (stmt_info)
295                  && !STMT_VINFO_LIVE_P (stmt_info))
296                 || gimple_clobber_p (stmt))
297             {
298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
302                 {
303                   stmt = pattern_stmt;
304                   stmt_info = vinfo_for_stmt (pattern_stmt);
305                   if (dump_enabled_p ())
306                     {
307                       dump_printf_loc (MSG_NOTE, vect_location,
308                                        "==> examining pattern statement: ");
309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
310                     }
311                 }
312               else
313                   {
314                     if (dump_enabled_p ())
315                       dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
316                   gsi_next (&si);
317                     continue;
318                 }
319               }
320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
324             analyze_pattern_stmt = true;
325 
326             /* If a pattern statement has def stmts, analyze them too.  */
327             if (is_pattern_stmt_p (stmt_info))
328               {
329                 if (pattern_def_seq == NULL)
330                     {
331                       pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
332                       pattern_def_si = gsi_start (pattern_def_seq);
333                     }
334                 else if (!gsi_end_p (pattern_def_si))
335                     gsi_next (&pattern_def_si);
336                 if (pattern_def_seq != NULL)
337                     {
338                       gimple *pattern_def_stmt = NULL;
339                       stmt_vec_info pattern_def_stmt_info = NULL;
340 
341                       while (!gsi_end_p (pattern_def_si))
342                         {
343                           pattern_def_stmt = gsi_stmt (pattern_def_si);
344                           pattern_def_stmt_info
345                               = vinfo_for_stmt (pattern_def_stmt);
346                           if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
347                                 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
348                               break;
349                           gsi_next (&pattern_def_si);
350                         }
351 
352                       if (!gsi_end_p (pattern_def_si))
353                         {
354                           if (dump_enabled_p ())
355                               {
356                                 dump_printf_loc (MSG_NOTE, vect_location,
357                                            "==> examining pattern def stmt: ");
358                                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
359                                             pattern_def_stmt, 0);
360                               }
361 
362                           stmt = pattern_def_stmt;
363                           stmt_info = pattern_def_stmt_info;
364                         }
365                       else
366                         {
367                           pattern_def_si = gsi_none ();
368                           analyze_pattern_stmt = false;
369                         }
370                     }
371                 else
372                     analyze_pattern_stmt = false;
373               }
374 
375             if (gimple_get_lhs (stmt) == NULL_TREE
376                 /* MASK_STORE has no lhs, but is ok.  */
377                 && (!is_gimple_call (stmt)
378                       || !gimple_call_internal_p (stmt)
379                       || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
380               {
381                 if (is_gimple_call (stmt))
382                     {
383                       /* Ignore calls with no lhs.  These must be calls to
384                          #pragma omp simd functions, and what vectorization factor
385                          it really needs can't be determined until
386                          vectorizable_simd_clone_call.  */
387                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
388                         {
389                           pattern_def_seq = NULL;
390                           gsi_next (&si);
391                         }
392                       continue;
393                     }
394                 if (dump_enabled_p ())
395                     {
396                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
397                                    "not vectorized: irregular stmt.");
398                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
399                                     0);
400                     }
401                 return false;
402               }
403 
404             if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
405               {
406                 if (dump_enabled_p ())
407                   {
408                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
409                                    "not vectorized: vector stmt in loop:");
410                     dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
411                   }
412                 return false;
413               }
414 
415             bool_result = false;
416 
417             if (STMT_VINFO_VECTYPE (stmt_info))
418               {
419                 /* The only case when a vectype had been already set is for stmts
420                    that contain a dataref, or for "pattern-stmts" (stmts
421                      generated by the vectorizer to represent/replace a certain
422                      idiom).  */
423                 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
424                                 || is_pattern_stmt_p (stmt_info)
425                                 || !gsi_end_p (pattern_def_si));
426                 vectype = STMT_VINFO_VECTYPE (stmt_info);
427               }
428             else
429               {
430                 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
431                 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
432                     scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
433                 else
434                     scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
435 
436                 /* Bool ops don't participate in vectorization factor
437                      computation.  For comparison use compared types to
438                      compute a factor.  */
439                 if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
440                       && is_gimple_assign (stmt)
441                       && gimple_assign_rhs_code (stmt) != COND_EXPR)
442                     {
443                       if (STMT_VINFO_RELEVANT_P (stmt_info)
444                           || STMT_VINFO_LIVE_P (stmt_info))
445                         mask_producers.safe_push (stmt_info);
446                       bool_result = true;
447 
448                       if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
449                           == tcc_comparison
450                           && !VECT_SCALAR_BOOLEAN_TYPE_P
451                                   (TREE_TYPE (gimple_assign_rhs1 (stmt))))
452                         scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
453                       else
454                         {
455                           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
456                               {
457                                 pattern_def_seq = NULL;
458                                 gsi_next (&si);
459                               }
460                           continue;
461                         }
462                     }
463 
464                 if (dump_enabled_p ())
465                     {
466                       dump_printf_loc (MSG_NOTE, vect_location,
467                                    "get vectype for scalar type:  ");
468                       dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
469                   dump_printf (MSG_NOTE, "\n");
470                     }
471                 vectype = get_vectype_for_scalar_type (scalar_type);
472                 if (!vectype)
473                     {
474                       if (dump_enabled_p ())
475                         {
476                           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477                                        "not vectorized: unsupported "
478                                        "data-type ");
479                           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
480                                          scalar_type);
481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
482                         }
483                       return false;
484                     }
485 
486                 if (!bool_result)
487                     STMT_VINFO_VECTYPE (stmt_info) = vectype;
488 
489                 if (dump_enabled_p ())
490                     {
491                       dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
492                       dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
493                   dump_printf (MSG_NOTE, "\n");
494                     }
495             }
496 
497             /* Don't try to compute VF out scalar types if we stmt
498                produces boolean vector.  Use result vectype instead.  */
499             if (VECTOR_BOOLEAN_TYPE_P (vectype))
500               vf_vectype = vectype;
501             else
502               {
503                 /* The vectorization factor is according to the smallest
504                      scalar type (or the largest vector size, but we only
505                      support one vector size per loop).  */
506                 if (!bool_result)
507                     scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
508                                                                            &dummy);
509                 if (dump_enabled_p ())
510                     {
511                       dump_printf_loc (MSG_NOTE, vect_location,
512                                            "get vectype for scalar type:  ");
513                       dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
514                       dump_printf (MSG_NOTE, "\n");
515                     }
516                 vf_vectype = get_vectype_for_scalar_type (scalar_type);
517               }
518             if (!vf_vectype)
519               {
520                 if (dump_enabled_p ())
521                     {
522                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523                                    "not vectorized: unsupported data-type ");
524                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
525                                      scalar_type);
526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
527                     }
528                 return false;
529               }
530 
531             if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
532                               GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
533               {
534                 if (dump_enabled_p ())
535                     {
536                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537                                    "not vectorized: different sized vector "
538                                    "types in statement, ");
539                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
540                                      vectype);
541                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
542                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
543                                      vf_vectype);
544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
545                     }
546                 return false;
547               }
548 
549             if (dump_enabled_p ())
550               {
551                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
552                 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
553               dump_printf (MSG_NOTE, "\n");
554               }
555 
556             if (dump_enabled_p ())
557               {
558                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
559                 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
560                 dump_printf (MSG_NOTE, "\n");
561               }
562 
563             vect_update_max_nunits (&vectorization_factor, vf_vectype);
564 
565             if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
566               {
567                 pattern_def_seq = NULL;
568                 gsi_next (&si);
569               }
570         }
571     }
572 
573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
574   if (dump_enabled_p ())
575     {
576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
577       dump_dec (MSG_NOTE, vectorization_factor);
578       dump_printf (MSG_NOTE, "\n");
579     }
580 
581   if (known_le (vectorization_factor, 1U))
582     {
583       if (dump_enabled_p ())
584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
585                          "not vectorized: unsupported data-type\n");
586       return false;
587     }
588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
589 
590   for (i = 0; i < mask_producers.length (); i++)
591     {
592       tree mask_type = NULL;
593 
594       stmt = STMT_VINFO_STMT (mask_producers[i]);
595 
596       if (is_gimple_assign (stmt)
597             && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
598             && !VECT_SCALAR_BOOLEAN_TYPE_P
599                                               (TREE_TYPE (gimple_assign_rhs1 (stmt))))
600           {
601             scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
602             mask_type = get_mask_type_for_scalar_type (scalar_type);
603 
604             if (!mask_type)
605               {
606                 if (dump_enabled_p ())
607                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608                                          "not vectorized: unsupported mask\n");
609                 return false;
610               }
611           }
612       else
613           {
614             tree rhs;
615             ssa_op_iter iter;
616             gimple *def_stmt;
617             enum vect_def_type dt;
618 
619             FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
620               {
621                 if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
622                                                &def_stmt, &dt, &vectype))
623                     {
624                       if (dump_enabled_p ())
625                         {
626                           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627                                                "not vectorized: can't compute mask type "
628                                                "for statement, ");
629                           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
630                                                   0);
631                         }
632                       return false;
633                     }
634 
635                 /* No vectype probably means external definition.
636                      Allow it in case there is another operand which
637                      allows to determine mask type.  */
638                 if (!vectype)
639                     continue;
640 
641                 if (!mask_type)
642                     mask_type = vectype;
643                 else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
644                                          TYPE_VECTOR_SUBPARTS (vectype)))
645                     {
646                       if (dump_enabled_p ())
647                         {
648                           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
649                                                "not vectorized: different sized masks "
650                                                "types in statement, ");
651                           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
652                                                    mask_type);
653                           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
654                           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
655                                                    vectype);
656                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
657                         }
658                       return false;
659                     }
660                 else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
661                            != VECTOR_BOOLEAN_TYPE_P (vectype))
662                     {
663                       if (dump_enabled_p ())
664                         {
665                           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666                                                "not vectorized: mixed mask and "
667                                                "nonmask vector types in statement, ");
668                           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
669                                                    mask_type);
670                           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
671                           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
672                                                    vectype);
673                           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
674                         }
675                       return false;
676                     }
677               }
678 
679             /* We may compare boolean value loaded as vector of integers.
680                Fix mask_type in such case.  */
681             if (mask_type
682                 && !VECTOR_BOOLEAN_TYPE_P (mask_type)
683                 && gimple_code (stmt) == GIMPLE_ASSIGN
684                 && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
685               mask_type = build_same_sized_truth_vector_type (mask_type);
686           }
687 
688       /* No mask_type should mean loop invariant predicate.
689            This is probably a subject for optimization in
690            if-conversion.  */
691       if (!mask_type)
692           {
693             if (dump_enabled_p ())
694               {
695                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696                                      "not vectorized: can't compute mask type "
697                                      "for statement, ");
698                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
699                                         0);
700               }
701             return false;
702           }
703 
704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
705     }
706 
707   return true;
708 }
709 
710 
711 /* Function vect_is_simple_iv_evolution.
712 
713    FORNOW: A simple evolution of an induction variables in the loop is
714    considered a polynomial evolution.  */
715 
716 static bool
vect_is_simple_iv_evolution(unsigned loop_nb,tree access_fn,tree * init,tree * step)717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
718                              tree * step)
719 {
720   tree init_expr;
721   tree step_expr;
722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
723   basic_block bb;
724 
725   /* When there is no evolution in this loop, the evolution function
726      is not "simple".  */
727   if (evolution_part == NULL_TREE)
728     return false;
729 
730   /* When the evolution is a polynomial of degree >= 2
731      the evolution function is not "simple".  */
732   if (tree_is_chrec (evolution_part))
733     return false;
734 
735   step_expr = evolution_part;
736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
737 
738   if (dump_enabled_p ())
739     {
740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
742       dump_printf (MSG_NOTE, ",  init: ");
743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
744       dump_printf (MSG_NOTE, "\n");
745     }
746 
747   *init = init_expr;
748   *step = step_expr;
749 
750   if (TREE_CODE (step_expr) != INTEGER_CST
751       && (TREE_CODE (step_expr) != SSA_NAME
752             || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
753                 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
754             || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
755                 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
756                       || !flag_associative_math)))
757       && (TREE_CODE (step_expr) != REAL_CST
758             || !flag_associative_math))
759     {
760       if (dump_enabled_p ())
761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762                          "step unknown.\n");
763       return false;
764     }
765 
766   return true;
767 }
768 
769 /* Function vect_analyze_scalar_cycles_1.
770 
771    Examine the cross iteration def-use cycles of scalar variables
772    in LOOP.  LOOP_VINFO represents the loop that is now being
773    considered for vectorization (can be LOOP, or an outer-loop
774    enclosing LOOP).  */
775 
776 static void
vect_analyze_scalar_cycles_1(loop_vec_info loop_vinfo,struct loop * loop)777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
778 {
779   basic_block bb = loop->header;
780   tree init, step;
781   auto_vec<gimple *, 64> worklist;
782   gphi_iterator gsi;
783   bool double_reduc;
784 
785   if (dump_enabled_p ())
786     dump_printf_loc (MSG_NOTE, vect_location,
787                      "=== vect_analyze_scalar_cycles ===\n");
788 
789   /* First - identify all inductions.  Reduction detection assumes that all the
790      inductions have been identified, therefore, this order must not be
791      changed.  */
792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
793     {
794       gphi *phi = gsi.phi ();
795       tree access_fn = NULL;
796       tree def = PHI_RESULT (phi);
797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
798 
799       if (dump_enabled_p ())
800           {
801             dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
802             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
803           }
804 
805       /* Skip virtual phi's.  The data dependences that are associated with
806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
807       if (virtual_operand_p (def))
808           continue;
809 
810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
811 
812       /* Analyze the evolution function.  */
813       access_fn = analyze_scalar_evolution (loop, def);
814       if (access_fn)
815           {
816             STRIP_NOPS (access_fn);
817             if (dump_enabled_p ())
818               {
819                 dump_printf_loc (MSG_NOTE, vect_location,
820                                "Access function of PHI: ");
821                 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
822               dump_printf (MSG_NOTE, "\n");
823               }
824             STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
825               = initial_condition_in_loop_num (access_fn, loop->num);
826             STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
827               = evolution_part_in_loop_num (access_fn, loop->num);
828           }
829 
830       if (!access_fn
831             || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
832             || (LOOP_VINFO_LOOP (loop_vinfo) != loop
833                 && TREE_CODE (step) != INTEGER_CST))
834           {
835             worklist.safe_push (phi);
836             continue;
837           }
838 
839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
840                       != NULL_TREE);
841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
842 
843       if (dump_enabled_p ())
844           dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
846     }
847 
848 
849   /* Second - identify all reductions and nested cycles.  */
850   while (worklist.length () > 0)
851     {
852       gimple *phi = worklist.pop ();
853       tree def = PHI_RESULT (phi);
854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
855       gimple *reduc_stmt;
856 
857       if (dump_enabled_p ())
858         {
859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
861         }
862 
863       gcc_assert (!virtual_operand_p (def)
864                       && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
865 
866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
867                                                             &double_reduc, false);
868       if (reduc_stmt)
869         {
870           if (double_reduc)
871             {
872               if (dump_enabled_p ())
873                 dump_printf_loc (MSG_NOTE, vect_location,
874                                          "Detected double reduction.\n");
875 
876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
878                                                     vect_double_reduction_def;
879             }
880           else
881             {
882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
883                 {
884                   if (dump_enabled_p ())
885                     dump_printf_loc (MSG_NOTE, vect_location,
886                                              "Detected vectorizable nested cycle.\n");
887 
888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
890                                                              vect_nested_cycle;
891                 }
892               else
893                 {
894                   if (dump_enabled_p ())
895                     dump_printf_loc (MSG_NOTE, vect_location,
896                                              "Detected reduction.\n");
897 
898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
900                                                            vect_reduction_def;
901                   /* Store the reduction cycles for possible vectorization in
902                      loop-aware SLP if it was not detected as reduction
903                          chain.  */
904                       if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
905                         LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
906                 }
907             }
908         }
909       else
910         if (dump_enabled_p ())
911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
912                                  "Unknown def-use cycle pattern.\n");
913     }
914 }
915 
916 
917 /* Function vect_analyze_scalar_cycles.
918 
919    Examine the cross iteration def-use cycles of scalar variables, by
920    analyzing the loop-header PHIs of scalar variables.  Classify each
921    cycle as one of the following: invariant, induction, reduction, unknown.
922    We do that for the loop represented by LOOP_VINFO, and also to its
923    inner-loop, if exists.
924    Examples for scalar cycles:
925 
926    Example1: reduction:
927 
928               loop1:
929               for (i=0; i<N; i++)
930                  sum += a[i];
931 
932    Example2: induction:
933 
934               loop2:
935               for (i=0; i<N; i++)
936                  a[i] = i;  */
937 
938 static void
vect_analyze_scalar_cycles(loop_vec_info loop_vinfo)939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
940 {
941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
942 
943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
944 
945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
946      Reductions in such inner-loop therefore have different properties than
947      the reductions in the nest that gets vectorized:
948      1. When vectorized, they are executed in the same order as in the original
949         scalar loop, so we can't change the order of computation when
950         vectorizing them.
951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
952         current checks are too strict.  */
953 
954   if (loop->inner)
955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
956 }
957 
958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
959 
960 static void
vect_fixup_reduc_chain(gimple * stmt)961 vect_fixup_reduc_chain (gimple *stmt)
962 {
963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
964   gimple *stmtp;
965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
966                 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
968   do
969     {
970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
973       if (stmt)
974           GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
975             = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
976     }
977   while (stmt);
978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
979 }
980 
981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
982 
983 static void
vect_fixup_scalar_cycles_with_patterns(loop_vec_info loop_vinfo)984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
985 {
986   gimple *first;
987   unsigned i;
988 
989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
991       {
992           gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
993           while (next)
994             {
995               if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
996                 break;
997               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
998             }
999           /* If not all stmt in the chain are patterns try to handle
1000              the chain without patterns.  */
1001           if (! next)
1002             {
1003               vect_fixup_reduc_chain (first);
1004               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005                 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006             }
1007       }
1008 }
1009 
1010 /* Function vect_get_loop_niters.
1011 
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016 
1017    Return the loop exit condition.  */
1018 
1019 
1020 static gcond *
vect_get_loop_niters(struct loop * loop,tree * assumptions,tree * number_of_iterations,tree * number_of_iterationsm1)1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022                           tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028 
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034                          "=== get_loop_niters ===\n");
1035 
1036   if (!exit)
1037     return cond;
1038 
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045 
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049 
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052 
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056           {
1057             /* Try to combine may_be_zero with assumptions, this can simplify
1058                computation of niter expression.  */
1059             if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060               niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061                                                        niter_assumptions,
1062                                                        fold_build1 (TRUTH_NOT_EXPR,
1063                                                                         boolean_type_node,
1064                                                                         may_be_zero));
1065             else
1066               niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067                                          build_int_cst (TREE_TYPE (niter), 0),
1068                                          rewrite_to_non_trapping_overflow (niter));
1069 
1070             may_be_zero = NULL_TREE;
1071           }
1072       else if (integer_nonzerop (may_be_zero))
1073           {
1074             *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075             *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076             return cond;
1077           }
1078       else
1079           return cond;
1080     }
1081 
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084 
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091                                 build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093 
1094   return cond;
1095 }
1096 
1097 /* Function bb_in_loop_p
1098 
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100 
1101 static bool
bb_in_loop_p(const_basic_block bb,const void * data)1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109 
1110 
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113 
_loop_vec_info(struct loop * loop_in)1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     ivexpr_map (NULL),
1132     slp_unrolling_factor (1),
1133     single_scalar_iteration_cost (0),
1134     vectorizable (false),
1135     can_fully_mask_p (true),
1136     fully_masked_p (false),
1137     peeling_for_gaps (false),
1138     peeling_for_niter (false),
1139     operands_swapped (false),
1140     no_data_dependencies (false),
1141     has_mask_store (false),
1142     scalar_loop (NULL),
1143     orig_loop_info (NULL)
1144 {
1145   /* Create/Update stmt_info for all stmts in the loop.  */
1146   basic_block *body = get_loop_body (loop);
1147   for (unsigned int i = 0; i < loop->num_nodes; i++)
1148     {
1149       basic_block bb = body[i];
1150       gimple_stmt_iterator si;
1151 
1152       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153           {
1154             gimple *phi = gsi_stmt (si);
1155             gimple_set_uid (phi, 0);
1156             set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1157           }
1158 
1159       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160           {
1161             gimple *stmt = gsi_stmt (si);
1162             gimple_set_uid (stmt, 0);
1163             set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1164           }
1165     }
1166   free (body);
1167 
1168   /* CHECKME: We want to visit all BBs before their successors (except for
1169      latch blocks, for which this assertion wouldn't hold).  In the simple
1170      case of the loop forms we allow, a dfs order of the BBs would the same
1171      as reversed postorder traversal, so we are safe.  */
1172 
1173   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174                                                     bbs, loop->num_nodes, loop);
1175   gcc_assert (nbbs == loop->num_nodes);
1176 }
1177 
1178 /* Free all levels of MASKS.  */
1179 
1180 void
release_vec_loop_masks(vec_loop_masks * masks)1181 release_vec_loop_masks (vec_loop_masks *masks)
1182 {
1183   rgroup_masks *rgm;
1184   unsigned int i;
1185   FOR_EACH_VEC_ELT (*masks, i, rgm)
1186     rgm->masks.release ();
1187   masks->release ();
1188 }
1189 
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191    stmt_vec_info structs of all the stmts in the loop.  */
1192 
~_loop_vec_info()1193 _loop_vec_info::~_loop_vec_info ()
1194 {
1195   int nbbs;
1196   gimple_stmt_iterator si;
1197   int j;
1198 
1199   nbbs = loop->num_nodes;
1200   for (j = 0; j < nbbs; j++)
1201     {
1202       basic_block bb = bbs[j];
1203       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204         free_stmt_vec_info (gsi_stmt (si));
1205 
1206       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207         {
1208             gimple *stmt = gsi_stmt (si);
1209 
1210             /* We may have broken canonical form by moving a constant
1211                into RHS1 of a commutative op.  Fix such occurrences.  */
1212             if (operands_swapped && is_gimple_assign (stmt))
1213               {
1214                 enum tree_code code = gimple_assign_rhs_code (stmt);
1215 
1216                 if ((code == PLUS_EXPR
1217                        || code == POINTER_PLUS_EXPR
1218                        || code == MULT_EXPR)
1219                       && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220                     swap_ssa_operands (stmt,
1221                                            gimple_assign_rhs1_ptr (stmt),
1222                                            gimple_assign_rhs2_ptr (stmt));
1223                 else if (code == COND_EXPR
1224                            && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225                     {
1226                       tree cond_expr = gimple_assign_rhs1 (stmt);
1227                       enum tree_code cond_code = TREE_CODE (cond_expr);
1228 
1229                       if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230                         {
1231                           bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232                                                                                   0));
1233                           cond_code = invert_tree_comparison (cond_code,
1234                                                                         honor_nans);
1235                           if (cond_code != ERROR_MARK)
1236                               {
1237                                 TREE_SET_CODE (cond_expr, cond_code);
1238                                 swap_ssa_operands (stmt,
1239                                                        gimple_assign_rhs2_ptr (stmt),
1240                                                        gimple_assign_rhs3_ptr (stmt));
1241                               }
1242                         }
1243                     }
1244               }
1245 
1246             /* Free stmt_vec_info.  */
1247             free_stmt_vec_info (stmt);
1248           gsi_next (&si);
1249         }
1250     }
1251 
1252   free (bbs);
1253 
1254   release_vec_loop_masks (&masks);
1255   delete ivexpr_map;
1256 
1257   loop->aux = NULL;
1258 }
1259 
1260 /* Return an invariant or register for EXPR and emit necessary
1261    computations in the LOOP_VINFO loop preheader.  */
1262 
1263 tree
cse_and_gimplify_to_preheader(loop_vec_info loop_vinfo,tree expr)1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1265 {
1266   if (is_gimple_reg (expr)
1267       || is_gimple_min_invariant (expr))
1268     return expr;
1269 
1270   if (! loop_vinfo->ivexpr_map)
1271     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273   if (! cached)
1274     {
1275       gimple_seq stmts = NULL;
1276       cached = force_gimple_operand (unshare_expr (expr),
1277                                              &stmts, true, NULL_TREE);
1278       if (stmts)
1279           {
1280             edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281             gsi_insert_seq_on_edge_immediate (e, stmts);
1282           }
1283     }
1284   return cached;
1285 }
1286 
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288    all masks required to mask LOOP_VINFO.  */
1289 
1290 static bool
can_produce_all_loop_masks_p(loop_vec_info loop_vinfo,tree cmp_type)1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1292 {
1293   rgroup_masks *rgm;
1294   unsigned int i;
1295   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296     if (rgm->mask_type != NULL_TREE
1297           && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298                                                       cmp_type, rgm->mask_type,
1299                                                       OPTIMIZE_FOR_SPEED))
1300       return false;
1301   return true;
1302 }
1303 
1304 /* Calculate the maximum number of scalars per iteration for every
1305    rgroup in LOOP_VINFO.  */
1306 
1307 static unsigned int
vect_get_max_nscalars_per_iter(loop_vec_info loop_vinfo)1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1309 {
1310   unsigned int res = 1;
1311   unsigned int i;
1312   rgroup_masks *rgm;
1313   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314     res = MAX (res, rgm->max_nscalars_per_iter);
1315   return res;
1316 }
1317 
1318 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1319    whether we can actually generate the masks required.  Return true if so,
1320    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1321 
1322 static bool
vect_verify_full_masking(loop_vec_info loop_vinfo)1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1324 {
1325   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326   unsigned int min_ni_width;
1327 
1328   /* Use a normal loop if there are no statements that need masking.
1329      This only happens in rare degenerate cases: it means that the loop
1330      has no loads, no stores, and no live-out values.  */
1331   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332     return false;
1333 
1334   /* Get the maximum number of iterations that is representable
1335      in the counter type.  */
1336   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1338 
1339   /* Get a more refined estimate for the number of iterations.  */
1340   widest_int max_back_edges;
1341   if (max_loop_iterations (loop, &max_back_edges))
1342     max_ni = wi::smin (max_ni, max_back_edges + 1);
1343 
1344   /* Account for rgroup masks, in which each bit is replicated N times.  */
1345   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1346 
1347   /* Work out how many bits we need to represent the limit.  */
1348   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1349 
1350   /* Find a scalar mode for which WHILE_ULT is supported.  */
1351   opt_scalar_int_mode cmp_mode_iter;
1352   tree cmp_type = NULL_TREE;
1353   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1354     {
1355       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356       if (cmp_bits >= min_ni_width
1357             && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1358           {
1359             tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360             if (this_type
1361                 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1362               {
1363                 /* Although we could stop as soon as we find a valid mode,
1364                      it's often better to continue until we hit Pmode, since the
1365                      operands to the WHILE are more likely to be reusable in
1366                      address calculations.  */
1367                 cmp_type = this_type;
1368                 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369                     break;
1370               }
1371           }
1372     }
1373 
1374   if (!cmp_type)
1375     return false;
1376 
1377   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378   return true;
1379 }
1380 
1381 /* Calculate the cost of one scalar iteration of the loop.  */
1382 static void
vect_compute_single_scalar_iteration_cost(loop_vec_info loop_vinfo)1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1384 {
1385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387   int nbbs = loop->num_nodes, factor;
1388   int innerloop_iters, i;
1389 
1390   /* Gather costs for statements in the scalar loop.  */
1391 
1392   /* FORNOW.  */
1393   innerloop_iters = 1;
1394   if (loop->inner)
1395     innerloop_iters = 50; /* FIXME */
1396 
1397   for (i = 0; i < nbbs; i++)
1398     {
1399       gimple_stmt_iterator si;
1400       basic_block bb = bbs[i];
1401 
1402       if (bb->loop_father == loop->inner)
1403         factor = innerloop_iters;
1404       else
1405         factor = 1;
1406 
1407       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409             gimple *stmt = gsi_stmt (si);
1410           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411 
1412           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413             continue;
1414 
1415           /* Skip stmts that are not vectorized inside the loop.  */
1416           if (stmt_info
1417               && !STMT_VINFO_RELEVANT_P (stmt_info)
1418               && (!STMT_VINFO_LIVE_P (stmt_info)
1419                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420                 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421             continue;
1422 
1423             vect_cost_for_stmt kind;
1424           if (STMT_VINFO_DATA_REF (stmt_info))
1425             {
1426               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427                kind = scalar_load;
1428              else
1429                kind = scalar_store;
1430             }
1431           else
1432             kind = scalar_stmt;
1433 
1434             record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435                                   factor, kind, stmt_info, 0, vect_prologue);
1436         }
1437     }
1438 
1439   /* Now accumulate cost.  */
1440   void *target_cost_data = init_cost (loop);
1441   stmt_info_for_cost *si;
1442   int j;
1443   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444                         j, si)
1445     {
1446       struct _stmt_vec_info *stmt_info
1447           = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448       (void) add_stmt_cost (target_cost_data, si->count,
1449                                   si->kind, stmt_info, si->misalign,
1450                                   vect_body);
1451     }
1452   unsigned dummy, body_cost = 0;
1453   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454   destroy_cost_data (target_cost_data);
1455   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1456 }
1457 
1458 
1459 /* Function vect_analyze_loop_form_1.
1460 
1461    Verify that certain CFG restrictions hold, including:
1462    - the loop has a pre-header
1463    - the loop has a single entry and exit
1464    - the loop exit condition is simple enough
1465    - the number of iterations can be analyzed, i.e, a countable loop.  The
1466      niter could be analyzed under some assumptions.  */
1467 
1468 bool
vect_analyze_loop_form_1(struct loop * loop,gcond ** loop_cond,tree * assumptions,tree * number_of_iterationsm1,tree * number_of_iterations,gcond ** inner_loop_cond)1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470                                 tree *assumptions, tree *number_of_iterationsm1,
1471                                 tree *number_of_iterations, gcond **inner_loop_cond)
1472 {
1473   if (dump_enabled_p ())
1474     dump_printf_loc (MSG_NOTE, vect_location,
1475                          "=== vect_analyze_loop_form ===\n");
1476 
1477   /* Different restrictions apply when we are considering an inner-most loop,
1478      vs. an outer (nested) loop.
1479      (FORNOW. May want to relax some of these restrictions in the future).  */
1480 
1481   if (!loop->inner)
1482     {
1483       /* Inner-most loop.  We currently require that the number of BBs is
1484            exactly 2 (the header and latch).  Vectorizable inner-most loops
1485            look like this:
1486 
1487                         (pre-header)
1488                            |
1489                           header <--------+
1490                            | |            |
1491                            | +--> latch --+
1492                            |
1493                         (exit-bb)  */
1494 
1495       if (loop->num_nodes != 2)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                                    "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502 
1503       if (empty_block_p (loop->header))
1504           {
1505             if (dump_enabled_p ())
1506               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                                    "not vectorized: empty loop.\n");
1508             return false;
1509           }
1510     }
1511   else
1512     {
1513       struct loop *innerloop = loop->inner;
1514       edge entryedge;
1515 
1516       /* Nested loop. We currently require that the loop is doubly-nested,
1517            contains a single inner loop, and the number of BBs is exactly 5.
1518            Vectorizable outer-loops look like this:
1519 
1520                               (pre-header)
1521                                  |
1522                                 header <---+
1523                                  |         |
1524                               inner-loop |
1525                                  |         |
1526                                 tail ------+
1527                                  |
1528                             (exit-bb)
1529 
1530            The inner-loop has the properties expected of inner-most loops
1531            as described above.  */
1532 
1533       if ((loop->inner)->inner || (loop->inner)->next)
1534           {
1535             if (dump_enabled_p ())
1536               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                                    "not vectorized: multiple nested loops.\n");
1538             return false;
1539           }
1540 
1541       if (loop->num_nodes != 5)
1542         {
1543             if (dump_enabled_p ())
1544               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                                    "not vectorized: control flow in loop.\n");
1546             return false;
1547         }
1548 
1549       entryedge = loop_preheader_edge (innerloop);
1550       if (entryedge->src != loop->header
1551             || !single_exit (innerloop)
1552             || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1553           {
1554             if (dump_enabled_p ())
1555               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                                    "not vectorized: unsupported outerloop form.\n");
1557             return false;
1558           }
1559 
1560       /* Analyze the inner-loop.  */
1561       tree inner_niterm1, inner_niter, inner_assumptions;
1562       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563                                               &inner_assumptions, &inner_niterm1,
1564                                               &inner_niter, NULL)
1565             /* Don't support analyzing niter under assumptions for inner
1566                loop.  */
1567             || !integer_onep (inner_assumptions))
1568           {
1569             if (dump_enabled_p ())
1570             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                                    "not vectorized: Bad inner loop.\n");
1572             return false;
1573           }
1574 
1575       if (!expr_invariant_in_loop_p (loop, inner_niter))
1576           {
1577             if (dump_enabled_p ())
1578               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579                                    "not vectorized: inner-loop count not"
1580                              " invariant.\n");
1581             return false;
1582           }
1583 
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_NOTE, vect_location,
1586                                "Considering outer-loop vectorization.\n");
1587     }
1588 
1589   if (!single_exit (loop)
1590       || EDGE_COUNT (loop->header->preds) != 2)
1591     {
1592       if (dump_enabled_p ())
1593         {
1594           if (!single_exit (loop))
1595               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596                                    "not vectorized: multiple exits.\n");
1597           else if (EDGE_COUNT (loop->header->preds) != 2)
1598               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                                    "not vectorized: too many incoming edges.\n");
1600         }
1601       return false;
1602     }
1603 
1604   /* We assume that the loop exit condition is at the end of the loop. i.e,
1605      that the loop is represented as a do-while (with a proper if-guard
1606      before the loop if needed), where the loop header contains all the
1607      executable statements, and the latch is empty.  */
1608   if (!empty_block_p (loop->latch)
1609       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1610     {
1611       if (dump_enabled_p ())
1612           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613                                "not vectorized: latch block not empty.\n");
1614       return false;
1615     }
1616 
1617   /* Make sure the exit is not abnormal.  */
1618   edge e = single_exit (loop);
1619   if (e->flags & EDGE_ABNORMAL)
1620     {
1621       if (dump_enabled_p ())
1622           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623                                "not vectorized: abnormal loop exit edge.\n");
1624       return false;
1625     }
1626 
1627   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628                                              number_of_iterationsm1);
1629   if (!*loop_cond)
1630     {
1631       if (dump_enabled_p ())
1632           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633                                "not vectorized: complicated exit condition.\n");
1634       return false;
1635     }
1636 
1637   if (integer_zerop (*assumptions)
1638       || !*number_of_iterations
1639       || chrec_contains_undetermined (*number_of_iterations))
1640     {
1641       if (dump_enabled_p ())
1642           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                                "not vectorized: number of iterations cannot be "
1644                                "computed.\n");
1645       return false;
1646     }
1647 
1648   if (integer_zerop (*number_of_iterations))
1649     {
1650       if (dump_enabled_p ())
1651           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652                                "not vectorized: number of iterations = 0.\n");
1653       return false;
1654     }
1655 
1656   return true;
1657 }
1658 
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1660 
1661 loop_vec_info
vect_analyze_loop_form(struct loop * loop)1662 vect_analyze_loop_form (struct loop *loop)
1663 {
1664   tree assumptions, number_of_iterations, number_of_iterationsm1;
1665   gcond *loop_cond, *inner_loop_cond = NULL;
1666 
1667   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668                                           &assumptions, &number_of_iterationsm1,
1669                                           &number_of_iterations, &inner_loop_cond))
1670     return NULL;
1671 
1672   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676   if (!integer_onep (assumptions))
1677     {
1678       /* We consider to vectorize this loop by versioning it under
1679            some assumptions.  In order to do this, we need to clear
1680            existing information computed by scev and niter analyzer.  */
1681       scev_reset_htab ();
1682       free_numbers_of_iterations_estimates (loop);
1683       /* Also set flag for this loop so that following scev and niter
1684            analysis are done under the assumptions.  */
1685       loop_constraint_set (loop, LOOP_C_FINITE);
1686       /* Also record the assumptions for versioning.  */
1687       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1688     }
1689 
1690   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1691     {
1692       if (dump_enabled_p ())
1693         {
1694           dump_printf_loc (MSG_NOTE, vect_location,
1695                                  "Symbolic number of iterations is ");
1696             dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697           dump_printf (MSG_NOTE, "\n");
1698         }
1699     }
1700 
1701   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702   if (inner_loop_cond)
1703     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704       = loop_exit_ctrl_vec_info_type;
1705 
1706   gcc_assert (!loop->aux);
1707   loop->aux = loop_vinfo;
1708   return loop_vinfo;
1709 }
1710 
1711 
1712 
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714    statements update the vectorization factor.  */
1715 
1716 static void
vect_update_vf_for_slp(loop_vec_info loop_vinfo)1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1718 {
1719   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721   int nbbs = loop->num_nodes;
1722   poly_uint64 vectorization_factor;
1723   int i;
1724 
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location,
1727                          "=== vect_update_vf_for_slp ===\n");
1728 
1729   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730   gcc_assert (known_ne (vectorization_factor, 0U));
1731 
1732   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733      vectorization factor of the loop is the unrolling factor required by
1734      the SLP instances.  If that unrolling factor is 1, we say, that we
1735      perform pure SLP on loop - cross iteration parallelism is not
1736      exploited.  */
1737   bool only_slp_in_loop = true;
1738   for (i = 0; i < nbbs; i++)
1739     {
1740       basic_block bb = bbs[i];
1741       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742              gsi_next (&si))
1743           {
1744             gimple *stmt = gsi_stmt (si);
1745             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746             if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747                 && STMT_VINFO_RELATED_STMT (stmt_info))
1748               {
1749                 stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750                 stmt_info = vinfo_for_stmt (stmt);
1751               }
1752             if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753                  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754                 && !PURE_SLP_STMT (stmt_info))
1755               /* STMT needs both SLP and loop-based vectorization.  */
1756               only_slp_in_loop = false;
1757           }
1758     }
1759 
1760   if (only_slp_in_loop)
1761     {
1762       dump_printf_loc (MSG_NOTE, vect_location,
1763                            "Loop contains only SLP stmts\n");
1764       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1765     }
1766   else
1767     {
1768       dump_printf_loc (MSG_NOTE, vect_location,
1769                            "Loop contains SLP and non-SLP stmts\n");
1770       /* Both the vectorization factor and unroll factor have the form
1771            current_vector_size * X for some rational X, so they must have
1772            a common multiple.  */
1773       vectorization_factor
1774           = force_common_multiple (vectorization_factor,
1775                                          LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1776     }
1777 
1778   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779   if (dump_enabled_p ())
1780     {
1781       dump_printf_loc (MSG_NOTE, vect_location,
1782                            "Updating vectorization factor to ");
1783       dump_dec (MSG_NOTE, vectorization_factor);
1784       dump_printf (MSG_NOTE, ".\n");
1785     }
1786 }
1787 
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789    the other phi in the reduction is also relevant for vectorization.
1790    This rejects cases such as:
1791 
1792       outer1:
1793           x_1 = PHI <x_3(outer2), ...>;
1794           ...
1795 
1796       inner:
1797           x_2 = ...;
1798           ...
1799 
1800       outer2:
1801           x_3 = PHI <x_2(inner)>;
1802 
1803    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1804 
1805 static bool
vect_active_double_reduction_p(stmt_vec_info stmt_info)1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1807 {
1808   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809     return false;
1810 
1811   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1813 }
1814 
1815 /* Function vect_analyze_loop_operations.
1816 
1817    Scan the loop stmts and make sure they are all vectorizable.  */
1818 
1819 static bool
vect_analyze_loop_operations(loop_vec_info loop_vinfo)1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1821 {
1822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824   int nbbs = loop->num_nodes;
1825   int i;
1826   stmt_vec_info stmt_info;
1827   bool need_to_vectorize = false;
1828   bool ok;
1829 
1830   if (dump_enabled_p ())
1831     dump_printf_loc (MSG_NOTE, vect_location,
1832                          "=== vect_analyze_loop_operations ===\n");
1833 
1834   for (i = 0; i < nbbs; i++)
1835     {
1836       basic_block bb = bbs[i];
1837 
1838       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839              gsi_next (&si))
1840         {
1841           gphi *phi = si.phi ();
1842           ok = true;
1843 
1844           stmt_info = vinfo_for_stmt (phi);
1845           if (dump_enabled_p ())
1846             {
1847               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1849             }
1850             if (virtual_operand_p (gimple_phi_result (phi)))
1851               continue;
1852 
1853           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854              (i.e., a phi in the tail of the outer-loop).  */
1855           if (! is_loop_header_bb_p (bb))
1856             {
1857               /* FORNOW: we currently don't support the case that these phis
1858                  are not used in the outerloop (unless it is double reduction,
1859                  i.e., this phi is vect_reduction_def), cause this case
1860                  requires to actually do something here.  */
1861               if (STMT_VINFO_LIVE_P (stmt_info)
1862                       && !vect_active_double_reduction_p (stmt_info))
1863                 {
1864                   if (dump_enabled_p ())
1865                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866                                              "Unsupported loop-closed phi in "
1867                                              "outer-loop.\n");
1868                   return false;
1869                 }
1870 
1871               /* If PHI is used in the outer loop, we check that its operand
1872                  is defined in the inner loop.  */
1873               if (STMT_VINFO_RELEVANT_P (stmt_info))
1874                 {
1875                   tree phi_op;
1876                       gimple *op_def_stmt;
1877 
1878                   if (gimple_phi_num_args (phi) != 1)
1879                     return false;
1880 
1881                   phi_op = PHI_ARG_DEF (phi, 0);
1882                   if (TREE_CODE (phi_op) != SSA_NAME)
1883                     return false;
1884 
1885                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886                       if (gimple_nop_p (op_def_stmt)
1887                           || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888                           || !vinfo_for_stmt (op_def_stmt))
1889                     return false;
1890 
1891                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892                         != vect_used_in_outer
1893                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894                            != vect_used_in_outer_by_reduction)
1895                     return false;
1896                 }
1897 
1898               continue;
1899             }
1900 
1901           gcc_assert (stmt_info);
1902 
1903           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904                || STMT_VINFO_LIVE_P (stmt_info))
1905               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1906             {
1907               /* A scalar-dependence cycle that we don't support.  */
1908               if (dump_enabled_p ())
1909                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910                                          "not vectorized: scalar dependence cycle.\n");
1911               return false;
1912             }
1913 
1914           if (STMT_VINFO_RELEVANT_P (stmt_info))
1915             {
1916               need_to_vectorize = true;
1917               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918                       && ! PURE_SLP_STMT (stmt_info))
1919                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920                 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921                               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922                            && ! PURE_SLP_STMT (stmt_info))
1923                     ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1924             }
1925 
1926             /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1927             if (ok
1928                 && STMT_VINFO_LIVE_P (stmt_info)
1929                 && !PURE_SLP_STMT (stmt_info))
1930               ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1931 
1932           if (!ok)
1933             {
1934               if (dump_enabled_p ())
1935                 {
1936                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                                            "not vectorized: relevant phi not "
1938                                            "supported: ");
1939                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1940                 }
1941                 return false;
1942             }
1943         }
1944 
1945       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946              gsi_next (&si))
1947         {
1948             gimple *stmt = gsi_stmt (si);
1949             if (!gimple_clobber_p (stmt)
1950                 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951               return false;
1952         }
1953     } /* bbs */
1954 
1955   /* All operations in the loop are either irrelevant (deal with loop
1956      control, or dead), or only used outside the loop and can be moved
1957      out of the loop (e.g. invariants, inductions).  The loop can be
1958      optimized away by scalar optimizations.  We're better off not
1959      touching this loop.  */
1960   if (!need_to_vectorize)
1961     {
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                                "All the computation can be taken out of the loop.\n");
1965       if (dump_enabled_p ())
1966           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967                                "not vectorized: redundant loop. no profit to "
1968                                "vectorize.\n");
1969       return false;
1970     }
1971 
1972   return true;
1973 }
1974 
1975 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1976    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1977    definitely no, or -1 if it's worth retrying.  */
1978 
1979 static int
vect_analyze_loop_costing(loop_vec_info loop_vinfo)1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1981 {
1982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1984 
1985   /* Only fully-masked loops can have iteration counts less than the
1986      vectorization factor.  */
1987   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988     {
1989       HOST_WIDE_INT max_niter;
1990 
1991       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992           max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993       else
1994           max_niter = max_stmt_executions_int (loop);
1995 
1996       if (max_niter != -1
1997             && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1998           {
1999             if (dump_enabled_p ())
2000               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001                                    "not vectorized: iteration count smaller than "
2002                                    "vectorization factor.\n");
2003             return 0;
2004           }
2005     }
2006 
2007   int min_profitable_iters, min_profitable_estimate;
2008   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009                                               &min_profitable_estimate);
2010 
2011   if (min_profitable_iters < 0)
2012     {
2013       if (dump_enabled_p ())
2014           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015                                "not vectorized: vectorization not profitable.\n");
2016       if (dump_enabled_p ())
2017           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                                "not vectorized: vector version will never be "
2019                                "profitable.\n");
2020       return -1;
2021     }
2022 
2023   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024                                      * assumed_vf);
2025 
2026   /* Use the cost model only if it is more conservative than user specified
2027      threshold.  */
2028   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029                                             min_profitable_iters);
2030 
2031   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2032 
2033   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2035     {
2036       if (dump_enabled_p ())
2037           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038                                "not vectorized: vectorization not profitable.\n");
2039       if (dump_enabled_p ())
2040           dump_printf_loc (MSG_NOTE, vect_location,
2041                                "not vectorized: iteration count smaller than user "
2042                                "specified loop bound parameter or minimum profitable "
2043                                "iterations (whichever is more conservative).\n");
2044       return 0;
2045     }
2046 
2047   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048   if (estimated_niter == -1)
2049     estimated_niter = likely_max_stmt_executions_int (loop);
2050   if (estimated_niter != -1
2051       && ((unsigned HOST_WIDE_INT) estimated_niter
2052             < MAX (th, (unsigned) min_profitable_estimate)))
2053     {
2054       if (dump_enabled_p ())
2055           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056                                "not vectorized: estimated iteration count too "
2057                                "small.\n");
2058       if (dump_enabled_p ())
2059           dump_printf_loc (MSG_NOTE, vect_location,
2060                                "not vectorized: estimated iteration count smaller "
2061                                "than specified loop bound parameter or minimum "
2062                                "profitable iterations (whichever is more "
2063                                "conservative).\n");
2064       return -1;
2065     }
2066 
2067   return 1;
2068 }
2069 
2070 
2071 /* Function vect_analyze_loop_2.
2072 
2073    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074    for it.  The different analyses will record information in the
2075    loop_vec_info struct.  */
2076 static bool
vect_analyze_loop_2(loop_vec_info loop_vinfo,bool & fatal)2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2078 {
2079   bool ok;
2080   int res;
2081   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082   poly_uint64 min_vf = 2;
2083   unsigned int n_stmts = 0;
2084 
2085   /* The first group of checks is independent of the vector size.  */
2086   fatal = true;
2087 
2088   /* Find all data references in the loop (which correspond to vdefs/vuses)
2089      and analyze their evolution in the loop.  */
2090 
2091   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2092 
2093   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2095     {
2096       if (dump_enabled_p ())
2097           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                                "not vectorized: loop nest containing two "
2099                                "or more consecutive inner loops cannot be "
2100                                "vectorized\n");
2101       return false;
2102     }
2103 
2104   for (unsigned i = 0; i < loop->num_nodes; i++)
2105     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106            !gsi_end_p (gsi); gsi_next (&gsi))
2107       {
2108           gimple *stmt = gsi_stmt (gsi);
2109           if (is_gimple_debug (stmt))
2110             continue;
2111           ++n_stmts;
2112           if (!find_data_references_in_stmt (loop, stmt,
2113                                                      &LOOP_VINFO_DATAREFS (loop_vinfo)))
2114             {
2115               if (is_gimple_call (stmt) && loop->safelen)
2116                 {
2117                     tree fndecl = gimple_call_fndecl (stmt), op;
2118                     if (fndecl != NULL_TREE)
2119                       {
2120                         cgraph_node *node = cgraph_node::get (fndecl);
2121                         if (node != NULL && node->simd_clones != NULL)
2122                           {
2123                               unsigned int j, n = gimple_call_num_args (stmt);
2124                               for (j = 0; j < n; j++)
2125                                 {
2126                                   op = gimple_call_arg (stmt, j);
2127                                   if (DECL_P (op)
2128                                         || (REFERENCE_CLASS_P (op)
2129                                             && get_base_address (op)))
2130                                     break;
2131                                 }
2132                               op = gimple_call_lhs (stmt);
2133                               /* Ignore #pragma omp declare simd functions
2134                                  if they don't have data references in the
2135                                  call stmt itself.  */
2136                               if (j == n
2137                                   && !(op
2138                                          && (DECL_P (op)
2139                                              || (REFERENCE_CLASS_P (op)
2140                                                    && get_base_address (op)))))
2141                                 continue;
2142                           }
2143                       }
2144                 }
2145               if (dump_enabled_p ())
2146                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                                      "not vectorized: loop contains function "
2148                                      "calls or data references that cannot "
2149                                      "be analyzed\n");
2150               return false;
2151             }
2152       }
2153 
2154   /* Analyze the data references and also adjust the minimal
2155      vectorization factor according to the loads and stores.  */
2156 
2157   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158   if (!ok)
2159     {
2160       if (dump_enabled_p ())
2161           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162                                "bad data references.\n");
2163       return false;
2164     }
2165 
2166   /* Classify all cross-iteration scalar data-flow cycles.
2167      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2168   vect_analyze_scalar_cycles (loop_vinfo);
2169 
2170   vect_pattern_recog (loop_vinfo);
2171 
2172   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2173 
2174   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2176 
2177   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178   if (!ok)
2179     {
2180       if (dump_enabled_p ())
2181           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182                                "bad data access.\n");
2183       return false;
2184     }
2185 
2186   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2187 
2188   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189   if (!ok)
2190     {
2191       if (dump_enabled_p ())
2192           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                                "unexpected pattern.\n");
2194       return false;
2195     }
2196 
2197   /* While the rest of the analysis below depends on it in some way.  */
2198   fatal = false;
2199 
2200   /* Analyze data dependences between the data-refs in the loop
2201      and adjust the maximum vectorization factor according to
2202      the dependences.
2203      FORNOW: fail at the first data dependence that we encounter.  */
2204 
2205   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206   if (!ok
2207       || (max_vf != MAX_VECTORIZATION_FACTOR
2208             && maybe_lt (max_vf, min_vf)))
2209     {
2210       if (dump_enabled_p ())
2211               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212                                    "bad data dependence.\n");
2213       return false;
2214     }
2215   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2216 
2217   ok = vect_determine_vectorization_factor (loop_vinfo);
2218   if (!ok)
2219     {
2220       if (dump_enabled_p ())
2221           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222                                "can't determine vectorization factor.\n");
2223       return false;
2224     }
2225   if (max_vf != MAX_VECTORIZATION_FACTOR
2226       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2227     {
2228       if (dump_enabled_p ())
2229           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230                                "bad data dependence.\n");
2231       return false;
2232     }
2233 
2234   /* Compute the scalar iteration cost.  */
2235   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2236 
2237   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238   unsigned th;
2239 
2240   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2241   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242   if (!ok)
2243     return false;
2244 
2245   /* If there are any SLP instances mark them as pure_slp.  */
2246   bool slp = vect_make_slp_decision (loop_vinfo);
2247   if (slp)
2248     {
2249       /* Find stmts that need to be both vectorized and SLPed.  */
2250       vect_detect_hybrid_slp (loop_vinfo);
2251 
2252       /* Update the vectorization factor based on the SLP decision.  */
2253       vect_update_vf_for_slp (loop_vinfo);
2254     }
2255 
2256   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2257 
2258   /* We don't expect to have to roll back to anything other than an empty
2259      set of rgroups.  */
2260   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2261 
2262   /* This is the point where we can re-start analysis with SLP forced off.  */
2263 start_over:
2264 
2265   /* Now the vectorization factor is final.  */
2266   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267   gcc_assert (known_ne (vectorization_factor, 0U));
2268 
2269   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2270     {
2271       dump_printf_loc (MSG_NOTE, vect_location,
2272                            "vectorization_factor = ");
2273       dump_dec (MSG_NOTE, vectorization_factor);
2274       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275                        LOOP_VINFO_INT_NITERS (loop_vinfo));
2276     }
2277 
2278   HOST_WIDE_INT max_niter
2279     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2280 
2281   /* Analyze the alignment of the data-refs in the loop.
2282      Fail if a data reference is found that cannot be vectorized.  */
2283 
2284   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                                "bad data alignment.\n");
2290       return false;
2291     }
2292 
2293   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294      It is important to call pruning after vect_analyze_data_ref_accesses,
2295      since we use grouping information gathered by interleaving analysis.  */
2296   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297   if (!ok)
2298     return false;
2299 
2300   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301      vectorization.  */
2302   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2303     {
2304     /* This pass will decide on using loop versioning and/or loop peeling in
2305        order to enhance the alignment of data references in the loop.  */
2306     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307     if (!ok)
2308       {
2309           if (dump_enabled_p ())
2310             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                                  "bad data alignment.\n");
2312         return false;
2313       }
2314     }
2315 
2316   if (slp)
2317     {
2318       /* Analyze operations in the SLP instances.  Note this may
2319            remove unsupported SLP instances which makes the above
2320            SLP kind detection invalid.  */
2321       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322       vect_slp_analyze_operations (loop_vinfo);
2323       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324           goto again;
2325     }
2326 
2327   /* Scan all the remaining operations in the loop that are not subject
2328      to SLP and make sure they are vectorizable.  */
2329   ok = vect_analyze_loop_operations (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                                "bad operation or unsupported loop bound.\n");
2335       return false;
2336     }
2337 
2338   /* Decide whether to use a fully-masked loop for this vectorization
2339      factor.  */
2340   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342        && vect_verify_full_masking (loop_vinfo));
2343   if (dump_enabled_p ())
2344     {
2345       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346           dump_printf_loc (MSG_NOTE, vect_location,
2347                                "using a fully-masked loop.\n");
2348       else
2349           dump_printf_loc (MSG_NOTE, vect_location,
2350                                "not using a fully-masked loop.\n");
2351     }
2352 
2353   /* If epilog loop is required because of data accesses with gaps,
2354      one additional iteration needs to be peeled.  Check if there is
2355      enough iterations for vectorization.  */
2356   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2359     {
2360       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2362 
2363       if (known_lt (wi::to_widest (scalar_niters), vf))
2364           {
2365             if (dump_enabled_p ())
2366               dump_printf_loc (MSG_NOTE, vect_location,
2367                                    "loop has no enough iterations to support"
2368                                    " peeling for gaps.\n");
2369             return false;
2370           }
2371     }
2372 
2373   /* Check the costings of the loop make vectorizing worthwhile.  */
2374   res = vect_analyze_loop_costing (loop_vinfo);
2375   if (res < 0)
2376     goto again;
2377   if (!res)
2378     {
2379       if (dump_enabled_p ())
2380           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381                                "Loop costings not worthwhile.\n");
2382       return false;
2383     }
2384 
2385   /* Decide whether we need to create an epilogue loop to handle
2386      remaining scalar iterations.  */
2387   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2388 
2389   unsigned HOST_WIDE_INT const_vf;
2390   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391     /* The main loop handles all iterations.  */
2392     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394              && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2395     {
2396       /* Work out the (constant) number of iterations that need to be
2397            peeled for reasons other than niters.  */
2398       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2399       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2400           peel_niter += 1;
2401       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2402                            LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2403           LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2404     }
2405   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2406              /* ??? When peeling for gaps but not alignment, we could
2407                 try to check whether the (variable) niters is known to be
2408                 VF * N + 1.  That's something of a niche case though.  */
2409              || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2410              || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2411              || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2412                     < (unsigned) exact_log2 (const_vf))
2413                  /* In case of versioning, check if the maximum number of
2414                       iterations is greater than th.  If they are identical,
2415                       the epilogue is unnecessary.  */
2416                  && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2417                        || ((unsigned HOST_WIDE_INT) max_niter
2418                            > (th / const_vf) * const_vf))))
2419     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2420 
2421   /* If an epilogue loop is required make sure we can create one.  */
2422   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2423       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2424     {
2425       if (dump_enabled_p ())
2426         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2427       if (!vect_can_advance_ivs_p (loop_vinfo)
2428             || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2429                                                      single_exit (LOOP_VINFO_LOOP
2430                                                                        (loop_vinfo))))
2431         {
2432           if (dump_enabled_p ())
2433               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434                                    "not vectorized: can't create required "
2435                                    "epilog loop\n");
2436           goto again;
2437         }
2438     }
2439 
2440   /* During peeling, we need to check if number of loop iterations is
2441      enough for both peeled prolog loop and vector loop.  This check
2442      can be merged along with threshold check of loop versioning, so
2443      increase threshold for this case if necessary.  */
2444   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2445     {
2446       poly_uint64 niters_th = 0;
2447 
2448       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2449           {
2450             /* Niters for peeled prolog loop.  */
2451             if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2452               {
2453                 struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2454                 tree vectype
2455                     = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2456                 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2457               }
2458             else
2459               niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2460           }
2461 
2462       /* Niters for at least one iteration of vectorized loop.  */
2463       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2464           niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2465       /* One additional iteration because of peeling for gap.  */
2466       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2467           niters_th += 1;
2468       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2469     }
2470 
2471   gcc_assert (known_eq (vectorization_factor,
2472                               LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2473 
2474   /* Ok to vectorize!  */
2475   return true;
2476 
2477 again:
2478   /* Try again with SLP forced off but if we didn't do any SLP there is
2479      no point in re-trying.  */
2480   if (!slp)
2481     return false;
2482 
2483   /* If there are reduction chains re-trying will fail anyway.  */
2484   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2485     return false;
2486 
2487   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2488      via interleaving or lane instructions.  */
2489   slp_instance instance;
2490   slp_tree node;
2491   unsigned i, j;
2492   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2493     {
2494       stmt_vec_info vinfo;
2495       vinfo = vinfo_for_stmt
2496             (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2497       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2498           continue;
2499       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2500       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2501       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2502       if (! vect_store_lanes_supported (vectype, size, false)
2503            && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2504            && ! vect_grouped_store_supported (vectype, size))
2505        return false;
2506       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2507           {
2508             vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2509             vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2510             bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2511             size = STMT_VINFO_GROUP_SIZE (vinfo);
2512             vectype = STMT_VINFO_VECTYPE (vinfo);
2513             if (! vect_load_lanes_supported (vectype, size, false)
2514                 && ! vect_grouped_load_supported (vectype, single_element_p,
2515                                                             size))
2516               return false;
2517           }
2518     }
2519 
2520   if (dump_enabled_p ())
2521     dump_printf_loc (MSG_NOTE, vect_location,
2522                          "re-trying with SLP disabled\n");
2523 
2524   /* Roll back state appropriately.  No SLP this time.  */
2525   slp = false;
2526   /* Restore vectorization factor as it were without SLP.  */
2527   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2528   /* Free the SLP instances.  */
2529   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2530     vect_free_slp_instance (instance);
2531   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2532   /* Reset SLP type to loop_vect on all stmts.  */
2533   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2534     {
2535       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2536       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2537              !gsi_end_p (si); gsi_next (&si))
2538           {
2539             stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2540             STMT_SLP_TYPE (stmt_info) = loop_vect;
2541           }
2542       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2543              !gsi_end_p (si); gsi_next (&si))
2544           {
2545             stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2546             STMT_SLP_TYPE (stmt_info) = loop_vect;
2547             if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2548               {
2549                 stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2550                 STMT_SLP_TYPE (stmt_info) = loop_vect;
2551                 for (gimple_stmt_iterator pi
2552                          = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2553                        !gsi_end_p (pi); gsi_next (&pi))
2554                     {
2555                       gimple *pstmt = gsi_stmt (pi);
2556                       STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2557                     }
2558               }
2559           }
2560     }
2561   /* Free optimized alias test DDRS.  */
2562   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2563   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2564   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2565   /* Reset target cost data.  */
2566   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2567   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2568     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2569   /* Reset accumulated rgroup information.  */
2570   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2571   /* Reset assorted flags.  */
2572   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2573   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2574   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2575   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2576   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2577 
2578   goto start_over;
2579 }
2580 
2581 /* Function vect_analyze_loop.
2582 
2583    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2584    for it.  The different analyses will record information in the
2585    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2586    be vectorized.  */
2587 loop_vec_info
vect_analyze_loop(struct loop * loop,loop_vec_info orig_loop_vinfo)2588 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2589 {
2590   loop_vec_info loop_vinfo;
2591   auto_vector_sizes vector_sizes;
2592 
2593   /* Autodetect first vector size we try.  */
2594   current_vector_size = 0;
2595   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2596   unsigned int next_size = 0;
2597 
2598   if (dump_enabled_p ())
2599     dump_printf_loc (MSG_NOTE, vect_location,
2600                          "===== analyze_loop_nest =====\n");
2601 
2602   if (loop_outer (loop)
2603       && loop_vec_info_for_loop (loop_outer (loop))
2604       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2605     {
2606       if (dump_enabled_p ())
2607           dump_printf_loc (MSG_NOTE, vect_location,
2608                                "outer-loop already vectorized.\n");
2609       return NULL;
2610     }
2611 
2612   poly_uint64 autodetected_vector_size = 0;
2613   while (1)
2614     {
2615       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2616       loop_vinfo = vect_analyze_loop_form (loop);
2617       if (!loop_vinfo)
2618           {
2619             if (dump_enabled_p ())
2620               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2621                                    "bad loop form.\n");
2622             return NULL;
2623           }
2624 
2625       bool fatal = false;
2626 
2627       if (orig_loop_vinfo)
2628           LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2629 
2630       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2631           {
2632             LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2633 
2634             return loop_vinfo;
2635           }
2636 
2637       delete loop_vinfo;
2638 
2639       if (next_size == 0)
2640           autodetected_vector_size = current_vector_size;
2641 
2642       if (next_size < vector_sizes.length ()
2643             && known_eq (vector_sizes[next_size], autodetected_vector_size))
2644           next_size += 1;
2645 
2646       if (fatal
2647             || next_size == vector_sizes.length ()
2648             || known_eq (current_vector_size, 0U))
2649           return NULL;
2650 
2651       /* Try the next biggest vector size.  */
2652       current_vector_size = vector_sizes[next_size++];
2653       if (dump_enabled_p ())
2654           {
2655             dump_printf_loc (MSG_NOTE, vect_location,
2656                                  "***** Re-trying analysis with "
2657                                  "vector size ");
2658             dump_dec (MSG_NOTE, current_vector_size);
2659             dump_printf (MSG_NOTE, "\n");
2660           }
2661     }
2662 }
2663 
2664 /* Return true if there is an in-order reduction function for CODE, storing
2665    it in *REDUC_FN if so.  */
2666 
2667 static bool
fold_left_reduction_fn(tree_code code,internal_fn * reduc_fn)2668 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2669 {
2670   switch (code)
2671     {
2672     case PLUS_EXPR:
2673       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2674       return true;
2675 
2676     default:
2677       return false;
2678     }
2679 }
2680 
2681 /* Function reduction_fn_for_scalar_code
2682 
2683    Input:
2684    CODE - tree_code of a reduction operations.
2685 
2686    Output:
2687    REDUC_FN - the corresponding internal function to be used to reduce the
2688       vector of partial results into a single scalar result, or IFN_LAST
2689       if the operation is a supported reduction operation, but does not have
2690       such an internal function.
2691 
2692    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2693 
2694 static bool
reduction_fn_for_scalar_code(enum tree_code code,internal_fn * reduc_fn)2695 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2696 {
2697   switch (code)
2698     {
2699       case MAX_EXPR:
2700         *reduc_fn = IFN_REDUC_MAX;
2701         return true;
2702 
2703       case MIN_EXPR:
2704         *reduc_fn = IFN_REDUC_MIN;
2705         return true;
2706 
2707       case PLUS_EXPR:
2708         *reduc_fn = IFN_REDUC_PLUS;
2709         return true;
2710 
2711       case BIT_AND_EXPR:
2712           *reduc_fn = IFN_REDUC_AND;
2713           return true;
2714 
2715       case BIT_IOR_EXPR:
2716           *reduc_fn = IFN_REDUC_IOR;
2717           return true;
2718 
2719       case BIT_XOR_EXPR:
2720           *reduc_fn = IFN_REDUC_XOR;
2721           return true;
2722 
2723       case MULT_EXPR:
2724       case MINUS_EXPR:
2725         *reduc_fn = IFN_LAST;
2726         return true;
2727 
2728       default:
2729        return false;
2730     }
2731 }
2732 
2733 /* If there is a neutral value X such that SLP reduction NODE would not
2734    be affected by the introduction of additional X elements, return that X,
2735    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2736    is true if the SLP statements perform a single reduction, false if each
2737    statement performs an independent reduction.  */
2738 
2739 static tree
neutral_op_for_slp_reduction(slp_tree slp_node,tree_code code,bool reduc_chain)2740 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2741                                     bool reduc_chain)
2742 {
2743   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2744   gimple *stmt = stmts[0];
2745   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2746   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2747   tree scalar_type = TREE_TYPE (vector_type);
2748   struct loop *loop = gimple_bb (stmt)->loop_father;
2749   gcc_assert (loop);
2750 
2751   switch (code)
2752     {
2753     case WIDEN_SUM_EXPR:
2754     case DOT_PROD_EXPR:
2755     case SAD_EXPR:
2756     case PLUS_EXPR:
2757     case MINUS_EXPR:
2758     case BIT_IOR_EXPR:
2759     case BIT_XOR_EXPR:
2760       return build_zero_cst (scalar_type);
2761 
2762     case MULT_EXPR:
2763       return build_one_cst (scalar_type);
2764 
2765     case BIT_AND_EXPR:
2766       return build_all_ones_cst (scalar_type);
2767 
2768     case MAX_EXPR:
2769     case MIN_EXPR:
2770       /* For MIN/MAX the initial values are neutral.  A reduction chain
2771            has only a single initial value, so that value is neutral for
2772            all statements.  */
2773       if (reduc_chain)
2774           return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2775       return NULL_TREE;
2776 
2777     default:
2778       return NULL_TREE;
2779     }
2780 }
2781 
2782 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2783    STMT is printed with a message MSG. */
2784 
2785 static void
report_vect_op(dump_flags_t msg_type,gimple * stmt,const char * msg)2786 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2787 {
2788   dump_printf_loc (msg_type, vect_location, "%s", msg);
2789   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2790 }
2791 
2792 
2793 /* Detect SLP reduction of the form:
2794 
2795    #a1 = phi <a5, a0>
2796    a2 = operation (a1)
2797    a3 = operation (a2)
2798    a4 = operation (a3)
2799    a5 = operation (a4)
2800 
2801    #a = phi <a5>
2802 
2803    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2804    FIRST_STMT is the first reduction stmt in the chain
2805    (a2 = operation (a1)).
2806 
2807    Return TRUE if a reduction chain was detected.  */
2808 
2809 static bool
vect_is_slp_reduction(loop_vec_info loop_info,gimple * phi,gimple * first_stmt)2810 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2811                            gimple *first_stmt)
2812 {
2813   struct loop *loop = (gimple_bb (phi))->loop_father;
2814   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2815   enum tree_code code;
2816   gimple *loop_use_stmt = NULL;
2817   stmt_vec_info use_stmt_info;
2818   tree lhs;
2819   imm_use_iterator imm_iter;
2820   use_operand_p use_p;
2821   int nloop_uses, size = 0, n_out_of_loop_uses;
2822   bool found = false;
2823 
2824   if (loop != vect_loop)
2825     return false;
2826 
2827   auto_vec<stmt_vec_info, 8> reduc_chain;
2828   lhs = PHI_RESULT (phi);
2829   code = gimple_assign_rhs_code (first_stmt);
2830   while (1)
2831     {
2832       nloop_uses = 0;
2833       n_out_of_loop_uses = 0;
2834       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2835         {
2836             gimple *use_stmt = USE_STMT (use_p);
2837             if (is_gimple_debug (use_stmt))
2838               continue;
2839 
2840           /* Check if we got back to the reduction phi.  */
2841             if (use_stmt == phi)
2842             {
2843                 loop_use_stmt = use_stmt;
2844               found = true;
2845               break;
2846             }
2847 
2848           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2849             {
2850                 loop_use_stmt = use_stmt;
2851                 nloop_uses++;
2852             }
2853            else
2854              n_out_of_loop_uses++;
2855 
2856            /* There are can be either a single use in the loop or two uses in
2857               phi nodes.  */
2858            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2859              return false;
2860         }
2861 
2862       if (found)
2863         break;
2864 
2865       /* We reached a statement with no loop uses.  */
2866       if (nloop_uses == 0)
2867           return false;
2868 
2869       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2870       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2871         return false;
2872 
2873       if (!is_gimple_assign (loop_use_stmt)
2874             || code != gimple_assign_rhs_code (loop_use_stmt)
2875             || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2876         return false;
2877 
2878       /* Insert USE_STMT into reduction chain.  */
2879       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2880       reduc_chain.safe_push (use_stmt_info);
2881 
2882       lhs = gimple_assign_lhs (loop_use_stmt);
2883       size++;
2884    }
2885 
2886   if (!found || loop_use_stmt != phi || size < 2)
2887     return false;
2888 
2889   /* Swap the operands, if needed, to make the reduction operand be the second
2890      operand.  */
2891   lhs = PHI_RESULT (phi);
2892   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2893     {
2894       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2895       if (gimple_assign_rhs2 (next_stmt) == lhs)
2896           {
2897             tree op = gimple_assign_rhs1 (next_stmt);
2898             gimple *def_stmt = NULL;
2899 
2900           if (TREE_CODE (op) == SSA_NAME)
2901             def_stmt = SSA_NAME_DEF_STMT (op);
2902 
2903             /* Check that the other def is either defined in the loop
2904                ("vect_internal_def"), or it's an induction (defined by a
2905                loop-header phi-node).  */
2906           if (def_stmt
2907               && gimple_bb (def_stmt)
2908                 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2909               && (is_gimple_assign (def_stmt)
2910                   || is_gimple_call (def_stmt)
2911                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2912                            == vect_induction_def
2913                   || (gimple_code (def_stmt) == GIMPLE_PHI
2914                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2915                                   == vect_internal_def
2916                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2917               {
2918                 lhs = gimple_assign_lhs (next_stmt);
2919                 continue;
2920               }
2921 
2922             return false;
2923           }
2924       else
2925           {
2926           tree op = gimple_assign_rhs2 (next_stmt);
2927             gimple *def_stmt = NULL;
2928 
2929           if (TREE_CODE (op) == SSA_NAME)
2930             def_stmt = SSA_NAME_DEF_STMT (op);
2931 
2932           /* Check that the other def is either defined in the loop
2933             ("vect_internal_def"), or it's an induction (defined by a
2934             loop-header phi-node).  */
2935           if (def_stmt
2936               && gimple_bb (def_stmt)
2937                 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2938               && (is_gimple_assign (def_stmt)
2939                   || is_gimple_call (def_stmt)
2940                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2941                               == vect_induction_def
2942                   || (gimple_code (def_stmt) == GIMPLE_PHI
2943                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2944                                   == vect_internal_def
2945                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2946               {
2947                 if (dump_enabled_p ())
2948                     {
2949                       dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2950                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2951                     }
2952 
2953                 swap_ssa_operands (next_stmt,
2954                                        gimple_assign_rhs1_ptr (next_stmt),
2955                                  gimple_assign_rhs2_ptr (next_stmt));
2956                 update_stmt (next_stmt);
2957 
2958                 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2959                     LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2960               }
2961             else
2962               return false;
2963         }
2964 
2965       lhs = gimple_assign_lhs (next_stmt);
2966     }
2967 
2968   /* Build up the actual chain.  */
2969   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2970     {
2971       GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]->stmt;
2972       GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]->stmt;
2973     }
2974   GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]->stmt;
2975   GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2976 
2977   /* Save the chain for further analysis in SLP detection.  */
2978   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]->stmt);
2979   GROUP_SIZE (reduc_chain[0]) = size;
2980 
2981   return true;
2982 }
2983 
2984 /* Return true if we need an in-order reduction for operation CODE
2985    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2986    overflow must wrap.  */
2987 
2988 static bool
needs_fold_left_reduction_p(tree type,tree_code code,bool need_wrapping_integral_overflow)2989 needs_fold_left_reduction_p (tree type, tree_code code,
2990                                    bool need_wrapping_integral_overflow)
2991 {
2992   /* CHECKME: check for !flag_finite_math_only too?  */
2993   if (SCALAR_FLOAT_TYPE_P (type))
2994     switch (code)
2995       {
2996       case MIN_EXPR:
2997       case MAX_EXPR:
2998           return false;
2999 
3000       default:
3001           return !flag_associative_math;
3002       }
3003 
3004   if (INTEGRAL_TYPE_P (type))
3005     {
3006       if (!operation_no_trapping_overflow (type, code))
3007           return true;
3008       if (need_wrapping_integral_overflow
3009             && !TYPE_OVERFLOW_WRAPS (type)
3010             && operation_can_overflow (code))
3011           return true;
3012       return false;
3013     }
3014 
3015   if (SAT_FIXED_POINT_TYPE_P (type))
3016     return true;
3017 
3018   return false;
3019 }
3020 
3021 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3022    reduction operation CODE has a handled computation expression.  */
3023 
3024 bool
check_reduction_path(location_t loc,loop_p loop,gphi * phi,tree loop_arg,enum tree_code code)3025 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3026                           enum tree_code code)
3027 {
3028   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3029   auto_bitmap visited;
3030   tree lookfor = PHI_RESULT (phi);
3031   ssa_op_iter curri;
3032   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3033   while (USE_FROM_PTR (curr) != loop_arg)
3034     curr = op_iter_next_use (&curri);
3035   curri.i = curri.numops;
3036   do
3037     {
3038       path.safe_push (std::make_pair (curri, curr));
3039       tree use = USE_FROM_PTR (curr);
3040       if (use == lookfor)
3041           break;
3042       gimple *def = SSA_NAME_DEF_STMT (use);
3043       if (gimple_nop_p (def)
3044             || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3045           {
3046 pop:
3047             do
3048               {
3049                 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3050                 curri = x.first;
3051                 curr = x.second;
3052                 do
3053                     curr = op_iter_next_use (&curri);
3054                 /* Skip already visited or non-SSA operands (from iterating
3055                    over PHI args).  */
3056                 while (curr != NULL_USE_OPERAND_P
3057                          && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3058                                || ! bitmap_set_bit (visited,
3059                                                         SSA_NAME_VERSION
3060                                                           (USE_FROM_PTR (curr)))));
3061               }
3062             while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3063             if (curr == NULL_USE_OPERAND_P)
3064               break;
3065           }
3066       else
3067           {
3068             if (gimple_code (def) == GIMPLE_PHI)
3069               curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3070             else
3071               curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3072             while (curr != NULL_USE_OPERAND_P
3073                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3074                          || ! bitmap_set_bit (visited,
3075                                                     SSA_NAME_VERSION
3076                                                       (USE_FROM_PTR (curr)))))
3077               curr = op_iter_next_use (&curri);
3078             if (curr == NULL_USE_OPERAND_P)
3079               goto pop;
3080           }
3081     }
3082   while (1);
3083   if (dump_file && (dump_flags & TDF_DETAILS))
3084     {
3085       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3086       unsigned i;
3087       std::pair<ssa_op_iter, use_operand_p> *x;
3088       FOR_EACH_VEC_ELT (path, i, x)
3089           {
3090             dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3091             dump_printf (MSG_NOTE, " ");
3092           }
3093       dump_printf (MSG_NOTE, "\n");
3094     }
3095 
3096   /* Check whether the reduction path detected is valid.  */
3097   bool fail = path.length () == 0;
3098   bool neg = false;
3099   for (unsigned i = 1; i < path.length (); ++i)
3100     {
3101       gimple *use_stmt = USE_STMT (path[i].second);
3102       tree op = USE_FROM_PTR (path[i].second);
3103       if (! has_single_use (op)
3104             || ! is_gimple_assign (use_stmt))
3105           {
3106             fail = true;
3107             break;
3108           }
3109       if (gimple_assign_rhs_code (use_stmt) != code)
3110           {
3111             if (code == PLUS_EXPR
3112                 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3113               {
3114                 /* Track whether we negate the reduction value each iteration.  */
3115                 if (gimple_assign_rhs2 (use_stmt) == op)
3116                     neg = ! neg;
3117               }
3118             else
3119               {
3120                 fail = true;
3121                 break;
3122               }
3123           }
3124     }
3125   return ! fail && ! neg;
3126 }
3127 
3128 
3129 /* Function vect_is_simple_reduction
3130 
3131    (1) Detect a cross-iteration def-use cycle that represents a simple
3132    reduction computation.  We look for the following pattern:
3133 
3134    loop_header:
3135      a1 = phi < a0, a2 >
3136      a3 = ...
3137      a2 = operation (a3, a1)
3138 
3139    or
3140 
3141    a3 = ...
3142    loop_header:
3143      a1 = phi < a0, a2 >
3144      a2 = operation (a3, a1)
3145 
3146    such that:
3147    1. operation is commutative and associative and it is safe to
3148       change the order of the computation
3149    2. no uses for a2 in the loop (a2 is used out of the loop)
3150    3. no uses of a1 in the loop besides the reduction operation
3151    4. no uses of a1 outside the loop.
3152 
3153    Conditions 1,4 are tested here.
3154    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3155 
3156    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3157    nested cycles.
3158 
3159    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3160    reductions:
3161 
3162      a1 = phi < a0, a2 >
3163      inner loop (def of a3)
3164      a2 = phi < a3 >
3165 
3166    (4) Detect condition expressions, ie:
3167      for (int i = 0; i < N; i++)
3168        if (a[i] < val)
3169           ret_val = a[i];
3170 
3171 */
3172 
3173 static gimple *
vect_is_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow,enum vect_reduction_type * v_reduc_type)3174 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3175                                 bool *double_reduc,
3176                                 bool need_wrapping_integral_overflow,
3177                                 enum vect_reduction_type *v_reduc_type)
3178 {
3179   struct loop *loop = (gimple_bb (phi))->loop_father;
3180   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3181   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3182   enum tree_code orig_code, code;
3183   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3184   tree type;
3185   int nloop_uses;
3186   tree name;
3187   imm_use_iterator imm_iter;
3188   use_operand_p use_p;
3189   bool phi_def;
3190 
3191   *double_reduc = false;
3192   *v_reduc_type = TREE_CODE_REDUCTION;
3193 
3194   tree phi_name = PHI_RESULT (phi);
3195   /* ???  If there are no uses of the PHI result the inner loop reduction
3196      won't be detected as possibly double-reduction by vectorizable_reduction
3197      because that tries to walk the PHI arg from the preheader edge which
3198      can be constant.  See PR60382.  */
3199   if (has_zero_uses (phi_name))
3200     return NULL;
3201   nloop_uses = 0;
3202   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3203     {
3204       gimple *use_stmt = USE_STMT (use_p);
3205       if (is_gimple_debug (use_stmt))
3206           continue;
3207 
3208       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3209         {
3210           if (dump_enabled_p ())
3211               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3212                                    "intermediate value used outside loop.\n");
3213 
3214           return NULL;
3215         }
3216 
3217       nloop_uses++;
3218       if (nloop_uses > 1)
3219         {
3220           if (dump_enabled_p ())
3221               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3222                                    "reduction value used in loop.\n");
3223           return NULL;
3224         }
3225 
3226       phi_use_stmt = use_stmt;
3227     }
3228 
3229   edge latch_e = loop_latch_edge (loop);
3230   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3231   if (TREE_CODE (loop_arg) != SSA_NAME)
3232     {
3233       if (dump_enabled_p ())
3234           {
3235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236                                  "reduction: not ssa_name: ");
3237             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3238           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3239           }
3240       return NULL;
3241     }
3242 
3243   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3244   if (is_gimple_assign (def_stmt))
3245     {
3246       name = gimple_assign_lhs (def_stmt);
3247       phi_def = false;
3248     }
3249   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3250     {
3251       name = PHI_RESULT (def_stmt);
3252       phi_def = true;
3253     }
3254   else
3255     {
3256       if (dump_enabled_p ())
3257           {
3258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3259                                  "reduction: unhandled reduction operation: ");
3260             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3261           }
3262       return NULL;
3263     }
3264 
3265   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3266     return NULL;
3267 
3268   nloop_uses = 0;
3269   auto_vec<gphi *, 3> lcphis;
3270   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3271     {
3272       gimple *use_stmt = USE_STMT (use_p);
3273       if (is_gimple_debug (use_stmt))
3274           continue;
3275       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3276           nloop_uses++;
3277       else
3278           /* We can have more than one loop-closed PHI.  */
3279           lcphis.safe_push (as_a <gphi *> (use_stmt));
3280       if (nloop_uses > 1)
3281           {
3282             if (dump_enabled_p ())
3283               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3284                                    "reduction used in loop.\n");
3285             return NULL;
3286           }
3287     }
3288 
3289   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3290      defined in the inner loop.  */
3291   if (phi_def)
3292     {
3293       op1 = PHI_ARG_DEF (def_stmt, 0);
3294 
3295       if (gimple_phi_num_args (def_stmt) != 1
3296           || TREE_CODE (op1) != SSA_NAME)
3297         {
3298           if (dump_enabled_p ())
3299               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3300                                    "unsupported phi node definition.\n");
3301 
3302           return NULL;
3303         }
3304 
3305       def1 = SSA_NAME_DEF_STMT (op1);
3306       if (gimple_bb (def1)
3307             && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3308           && loop->inner
3309           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3310           && is_gimple_assign (def1)
3311             && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3312         {
3313           if (dump_enabled_p ())
3314             report_vect_op (MSG_NOTE, def_stmt,
3315                                   "detected double reduction: ");
3316 
3317           *double_reduc = true;
3318           return def_stmt;
3319         }
3320 
3321       return NULL;
3322     }
3323 
3324   /* If we are vectorizing an inner reduction we are executing that
3325      in the original order only in case we are not dealing with a
3326      double reduction.  */
3327   bool check_reduction = true;
3328   if (flow_loop_nested_p (vect_loop, loop))
3329     {
3330       gphi *lcphi;
3331       unsigned i;
3332       check_reduction = false;
3333       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3334           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3335             {
3336               gimple *use_stmt = USE_STMT (use_p);
3337               if (is_gimple_debug (use_stmt))
3338                 continue;
3339               if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3340                 check_reduction = true;
3341             }
3342     }
3343 
3344   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3345   code = orig_code = gimple_assign_rhs_code (def_stmt);
3346 
3347   /* We can handle "res -= x[i]", which is non-associative by
3348      simply rewriting this into "res += -x[i]".  Avoid changing
3349      gimple instruction for the first simple tests and only do this
3350      if we're allowed to change code at all.  */
3351   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3352     code = PLUS_EXPR;
3353 
3354   if (code == COND_EXPR)
3355     {
3356       if (! nested_in_vect_loop)
3357           *v_reduc_type = COND_REDUCTION;
3358 
3359       op3 = gimple_assign_rhs1 (def_stmt);
3360       if (COMPARISON_CLASS_P (op3))
3361         {
3362           op4 = TREE_OPERAND (op3, 1);
3363           op3 = TREE_OPERAND (op3, 0);
3364         }
3365       if (op3 == phi_name || op4 == phi_name)
3366           {
3367             if (dump_enabled_p ())
3368               report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3369                                   "reduction: condition depends on previous"
3370                                   " iteration: ");
3371             return NULL;
3372           }
3373 
3374       op1 = gimple_assign_rhs2 (def_stmt);
3375       op2 = gimple_assign_rhs3 (def_stmt);
3376     }
3377   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3378     {
3379       if (dump_enabled_p ())
3380           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3381                               "reduction: not commutative/associative: ");
3382       return NULL;
3383     }
3384   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3385     {
3386       op1 = gimple_assign_rhs1 (def_stmt);
3387       op2 = gimple_assign_rhs2 (def_stmt);
3388     }
3389   else
3390     {
3391       if (dump_enabled_p ())
3392           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3393                               "reduction: not handled operation: ");
3394       return NULL;
3395     }
3396 
3397   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3398     {
3399       if (dump_enabled_p ())
3400           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3401                               "reduction: both uses not ssa_names: ");
3402 
3403       return NULL;
3404     }
3405 
3406   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3407   if ((TREE_CODE (op1) == SSA_NAME
3408        && !types_compatible_p (type,TREE_TYPE (op1)))
3409       || (TREE_CODE (op2) == SSA_NAME
3410           && !types_compatible_p (type, TREE_TYPE (op2)))
3411       || (op3 && TREE_CODE (op3) == SSA_NAME
3412           && !types_compatible_p (type, TREE_TYPE (op3)))
3413       || (op4 && TREE_CODE (op4) == SSA_NAME
3414           && !types_compatible_p (type, TREE_TYPE (op4))))
3415     {
3416       if (dump_enabled_p ())
3417         {
3418           dump_printf_loc (MSG_NOTE, vect_location,
3419                                  "reduction: multiple types: operation type: ");
3420           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3421           dump_printf (MSG_NOTE, ", operands types: ");
3422           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3423                                    TREE_TYPE (op1));
3424           dump_printf (MSG_NOTE, ",");
3425           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3426                                    TREE_TYPE (op2));
3427           if (op3)
3428             {
3429               dump_printf (MSG_NOTE, ",");
3430               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3431                                          TREE_TYPE (op3));
3432             }
3433 
3434           if (op4)
3435             {
3436               dump_printf (MSG_NOTE, ",");
3437               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3438                                          TREE_TYPE (op4));
3439             }
3440           dump_printf (MSG_NOTE, "\n");
3441         }
3442 
3443       return NULL;
3444     }
3445 
3446   /* Check whether it's ok to change the order of the computation.
3447      Generally, when vectorizing a reduction we change the order of the
3448      computation.  This may change the behavior of the program in some
3449      cases, so we need to check that this is ok.  One exception is when
3450      vectorizing an outer-loop: the inner-loop is executed sequentially,
3451      and therefore vectorizing reductions in the inner-loop during
3452      outer-loop vectorization is safe.  */
3453   if (check_reduction
3454       && *v_reduc_type == TREE_CODE_REDUCTION
3455       && needs_fold_left_reduction_p (type, code,
3456                                               need_wrapping_integral_overflow))
3457     *v_reduc_type = FOLD_LEFT_REDUCTION;
3458 
3459   /* Reduction is safe. We're dealing with one of the following:
3460      1) integer arithmetic and no trapv
3461      2) floating point arithmetic, and special flags permit this optimization
3462      3) nested cycle (i.e., outer loop vectorization).  */
3463   if (TREE_CODE (op1) == SSA_NAME)
3464     def1 = SSA_NAME_DEF_STMT (op1);
3465 
3466   if (TREE_CODE (op2) == SSA_NAME)
3467     def2 = SSA_NAME_DEF_STMT (op2);
3468 
3469   if (code != COND_EXPR
3470       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3471     {
3472       if (dump_enabled_p ())
3473           report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3474       return NULL;
3475     }
3476 
3477   /* Check that one def is the reduction def, defined by PHI,
3478      the other def is either defined in the loop ("vect_internal_def"),
3479      or it's an induction (defined by a loop-header phi-node).  */
3480 
3481   if (def2 && def2 == phi
3482       && (code == COND_EXPR
3483             || !def1 || gimple_nop_p (def1)
3484             || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3485           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3486               && (is_gimple_assign (def1)
3487                       || is_gimple_call (def1)
3488                     || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3489                       == vect_induction_def
3490                     || (gimple_code (def1) == GIMPLE_PHI
3491                         && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3492                           == vect_internal_def
3493                         && !is_loop_header_bb_p (gimple_bb (def1)))))))
3494     {
3495       if (dump_enabled_p ())
3496           report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3497       return def_stmt;
3498     }
3499 
3500   if (def1 && def1 == phi
3501       && (code == COND_EXPR
3502             || !def2 || gimple_nop_p (def2)
3503             || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3504             || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3505                 && (is_gimple_assign (def2)
3506                       || is_gimple_call (def2)
3507                       || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3508                            == vect_induction_def
3509                       || (gimple_code (def2) == GIMPLE_PHI
3510                           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3511                                  == vect_internal_def
3512                           && !is_loop_header_bb_p (gimple_bb (def2)))))))
3513     {
3514       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3515           {
3516             /* Check if we can swap operands (just for simplicity - so that
3517                the rest of the code can assume that the reduction variable
3518                is always the last (second) argument).  */
3519             if (code == COND_EXPR)
3520               {
3521                 /* Swap cond_expr by inverting the condition.  */
3522                 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3523                 enum tree_code invert_code = ERROR_MARK;
3524                 enum tree_code cond_code = TREE_CODE (cond_expr);
3525 
3526                 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3527                     {
3528                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3529                       invert_code = invert_tree_comparison (cond_code, honor_nans);
3530                     }
3531                 if (invert_code != ERROR_MARK)
3532                     {
3533                       TREE_SET_CODE (cond_expr, invert_code);
3534                       swap_ssa_operands (def_stmt,
3535                                              gimple_assign_rhs2_ptr (def_stmt),
3536                                              gimple_assign_rhs3_ptr (def_stmt));
3537                     }
3538                 else
3539                     {
3540                       if (dump_enabled_p ())
3541                         report_vect_op (MSG_NOTE, def_stmt,
3542                                             "detected reduction: cannot swap operands "
3543                                             "for cond_expr");
3544                       return NULL;
3545                     }
3546               }
3547             else
3548               swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3549                                      gimple_assign_rhs2_ptr (def_stmt));
3550 
3551             if (dump_enabled_p ())
3552               report_vect_op (MSG_NOTE, def_stmt,
3553                                   "detected reduction: need to swap operands: ");
3554 
3555             if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3556               LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3557         }
3558       else
3559         {
3560           if (dump_enabled_p ())
3561             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3562         }
3563 
3564       return def_stmt;
3565     }
3566 
3567   /* Try to find SLP reduction chain.  */
3568   if (! nested_in_vect_loop
3569       && code != COND_EXPR
3570       && orig_code != MINUS_EXPR
3571       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3572     {
3573       if (dump_enabled_p ())
3574         report_vect_op (MSG_NOTE, def_stmt,
3575                               "reduction: detected reduction chain: ");
3576 
3577       return def_stmt;
3578     }
3579 
3580   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3581   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3582   while (first)
3583     {
3584       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3585       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3586       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3587       first = next;
3588     }
3589 
3590   /* Look for the expression computing loop_arg from loop PHI result.  */
3591   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3592                                   code))
3593     return def_stmt;
3594 
3595   if (dump_enabled_p ())
3596     {
3597       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3598                           "reduction: unknown pattern: ");
3599     }
3600 
3601   return NULL;
3602 }
3603 
3604 /* Wrapper around vect_is_simple_reduction, which will modify code
3605    in-place if it enables detection of more reductions.  Arguments
3606    as there.  */
3607 
3608 gimple *
vect_force_simple_reduction(loop_vec_info loop_info,gimple * phi,bool * double_reduc,bool need_wrapping_integral_overflow)3609 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3610                                    bool *double_reduc,
3611                                    bool need_wrapping_integral_overflow)
3612 {
3613   enum vect_reduction_type v_reduc_type;
3614   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3615                                                     need_wrapping_integral_overflow,
3616                                                     &v_reduc_type);
3617   if (def)
3618     {
3619       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3620       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3621       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3622       reduc_def_info = vinfo_for_stmt (def);
3623       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3624       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3625     }
3626   return def;
3627 }
3628 
3629 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3630 int
vect_get_known_peeling_cost(loop_vec_info loop_vinfo,int peel_iters_prologue,int * peel_iters_epilogue,stmt_vector_for_cost * scalar_cost_vec,stmt_vector_for_cost * prologue_cost_vec,stmt_vector_for_cost * epilogue_cost_vec)3631 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3632                              int *peel_iters_epilogue,
3633                              stmt_vector_for_cost *scalar_cost_vec,
3634                                    stmt_vector_for_cost *prologue_cost_vec,
3635                                    stmt_vector_for_cost *epilogue_cost_vec)
3636 {
3637   int retval = 0;
3638   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3639 
3640   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3641     {
3642       *peel_iters_epilogue = assumed_vf / 2;
3643       if (dump_enabled_p ())
3644         dump_printf_loc (MSG_NOTE, vect_location,
3645                                "cost model: epilogue peel iters set to vf/2 "
3646                                "because loop iterations are unknown .\n");
3647 
3648       /* If peeled iterations are known but number of scalar loop
3649          iterations are unknown, count a taken branch per peeled loop.  */
3650       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3651                                          NULL, 0, vect_prologue);
3652       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3653                                          NULL, 0, vect_epilogue);
3654     }
3655   else
3656     {
3657       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3658       peel_iters_prologue = niters < peel_iters_prologue ?
3659                             niters : peel_iters_prologue;
3660       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3661       /* If we need to peel for gaps, but no peeling is required, we have to
3662            peel VF iterations.  */
3663       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3664           *peel_iters_epilogue = assumed_vf;
3665     }
3666 
3667   stmt_info_for_cost *si;
3668   int j;
3669   if (peel_iters_prologue)
3670     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3671           {
3672             stmt_vec_info stmt_info
3673               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3674             retval += record_stmt_cost (prologue_cost_vec,
3675                                               si->count * peel_iters_prologue,
3676                                               si->kind, stmt_info, si->misalign,
3677                                               vect_prologue);
3678           }
3679   if (*peel_iters_epilogue)
3680     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3681           {
3682             stmt_vec_info stmt_info
3683               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3684             retval += record_stmt_cost (epilogue_cost_vec,
3685                                               si->count * *peel_iters_epilogue,
3686                                               si->kind, stmt_info, si->misalign,
3687                                               vect_epilogue);
3688           }
3689 
3690   return retval;
3691 }
3692 
3693 /* Function vect_estimate_min_profitable_iters
3694 
3695    Return the number of iterations required for the vector version of the
3696    loop to be profitable relative to the cost of the scalar version of the
3697    loop.
3698 
3699    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3700    of iterations for vectorization.  -1 value means loop vectorization
3701    is not profitable.  This returned value may be used for dynamic
3702    profitability check.
3703 
3704    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3705    for static check against estimated number of iterations.  */
3706 
3707 static void
vect_estimate_min_profitable_iters(loop_vec_info loop_vinfo,int * ret_min_profitable_niters,int * ret_min_profitable_estimate)3708 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3709                                             int *ret_min_profitable_niters,
3710                                             int *ret_min_profitable_estimate)
3711 {
3712   int min_profitable_iters;
3713   int min_profitable_estimate;
3714   int peel_iters_prologue;
3715   int peel_iters_epilogue;
3716   unsigned vec_inside_cost = 0;
3717   int vec_outside_cost = 0;
3718   unsigned vec_prologue_cost = 0;
3719   unsigned vec_epilogue_cost = 0;
3720   int scalar_single_iter_cost = 0;
3721   int scalar_outside_cost = 0;
3722   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3723   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3724   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3725 
3726   /* Cost model disabled.  */
3727   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3728     {
3729       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3730       *ret_min_profitable_niters = 0;
3731       *ret_min_profitable_estimate = 0;
3732       return;
3733     }
3734 
3735   /* Requires loop versioning tests to handle misalignment.  */
3736   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3737     {
3738       /*  FIXME: Make cost depend on complexity of individual check.  */
3739       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3740       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3741                                   vect_prologue);
3742       dump_printf (MSG_NOTE,
3743                    "cost model: Adding cost of checks for loop "
3744                    "versioning to treat misalignment.\n");
3745     }
3746 
3747   /* Requires loop versioning with alias checks.  */
3748   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3749     {
3750       /*  FIXME: Make cost depend on complexity of individual check.  */
3751       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3752       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3753                                   vect_prologue);
3754       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3755       if (len)
3756           /* Count LEN - 1 ANDs and LEN comparisons.  */
3757           (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3758                                     NULL, 0, vect_prologue);
3759       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3760       if (len)
3761           {
3762             /* Count LEN - 1 ANDs and LEN comparisons.  */
3763             unsigned int nstmts = len * 2 - 1;
3764             /* +1 for each bias that needs adding.  */
3765             for (unsigned int i = 0; i < len; ++i)
3766               if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3767                 nstmts += 1;
3768             (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3769                                         NULL, 0, vect_prologue);
3770           }
3771       dump_printf (MSG_NOTE,
3772                    "cost model: Adding cost of checks for loop "
3773                    "versioning aliasing.\n");
3774     }
3775 
3776   /* Requires loop versioning with niter checks.  */
3777   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3778     {
3779       /*  FIXME: Make cost depend on complexity of individual check.  */
3780       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3781                                   vect_prologue);
3782       dump_printf (MSG_NOTE,
3783                        "cost model: Adding cost of checks for loop "
3784                        "versioning niters.\n");
3785     }
3786 
3787   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3788     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3789                                 vect_prologue);
3790 
3791   /* Count statements in scalar loop.  Using this as scalar cost for a single
3792      iteration for now.
3793 
3794      TODO: Add outer loop support.
3795 
3796      TODO: Consider assigning different costs to different scalar
3797      statements.  */
3798 
3799   scalar_single_iter_cost
3800     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3801 
3802   /* Add additional cost for the peeled instructions in prologue and epilogue
3803      loop.  (For fully-masked loops there will be no peeling.)
3804 
3805      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3806      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3807 
3808      TODO: Build an expression that represents peel_iters for prologue and
3809      epilogue to be used in a run-time test.  */
3810 
3811   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3812     {
3813       peel_iters_prologue = 0;
3814       peel_iters_epilogue = 0;
3815 
3816       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3817           {
3818             /* We need to peel exactly one iteration.  */
3819             peel_iters_epilogue += 1;
3820             stmt_info_for_cost *si;
3821             int j;
3822             FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3823                                   j, si)
3824               {
3825                 struct _stmt_vec_info *stmt_info
3826                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3827                 (void) add_stmt_cost (target_cost_data, si->count,
3828                                             si->kind, stmt_info, si->misalign,
3829                                             vect_epilogue);
3830               }
3831           }
3832     }
3833   else if (npeel < 0)
3834     {
3835       peel_iters_prologue = assumed_vf / 2;
3836       dump_printf (MSG_NOTE, "cost model: "
3837                    "prologue peel iters set to vf/2.\n");
3838 
3839       /* If peeling for alignment is unknown, loop bound of main loop becomes
3840          unknown.  */
3841       peel_iters_epilogue = assumed_vf / 2;
3842       dump_printf (MSG_NOTE, "cost model: "
3843                    "epilogue peel iters set to vf/2 because "
3844                    "peeling for alignment is unknown.\n");
3845 
3846       /* If peeled iterations are unknown, count a taken branch and a not taken
3847          branch per peeled loop. Even if scalar loop iterations are known,
3848          vector iterations are not known since peeled prologue iterations are
3849          not known. Hence guards remain the same.  */
3850       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3851                                   NULL, 0, vect_prologue);
3852       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3853                                   NULL, 0, vect_prologue);
3854       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3855                                   NULL, 0, vect_epilogue);
3856       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3857                                   NULL, 0, vect_epilogue);
3858       stmt_info_for_cost *si;
3859       int j;
3860       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3861           {
3862             struct _stmt_vec_info *stmt_info
3863               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3864             (void) add_stmt_cost (target_cost_data,
3865                                         si->count * peel_iters_prologue,
3866                                         si->kind, stmt_info, si->misalign,
3867                                         vect_prologue);
3868             (void) add_stmt_cost (target_cost_data,
3869                                         si->count * peel_iters_epilogue,
3870                                         si->kind, stmt_info, si->misalign,
3871                                         vect_epilogue);
3872           }
3873     }
3874   else
3875     {
3876       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3877       stmt_info_for_cost *si;
3878       int j;
3879       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3880 
3881       prologue_cost_vec.create (2);
3882       epilogue_cost_vec.create (2);
3883       peel_iters_prologue = npeel;
3884 
3885       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3886                                                     &peel_iters_epilogue,
3887                                                     &LOOP_VINFO_SCALAR_ITERATION_COST
3888                                                       (loop_vinfo),
3889                                                     &prologue_cost_vec,
3890                                                     &epilogue_cost_vec);
3891 
3892       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3893           {
3894             struct _stmt_vec_info *stmt_info
3895               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3896             (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3897                                         si->misalign, vect_prologue);
3898           }
3899 
3900       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3901           {
3902             struct _stmt_vec_info *stmt_info
3903               = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3904             (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3905                                         si->misalign, vect_epilogue);
3906           }
3907 
3908       prologue_cost_vec.release ();
3909       epilogue_cost_vec.release ();
3910     }
3911 
3912   /* FORNOW: The scalar outside cost is incremented in one of the
3913      following ways:
3914 
3915      1. The vectorizer checks for alignment and aliasing and generates
3916      a condition that allows dynamic vectorization.  A cost model
3917      check is ANDED with the versioning condition.  Hence scalar code
3918      path now has the added cost of the versioning check.
3919 
3920        if (cost > th & versioning_check)
3921          jmp to vector code
3922 
3923      Hence run-time scalar is incremented by not-taken branch cost.
3924 
3925      2. The vectorizer then checks if a prologue is required.  If the
3926      cost model check was not done before during versioning, it has to
3927      be done before the prologue check.
3928 
3929        if (cost <= th)
3930          prologue = scalar_iters
3931        if (prologue == 0)
3932          jmp to vector code
3933        else
3934          execute prologue
3935        if (prologue == num_iters)
3936            go to exit
3937 
3938      Hence the run-time scalar cost is incremented by a taken branch,
3939      plus a not-taken branch, plus a taken branch cost.
3940 
3941      3. The vectorizer then checks if an epilogue is required.  If the
3942      cost model check was not done before during prologue check, it
3943      has to be done with the epilogue check.
3944 
3945        if (prologue == 0)
3946          jmp to vector code
3947        else
3948          execute prologue
3949        if (prologue == num_iters)
3950            go to exit
3951        vector code:
3952          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3953            jmp to epilogue
3954 
3955      Hence the run-time scalar cost should be incremented by 2 taken
3956      branches.
3957 
3958      TODO: The back end may reorder the BBS's differently and reverse
3959      conditions/branch directions.  Change the estimates below to
3960      something more reasonable.  */
3961 
3962   /* If the number of iterations is known and we do not do versioning, we can
3963      decide whether to vectorize at compile time.  Hence the scalar version
3964      do not carry cost model guard costs.  */
3965   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3966       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3967     {
3968       /* Cost model check occurs at versioning.  */
3969       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3970           scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3971       else
3972           {
3973             /* Cost model check occurs at prologue generation.  */
3974             if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3975               scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3976                 + vect_get_stmt_cost (cond_branch_not_taken);
3977             /* Cost model check occurs at epilogue generation.  */
3978             else
3979               scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3980           }
3981     }
3982 
3983   /* Complete the target-specific cost calculations.  */
3984   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3985                  &vec_inside_cost, &vec_epilogue_cost);
3986 
3987   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3988 
3989   if (dump_enabled_p ())
3990     {
3991       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3992       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3993                    vec_inside_cost);
3994       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3995                    vec_prologue_cost);
3996       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3997                    vec_epilogue_cost);
3998       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3999                    scalar_single_iter_cost);
4000       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4001                    scalar_outside_cost);
4002       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4003                    vec_outside_cost);
4004       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4005                    peel_iters_prologue);
4006       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4007                    peel_iters_epilogue);
4008     }
4009 
4010   /* Calculate number of iterations required to make the vector version
4011      profitable, relative to the loop bodies only.  The following condition
4012      must hold true:
4013      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4014      where
4015      SIC = scalar iteration cost, VIC = vector iteration cost,
4016      VOC = vector outside cost, VF = vectorization factor,
4017      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4018      SOC = scalar outside cost for run time cost model check.  */
4019 
4020   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4021     {
4022       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4023                                     * assumed_vf
4024                                     - vec_inside_cost * peel_iters_prologue
4025                                     - vec_inside_cost * peel_iters_epilogue);
4026       if (min_profitable_iters <= 0)
4027         min_profitable_iters = 0;
4028       else
4029           {
4030             min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4031                                            - vec_inside_cost);
4032 
4033             if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4034                 <= (((int) vec_inside_cost * min_profitable_iters)
4035                       + (((int) vec_outside_cost - scalar_outside_cost)
4036                          * assumed_vf)))
4037               min_profitable_iters++;
4038           }
4039     }
4040   /* vector version will never be profitable.  */
4041   else
4042     {
4043       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4044           warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4045                         "did not happen for a simd loop");
4046 
4047       if (dump_enabled_p ())
4048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4049                                "cost model: the vector iteration cost = %d "
4050                                "divided by the scalar iteration cost = %d "
4051                                "is greater or equal to the vectorization factor = %d"
4052                          ".\n",
4053                                vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4054       *ret_min_profitable_niters = -1;
4055       *ret_min_profitable_estimate = -1;
4056       return;
4057     }
4058 
4059   dump_printf (MSG_NOTE,
4060                  "  Calculated minimum iters for profitability: %d\n",
4061                  min_profitable_iters);
4062 
4063   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4064       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4065     /* We want the vectorized loop to execute at least once.  */
4066     min_profitable_iters = assumed_vf + peel_iters_prologue;
4067 
4068   if (dump_enabled_p ())
4069     dump_printf_loc (MSG_NOTE, vect_location,
4070                      "  Runtime profitability threshold = %d\n",
4071                      min_profitable_iters);
4072 
4073   *ret_min_profitable_niters = min_profitable_iters;
4074 
4075   /* Calculate number of iterations required to make the vector version
4076      profitable, relative to the loop bodies only.
4077 
4078      Non-vectorized variant is SIC * niters and it must win over vector
4079      variant on the expected loop trip count.  The following condition must hold true:
4080      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4081 
4082   if (vec_outside_cost <= 0)
4083     min_profitable_estimate = 0;
4084   else
4085     {
4086       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4087                                          * assumed_vf
4088                                          - vec_inside_cost * peel_iters_prologue
4089                                          - vec_inside_cost * peel_iters_epilogue)
4090                                          / ((scalar_single_iter_cost * assumed_vf)
4091                                            - vec_inside_cost);
4092     }
4093   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4094   if (dump_enabled_p ())
4095     dump_printf_loc (MSG_NOTE, vect_location,
4096                          "  Static estimate profitability threshold = %d\n",
4097                          min_profitable_estimate);
4098 
4099   *ret_min_profitable_estimate = min_profitable_estimate;
4100 }
4101 
4102 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4103    vector elements (not bits) for a vector with NELT elements.  */
4104 static void
calc_vec_perm_mask_for_shift(unsigned int offset,unsigned int nelt,vec_perm_builder * sel)4105 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4106                                     vec_perm_builder *sel)
4107 {
4108   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4109      by vec_perm_indices.  */
4110   sel->new_vector (nelt, 1, 3);
4111   for (unsigned int i = 0; i < 3; i++)
4112     sel->quick_push (i + offset);
4113 }
4114 
4115 /* Checks whether the target supports whole-vector shifts for vectors of mode
4116    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4117    it supports vec_perm_const with masks for all necessary shift amounts.  */
4118 static bool
have_whole_vector_shift(machine_mode mode)4119 have_whole_vector_shift (machine_mode mode)
4120 {
4121   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4122     return true;
4123 
4124   /* Variable-length vectors should be handled via the optab.  */
4125   unsigned int nelt;
4126   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4127     return false;
4128 
4129   vec_perm_builder sel;
4130   vec_perm_indices indices;
4131   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4132     {
4133       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4134       indices.new_vector (sel, 2, nelt);
4135       if (!can_vec_perm_const_p (mode, indices, false))
4136           return false;
4137     }
4138   return true;
4139 }
4140 
4141 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4142    functions. Design better to avoid maintenance issues.  */
4143 
4144 /* Function vect_model_reduction_cost.
4145 
4146    Models cost for a reduction operation, including the vector ops
4147    generated within the strip-mine loop, the initial definition before
4148    the loop, and the epilogue code that must be generated.  */
4149 
4150 static void
vect_model_reduction_cost(stmt_vec_info stmt_info,internal_fn reduc_fn,int ncopies)4151 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4152                                  int ncopies)
4153 {
4154   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4155   enum tree_code code;
4156   optab optab;
4157   tree vectype;
4158   gimple *orig_stmt;
4159   machine_mode mode;
4160   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4161   struct loop *loop = NULL;
4162   void *target_cost_data;
4163 
4164   if (loop_vinfo)
4165     {
4166       loop = LOOP_VINFO_LOOP (loop_vinfo);
4167       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4168     }
4169   else
4170     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4171 
4172   /* Condition reductions generate two reductions in the loop.  */
4173   vect_reduction_type reduction_type
4174     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4175   if (reduction_type == COND_REDUCTION)
4176     ncopies *= 2;
4177 
4178   vectype = STMT_VINFO_VECTYPE (stmt_info);
4179   mode = TYPE_MODE (vectype);
4180   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4181 
4182   if (!orig_stmt)
4183     orig_stmt = STMT_VINFO_STMT (stmt_info);
4184 
4185   code = gimple_assign_rhs_code (orig_stmt);
4186 
4187   if (reduction_type == EXTRACT_LAST_REDUCTION
4188       || reduction_type == FOLD_LEFT_REDUCTION)
4189     {
4190       /* No extra instructions needed in the prologue.  */
4191       prologue_cost = 0;
4192 
4193       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4194           /* Count one reduction-like operation per vector.  */
4195           inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4196                                              stmt_info, 0, vect_body);
4197       else
4198           {
4199             /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4200             unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4201             inside_cost = add_stmt_cost (target_cost_data,  nelements,
4202                                                vec_to_scalar, stmt_info, 0,
4203                                                vect_body);
4204             inside_cost += add_stmt_cost (target_cost_data,  nelements,
4205                                                   scalar_stmt, stmt_info, 0,
4206                                                   vect_body);
4207           }
4208     }
4209   else
4210     {
4211       /* Add in cost for initial definition.
4212            For cond reduction we have four vectors: initial index, step,
4213            initial result of the data reduction, initial value of the index
4214            reduction.  */
4215       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4216       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4217                                               scalar_to_vec, stmt_info, 0,
4218                                               vect_prologue);
4219 
4220       /* Cost of reduction op inside loop.  */
4221       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4222                                            stmt_info, 0, vect_body);
4223     }
4224 
4225   /* Determine cost of epilogue code.
4226 
4227      We have a reduction operator that will reduce the vector in one statement.
4228      Also requires scalar extract.  */
4229 
4230   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4231     {
4232       if (reduc_fn != IFN_LAST)
4233           {
4234             if (reduction_type == COND_REDUCTION)
4235               {
4236                 /* An EQ stmt and an COND_EXPR stmt.  */
4237                 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4238                                                         vector_stmt, stmt_info, 0,
4239                                                         vect_epilogue);
4240                 /* Reduction of the max index and a reduction of the found
4241                      values.  */
4242                 epilogue_cost += add_stmt_cost (target_cost_data, 2,
4243                                                         vec_to_scalar, stmt_info, 0,
4244                                                         vect_epilogue);
4245                 /* A broadcast of the max value.  */
4246                 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4247                                                         scalar_to_vec, stmt_info, 0,
4248                                                         vect_epilogue);
4249               }
4250             else
4251               {
4252                 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4253                                                         stmt_info, 0, vect_epilogue);
4254                 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4255                                                         vec_to_scalar, stmt_info, 0,
4256                                                         vect_epilogue);
4257               }
4258           }
4259       else if (reduction_type == COND_REDUCTION)
4260           {
4261             unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4262             /* Extraction of scalar elements.  */
4263             epilogue_cost += add_stmt_cost (target_cost_data,
4264                                                     2 * estimated_nunits,
4265                                                     vec_to_scalar, stmt_info, 0,
4266                                                     vect_epilogue);
4267             /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4268             epilogue_cost += add_stmt_cost (target_cost_data,
4269                                                     2 * estimated_nunits - 3,
4270                                                     scalar_stmt, stmt_info, 0,
4271                                                     vect_epilogue);
4272           }
4273       else if (reduction_type == EXTRACT_LAST_REDUCTION
4274                  || reduction_type == FOLD_LEFT_REDUCTION)
4275           /* No extra instructions need in the epilogue.  */
4276           ;
4277       else
4278           {
4279             int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4280             tree bitsize =
4281               TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4282             int element_bitsize = tree_to_uhwi (bitsize);
4283             int nelements = vec_size_in_bits / element_bitsize;
4284 
4285             if (code == COND_EXPR)
4286               code = MAX_EXPR;
4287 
4288             optab = optab_for_tree_code (code, vectype, optab_default);
4289 
4290             /* We have a whole vector shift available.  */
4291             if (optab != unknown_optab
4292                 && VECTOR_MODE_P (mode)
4293                 && optab_handler (optab, mode) != CODE_FOR_nothing
4294                 && have_whole_vector_shift (mode))
4295               {
4296                 /* Final reduction via vector shifts and the reduction operator.
4297                      Also requires scalar extract.  */
4298                 epilogue_cost += add_stmt_cost (target_cost_data,
4299                                                         exact_log2 (nelements) * 2,
4300                                                         vector_stmt, stmt_info, 0,
4301                                                         vect_epilogue);
4302                 epilogue_cost += add_stmt_cost (target_cost_data, 1,
4303                                                         vec_to_scalar, stmt_info, 0,
4304                                                         vect_epilogue);
4305               }
4306             else
4307               /* Use extracts and reduction op for final reduction.  For N
4308                  elements, we have N extracts and N-1 reduction ops.  */
4309               epilogue_cost += add_stmt_cost (target_cost_data,
4310                                                       nelements + nelements - 1,
4311                                                       vector_stmt, stmt_info, 0,
4312                                                       vect_epilogue);
4313           }
4314     }
4315 
4316   if (dump_enabled_p ())
4317     dump_printf (MSG_NOTE,
4318                  "vect_model_reduction_cost: inside_cost = %d, "
4319                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4320                  prologue_cost, epilogue_cost);
4321 }
4322 
4323 
4324 /* Function vect_model_induction_cost.
4325 
4326    Models cost for induction operations.  */
4327 
4328 static void
vect_model_induction_cost(stmt_vec_info stmt_info,int ncopies)4329 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4330 {
4331   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4332   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4333   unsigned inside_cost, prologue_cost;
4334 
4335   if (PURE_SLP_STMT (stmt_info))
4336     return;
4337 
4338   /* loop cost for vec_loop.  */
4339   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4340                                      stmt_info, 0, vect_body);
4341 
4342   /* prologue cost for vec_init and vec_step.  */
4343   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4344                                          stmt_info, 0, vect_prologue);
4345 
4346   if (dump_enabled_p ())
4347     dump_printf_loc (MSG_NOTE, vect_location,
4348                      "vect_model_induction_cost: inside_cost = %d, "
4349                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4350 }
4351 
4352 
4353 
4354 /* Function get_initial_def_for_reduction
4355 
4356    Input:
4357    STMT - a stmt that performs a reduction operation in the loop.
4358    INIT_VAL - the initial value of the reduction variable
4359 
4360    Output:
4361    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4362         of the reduction (used for adjusting the epilog - see below).
4363    Return a vector variable, initialized according to the operation that STMT
4364         performs. This vector will be used as the initial value of the
4365         vector of partial results.
4366 
4367    Option1 (adjust in epilog): Initialize the vector as follows:
4368      add/bit or/xor:    [0,0,...,0,0]
4369      mult/bit and:      [1,1,...,1,1]
4370      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4371    and when necessary (e.g. add/mult case) let the caller know
4372    that it needs to adjust the result by init_val.
4373 
4374    Option2: Initialize the vector as follows:
4375      add/bit or/xor:    [init_val,0,0,...,0]
4376      mult/bit and:      [init_val,1,1,...,1]
4377      min/max/cond_expr: [init_val,init_val,...,init_val]
4378    and no adjustments are needed.
4379 
4380    For example, for the following code:
4381 
4382    s = init_val;
4383    for (i=0;i<n;i++)
4384      s = s + a[i];
4385 
4386    STMT is 's = s + a[i]', and the reduction variable is 's'.
4387    For a vector of 4 units, we want to return either [0,0,0,init_val],
4388    or [0,0,0,0] and let the caller know that it needs to adjust
4389    the result at the end by 'init_val'.
4390 
4391    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4392    initialization vector is simpler (same element in all entries), if
4393    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4394 
4395    A cost model should help decide between these two schemes.  */
4396 
4397 tree
get_initial_def_for_reduction(gimple * stmt,tree init_val,tree * adjustment_def)4398 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4399                                tree *adjustment_def)
4400 {
4401   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4402   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4403   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4404   tree scalar_type = TREE_TYPE (init_val);
4405   tree vectype = get_vectype_for_scalar_type (scalar_type);
4406   enum tree_code code = gimple_assign_rhs_code (stmt);
4407   tree def_for_init;
4408   tree init_def;
4409   bool nested_in_vect_loop = false;
4410   REAL_VALUE_TYPE real_init_val = dconst0;
4411   int int_init_val = 0;
4412   gimple *def_stmt = NULL;
4413   gimple_seq stmts = NULL;
4414 
4415   gcc_assert (vectype);
4416 
4417   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4418                 || SCALAR_FLOAT_TYPE_P (scalar_type));
4419 
4420   if (nested_in_vect_loop_p (loop, stmt))
4421     nested_in_vect_loop = true;
4422   else
4423     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4424 
4425   /* In case of double reduction we only create a vector variable to be put
4426      in the reduction phi node.  The actual statement creation is done in
4427      vect_create_epilog_for_reduction.  */
4428   if (adjustment_def && nested_in_vect_loop
4429       && TREE_CODE (init_val) == SSA_NAME
4430       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4431       && gimple_code (def_stmt) == GIMPLE_PHI
4432       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4433       && vinfo_for_stmt (def_stmt)
4434       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4435           == vect_double_reduction_def)
4436     {
4437       *adjustment_def = NULL;
4438       return vect_create_destination_var (init_val, vectype);
4439     }
4440 
4441   vect_reduction_type reduction_type
4442     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4443 
4444   /* In case of a nested reduction do not use an adjustment def as
4445      that case is not supported by the epilogue generation correctly
4446      if ncopies is not one.  */
4447   if (adjustment_def && nested_in_vect_loop)
4448     {
4449       *adjustment_def = NULL;
4450       return vect_get_vec_def_for_operand (init_val, stmt);
4451     }
4452 
4453   switch (code)
4454     {
4455     case WIDEN_SUM_EXPR:
4456     case DOT_PROD_EXPR:
4457     case SAD_EXPR:
4458     case PLUS_EXPR:
4459     case MINUS_EXPR:
4460     case BIT_IOR_EXPR:
4461     case BIT_XOR_EXPR:
4462     case MULT_EXPR:
4463     case BIT_AND_EXPR:
4464       {
4465         /* ADJUSTMENT_DEF is NULL when called from
4466            vect_create_epilog_for_reduction to vectorize double reduction.  */
4467         if (adjustment_def)
4468             *adjustment_def = init_val;
4469 
4470         if (code == MULT_EXPR)
4471           {
4472             real_init_val = dconst1;
4473             int_init_val = 1;
4474           }
4475 
4476         if (code == BIT_AND_EXPR)
4477           int_init_val = -1;
4478 
4479         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4480           def_for_init = build_real (scalar_type, real_init_val);
4481         else
4482           def_for_init = build_int_cst (scalar_type, int_init_val);
4483 
4484           if (adjustment_def)
4485             /* Option1: the first element is '0' or '1' as well.  */
4486             init_def = gimple_build_vector_from_val (&stmts, vectype,
4487                                                                def_for_init);
4488           else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4489             {
4490               /* Option2 (variable length): the first element is INIT_VAL.  */
4491               init_def = build_vector_from_val (vectype, def_for_init);
4492               gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4493                                                                   2, init_def, init_val);
4494               init_def = make_ssa_name (vectype);
4495               gimple_call_set_lhs (call, init_def);
4496               gimple_seq_add_stmt (&stmts, call);
4497             }
4498           else
4499             {
4500               /* Option2: the first element is INIT_VAL.  */
4501               tree_vector_builder elts (vectype, 1, 2);
4502               elts.quick_push (init_val);
4503               elts.quick_push (def_for_init);
4504               init_def = gimple_build_vector (&stmts, &elts);
4505             }
4506       }
4507       break;
4508 
4509     case MIN_EXPR:
4510     case MAX_EXPR:
4511     case COND_EXPR:
4512       {
4513           if (adjustment_def)
4514           {
4515               *adjustment_def = NULL_TREE;
4516               if (reduction_type != COND_REDUCTION
4517                     && reduction_type != EXTRACT_LAST_REDUCTION)
4518                 {
4519                     init_def = vect_get_vec_def_for_operand (init_val, stmt);
4520                     break;
4521                 }
4522             }
4523           init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4524           init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4525       }
4526       break;
4527 
4528     default:
4529       gcc_unreachable ();
4530     }
4531 
4532   if (stmts)
4533     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4534   return init_def;
4535 }
4536 
4537 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4538    NUMBER_OF_VECTORS is the number of vector defs to create.
4539    If NEUTRAL_OP is nonnull, introducing extra elements of that
4540    value will not change the result.  */
4541 
4542 static void
get_initial_defs_for_reduction(slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int number_of_vectors,bool reduc_chain,tree neutral_op)4543 get_initial_defs_for_reduction (slp_tree slp_node,
4544                                         vec<tree> *vec_oprnds,
4545                                         unsigned int number_of_vectors,
4546                                         bool reduc_chain, tree neutral_op)
4547 {
4548   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4549   gimple *stmt = stmts[0];
4550   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4551   unsigned HOST_WIDE_INT nunits;
4552   unsigned j, number_of_places_left_in_vector;
4553   tree vector_type;
4554   tree vop;
4555   int group_size = stmts.length ();
4556   unsigned int vec_num, i;
4557   unsigned number_of_copies = 1;
4558   vec<tree> voprnds;
4559   voprnds.create (number_of_vectors);
4560   struct loop *loop;
4561   auto_vec<tree, 16> permute_results;
4562 
4563   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4564 
4565   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4566 
4567   loop = (gimple_bb (stmt))->loop_father;
4568   gcc_assert (loop);
4569   edge pe = loop_preheader_edge (loop);
4570 
4571   gcc_assert (!reduc_chain || neutral_op);
4572 
4573   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4574      created vectors. It is greater than 1 if unrolling is performed.
4575 
4576      For example, we have two scalar operands, s1 and s2 (e.g., group of
4577      strided accesses of size two), while NUNITS is four (i.e., four scalars
4578      of this type can be packed in a vector).  The output vector will contain
4579      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4580      will be 2).
4581 
4582      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4583      containing the operands.
4584 
4585      For example, NUNITS is four as before, and the group size is 8
4586      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4587      {s5, s6, s7, s8}.  */
4588 
4589   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4590     nunits = group_size;
4591 
4592   number_of_copies = nunits * number_of_vectors / group_size;
4593 
4594   number_of_places_left_in_vector = nunits;
4595   bool constant_p = true;
4596   tree_vector_builder elts (vector_type, nunits, 1);
4597   elts.quick_grow (nunits);
4598   for (j = 0; j < number_of_copies; j++)
4599     {
4600       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4601         {
4602             tree op;
4603             /* Get the def before the loop.  In reduction chain we have only
4604                one initial value.  */
4605             if ((j != (number_of_copies - 1)
4606                  || (reduc_chain && i != 0))
4607                 && neutral_op)
4608               op = neutral_op;
4609             else
4610               op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4611 
4612           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4613           number_of_places_left_in_vector--;
4614             elts[number_of_places_left_in_vector] = op;
4615             if (!CONSTANT_CLASS_P (op))
4616               constant_p = false;
4617 
4618           if (number_of_places_left_in_vector == 0)
4619             {
4620                 gimple_seq ctor_seq = NULL;
4621                 tree init;
4622                 if (constant_p && !neutral_op
4623                       ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4624                       : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4625                     /* Build the vector directly from ELTS.  */
4626                     init = gimple_build_vector (&ctor_seq, &elts);
4627                 else if (neutral_op)
4628                     {
4629                       /* Build a vector of the neutral value and shift the
4630                          other elements into place.  */
4631                       init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4632                                                                    neutral_op);
4633                       int k = nunits;
4634                       while (k > 0 && elts[k - 1] == neutral_op)
4635                         k -= 1;
4636                       while (k > 0)
4637                         {
4638                           k -= 1;
4639                           gcall *call = gimple_build_call_internal
4640                               (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4641                           init = make_ssa_name (vector_type);
4642                           gimple_call_set_lhs (call, init);
4643                           gimple_seq_add_stmt (&ctor_seq, call);
4644                         }
4645                     }
4646                 else
4647                     {
4648                       /* First time round, duplicate ELTS to fill the
4649                          required number of vectors, then cherry pick the
4650                          appropriate result for each iteration.  */
4651                       if (vec_oprnds->is_empty ())
4652                         duplicate_and_interleave (&ctor_seq, vector_type, elts,
4653                                                         number_of_vectors,
4654                                                         permute_results);
4655                       init = permute_results[number_of_vectors - j - 1];
4656                     }
4657                 if (ctor_seq != NULL)
4658                     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4659                 voprnds.quick_push (init);
4660 
4661               number_of_places_left_in_vector = nunits;
4662                 elts.new_vector (vector_type, nunits, 1);
4663                 elts.quick_grow (nunits);
4664                 constant_p = true;
4665             }
4666         }
4667     }
4668 
4669   /* Since the vectors are created in the reverse order, we should invert
4670      them.  */
4671   vec_num = voprnds.length ();
4672   for (j = vec_num; j != 0; j--)
4673     {
4674       vop = voprnds[j - 1];
4675       vec_oprnds->quick_push (vop);
4676     }
4677 
4678   voprnds.release ();
4679 
4680   /* In case that VF is greater than the unrolling factor needed for the SLP
4681      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4682      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4683      to replicate the vectors.  */
4684   tree neutral_vec = NULL;
4685   while (number_of_vectors > vec_oprnds->length ())
4686     {
4687       if (neutral_op)
4688         {
4689           if (!neutral_vec)
4690               {
4691                 gimple_seq ctor_seq = NULL;
4692                 neutral_vec = gimple_build_vector_from_val
4693                     (&ctor_seq, vector_type, neutral_op);
4694                 if (ctor_seq != NULL)
4695                     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4696               }
4697           vec_oprnds->quick_push (neutral_vec);
4698         }
4699       else
4700         {
4701           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4702             vec_oprnds->quick_push (vop);
4703         }
4704     }
4705 }
4706 
4707 
4708 /* Function vect_create_epilog_for_reduction
4709 
4710    Create code at the loop-epilog to finalize the result of a reduction
4711    computation.
4712 
4713    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4714      reduction statements.
4715    STMT is the scalar reduction stmt that is being vectorized.
4716    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4717      number of elements that we can fit in a vectype (nunits).  In this case
4718      we have to generate more than one vector stmt - i.e - we need to "unroll"
4719      the vector stmt by a factor VF/nunits.  For more details see documentation
4720      in vectorizable_operation.
4721    REDUC_FN is the internal function for the epilog reduction.
4722    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4723      computation.
4724    REDUC_INDEX is the index of the operand in the right hand side of the
4725      statement that is defined by REDUCTION_PHI.
4726    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4727    SLP_NODE is an SLP node containing a group of reduction statements. The
4728      first one in this group is STMT.
4729    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4730      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4731      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4732      any value of the IV in the loop.
4733    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4734    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4735      null if this is not an SLP reduction
4736 
4737    This function:
4738    1. Creates the reduction def-use cycles: sets the arguments for
4739       REDUCTION_PHIS:
4740       The loop-entry argument is the vectorized initial-value of the reduction.
4741       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4742       sums.
4743    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4744       by calling the function specified by REDUC_FN if available, or by
4745       other means (whole-vector shifts or a scalar loop).
4746       The function also creates a new phi node at the loop exit to preserve
4747       loop-closed form, as illustrated below.
4748 
4749      The flow at the entry to this function:
4750 
4751         loop:
4752           vec_def = phi <null, null>            # REDUCTION_PHI
4753           VECT_DEF = vector_stmt                # vectorized form of STMT
4754           s_loop = scalar_stmt                  # (scalar) STMT
4755         loop_exit:
4756           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4757           use <s_out0>
4758           use <s_out0>
4759 
4760      The above is transformed by this function into:
4761 
4762         loop:
4763           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4764           VECT_DEF = vector_stmt                # vectorized form of STMT
4765           s_loop = scalar_stmt                  # (scalar) STMT
4766         loop_exit:
4767           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4768           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4769           v_out2 = reduce <v_out1>
4770           s_out3 = extract_field <v_out2, 0>
4771           s_out4 = adjust_result <s_out3>
4772           use <s_out4>
4773           use <s_out4>
4774 */
4775 
4776 static void
vect_create_epilog_for_reduction(vec<tree> vect_defs,gimple * stmt,gimple * reduc_def_stmt,int ncopies,internal_fn reduc_fn,vec<gimple * > reduction_phis,bool double_reduc,slp_tree slp_node,slp_instance slp_node_instance,tree induc_val,enum tree_code induc_code,tree neutral_op)4777 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4778                                           gimple *reduc_def_stmt,
4779                                           int ncopies, internal_fn reduc_fn,
4780                                           vec<gimple *> reduction_phis,
4781                                   bool double_reduc,
4782                                           slp_tree slp_node,
4783                                           slp_instance slp_node_instance,
4784                                           tree induc_val, enum tree_code induc_code,
4785                                           tree neutral_op)
4786 {
4787   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4788   stmt_vec_info prev_phi_info;
4789   tree vectype;
4790   machine_mode mode;
4791   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4792   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4793   basic_block exit_bb;
4794   tree scalar_dest;
4795   tree scalar_type;
4796   gimple *new_phi = NULL, *phi;
4797   gimple_stmt_iterator exit_gsi;
4798   tree vec_dest;
4799   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4800   gimple *epilog_stmt = NULL;
4801   enum tree_code code = gimple_assign_rhs_code (stmt);
4802   gimple *exit_phi;
4803   tree bitsize;
4804   tree adjustment_def = NULL;
4805   tree vec_initial_def = NULL;
4806   tree expr, def, initial_def = NULL;
4807   tree orig_name, scalar_result;
4808   imm_use_iterator imm_iter, phi_imm_iter;
4809   use_operand_p use_p, phi_use_p;
4810   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4811   bool nested_in_vect_loop = false;
4812   auto_vec<gimple *> new_phis;
4813   auto_vec<gimple *> inner_phis;
4814   enum vect_def_type dt = vect_unknown_def_type;
4815   int j, i;
4816   auto_vec<tree> scalar_results;
4817   unsigned int group_size = 1, k, ratio;
4818   auto_vec<tree> vec_initial_defs;
4819   auto_vec<gimple *> phis;
4820   bool slp_reduc = false;
4821   bool direct_slp_reduc;
4822   tree new_phi_result;
4823   gimple *inner_phi = NULL;
4824   tree induction_index = NULL_TREE;
4825 
4826   if (slp_node)
4827     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4828 
4829   if (nested_in_vect_loop_p (loop, stmt))
4830     {
4831       outer_loop = loop;
4832       loop = loop->inner;
4833       nested_in_vect_loop = true;
4834       gcc_assert (!slp_node);
4835     }
4836 
4837   vectype = STMT_VINFO_VECTYPE (stmt_info);
4838   gcc_assert (vectype);
4839   mode = TYPE_MODE (vectype);
4840 
4841   /* 1. Create the reduction def-use cycle:
4842      Set the arguments of REDUCTION_PHIS, i.e., transform
4843 
4844         loop:
4845           vec_def = phi <null, null>            # REDUCTION_PHI
4846           VECT_DEF = vector_stmt                # vectorized form of STMT
4847           ...
4848 
4849      into:
4850 
4851         loop:
4852           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4853           VECT_DEF = vector_stmt                # vectorized form of STMT
4854           ...
4855 
4856      (in case of SLP, do it for all the phis). */
4857 
4858   /* Get the loop-entry arguments.  */
4859   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4860   if (slp_node)
4861     {
4862       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4863       vec_initial_defs.reserve (vec_num);
4864       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4865                                               &vec_initial_defs, vec_num,
4866                                               GROUP_FIRST_ELEMENT (stmt_info),
4867                                               neutral_op);
4868     }
4869   else
4870     {
4871       /* Get at the scalar def before the loop, that defines the initial value
4872            of the reduction variable.  */
4873       gimple *def_stmt;
4874       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4875                                                      loop_preheader_edge (loop));
4876       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4877            and we can't use zero for induc_val, use initial_def.  Similarly
4878            for REDUC_MIN and initial_def larger than the base.  */
4879       if (TREE_CODE (initial_def) == INTEGER_CST
4880             && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4881                 == INTEGER_INDUC_COND_REDUCTION)
4882             && !integer_zerop (induc_val)
4883             && ((induc_code == MAX_EXPR
4884                  && tree_int_cst_lt (initial_def, induc_val))
4885                 || (induc_code == MIN_EXPR
4886                       && tree_int_cst_lt (induc_val, initial_def))))
4887           induc_val = initial_def;
4888       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4889       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4890                                                                    &adjustment_def);
4891       vec_initial_defs.create (1);
4892       vec_initial_defs.quick_push (vec_initial_def);
4893     }
4894 
4895   /* Set phi nodes arguments.  */
4896   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4897     {
4898       tree vec_init_def = vec_initial_defs[i];
4899       tree def = vect_defs[i];
4900       for (j = 0; j < ncopies; j++)
4901         {
4902             if (j != 0)
4903               {
4904                 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4905                 if (nested_in_vect_loop)
4906                     vec_init_def
4907                       = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4908                                                                 vec_init_def);
4909               }
4910 
4911             /* Set the loop-entry arg of the reduction-phi.  */
4912 
4913             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4914                 == INTEGER_INDUC_COND_REDUCTION)
4915               {
4916                 /* Initialise the reduction phi to zero.  This prevents initial
4917                      values of non-zero interferring with the reduction op.  */
4918                 gcc_assert (ncopies == 1);
4919                 gcc_assert (i == 0);
4920 
4921                 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4922                 tree induc_val_vec
4923                     = build_vector_from_val (vec_init_def_type, induc_val);
4924 
4925                 add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4926                                  loop_preheader_edge (loop), UNKNOWN_LOCATION);
4927               }
4928             else
4929               add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4930                                loop_preheader_edge (loop), UNKNOWN_LOCATION);
4931 
4932           /* Set the loop-latch arg for the reduction-phi.  */
4933           if (j > 0)
4934             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4935 
4936           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4937                            UNKNOWN_LOCATION);
4938 
4939           if (dump_enabled_p ())
4940             {
4941               dump_printf_loc (MSG_NOTE, vect_location,
4942                                      "transform reduction: created def-use cycle: ");
4943               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4944               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4945             }
4946         }
4947     }
4948 
4949   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4950      which is updated with the current index of the loop for every match of
4951      the original loop's cond_expr (VEC_STMT).  This results in a vector
4952      containing the last time the condition passed for that vector lane.
4953      The first match will be a 1 to allow 0 to be used for non-matching
4954      indexes.  If there are no matches at all then the vector will be all
4955      zeroes.  */
4956   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4957     {
4958       tree indx_before_incr, indx_after_incr;
4959       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4960 
4961       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4962       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4963 
4964       int scalar_precision
4965           = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4966       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4967       tree cr_index_vector_type = build_vector_type
4968           (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4969 
4970       /* First we create a simple vector induction variable which starts
4971            with the values {1,2,3,...} (SERIES_VECT) and increments by the
4972            vector size (STEP).  */
4973 
4974       /* Create a {1,2,3,...} vector.  */
4975       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4976 
4977       /* Create a vector of the step value.  */
4978       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4979       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4980 
4981       /* Create an induction variable.  */
4982       gimple_stmt_iterator incr_gsi;
4983       bool insert_after;
4984       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4985       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4986                      insert_after, &indx_before_incr, &indx_after_incr);
4987 
4988       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4989            filled with zeros (VEC_ZERO).  */
4990 
4991       /* Create a vector of 0s.  */
4992       tree zero = build_zero_cst (cr_index_scalar_type);
4993       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4994 
4995       /* Create a vector phi node.  */
4996       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4997       new_phi = create_phi_node (new_phi_tree, loop->header);
4998       set_vinfo_for_stmt (new_phi,
4999                                 new_stmt_vec_info (new_phi, loop_vinfo));
5000       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5001                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
5002 
5003       /* Now take the condition from the loops original cond_expr
5004            (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
5005            every match uses values from the induction variable
5006            (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5007            (NEW_PHI_TREE).
5008            Finally, we update the phi (NEW_PHI_TREE) to take the value of
5009            the new cond_expr (INDEX_COND_EXPR).  */
5010 
5011       /* Duplicate the condition from vec_stmt.  */
5012       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5013 
5014       /* Create a conditional, where the condition is taken from vec_stmt
5015            (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5016            else is the phi (NEW_PHI_TREE).  */
5017       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5018                                              ccompare, indx_before_incr,
5019                                              new_phi_tree);
5020       induction_index = make_ssa_name (cr_index_vector_type);
5021       gimple *index_condition = gimple_build_assign (induction_index,
5022                                                                  index_cond_expr);
5023       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5024       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5025                                                                       loop_vinfo);
5026       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5027       set_vinfo_for_stmt (index_condition, index_vec_info);
5028 
5029       /* Update the phi with the vec cond.  */
5030       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5031                        loop_latch_edge (loop), UNKNOWN_LOCATION);
5032     }
5033 
5034   /* 2. Create epilog code.
5035         The reduction epilog code operates across the elements of the vector
5036         of partial results computed by the vectorized loop.
5037         The reduction epilog code consists of:
5038 
5039         step 1: compute the scalar result in a vector (v_out2)
5040         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5041         step 3: adjust the scalar result (s_out3) if needed.
5042 
5043         Step 1 can be accomplished using one the following three schemes:
5044           (scheme 1) using reduc_fn, if available.
5045           (scheme 2) using whole-vector shifts, if available.
5046           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5047                      combined.
5048 
5049           The overall epilog code looks like this:
5050 
5051           s_out0 = phi <s_loop>         # original EXIT_PHI
5052           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5053           v_out2 = reduce <v_out1>              # step 1
5054           s_out3 = extract_field <v_out2, 0>    # step 2
5055           s_out4 = adjust_result <s_out3>       # step 3
5056 
5057           (step 3 is optional, and steps 1 and 2 may be combined).
5058           Lastly, the uses of s_out0 are replaced by s_out4.  */
5059 
5060 
5061   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5062          v_out1 = phi <VECT_DEF>
5063          Store them in NEW_PHIS.  */
5064 
5065   exit_bb = single_exit (loop)->dest;
5066   prev_phi_info = NULL;
5067   new_phis.create (vect_defs.length ());
5068   FOR_EACH_VEC_ELT (vect_defs, i, def)
5069     {
5070       for (j = 0; j < ncopies; j++)
5071         {
5072             tree new_def = copy_ssa_name (def);
5073           phi = create_phi_node (new_def, exit_bb);
5074           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5075           if (j == 0)
5076             new_phis.quick_push (phi);
5077           else
5078               {
5079                 def = vect_get_vec_def_for_stmt_copy (dt, def);
5080                 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5081               }
5082 
5083           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5084           prev_phi_info = vinfo_for_stmt (phi);
5085         }
5086     }
5087 
5088   /* The epilogue is created for the outer-loop, i.e., for the loop being
5089      vectorized.  Create exit phis for the outer loop.  */
5090   if (double_reduc)
5091     {
5092       loop = outer_loop;
5093       exit_bb = single_exit (loop)->dest;
5094       inner_phis.create (vect_defs.length ());
5095       FOR_EACH_VEC_ELT (new_phis, i, phi)
5096           {
5097             tree new_result = copy_ssa_name (PHI_RESULT (phi));
5098             gphi *outer_phi = create_phi_node (new_result, exit_bb);
5099             SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5100                                  PHI_RESULT (phi));
5101             set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5102                                                                           loop_vinfo));
5103             inner_phis.quick_push (phi);
5104             new_phis[i] = outer_phi;
5105             prev_phi_info = vinfo_for_stmt (outer_phi);
5106           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5107             {
5108                 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5109                 new_result = copy_ssa_name (PHI_RESULT (phi));
5110                 outer_phi = create_phi_node (new_result, exit_bb);
5111                 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5112                                      PHI_RESULT (phi));
5113                 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5114                                                                                 loop_vinfo));
5115                 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5116                 prev_phi_info = vinfo_for_stmt (outer_phi);
5117               }
5118           }
5119     }
5120 
5121   exit_gsi = gsi_after_labels (exit_bb);
5122 
5123   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5124          (i.e. when reduc_fn is not available) and in the final adjustment
5125            code (if needed).  Also get the original scalar reduction variable as
5126          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5127          represents a reduction pattern), the tree-code and scalar-def are
5128          taken from the original stmt that the pattern-stmt (STMT) replaces.
5129          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5130          are taken from STMT.  */
5131 
5132   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5133   if (!orig_stmt)
5134     {
5135       /* Regular reduction  */
5136       orig_stmt = stmt;
5137     }
5138   else
5139     {
5140       /* Reduction pattern  */
5141       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5142       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5143       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5144     }
5145 
5146   code = gimple_assign_rhs_code (orig_stmt);
5147   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5148      partial results are added and not subtracted.  */
5149   if (code == MINUS_EXPR)
5150     code = PLUS_EXPR;
5151 
5152   scalar_dest = gimple_assign_lhs (orig_stmt);
5153   scalar_type = TREE_TYPE (scalar_dest);
5154   scalar_results.create (group_size);
5155   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5156   bitsize = TYPE_SIZE (scalar_type);
5157 
5158   /* In case this is a reduction in an inner-loop while vectorizing an outer
5159      loop - we don't need to extract a single scalar result at the end of the
5160      inner-loop (unless it is double reduction, i.e., the use of reduction is
5161      outside the outer-loop).  The final vector of partial results will be used
5162      in the vectorized outer-loop, or reduced to a scalar result at the end of
5163      the outer-loop.  */
5164   if (nested_in_vect_loop && !double_reduc)
5165     goto vect_finalize_reduction;
5166 
5167   /* SLP reduction without reduction chain, e.g.,
5168      # a1 = phi <a2, a0>
5169      # b1 = phi <b2, b0>
5170      a2 = operation (a1)
5171      b2 = operation (b1)  */
5172   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5173 
5174   /* True if we should implement SLP_REDUC using native reduction operations
5175      instead of scalar operations.  */
5176   direct_slp_reduc = (reduc_fn != IFN_LAST
5177                           && slp_reduc
5178                           && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5179 
5180   /* In case of reduction chain, e.g.,
5181      # a1 = phi <a3, a0>
5182      a2 = operation (a1)
5183      a3 = operation (a2),
5184 
5185      we may end up with more than one vector result.  Here we reduce them to
5186      one vector.  */
5187   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5188     {
5189       tree first_vect = PHI_RESULT (new_phis[0]);
5190       gassign *new_vec_stmt = NULL;
5191       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5192       for (k = 1; k < new_phis.length (); k++)
5193         {
5194             gimple *next_phi = new_phis[k];
5195           tree second_vect = PHI_RESULT (next_phi);
5196           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5197           new_vec_stmt = gimple_build_assign (tem, code,
5198                                                         first_vect, second_vect);
5199           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5200             first_vect = tem;
5201         }
5202 
5203       new_phi_result = first_vect;
5204       if (new_vec_stmt)
5205         {
5206           new_phis.truncate (0);
5207           new_phis.safe_push (new_vec_stmt);
5208         }
5209     }
5210   /* Likewise if we couldn't use a single defuse cycle.  */
5211   else if (ncopies > 1)
5212     {
5213       gcc_assert (new_phis.length () == 1);
5214       tree first_vect = PHI_RESULT (new_phis[0]);
5215       gassign *new_vec_stmt = NULL;
5216       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5217       gimple *next_phi = new_phis[0];
5218       for (int k = 1; k < ncopies; ++k)
5219           {
5220             next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5221             tree second_vect = PHI_RESULT (next_phi);
5222           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5223           new_vec_stmt = gimple_build_assign (tem, code,
5224                                                         first_vect, second_vect);
5225           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5226             first_vect = tem;
5227           }
5228       new_phi_result = first_vect;
5229       new_phis.truncate (0);
5230       new_phis.safe_push (new_vec_stmt);
5231     }
5232   else
5233     new_phi_result = PHI_RESULT (new_phis[0]);
5234 
5235   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5236       && reduc_fn != IFN_LAST)
5237     {
5238       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5239            various data values where the condition matched and another vector
5240            (INDUCTION_INDEX) containing all the indexes of those matches.  We
5241            need to extract the last matching index (which will be the index with
5242            highest value) and use this to index into the data vector.
5243            For the case where there were no matches, the data vector will contain
5244            all default values and the index vector will be all zeros.  */
5245 
5246       /* Get various versions of the type of the vector of indexes.  */
5247       tree index_vec_type = TREE_TYPE (induction_index);
5248       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5249       tree index_scalar_type = TREE_TYPE (index_vec_type);
5250       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5251           (index_vec_type);
5252 
5253       /* Get an unsigned integer version of the type of the data vector.  */
5254       int scalar_precision
5255           = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5256       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5257       tree vectype_unsigned = build_vector_type
5258           (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5259 
5260       /* First we need to create a vector (ZERO_VEC) of zeros and another
5261            vector (MAX_INDEX_VEC) filled with the last matching index, which we
5262            can create using a MAX reduction and then expanding.
5263            In the case where the loop never made any matches, the max index will
5264            be zero.  */
5265 
5266       /* Vector of {0, 0, 0,...}.  */
5267       tree zero_vec = make_ssa_name (vectype);
5268       tree zero_vec_rhs = build_zero_cst (vectype);
5269       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5270       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5271 
5272       /* Find maximum value from the vector of found indexes.  */
5273       tree max_index = make_ssa_name (index_scalar_type);
5274       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5275                                                                         1, induction_index);
5276       gimple_call_set_lhs (max_index_stmt, max_index);
5277       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5278 
5279       /* Vector of {max_index, max_index, max_index,...}.  */
5280       tree max_index_vec = make_ssa_name (index_vec_type);
5281       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5282                                                                   max_index);
5283       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5284                                                                       max_index_vec_rhs);
5285       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5286 
5287       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5288            with the vector (INDUCTION_INDEX) of found indexes, choosing values
5289            from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5290            otherwise.  Only one value should match, resulting in a vector
5291            (VEC_COND) with one data value and the rest zeros.
5292            In the case where the loop never made any matches, every index will
5293            match, resulting in a vector with all data values (which will all be
5294            the default value).  */
5295 
5296       /* Compare the max index vector to the vector of found indexes to find
5297            the position of the max value.  */
5298       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5299       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5300                                                                   induction_index,
5301                                                                   max_index_vec);
5302       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5303 
5304       /* Use the compare to choose either values from the data vector or
5305            zero.  */
5306       tree vec_cond = make_ssa_name (vectype);
5307       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5308                                                                vec_compare, new_phi_result,
5309                                                                zero_vec);
5310       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5311 
5312       /* Finally we need to extract the data value from the vector (VEC_COND)
5313            into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5314            reduction, but because this doesn't exist, we can use a MAX reduction
5315            instead.  The data value might be signed or a float so we need to cast
5316            it first.
5317            In the case where the loop never made any matches, the data values are
5318            all identical, and so will reduce down correctly.  */
5319 
5320       /* Make the matched data values unsigned.  */
5321       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5322       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5323                                                vec_cond);
5324       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5325                                                                       VIEW_CONVERT_EXPR,
5326                                                                       vec_cond_cast_rhs);
5327       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5328 
5329       /* Reduce down to a scalar value.  */
5330       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5331       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5332                                                                          1, vec_cond_cast);
5333       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5334       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5335 
5336       /* Convert the reduced value back to the result type and set as the
5337            result.  */
5338       gimple_seq stmts = NULL;
5339       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5340                                      data_reduc);
5341       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5342       scalar_results.safe_push (new_temp);
5343     }
5344   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5345              && reduc_fn == IFN_LAST)
5346     {
5347       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5348            idx = 0;
5349          idx_val = induction_index[0];
5350            val = data_reduc[0];
5351          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5352              if (induction_index[i] > idx_val)
5353                val = data_reduc[i], idx_val = induction_index[i];
5354            return val;  */
5355 
5356       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5357       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5358       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5359       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5360       /* Enforced by vectorizable_reduction, which ensures we have target
5361            support before allowing a conditional reduction on variable-length
5362            vectors.  */
5363       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5364       tree idx_val = NULL_TREE, val = NULL_TREE;
5365       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5366           {
5367             tree old_idx_val = idx_val;
5368             tree old_val = val;
5369             idx_val = make_ssa_name (idx_eltype);
5370             epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5371                                                        build3 (BIT_FIELD_REF, idx_eltype,
5372                                                                  induction_index,
5373                                                                  bitsize_int (el_size),
5374                                                                  bitsize_int (off)));
5375             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5376             val = make_ssa_name (data_eltype);
5377             epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5378                                                        build3 (BIT_FIELD_REF,
5379                                                                  data_eltype,
5380                                                                  new_phi_result,
5381                                                                  bitsize_int (el_size),
5382                                                                  bitsize_int (off)));
5383             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5384             if (off != 0)
5385               {
5386                 tree new_idx_val = idx_val;
5387                 tree new_val = val;
5388                 if (off != v_size - el_size)
5389                     {
5390                       new_idx_val = make_ssa_name (idx_eltype);
5391                       epilog_stmt = gimple_build_assign (new_idx_val,
5392                                                                  MAX_EXPR, idx_val,
5393                                                                  old_idx_val);
5394                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5395                     }
5396                 new_val = make_ssa_name (data_eltype);
5397                 epilog_stmt = gimple_build_assign (new_val,
5398                                                              COND_EXPR,
5399                                                              build2 (GT_EXPR,
5400                                                                        boolean_type_node,
5401                                                                        idx_val,
5402                                                                        old_idx_val),
5403                                                              val, old_val);
5404                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5405                 idx_val = new_idx_val;
5406                 val = new_val;
5407               }
5408           }
5409       /* Convert the reduced value back to the result type and set as the
5410            result.  */
5411       gimple_seq stmts = NULL;
5412       val = gimple_convert (&stmts, scalar_type, val);
5413       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5414       scalar_results.safe_push (val);
5415     }
5416 
5417   /* 2.3 Create the reduction code, using one of the three schemes described
5418          above. In SLP we simply need to extract all the elements from the
5419          vector (without reducing them), so we use scalar shifts.  */
5420   else if (reduc_fn != IFN_LAST && !slp_reduc)
5421     {
5422       tree tmp;
5423       tree vec_elem_type;
5424 
5425       /* Case 1:  Create:
5426          v_out2 = reduc_expr <v_out1>  */
5427 
5428       if (dump_enabled_p ())
5429         dump_printf_loc (MSG_NOTE, vect_location,
5430                                "Reduce using direct vector reduction.\n");
5431 
5432       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5433       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5434           {
5435             tree tmp_dest
5436               = vect_create_destination_var (scalar_dest, vec_elem_type);
5437             epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5438                                                                 new_phi_result);
5439             gimple_set_lhs (epilog_stmt, tmp_dest);
5440             new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5441             gimple_set_lhs (epilog_stmt, new_temp);
5442             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5443 
5444             epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5445                                                        new_temp);
5446           }
5447       else
5448           {
5449             epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5450                                                                 new_phi_result);
5451             gimple_set_lhs (epilog_stmt, new_scalar_dest);
5452           }
5453 
5454       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5455       gimple_set_lhs (epilog_stmt, new_temp);
5456       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457 
5458       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5459              == INTEGER_INDUC_COND_REDUCTION)
5460             && !operand_equal_p (initial_def, induc_val, 0))
5461           {
5462             /* Earlier we set the initial value to be a vector if induc_val
5463                values.  Check the result and if it is induc_val then replace
5464                with the original initial value, unless induc_val is
5465                the same as initial_def already.  */
5466             tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5467                                           induc_val);
5468 
5469             tmp = make_ssa_name (new_scalar_dest);
5470             epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5471                                                        initial_def, new_temp);
5472             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5473             new_temp = tmp;
5474           }
5475 
5476       scalar_results.safe_push (new_temp);
5477     }
5478   else if (direct_slp_reduc)
5479     {
5480       /* Here we create one vector for each of the GROUP_SIZE results,
5481            with the elements for other SLP statements replaced with the
5482            neutral value.  We can then do a normal reduction on each vector.  */
5483 
5484       /* Enforced by vectorizable_reduction.  */
5485       gcc_assert (new_phis.length () == 1);
5486       gcc_assert (pow2p_hwi (group_size));
5487 
5488       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5489       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5490       gimple_seq seq = NULL;
5491 
5492       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5493            and the same element size as VECTYPE.  */
5494       tree index = build_index_vector (vectype, 0, 1);
5495       tree index_type = TREE_TYPE (index);
5496       tree index_elt_type = TREE_TYPE (index_type);
5497       tree mask_type = build_same_sized_truth_vector_type (index_type);
5498 
5499       /* Create a vector that, for each element, identifies which of
5500            the GROUP_SIZE results should use it.  */
5501       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5502       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5503                                   build_vector_from_val (index_type, index_mask));
5504 
5505       /* Get a neutral vector value.  This is simply a splat of the neutral
5506            scalar value if we have one, otherwise the initial scalar value
5507            is itself a neutral value.  */
5508       tree vector_identity = NULL_TREE;
5509       if (neutral_op)
5510           vector_identity = gimple_build_vector_from_val (&seq, vectype,
5511                                                                       neutral_op);
5512       for (unsigned int i = 0; i < group_size; ++i)
5513           {
5514             /* If there's no univeral neutral value, we can use the
5515                initial scalar value from the original PHI.  This is used
5516                for MIN and MAX reduction, for example.  */
5517             if (!neutral_op)
5518               {
5519                 tree scalar_value
5520                     = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5521                                                    loop_preheader_edge (loop));
5522                 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5523                                                                             scalar_value);
5524               }
5525 
5526             /* Calculate the equivalent of:
5527 
5528                sel[j] = (index[j] == i);
5529 
5530                which selects the elements of NEW_PHI_RESULT that should
5531                be included in the result.  */
5532             tree compare_val = build_int_cst (index_elt_type, i);
5533             compare_val = build_vector_from_val (index_type, compare_val);
5534             tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5535                                            index, compare_val);
5536 
5537             /* Calculate the equivalent of:
5538 
5539                vec = seq ? new_phi_result : vector_identity;
5540 
5541                VEC is now suitable for a full vector reduction.  */
5542             tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5543                                            sel, new_phi_result, vector_identity);
5544 
5545             /* Do the reduction and convert it to the appropriate type.  */
5546             gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5547             tree scalar = make_ssa_name (TREE_TYPE (vectype));
5548             gimple_call_set_lhs (call, scalar);
5549             gimple_seq_add_stmt (&seq, call);
5550             scalar = gimple_convert (&seq, scalar_type, scalar);
5551             scalar_results.safe_push (scalar);
5552           }
5553       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5554     }
5555   else
5556     {
5557       bool reduce_with_shift;
5558       tree vec_temp;
5559 
5560       /* COND reductions all do the final reduction with MAX_EXPR
5561            or MIN_EXPR.  */
5562       if (code == COND_EXPR)
5563           {
5564             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5565                 == INTEGER_INDUC_COND_REDUCTION)
5566               code = induc_code;
5567             else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5568                        == CONST_COND_REDUCTION)
5569               code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5570             else
5571               code = MAX_EXPR;
5572           }
5573 
5574       /* See if the target wants to do the final (shift) reduction
5575            in a vector mode of smaller size and first reduce upper/lower
5576            halves against each other.  */
5577       enum machine_mode mode1 = mode;
5578       tree vectype1 = vectype;
5579       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5580       unsigned sz1 = sz;
5581       if (!slp_reduc
5582             && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5583           sz1 = GET_MODE_SIZE (mode1).to_constant ();
5584 
5585       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5586       reduce_with_shift = have_whole_vector_shift (mode1);
5587       if (!VECTOR_MODE_P (mode1))
5588           reduce_with_shift = false;
5589       else
5590           {
5591             optab optab = optab_for_tree_code (code, vectype1, optab_default);
5592             if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5593               reduce_with_shift = false;
5594           }
5595 
5596       /* First reduce the vector to the desired vector size we should
5597            do shift reduction on by combining upper and lower halves.  */
5598       new_temp = new_phi_result;
5599       while (sz > sz1)
5600           {
5601             gcc_assert (!slp_reduc);
5602             sz /= 2;
5603             vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5604 
5605             /* The target has to make sure we support lowpart/highpart
5606                extraction, either via direct vector extract or through
5607                an integer mode punning.  */
5608             tree dst1, dst2;
5609             if (convert_optab_handler (vec_extract_optab,
5610                                              TYPE_MODE (TREE_TYPE (new_temp)),
5611                                              TYPE_MODE (vectype1))
5612                 != CODE_FOR_nothing)
5613               {
5614                 /* Extract sub-vectors directly once vec_extract becomes
5615                      a conversion optab.  */
5616                 dst1 = make_ssa_name (vectype1);
5617                 epilog_stmt
5618                       = gimple_build_assign (dst1, BIT_FIELD_REF,
5619                                                    build3 (BIT_FIELD_REF, vectype1,
5620                                                              new_temp, TYPE_SIZE (vectype1),
5621                                                              bitsize_int (0)));
5622                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5623                 dst2 =  make_ssa_name (vectype1);
5624                 epilog_stmt
5625                       = gimple_build_assign (dst2, BIT_FIELD_REF,
5626                                                    build3 (BIT_FIELD_REF, vectype1,
5627                                                              new_temp, TYPE_SIZE (vectype1),
5628                                                              bitsize_int (sz * BITS_PER_UNIT)));
5629                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5630               }
5631             else
5632               {
5633                 /* Extract via punning to appropriately sized integer mode
5634                      vector.  */
5635                 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5636                                                                           1);
5637                 tree etype = build_vector_type (eltype, 2);
5638                 gcc_assert (convert_optab_handler (vec_extract_optab,
5639                                                              TYPE_MODE (etype),
5640                                                              TYPE_MODE (eltype))
5641                                 != CODE_FOR_nothing);
5642                 tree tem = make_ssa_name (etype);
5643                 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5644                                                              build1 (VIEW_CONVERT_EXPR,
5645                                                                        etype, new_temp));
5646                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5647                 new_temp = tem;
5648                 tem = make_ssa_name (eltype);
5649                 epilog_stmt
5650                       = gimple_build_assign (tem, BIT_FIELD_REF,
5651                                                    build3 (BIT_FIELD_REF, eltype,
5652                                                              new_temp, TYPE_SIZE (eltype),
5653                                                              bitsize_int (0)));
5654                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5655                 dst1 = make_ssa_name (vectype1);
5656                 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5657                                                              build1 (VIEW_CONVERT_EXPR,
5658                                                                        vectype1, tem));
5659                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5660                 tem = make_ssa_name (eltype);
5661                 epilog_stmt
5662                       = gimple_build_assign (tem, BIT_FIELD_REF,
5663                                                    build3 (BIT_FIELD_REF, eltype,
5664                                                              new_temp, TYPE_SIZE (eltype),
5665                                                              bitsize_int (sz * BITS_PER_UNIT)));
5666                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5667                 dst2 =  make_ssa_name (vectype1);
5668                 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5669                                                              build1 (VIEW_CONVERT_EXPR,
5670                                                                        vectype1, tem));
5671                 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5672               }
5673 
5674             new_temp = make_ssa_name (vectype1);
5675             epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5676             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5677           }
5678 
5679       if (reduce_with_shift && !slp_reduc)
5680           {
5681             int element_bitsize = tree_to_uhwi (bitsize);
5682             /* Enforced by vectorizable_reduction, which disallows SLP reductions
5683                for variable-length vectors and also requires direct target support
5684                for loop reductions.  */
5685             int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5686             int nelements = vec_size_in_bits / element_bitsize;
5687             vec_perm_builder sel;
5688             vec_perm_indices indices;
5689 
5690           int elt_offset;
5691 
5692           tree zero_vec = build_zero_cst (vectype1);
5693           /* Case 2: Create:
5694              for (offset = nelements/2; offset >= 1; offset/=2)
5695                 {
5696                   Create:  va' = vec_shift <va, offset>
5697                   Create:  va = vop <va, va'>
5698                 }  */
5699 
5700           tree rhs;
5701 
5702           if (dump_enabled_p ())
5703             dump_printf_loc (MSG_NOTE, vect_location,
5704                                    "Reduce using vector shifts\n");
5705 
5706             mode1 = TYPE_MODE (vectype1);
5707           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5708           for (elt_offset = nelements / 2;
5709                elt_offset >= 1;
5710                elt_offset /= 2)
5711             {
5712                 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5713                 indices.new_vector (sel, 2, nelements);
5714                 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5715                 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5716                                                              new_temp, zero_vec, mask);
5717               new_name = make_ssa_name (vec_dest, epilog_stmt);
5718               gimple_assign_set_lhs (epilog_stmt, new_name);
5719               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5720 
5721                 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5722                                                              new_temp);
5723               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5724               gimple_assign_set_lhs (epilog_stmt, new_temp);
5725               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5726             }
5727 
5728             /* 2.4  Extract the final scalar result.  Create:
5729                s_out3 = extract_field <v_out2, bitpos>  */
5730 
5731             if (dump_enabled_p ())
5732               dump_printf_loc (MSG_NOTE, vect_location,
5733                                    "extract scalar result\n");
5734 
5735             rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5736                               bitsize, bitsize_zero_node);
5737             epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5738             new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5739             gimple_assign_set_lhs (epilog_stmt, new_temp);
5740             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5741             scalar_results.safe_push (new_temp);
5742         }
5743       else
5744         {
5745           /* Case 3: Create:
5746              s = extract_field <v_out2, 0>
5747              for (offset = element_size;
5748                   offset < vector_size;
5749                   offset += element_size;)
5750                {
5751                  Create:  s' = extract_field <v_out2, offset>
5752                  Create:  s = op <s, s'>  // For non SLP cases
5753                }  */
5754 
5755           if (dump_enabled_p ())
5756             dump_printf_loc (MSG_NOTE, vect_location,
5757                                    "Reduce using scalar code.\n");
5758 
5759             int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5760             int element_bitsize = tree_to_uhwi (bitsize);
5761           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5762             {
5763               int bit_offset;
5764               if (gimple_code (new_phi) == GIMPLE_PHI)
5765                 vec_temp = PHI_RESULT (new_phi);
5766               else
5767                 vec_temp = gimple_assign_lhs (new_phi);
5768               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5769                                          bitsize_zero_node);
5770               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5771               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5772               gimple_assign_set_lhs (epilog_stmt, new_temp);
5773               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5774 
5775               /* In SLP we don't need to apply reduction operation, so we just
5776                  collect s' values in SCALAR_RESULTS.  */
5777               if (slp_reduc)
5778                 scalar_results.safe_push (new_temp);
5779 
5780               for (bit_offset = element_bitsize;
5781                    bit_offset < vec_size_in_bits;
5782                    bit_offset += element_bitsize)
5783                 {
5784                   tree bitpos = bitsize_int (bit_offset);
5785                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5786                                      bitsize, bitpos);
5787 
5788                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5789                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5790                   gimple_assign_set_lhs (epilog_stmt, new_name);
5791                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5792 
5793                   if (slp_reduc)
5794                     {
5795                       /* In SLP we don't need to apply reduction operation, so
5796                          we just collect s' values in SCALAR_RESULTS.  */
5797                       new_temp = new_name;
5798                       scalar_results.safe_push (new_name);
5799                     }
5800                   else
5801                     {
5802                           epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5803                                                                        new_name, new_temp);
5804                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5805                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5806                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5807                     }
5808                 }
5809             }
5810 
5811           /* The only case where we need to reduce scalar results in SLP, is
5812              unrolling.  If the size of SCALAR_RESULTS is greater than
5813              GROUP_SIZE, we reduce them combining elements modulo
5814              GROUP_SIZE.  */
5815           if (slp_reduc)
5816             {
5817               tree res, first_res, new_res;
5818                 gimple *new_stmt;
5819 
5820               /* Reduce multiple scalar results in case of SLP unrolling.  */
5821               for (j = group_size; scalar_results.iterate (j, &res);
5822                    j++)
5823                 {
5824                   first_res = scalar_results[j % group_size];
5825                       new_stmt = gimple_build_assign (new_scalar_dest, code,
5826                                                               first_res, res);
5827                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5828                   gimple_assign_set_lhs (new_stmt, new_res);
5829                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5830                   scalar_results[j % group_size] = new_res;
5831                 }
5832             }
5833           else
5834             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5835             scalar_results.safe_push (new_temp);
5836         }
5837 
5838       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5839              == INTEGER_INDUC_COND_REDUCTION)
5840             && !operand_equal_p (initial_def, induc_val, 0))
5841           {
5842             /* Earlier we set the initial value to be a vector if induc_val
5843                values.  Check the result and if it is induc_val then replace
5844                with the original initial value, unless induc_val is
5845                the same as initial_def already.  */
5846             tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5847                                           induc_val);
5848 
5849             tree tmp = make_ssa_name (new_scalar_dest);
5850             epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5851                                                        initial_def, new_temp);
5852             gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5853             scalar_results[0] = tmp;
5854           }
5855     }
5856 
5857 vect_finalize_reduction:
5858 
5859   if (double_reduc)
5860     loop = loop->inner;
5861 
5862   /* 2.5 Adjust the final result by the initial value of the reduction
5863            variable. (When such adjustment is not needed, then
5864            'adjustment_def' is zero).  For example, if code is PLUS we create:
5865            new_temp = loop_exit_def + adjustment_def  */
5866 
5867   if (adjustment_def)
5868     {
5869       gcc_assert (!slp_reduc);
5870       if (nested_in_vect_loop)
5871           {
5872           new_phi = new_phis[0];
5873             gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5874             expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5875             new_dest = vect_create_destination_var (scalar_dest, vectype);
5876           }
5877       else
5878           {
5879           new_temp = scalar_results[0];
5880             gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5881             expr = build2 (code, scalar_type, new_temp, adjustment_def);
5882             new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5883           }
5884 
5885       epilog_stmt = gimple_build_assign (new_dest, expr);
5886       new_temp = make_ssa_name (new_dest, epilog_stmt);
5887       gimple_assign_set_lhs (epilog_stmt, new_temp);
5888       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5889       if (nested_in_vect_loop)
5890         {
5891           set_vinfo_for_stmt (epilog_stmt,
5892                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5893           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5894                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5895 
5896           if (!double_reduc)
5897             scalar_results.quick_push (new_temp);
5898           else
5899             scalar_results[0] = new_temp;
5900         }
5901       else
5902         scalar_results[0] = new_temp;
5903 
5904       new_phis[0] = epilog_stmt;
5905     }
5906 
5907   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5908           phis with new adjusted scalar results, i.e., replace use <s_out0>
5909           with use <s_out4>.
5910 
5911      Transform:
5912         loop_exit:
5913           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5914           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5915           v_out2 = reduce <v_out1>
5916           s_out3 = extract_field <v_out2, 0>
5917           s_out4 = adjust_result <s_out3>
5918           use <s_out0>
5919           use <s_out0>
5920 
5921      into:
5922 
5923         loop_exit:
5924           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5925           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5926           v_out2 = reduce <v_out1>
5927           s_out3 = extract_field <v_out2, 0>
5928           s_out4 = adjust_result <s_out3>
5929           use <s_out4>
5930           use <s_out4> */
5931 
5932 
5933   /* In SLP reduction chain we reduce vector results into one vector if
5934      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5935      the last stmt in the reduction chain, since we are looking for the loop
5936      exit phi node.  */
5937   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5938     {
5939       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5940       /* Handle reduction patterns.  */
5941       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5942           dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5943 
5944       scalar_dest = gimple_assign_lhs (dest_stmt);
5945       group_size = 1;
5946     }
5947 
5948   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5949      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5950      need to match SCALAR_RESULTS with corresponding statements.  The first
5951      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5952      the first vector stmt, etc.
5953      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5954   if (group_size > new_phis.length ())
5955     {
5956       ratio = group_size / new_phis.length ();
5957       gcc_assert (!(group_size % new_phis.length ()));
5958     }
5959   else
5960     ratio = 1;
5961 
5962   for (k = 0; k < group_size; k++)
5963     {
5964       if (k % ratio == 0)
5965         {
5966           epilog_stmt = new_phis[k / ratio];
5967           reduction_phi = reduction_phis[k / ratio];
5968             if (double_reduc)
5969               inner_phi = inner_phis[k / ratio];
5970         }
5971 
5972       if (slp_reduc)
5973         {
5974             gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5975 
5976           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5977           /* SLP statements can't participate in patterns.  */
5978           gcc_assert (!orig_stmt);
5979           scalar_dest = gimple_assign_lhs (current_stmt);
5980         }
5981 
5982       phis.create (3);
5983       /* Find the loop-closed-use at the loop exit of the original scalar
5984          result.  (The reduction result is expected to have two immediate uses -
5985          one at the latch block, and one at the loop exit).  */
5986       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5987         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5988               && !is_gimple_debug (USE_STMT (use_p)))
5989           phis.safe_push (USE_STMT (use_p));
5990 
5991       /* While we expect to have found an exit_phi because of loop-closed-ssa
5992          form we can end up without one if the scalar cycle is dead.  */
5993 
5994       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5995         {
5996           if (outer_loop)
5997             {
5998               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5999               gphi *vect_phi;
6000 
6001               /* FORNOW. Currently not supporting the case that an inner-loop
6002                  reduction is not used in the outer-loop (but only outside the
6003                  outer-loop), unless it is double reduction.  */
6004               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6005                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
6006                           || double_reduc);
6007 
6008                 if (double_reduc)
6009                     STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6010                 else
6011                     STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6012               if (!double_reduc
6013                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6014                       != vect_double_reduction_def)
6015                 continue;
6016 
6017               /* Handle double reduction:
6018 
6019                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
6020                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6021                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
6022                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
6023 
6024                  At that point the regular reduction (stmt2 and stmt3) is
6025                  already vectorized, as well as the exit phi node, stmt4.
6026                  Here we vectorize the phi node of double reduction, stmt1, and
6027                  update all relevant statements.  */
6028 
6029               /* Go through all the uses of s2 to find double reduction phi
6030                  node, i.e., stmt1 above.  */
6031               orig_name = PHI_RESULT (exit_phi);
6032               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6033                 {
6034                   stmt_vec_info use_stmt_vinfo;
6035                   stmt_vec_info new_phi_vinfo;
6036                   tree vect_phi_init, preheader_arg, vect_phi_res;
6037                   basic_block bb = gimple_bb (use_stmt);
6038                       gimple *use;
6039 
6040                   /* Check that USE_STMT is really double reduction phi
6041                      node.  */
6042                   if (gimple_code (use_stmt) != GIMPLE_PHI
6043                       || gimple_phi_num_args (use_stmt) != 2
6044                       || bb->loop_father != outer_loop)
6045                     continue;
6046                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6047                   if (!use_stmt_vinfo
6048                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6049                           != vect_double_reduction_def)
6050                         continue;
6051 
6052                   /* Create vector phi node for double reduction:
6053                      vs1 = phi <vs0, vs2>
6054                      vs1 was created previously in this function by a call to
6055                        vect_get_vec_def_for_operand and is stored in
6056                        vec_initial_def;
6057                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6058                      vs0 is created here.  */
6059 
6060                   /* Create vector phi node.  */
6061                   vect_phi = create_phi_node (vec_initial_def, bb);
6062                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6063                                     loop_vec_info_for_loop (outer_loop));
6064                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6065 
6066                   /* Create vs0 - initial def of the double reduction phi.  */
6067                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6068                                              loop_preheader_edge (outer_loop));
6069                   vect_phi_init = get_initial_def_for_reduction
6070                         (stmt, preheader_arg, NULL);
6071 
6072                   /* Update phi node arguments with vs0 and vs2.  */
6073                   add_phi_arg (vect_phi, vect_phi_init,
6074                                loop_preheader_edge (outer_loop),
6075                                UNKNOWN_LOCATION);
6076                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6077                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6078                   if (dump_enabled_p ())
6079                     {
6080                       dump_printf_loc (MSG_NOTE, vect_location,
6081                                                "created double reduction phi node: ");
6082                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6083                     }
6084 
6085                   vect_phi_res = PHI_RESULT (vect_phi);
6086 
6087                   /* Replace the use, i.e., set the correct vs1 in the regular
6088                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6089                      loop is redundant.  */
6090                   use = reduction_phi;
6091                   for (j = 0; j < ncopies; j++)
6092                     {
6093                       edge pr_edge = loop_preheader_edge (loop);
6094                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6095                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6096                     }
6097                 }
6098             }
6099         }
6100 
6101       phis.release ();
6102       if (nested_in_vect_loop)
6103         {
6104           if (double_reduc)
6105             loop = outer_loop;
6106           else
6107             continue;
6108         }
6109 
6110       phis.create (3);
6111       /* Find the loop-closed-use at the loop exit of the original scalar
6112          result.  (The reduction result is expected to have two immediate uses,
6113          one at the latch block, and one at the loop exit).  For double
6114          reductions we are looking for exit phis of the outer loop.  */
6115       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6116         {
6117           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6118               {
6119                 if (!is_gimple_debug (USE_STMT (use_p)))
6120                     phis.safe_push (USE_STMT (use_p));
6121               }
6122           else
6123             {
6124               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6125                 {
6126                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6127 
6128                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6129                     {
6130                       if (!flow_bb_inside_loop_p (loop,
6131                                              gimple_bb (USE_STMT (phi_use_p)))
6132                                 && !is_gimple_debug (USE_STMT (phi_use_p)))
6133                         phis.safe_push (USE_STMT (phi_use_p));
6134                     }
6135                 }
6136             }
6137         }
6138 
6139       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6140         {
6141           /* Replace the uses:  */
6142           orig_name = PHI_RESULT (exit_phi);
6143           scalar_result = scalar_results[k];
6144           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6145             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6146               SET_USE (use_p, scalar_result);
6147         }
6148 
6149       phis.release ();
6150     }
6151 }
6152 
6153 /* Return a vector of type VECTYPE that is equal to the vector select
6154    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6155    before GSI.  */
6156 
6157 static tree
merge_with_identity(gimple_stmt_iterator * gsi,tree mask,tree vectype,tree vec,tree identity)6158 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6159                          tree vec, tree identity)
6160 {
6161   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6162   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6163                                                     mask, vec, identity);
6164   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6165   return cond;
6166 }
6167 
6168 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6169    order, starting with LHS.  Insert the extraction statements before GSI and
6170    associate the new scalar SSA names with variable SCALAR_DEST.
6171    Return the SSA name for the result.  */
6172 
6173 static tree
vect_expand_fold_left(gimple_stmt_iterator * gsi,tree scalar_dest,tree_code code,tree lhs,tree vector_rhs)6174 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6175                            tree_code code, tree lhs, tree vector_rhs)
6176 {
6177   tree vectype = TREE_TYPE (vector_rhs);
6178   tree scalar_type = TREE_TYPE (vectype);
6179   tree bitsize = TYPE_SIZE (scalar_type);
6180   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6181   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6182 
6183   for (unsigned HOST_WIDE_INT bit_offset = 0;
6184        bit_offset < vec_size_in_bits;
6185        bit_offset += element_bitsize)
6186     {
6187       tree bitpos = bitsize_int (bit_offset);
6188       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6189                                bitsize, bitpos);
6190 
6191       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6192       rhs = make_ssa_name (scalar_dest, stmt);
6193       gimple_assign_set_lhs (stmt, rhs);
6194       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6195 
6196       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6197       tree new_name = make_ssa_name (scalar_dest, stmt);
6198       gimple_assign_set_lhs (stmt, new_name);
6199       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6200       lhs = new_name;
6201     }
6202   return lhs;
6203 }
6204 
6205 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6206    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6207    statement.  CODE is the operation performed by STMT and OPS are
6208    its scalar operands.  REDUC_INDEX is the index of the operand in
6209    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6210    implements in-order reduction, or IFN_LAST if we should open-code it.
6211    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6212    that should be used to control the operation in a fully-masked loop.  */
6213 
6214 static bool
vectorize_fold_left_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,gimple * reduc_def_stmt,tree_code code,internal_fn reduc_fn,tree ops[3],tree vectype_in,int reduc_index,vec_loop_masks * masks)6215 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6216                                      gimple **vec_stmt, slp_tree slp_node,
6217                                      gimple *reduc_def_stmt,
6218                                      tree_code code, internal_fn reduc_fn,
6219                                      tree ops[3], tree vectype_in,
6220                                      int reduc_index, vec_loop_masks *masks)
6221 {
6222   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6223   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6224   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6225   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6226   gimple *new_stmt = NULL;
6227 
6228   int ncopies;
6229   if (slp_node)
6230     ncopies = 1;
6231   else
6232     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6233 
6234   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6235   gcc_assert (ncopies == 1);
6236   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6237   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6238   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6239                 == FOLD_LEFT_REDUCTION);
6240 
6241   if (slp_node)
6242     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6243                                 TYPE_VECTOR_SUBPARTS (vectype_in)));
6244 
6245   tree op0 = ops[1 - reduc_index];
6246 
6247   int group_size = 1;
6248   gimple *scalar_dest_def;
6249   auto_vec<tree> vec_oprnds0;
6250   if (slp_node)
6251     {
6252       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6253       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6254       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6255     }
6256   else
6257     {
6258       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6259       vec_oprnds0.create (1);
6260       vec_oprnds0.quick_push (loop_vec_def0);
6261       scalar_dest_def = stmt;
6262     }
6263 
6264   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6265   tree scalar_type = TREE_TYPE (scalar_dest);
6266   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6267 
6268   int vec_num = vec_oprnds0.length ();
6269   gcc_assert (vec_num == 1 || slp_node);
6270   tree vec_elem_type = TREE_TYPE (vectype_out);
6271   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6272 
6273   tree vector_identity = NULL_TREE;
6274   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6275     vector_identity = build_zero_cst (vectype_out);
6276 
6277   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6278   int i;
6279   tree def0;
6280   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6281     {
6282       tree mask = NULL_TREE;
6283       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6284           mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6285 
6286       /* Handle MINUS by adding the negative.  */
6287       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6288           {
6289             tree negated = make_ssa_name (vectype_out);
6290             new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6291             gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6292             def0 = negated;
6293           }
6294 
6295       if (mask)
6296           def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6297                                             vector_identity);
6298 
6299       /* On the first iteration the input is simply the scalar phi
6300            result, and for subsequent iterations it is the output of
6301            the preceding operation.  */
6302       if (reduc_fn != IFN_LAST)
6303           {
6304             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6305             /* For chained SLP reductions the output of the previous reduction
6306                operation serves as the input of the next. For the final statement
6307                the output cannot be a temporary - we reuse the original
6308                scalar destination of the last statement.  */
6309             if (i != vec_num - 1)
6310               {
6311                 gimple_set_lhs (new_stmt, scalar_dest_var);
6312                 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6313                 gimple_set_lhs (new_stmt, reduc_var);
6314               }
6315           }
6316       else
6317           {
6318             reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6319                                                        reduc_var, def0);
6320             new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6321             /* Remove the statement, so that we can use the same code paths
6322                as for statements that we've just created.  */
6323             gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6324             gsi_remove (&tmp_gsi, true);
6325           }
6326 
6327       if (i == vec_num - 1)
6328           {
6329             gimple_set_lhs (new_stmt, scalar_dest);
6330             vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6331           }
6332       else
6333           vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6334 
6335       if (slp_node)
6336           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6337     }
6338 
6339   if (!slp_node)
6340     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6341 
6342   return true;
6343 }
6344 
6345 /* Function is_nonwrapping_integer_induction.
6346 
6347    Check if STMT (which is part of loop LOOP) both increments and
6348    does not cause overflow.  */
6349 
6350 static bool
is_nonwrapping_integer_induction(gimple * stmt,struct loop * loop)6351 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6352 {
6353   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6354   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6355   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6356   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6357   widest_int ni, max_loop_value, lhs_max;
6358   bool overflow = false;
6359 
6360   /* Make sure the loop is integer based.  */
6361   if (TREE_CODE (base) != INTEGER_CST
6362       || TREE_CODE (step) != INTEGER_CST)
6363     return false;
6364 
6365   /* Check that the max size of the loop will not wrap.  */
6366 
6367   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6368     return true;
6369 
6370   if (! max_stmt_executions (loop, &ni))
6371     return false;
6372 
6373   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6374                                   &overflow);
6375   if (overflow)
6376     return false;
6377 
6378   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6379                                   TYPE_SIGN (lhs_type), &overflow);
6380   if (overflow)
6381     return false;
6382 
6383   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6384             <= TYPE_PRECISION (lhs_type));
6385 }
6386 
6387 /* Function vectorizable_reduction.
6388 
6389    Check if STMT performs a reduction operation that can be vectorized.
6390    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6391    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6392    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6393 
6394    This function also handles reduction idioms (patterns) that have been
6395    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6396    of this form:
6397      X = pattern_expr (arg0, arg1, ..., X)
6398    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6399    sequence that had been detected and replaced by the pattern-stmt (STMT).
6400 
6401    This function also handles reduction of condition expressions, for example:
6402      for (int i = 0; i < N; i++)
6403        if (a[i] < value)
6404            last = a[i];
6405    This is handled by vectorising the loop and creating an additional vector
6406    containing the loop indexes for which "a[i] < value" was true.  In the
6407    function epilogue this is reduced to a single max value and then used to
6408    index into the vector of results.
6409 
6410    In some cases of reduction patterns, the type of the reduction variable X is
6411    different than the type of the other arguments of STMT.
6412    In such cases, the vectype that is used when transforming STMT into a vector
6413    stmt is different than the vectype that is used to determine the
6414    vectorization factor, because it consists of a different number of elements
6415    than the actual number of elements that are being operated upon in parallel.
6416 
6417    For example, consider an accumulation of shorts into an int accumulator.
6418    On some targets it's possible to vectorize this pattern operating on 8
6419    shorts at a time (hence, the vectype for purposes of determining the
6420    vectorization factor should be V8HI); on the other hand, the vectype that
6421    is used to create the vector form is actually V4SI (the type of the result).
6422 
6423    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6424    indicates what is the actual level of parallelism (V8HI in the example), so
6425    that the right vectorization factor would be derived.  This vectype
6426    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6427    be used to create the vectorized stmt.  The right vectype for the vectorized
6428    stmt is obtained from the type of the result X:
6429         get_vectype_for_scalar_type (TREE_TYPE (X))
6430 
6431    This means that, contrary to "regular" reductions (or "regular" stmts in
6432    general), the following equation:
6433       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6434    does *NOT* necessarily hold for reduction patterns.  */
6435 
6436 bool
vectorizable_reduction(gimple * stmt,gimple_stmt_iterator * gsi,gimple ** vec_stmt,slp_tree slp_node,slp_instance slp_node_instance)6437 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6438                               gimple **vec_stmt, slp_tree slp_node,
6439                               slp_instance slp_node_instance)
6440 {
6441   tree vec_dest;
6442   tree scalar_dest;
6443   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6444   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6445   tree vectype_in = NULL_TREE;
6446   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6447   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6448   enum tree_code code, orig_code;
6449   internal_fn reduc_fn;
6450   machine_mode vec_mode;
6451   int op_type;
6452   optab optab;
6453   tree new_temp = NULL_TREE;
6454   gimple *def_stmt;
6455   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6456   gimple *cond_reduc_def_stmt = NULL;
6457   enum tree_code cond_reduc_op_code = ERROR_MARK;
6458   tree scalar_type;
6459   bool is_simple_use;
6460   gimple *orig_stmt;
6461   stmt_vec_info orig_stmt_info = NULL;
6462   int i;
6463   int ncopies;
6464   int epilog_copies;
6465   stmt_vec_info prev_stmt_info, prev_phi_info;
6466   bool single_defuse_cycle = false;
6467   gimple *new_stmt = NULL;
6468   int j;
6469   tree ops[3];
6470   enum vect_def_type dts[3];
6471   bool nested_cycle = false, found_nested_cycle_def = false;
6472   bool double_reduc = false;
6473   basic_block def_bb;
6474   struct loop * def_stmt_loop, *outer_loop = NULL;
6475   tree def_arg;
6476   gimple *def_arg_stmt;
6477   auto_vec<tree> vec_oprnds0;
6478   auto_vec<tree> vec_oprnds1;
6479   auto_vec<tree> vec_oprnds2;
6480   auto_vec<tree> vect_defs;
6481   auto_vec<gimple *> phis;
6482   int vec_num;
6483   tree def0, tem;
6484   bool first_p = true;
6485   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6486   tree cond_reduc_val = NULL_TREE;
6487 
6488   /* Make sure it was already recognized as a reduction computation.  */
6489   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6490       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6491     return false;
6492 
6493   if (nested_in_vect_loop_p (loop, stmt))
6494     {
6495       outer_loop = loop;
6496       loop = loop->inner;
6497       nested_cycle = true;
6498     }
6499 
6500   /* In case of reduction chain we switch to the first stmt in the chain, but
6501      we don't update STMT_INFO, since only the last stmt is marked as reduction
6502      and has reduction properties.  */
6503   if (GROUP_FIRST_ELEMENT (stmt_info)
6504       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6505     {
6506       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6507       first_p = false;
6508     }
6509 
6510   if (gimple_code (stmt) == GIMPLE_PHI)
6511     {
6512       /* Analysis is fully done on the reduction stmt invocation.  */
6513       if (! vec_stmt)
6514           {
6515             if (slp_node)
6516               slp_node_instance->reduc_phis = slp_node;
6517 
6518             STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6519             return true;
6520           }
6521 
6522       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6523           /* Leave the scalar phi in place.  Note that checking
6524              STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6525              for reductions involving a single statement.  */
6526           return true;
6527 
6528       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6529       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6530           reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6531 
6532       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6533             == EXTRACT_LAST_REDUCTION)
6534           /* Leave the scalar phi in place.  */
6535           return true;
6536 
6537       gcc_assert (is_gimple_assign (reduc_stmt));
6538       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6539           {
6540             tree op = gimple_op (reduc_stmt, k);
6541             if (op == gimple_phi_result (stmt))
6542               continue;
6543             if (k == 1
6544                 && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6545               continue;
6546             if (!vectype_in
6547                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6548                       < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6549               vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6550             break;
6551           }
6552       gcc_assert (vectype_in);
6553 
6554       if (slp_node)
6555           ncopies = 1;
6556       else
6557           ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6558 
6559       use_operand_p use_p;
6560       gimple *use_stmt;
6561       if (ncopies > 1
6562             && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6563                 <= vect_used_only_live)
6564             && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6565             && (use_stmt == reduc_stmt
6566                 || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6567                       == reduc_stmt)))
6568           single_defuse_cycle = true;
6569 
6570       /* Create the destination vector  */
6571       scalar_dest = gimple_assign_lhs (reduc_stmt);
6572       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6573 
6574       if (slp_node)
6575           /* The size vect_schedule_slp_instance computes is off for us.  */
6576           vec_num = vect_get_num_vectors
6577             (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6578              * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6579              vectype_in);
6580       else
6581           vec_num = 1;
6582 
6583       /* Generate the reduction PHIs upfront.  */
6584       prev_phi_info = NULL;
6585       for (j = 0; j < ncopies; j++)
6586           {
6587             if (j == 0 || !single_defuse_cycle)
6588               {
6589                 for (i = 0; i < vec_num; i++)
6590                     {
6591                       /* Create the reduction-phi that defines the reduction
6592                          operand.  */
6593                       gimple *new_phi = create_phi_node (vec_dest, loop->header);
6594                       set_vinfo_for_stmt (new_phi,
6595                                               new_stmt_vec_info (new_phi, loop_vinfo));
6596 
6597                       if (slp_node)
6598                         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6599                       else
6600                         {
6601                           if (j == 0)
6602                               STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6603                           else
6604                               STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6605                           prev_phi_info = vinfo_for_stmt (new_phi);
6606                         }
6607                     }
6608               }
6609           }
6610 
6611       return true;
6612     }
6613 
6614   /* 1. Is vectorizable reduction?  */
6615   /* Not supportable if the reduction variable is used in the loop, unless
6616      it's a reduction chain.  */
6617   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6618       && !GROUP_FIRST_ELEMENT (stmt_info))
6619     return false;
6620 
6621   /* Reductions that are not used even in an enclosing outer-loop,
6622      are expected to be "live" (used out of the loop).  */
6623   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6624       && !STMT_VINFO_LIVE_P (stmt_info))
6625     return false;
6626 
6627   /* 2. Has this been recognized as a reduction pattern?
6628 
6629      Check if STMT represents a pattern that has been recognized
6630      in earlier analysis stages.  For stmts that represent a pattern,
6631      the STMT_VINFO_RELATED_STMT field records the last stmt in
6632      the original sequence that constitutes the pattern.  */
6633 
6634   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6635   if (orig_stmt)
6636     {
6637       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6638       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6639       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6640     }
6641 
6642   /* 3. Check the operands of the operation.  The first operands are defined
6643         inside the loop body. The last operand is the reduction variable,
6644         which is defined by the loop-header-phi.  */
6645 
6646   gcc_assert (is_gimple_assign (stmt));
6647 
6648   /* Flatten RHS.  */
6649   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6650     {
6651     case GIMPLE_BINARY_RHS:
6652       code = gimple_assign_rhs_code (stmt);
6653       op_type = TREE_CODE_LENGTH (code);
6654       gcc_assert (op_type == binary_op);
6655       ops[0] = gimple_assign_rhs1 (stmt);
6656       ops[1] = gimple_assign_rhs2 (stmt);
6657       break;
6658 
6659     case GIMPLE_TERNARY_RHS:
6660       code = gimple_assign_rhs_code (stmt);
6661       op_type = TREE_CODE_LENGTH (code);
6662       gcc_assert (op_type == ternary_op);
6663       ops[0] = gimple_assign_rhs1 (stmt);
6664       ops[1] = gimple_assign_rhs2 (stmt);
6665       ops[2] = gimple_assign_rhs3 (stmt);
6666       break;
6667 
6668     case GIMPLE_UNARY_RHS:
6669       return false;
6670 
6671     default:
6672       gcc_unreachable ();
6673     }
6674 
6675   if (code == COND_EXPR && slp_node)
6676     return false;
6677 
6678   scalar_dest = gimple_assign_lhs (stmt);
6679   scalar_type = TREE_TYPE (scalar_dest);
6680   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6681       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6682     return false;
6683 
6684   /* Do not try to vectorize bit-precision reductions.  */
6685   if (!type_has_mode_precision_p (scalar_type))
6686     return false;
6687 
6688   /* All uses but the last are expected to be defined in the loop.
6689      The last use is the reduction variable.  In case of nested cycle this
6690      assumption is not true: we use reduc_index to record the index of the
6691      reduction variable.  */
6692   gimple *reduc_def_stmt = NULL;
6693   int reduc_index = -1;
6694   for (i = 0; i < op_type; i++)
6695     {
6696       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6697       if (i == 0 && code == COND_EXPR)
6698         continue;
6699 
6700       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6701                                                     &def_stmt, &dts[i], &tem);
6702       dt = dts[i];
6703       gcc_assert (is_simple_use);
6704       if (dt == vect_reduction_def)
6705           {
6706           reduc_def_stmt = def_stmt;
6707             reduc_index = i;
6708             continue;
6709           }
6710       else if (tem)
6711           {
6712             /* To properly compute ncopies we are interested in the widest
6713                input type in case we're looking at a widening accumulation.  */
6714             if (!vectype_in
6715                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6716                       < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6717               vectype_in = tem;
6718           }
6719 
6720       if (dt != vect_internal_def
6721             && dt != vect_external_def
6722             && dt != vect_constant_def
6723             && dt != vect_induction_def
6724           && !(dt == vect_nested_cycle && nested_cycle))
6725           return false;
6726 
6727       if (dt == vect_nested_cycle)
6728         {
6729           found_nested_cycle_def = true;
6730           reduc_def_stmt = def_stmt;
6731           reduc_index = i;
6732         }
6733 
6734       if (i == 1 && code == COND_EXPR)
6735           {
6736             /* Record how value of COND_EXPR is defined.  */
6737             if (dt == vect_constant_def)
6738               {
6739                 cond_reduc_dt = dt;
6740                 cond_reduc_val = ops[i];
6741               }
6742             if (dt == vect_induction_def
6743                 && def_stmt != NULL
6744                 && is_nonwrapping_integer_induction (def_stmt, loop))
6745               {
6746                 cond_reduc_dt = dt;
6747                 cond_reduc_def_stmt = def_stmt;
6748               }
6749           }
6750     }
6751 
6752   if (!vectype_in)
6753     vectype_in = vectype_out;
6754 
6755   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6756      directy used in stmt.  */
6757   if (reduc_index == -1)
6758     {
6759       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6760           {
6761             if (dump_enabled_p ())
6762               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6763                                    "in-order reduction chain without SLP.\n");
6764             return false;
6765           }
6766 
6767       if (orig_stmt)
6768           reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6769       else
6770           reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6771     }
6772 
6773   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6774     return false;
6775 
6776   if (!(reduc_index == -1
6777           || dts[reduc_index] == vect_reduction_def
6778           || dts[reduc_index] == vect_nested_cycle
6779           || ((dts[reduc_index] == vect_internal_def
6780                || dts[reduc_index] == vect_external_def
6781                || dts[reduc_index] == vect_constant_def
6782                || dts[reduc_index] == vect_induction_def)
6783               && nested_cycle && found_nested_cycle_def)))
6784     {
6785       /* For pattern recognized stmts, orig_stmt might be a reduction,
6786            but some helper statements for the pattern might not, or
6787            might be COND_EXPRs with reduction uses in the condition.  */
6788       gcc_assert (orig_stmt);
6789       return false;
6790     }
6791 
6792   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6793   enum vect_reduction_type v_reduc_type
6794     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6795   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6796 
6797   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6798   /* If we have a condition reduction, see if we can simplify it further.  */
6799   if (v_reduc_type == COND_REDUCTION)
6800     {
6801       /* TODO: We can't yet handle reduction chains, since we need to treat
6802            each COND_EXPR in the chain specially, not just the last one.
6803            E.g. for:
6804 
6805               x_1 = PHI <x_3, ...>
6806               x_2 = a_2 ? ... : x_1;
6807               x_3 = a_3 ? ... : x_2;
6808 
6809            we're interested in the last element in x_3 for which a_2 || a_3
6810            is true, whereas the current reduction chain handling would
6811            vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6812            as a reduction operation.  */
6813       if (reduc_index == -1)
6814           {
6815             if (dump_enabled_p ())
6816               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6817                                    "conditional reduction chains not supported\n");
6818             return false;
6819           }
6820 
6821       /* vect_is_simple_reduction ensured that operand 2 is the
6822            loop-carried operand.  */
6823       gcc_assert (reduc_index == 2);
6824 
6825       /* Loop peeling modifies initial value of reduction PHI, which
6826            makes the reduction stmt to be transformed different to the
6827            original stmt analyzed.  We need to record reduction code for
6828            CONST_COND_REDUCTION type reduction at analyzing stage, thus
6829            it can be used directly at transform stage.  */
6830       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6831             || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6832           {
6833             /* Also set the reduction type to CONST_COND_REDUCTION.  */
6834             gcc_assert (cond_reduc_dt == vect_constant_def);
6835             STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6836           }
6837       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6838                                                          vectype_in, OPTIMIZE_FOR_SPEED))
6839           {
6840             if (dump_enabled_p ())
6841               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842                                    "optimizing condition reduction with"
6843                                    " FOLD_EXTRACT_LAST.\n");
6844             STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6845           }
6846       else if (cond_reduc_dt == vect_induction_def)
6847           {
6848             stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6849             tree base
6850               = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6851             tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6852 
6853             gcc_assert (TREE_CODE (base) == INTEGER_CST
6854                           && TREE_CODE (step) == INTEGER_CST);
6855             cond_reduc_val = NULL_TREE;
6856             /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6857                above base; punt if base is the minimum value of the type for
6858                MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6859             if (tree_int_cst_sgn (step) == -1)
6860               {
6861                 cond_reduc_op_code = MIN_EXPR;
6862                 if (tree_int_cst_sgn (base) == -1)
6863                     cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6864                 else if (tree_int_cst_lt (base,
6865                                                   TYPE_MAX_VALUE (TREE_TYPE (base))))
6866                     cond_reduc_val
6867                       = int_const_binop (PLUS_EXPR, base, integer_one_node);
6868               }
6869             else
6870               {
6871                 cond_reduc_op_code = MAX_EXPR;
6872                 if (tree_int_cst_sgn (base) == 1)
6873                     cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6874                 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6875                                                   base))
6876                     cond_reduc_val
6877                       = int_const_binop (MINUS_EXPR, base, integer_one_node);
6878               }
6879             if (cond_reduc_val)
6880               {
6881                 if (dump_enabled_p ())
6882                     dump_printf_loc (MSG_NOTE, vect_location,
6883                                          "condition expression based on "
6884                                          "integer induction.\n");
6885                 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6886                     = INTEGER_INDUC_COND_REDUCTION;
6887               }
6888           }
6889       else if (cond_reduc_dt == vect_constant_def)
6890           {
6891             enum vect_def_type cond_initial_dt;
6892             gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6893             tree cond_initial_val
6894               = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6895 
6896             gcc_assert (cond_reduc_val != NULL_TREE);
6897             vect_is_simple_use (cond_initial_val, loop_vinfo,
6898                                     &def_stmt, &cond_initial_dt);
6899             if (cond_initial_dt == vect_constant_def
6900                 && types_compatible_p (TREE_TYPE (cond_initial_val),
6901                                              TREE_TYPE (cond_reduc_val)))
6902               {
6903                 tree e = fold_binary (LE_EXPR, boolean_type_node,
6904                                             cond_initial_val, cond_reduc_val);
6905                 if (e && (integer_onep (e) || integer_zerop (e)))
6906                     {
6907                       if (dump_enabled_p ())
6908                         dump_printf_loc (MSG_NOTE, vect_location,
6909                                              "condition expression based on "
6910                                              "compile time constant.\n");
6911                       /* Record reduction code at analysis stage.  */
6912                       STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6913                         = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6914                       STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6915                         = CONST_COND_REDUCTION;
6916                     }
6917               }
6918           }
6919     }
6920 
6921   if (orig_stmt)
6922     gcc_assert (tmp == orig_stmt
6923                     || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6924   else
6925     /* We changed STMT to be the first stmt in reduction chain, hence we
6926        check that in this case the first element in the chain is STMT.  */
6927     gcc_assert (stmt == tmp
6928                     || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6929 
6930   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6931     return false;
6932 
6933   if (slp_node)
6934     ncopies = 1;
6935   else
6936     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6937 
6938   gcc_assert (ncopies >= 1);
6939 
6940   vec_mode = TYPE_MODE (vectype_in);
6941   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6942 
6943   if (code == COND_EXPR)
6944     {
6945       /* Only call during the analysis stage, otherwise we'll lose
6946            STMT_VINFO_TYPE.  */
6947       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6948                                                             ops[reduc_index], 0, NULL))
6949         {
6950           if (dump_enabled_p ())
6951               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6952                                    "unsupported condition in reduction\n");
6953             return false;
6954         }
6955     }
6956   else
6957     {
6958       /* 4. Supportable by target?  */
6959 
6960       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6961             || code == LROTATE_EXPR || code == RROTATE_EXPR)
6962           {
6963             /* Shifts and rotates are only supported by vectorizable_shifts,
6964                not vectorizable_reduction.  */
6965           if (dump_enabled_p ())
6966               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967                                    "unsupported shift or rotation.\n");
6968             return false;
6969           }
6970 
6971       /* 4.1. check support for the operation in the loop  */
6972       optab = optab_for_tree_code (code, vectype_in, optab_default);
6973       if (!optab)
6974         {
6975           if (dump_enabled_p ())
6976               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6977                                    "no optab.\n");
6978 
6979           return false;
6980         }
6981 
6982       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6983         {
6984           if (dump_enabled_p ())
6985             dump_printf (MSG_NOTE, "op not supported by target.\n");
6986 
6987             if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6988                 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6989             return false;
6990 
6991           if (dump_enabled_p ())
6992               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6993         }
6994 
6995       /* Worthwhile without SIMD support?  */
6996       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6997             && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6998         {
6999           if (dump_enabled_p ())
7000               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001                                    "not worthwhile without SIMD support.\n");
7002 
7003           return false;
7004         }
7005     }
7006 
7007   /* 4.2. Check support for the epilog operation.
7008 
7009           If STMT represents a reduction pattern, then the type of the
7010           reduction variable may be different than the type of the rest
7011           of the arguments.  For example, consider the case of accumulation
7012           of shorts into an int accumulator; The original code:
7013                         S1: int_a = (int) short_a;
7014           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7015 
7016           was replaced with:
7017                         STMT: int_acc = widen_sum <short_a, int_acc>
7018 
7019           This means that:
7020           1. The tree-code that is used to create the vector operation in the
7021              epilog code (that reduces the partial results) is not the
7022              tree-code of STMT, but is rather the tree-code of the original
7023              stmt from the pattern that STMT is replacing.  I.e, in the example
7024              above we want to use 'widen_sum' in the loop, but 'plus' in the
7025              epilog.
7026           2. The type (mode) we use to check available target support
7027              for the vector operation to be created in the *epilog*, is
7028              determined by the type of the reduction variable (in the example
7029              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7030              However the type (mode) we use to check available target support
7031              for the vector operation to be created *inside the loop*, is
7032              determined by the type of the other arguments to STMT (in the
7033              example we'd check this: optab_handler (widen_sum_optab,
7034                vect_short_mode)).
7035 
7036           This is contrary to "regular" reductions, in which the types of all
7037           the arguments are the same as the type of the reduction variable.
7038           For "regular" reductions we can therefore use the same vector type
7039           (and also the same tree-code) when generating the epilog code and
7040           when generating the code inside the loop.  */
7041 
7042   vect_reduction_type reduction_type
7043     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7044   if (orig_stmt
7045       && (reduction_type == TREE_CODE_REDUCTION
7046             || reduction_type == FOLD_LEFT_REDUCTION))
7047     {
7048       /* This is a reduction pattern: get the vectype from the type of the
7049          reduction variable, and get the tree-code from orig_stmt.  */
7050       orig_code = gimple_assign_rhs_code (orig_stmt);
7051       gcc_assert (vectype_out);
7052       vec_mode = TYPE_MODE (vectype_out);
7053     }
7054   else
7055     {
7056       /* Regular reduction: use the same vectype and tree-code as used for
7057          the vector code inside the loop can be used for the epilog code. */
7058       orig_code = code;
7059 
7060       if (code == MINUS_EXPR)
7061           orig_code = PLUS_EXPR;
7062 
7063       /* For simple condition reductions, replace with the actual expression
7064            we want to base our reduction around.  */
7065       if (reduction_type == CONST_COND_REDUCTION)
7066           {
7067             orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7068             gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7069           }
7070       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7071           orig_code = cond_reduc_op_code;
7072     }
7073 
7074   if (nested_cycle)
7075     {
7076       def_bb = gimple_bb (reduc_def_stmt);
7077       def_stmt_loop = def_bb->loop_father;
7078       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7079                                        loop_preheader_edge (def_stmt_loop));
7080       if (TREE_CODE (def_arg) == SSA_NAME
7081           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7082           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7083           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7084           && vinfo_for_stmt (def_arg_stmt)
7085           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7086               == vect_double_reduction_def)
7087         double_reduc = true;
7088     }
7089 
7090   reduc_fn = IFN_LAST;
7091 
7092   if (reduction_type == TREE_CODE_REDUCTION
7093       || reduction_type == FOLD_LEFT_REDUCTION
7094       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7095       || reduction_type == CONST_COND_REDUCTION)
7096     {
7097       if (reduction_type == FOLD_LEFT_REDUCTION
7098             ? fold_left_reduction_fn (orig_code, &reduc_fn)
7099             : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7100           {
7101             if (reduc_fn != IFN_LAST
7102                 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7103                                                               OPTIMIZE_FOR_SPEED))
7104               {
7105                 if (dump_enabled_p ())
7106                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7107                                          "reduc op not supported by target.\n");
7108 
7109                 reduc_fn = IFN_LAST;
7110               }
7111           }
7112       else
7113           {
7114             if (!nested_cycle || double_reduc)
7115               {
7116                 if (dump_enabled_p ())
7117                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7118                                          "no reduc code for scalar code.\n");
7119 
7120                 return false;
7121               }
7122           }
7123     }
7124   else if (reduction_type == COND_REDUCTION)
7125     {
7126       int scalar_precision
7127           = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7128       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7129       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7130                                                             nunits_out);
7131 
7132       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7133                                                     OPTIMIZE_FOR_SPEED))
7134           reduc_fn = IFN_REDUC_MAX;
7135     }
7136 
7137   if (reduction_type != EXTRACT_LAST_REDUCTION
7138       && reduc_fn == IFN_LAST
7139       && !nunits_out.is_constant ())
7140     {
7141       if (dump_enabled_p ())
7142           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143                                "missing target support for reduction on"
7144                                " variable-length vectors.\n");
7145       return false;
7146     }
7147 
7148   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7149       && ncopies > 1)
7150     {
7151       if (dump_enabled_p ())
7152           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7153                                "multiple types in double reduction or condition "
7154                                "reduction.\n");
7155       return false;
7156     }
7157 
7158   /* For SLP reductions, see if there is a neutral value we can use.  */
7159   tree neutral_op = NULL_TREE;
7160   if (slp_node)
7161     neutral_op
7162       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7163                                               GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7164 
7165   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7166     {
7167       /* We can't support in-order reductions of code such as this:
7168 
7169              for (int i = 0; i < n1; ++i)
7170                for (int j = 0; j < n2; ++j)
7171                  l += a[j];
7172 
7173            since GCC effectively transforms the loop when vectorizing:
7174 
7175              for (int i = 0; i < n1 / VF; ++i)
7176                for (int j = 0; j < n2; ++j)
7177                  for (int k = 0; k < VF; ++k)
7178                      l += a[j];
7179 
7180            which is a reassociation of the original operation.  */
7181       if (dump_enabled_p ())
7182           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7183                                "in-order double reduction not supported.\n");
7184 
7185       return false;
7186     }
7187 
7188   if (reduction_type == FOLD_LEFT_REDUCTION
7189       && slp_node
7190       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7191     {
7192       /* We cannot use in-order reductions in this case because there is
7193            an implicit reassociation of the operations involved.  */
7194       if (dump_enabled_p ())
7195           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7196                                "in-order unchained SLP reductions not supported.\n");
7197       return false;
7198     }
7199 
7200   /* For double reductions, and for SLP reductions with a neutral value,
7201      we construct a variable-length initial vector by loading a vector
7202      full of the neutral value and then shift-and-inserting the start
7203      values into the low-numbered elements.  */
7204   if ((double_reduc || neutral_op)
7205       && !nunits_out.is_constant ()
7206       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7207                                                     vectype_out, OPTIMIZE_FOR_SPEED))
7208     {
7209       if (dump_enabled_p ())
7210           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7211                                "reduction on variable-length vectors requires"
7212                                " target support for a vector-shift-and-insert"
7213                                " operation.\n");
7214       return false;
7215     }
7216 
7217   /* Check extra constraints for variable-length unchained SLP reductions.  */
7218   if (STMT_SLP_TYPE (stmt_info)
7219       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7220       && !nunits_out.is_constant ())
7221     {
7222       /* We checked above that we could build the initial vector when
7223            there's a neutral element value.  Check here for the case in
7224            which each SLP statement has its own initial value and in which
7225            that value needs to be repeated for every instance of the
7226            statement within the initial vector.  */
7227       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7228       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7229       if (!neutral_op
7230             && !can_duplicate_and_interleave_p (group_size, elt_mode))
7231           {
7232             if (dump_enabled_p ())
7233               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7234                                    "unsupported form of SLP reduction for"
7235                                    " variable-length vectors: cannot build"
7236                                    " initial vector.\n");
7237             return false;
7238           }
7239       /* The epilogue code relies on the number of elements being a multiple
7240            of the group size.  The duplicate-and-interleave approach to setting
7241            up the the initial vector does too.  */
7242       if (!multiple_p (nunits_out, group_size))
7243           {
7244             if (dump_enabled_p ())
7245               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7246                                    "unsupported form of SLP reduction for"
7247                                    " variable-length vectors: the vector size"
7248                                    " is not a multiple of the number of results.\n");
7249             return false;
7250           }
7251     }
7252 
7253   /* In case of widenning multiplication by a constant, we update the type
7254      of the constant to be the type of the other operand.  We check that the
7255      constant fits the type in the pattern recognition pass.  */
7256   if (code == DOT_PROD_EXPR
7257       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7258     {
7259       if (TREE_CODE (ops[0]) == INTEGER_CST)
7260         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7261       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7262         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7263       else
7264         {
7265           if (dump_enabled_p ())
7266               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7267                                    "invalid types in dot-prod\n");
7268 
7269           return false;
7270         }
7271     }
7272 
7273   if (reduction_type == COND_REDUCTION)
7274     {
7275       widest_int ni;
7276 
7277       if (! max_loop_iterations (loop, &ni))
7278           {
7279             if (dump_enabled_p ())
7280               dump_printf_loc (MSG_NOTE, vect_location,
7281                                    "loop count not known, cannot create cond "
7282                                    "reduction.\n");
7283             return false;
7284           }
7285       /* Convert backedges to iterations.  */
7286       ni += 1;
7287 
7288       /* The additional index will be the same type as the condition.  Check
7289            that the loop can fit into this less one (because we'll use up the
7290            zero slot for when there are no matches).  */
7291       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7292       if (wi::geu_p (ni, wi::to_widest (max_index)))
7293           {
7294             if (dump_enabled_p ())
7295               dump_printf_loc (MSG_NOTE, vect_location,
7296                                    "loop size is greater than data size.\n");
7297             return false;
7298           }
7299     }
7300 
7301   /* In case the vectorization factor (VF) is bigger than the number
7302      of elements that we can fit in a vectype (nunits), we have to generate
7303      more than one vector stmt - i.e - we need to "unroll" the
7304      vector stmt by a factor VF/nunits.  For more details see documentation
7305      in vectorizable_operation.  */
7306 
7307   /* If the reduction is used in an outer loop we need to generate
7308      VF intermediate results, like so (e.g. for ncopies=2):
7309           r0 = phi (init, r0)
7310           r1 = phi (init, r1)
7311           r0 = x0 + r0;
7312         r1 = x1 + r1;
7313     (i.e. we generate VF results in 2 registers).
7314     In this case we have a separate def-use cycle for each copy, and therefore
7315     for each copy we get the vector def for the reduction variable from the
7316     respective phi node created for this copy.
7317 
7318     Otherwise (the reduction is unused in the loop nest), we can combine
7319     together intermediate results, like so (e.g. for ncopies=2):
7320           r = phi (init, r)
7321           r = x0 + r;
7322           r = x1 + r;
7323    (i.e. we generate VF/2 results in a single register).
7324    In this case for each copy we get the vector def for the reduction variable
7325    from the vectorized reduction operation generated in the previous iteration.
7326 
7327    This only works when we see both the reduction PHI and its only consumer
7328    in vectorizable_reduction and there are no intermediate stmts
7329    participating.  */
7330   use_operand_p use_p;
7331   gimple *use_stmt;
7332   if (ncopies > 1
7333       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7334       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7335       && (use_stmt == stmt
7336             || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7337     {
7338       single_defuse_cycle = true;
7339       epilog_copies = 1;
7340     }
7341   else
7342     epilog_copies = ncopies;
7343 
7344   /* If the reduction stmt is one of the patterns that have lane
7345      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7346   if ((ncopies > 1
7347        && ! single_defuse_cycle)
7348       && (code == DOT_PROD_EXPR
7349             || code == WIDEN_SUM_EXPR
7350             || code == SAD_EXPR))
7351     {
7352       if (dump_enabled_p ())
7353           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7354                                "multi def-use cycle not possible for lane-reducing "
7355                                "reduction operation\n");
7356       return false;
7357     }
7358 
7359   if (slp_node)
7360     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7361   else
7362     vec_num = 1;
7363 
7364   internal_fn cond_fn = get_conditional_internal_fn (code);
7365   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7366 
7367   if (!vec_stmt) /* transformation not required.  */
7368     {
7369       if (first_p)
7370           vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7371       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7372           {
7373             if (reduction_type != FOLD_LEFT_REDUCTION
7374                 && (cond_fn == IFN_LAST
7375                       || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7376                                                                   OPTIMIZE_FOR_SPEED)))
7377               {
7378                 if (dump_enabled_p ())
7379                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7380                                          "can't use a fully-masked loop because no"
7381                                          " conditional operation is available.\n");
7382                 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7383               }
7384             else if (reduc_index == -1)
7385               {
7386                 if (dump_enabled_p ())
7387                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7388                                          "can't use a fully-masked loop for chained"
7389                                          " reductions.\n");
7390                 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7391               }
7392             else
7393               vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7394                                            vectype_in);
7395           }
7396       if (dump_enabled_p ()
7397             && reduction_type == FOLD_LEFT_REDUCTION)
7398           dump_printf_loc (MSG_NOTE, vect_location,
7399                                "using an in-order (fold-left) reduction.\n");
7400       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7401       return true;
7402     }
7403 
7404   /* Transform.  */
7405 
7406   if (dump_enabled_p ())
7407     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7408 
7409   /* FORNOW: Multiple types are not supported for condition.  */
7410   if (code == COND_EXPR)
7411     gcc_assert (ncopies == 1);
7412 
7413   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7414 
7415   if (reduction_type == FOLD_LEFT_REDUCTION)
7416     return vectorize_fold_left_reduction
7417       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7418        reduc_fn, ops, vectype_in, reduc_index, masks);
7419 
7420   if (reduction_type == EXTRACT_LAST_REDUCTION)
7421     {
7422       gcc_assert (!slp_node);
7423       return vectorizable_condition (stmt, gsi, vec_stmt,
7424                                              NULL, reduc_index, NULL);
7425     }
7426 
7427   /* Create the destination vector  */
7428   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7429 
7430   prev_stmt_info = NULL;
7431   prev_phi_info = NULL;
7432   if (!slp_node)
7433     {
7434       vec_oprnds0.create (1);
7435       vec_oprnds1.create (1);
7436       if (op_type == ternary_op)
7437         vec_oprnds2.create (1);
7438     }
7439 
7440   phis.create (vec_num);
7441   vect_defs.create (vec_num);
7442   if (!slp_node)
7443     vect_defs.quick_push (NULL_TREE);
7444 
7445   if (slp_node)
7446     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7447   else
7448     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7449 
7450   for (j = 0; j < ncopies; j++)
7451     {
7452       if (code == COND_EXPR)
7453         {
7454           gcc_assert (!slp_node);
7455           vectorizable_condition (stmt, gsi, vec_stmt,
7456                                   PHI_RESULT (phis[0]),
7457                                   reduc_index, NULL);
7458           /* Multiple types are not supported for condition.  */
7459           break;
7460         }
7461 
7462       /* Handle uses.  */
7463       if (j == 0)
7464         {
7465             if (slp_node)
7466               {
7467                 /* Get vec defs for all the operands except the reduction index,
7468                      ensuring the ordering of the ops in the vector is kept.  */
7469                 auto_vec<tree, 3> slp_ops;
7470                 auto_vec<vec<tree>, 3> vec_defs;
7471 
7472                 slp_ops.quick_push (ops[0]);
7473                 slp_ops.quick_push (ops[1]);
7474                 if (op_type == ternary_op)
7475                     slp_ops.quick_push (ops[2]);
7476 
7477                 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7478 
7479                 vec_oprnds0.safe_splice (vec_defs[0]);
7480                 vec_defs[0].release ();
7481                 vec_oprnds1.safe_splice (vec_defs[1]);
7482                 vec_defs[1].release ();
7483                 if (op_type == ternary_op)
7484                     {
7485                       vec_oprnds2.safe_splice (vec_defs[2]);
7486                       vec_defs[2].release ();
7487                     }
7488               }
7489           else
7490               {
7491               vec_oprnds0.quick_push
7492                     (vect_get_vec_def_for_operand (ops[0], stmt));
7493               vec_oprnds1.quick_push
7494                     (vect_get_vec_def_for_operand (ops[1], stmt));
7495               if (op_type == ternary_op)
7496                     vec_oprnds2.quick_push
7497                       (vect_get_vec_def_for_operand (ops[2], stmt));
7498               }
7499         }
7500       else
7501         {
7502           if (!slp_node)
7503             {
7504                 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7505 
7506                 if (single_defuse_cycle && reduc_index == 0)
7507                     vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7508                 else
7509                     vec_oprnds0[0]
7510                       = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7511                 if (single_defuse_cycle && reduc_index == 1)
7512                     vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7513                 else
7514                     vec_oprnds1[0]
7515                       = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7516                 if (op_type == ternary_op)
7517                     {
7518                       if (single_defuse_cycle && reduc_index == 2)
7519                         vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7520                       else
7521                         vec_oprnds2[0]
7522                           = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7523                     }
7524             }
7525         }
7526 
7527       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7528         {
7529             tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7530             if (masked_loop_p)
7531               {
7532                 /* Make sure that the reduction accumulator is vop[0].  */
7533                 if (reduc_index == 1)
7534                     {
7535                       gcc_assert (commutative_tree_code (code));
7536                       std::swap (vop[0], vop[1]);
7537                     }
7538                 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7539                                                         vectype_in, i * ncopies + j);
7540                 gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7541                                                                       vop[0], vop[1]);
7542                 new_temp = make_ssa_name (vec_dest, call);
7543                 gimple_call_set_lhs (call, new_temp);
7544                 gimple_call_set_nothrow (call, true);
7545                 new_stmt = call;
7546               }
7547             else
7548               {
7549                 if (op_type == ternary_op)
7550                     vop[2] = vec_oprnds2[i];
7551 
7552                 new_temp = make_ssa_name (vec_dest, new_stmt);
7553                 new_stmt = gimple_build_assign (new_temp, code,
7554                                                         vop[0], vop[1], vop[2]);
7555               }
7556             vect_finish_stmt_generation (stmt, new_stmt, gsi);
7557 
7558           if (slp_node)
7559             {
7560               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7561               vect_defs.quick_push (new_temp);
7562             }
7563           else
7564             vect_defs[0] = new_temp;
7565         }
7566 
7567       if (slp_node)
7568         continue;
7569 
7570       if (j == 0)
7571           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7572       else
7573           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7574 
7575       prev_stmt_info = vinfo_for_stmt (new_stmt);
7576     }
7577 
7578   /* Finalize the reduction-phi (set its arguments) and create the
7579      epilog reduction code.  */
7580   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7581     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7582 
7583   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7584                                             epilog_copies, reduc_fn, phis,
7585                                             double_reduc, slp_node, slp_node_instance,
7586                                             cond_reduc_val, cond_reduc_op_code,
7587                                             neutral_op);
7588 
7589   return true;
7590 }
7591 
7592 /* Function vect_min_worthwhile_factor.
7593 
7594    For a loop where we could vectorize the operation indicated by CODE,
7595    return the minimum vectorization factor that makes it worthwhile
7596    to use generic vectors.  */
7597 static unsigned int
vect_min_worthwhile_factor(enum tree_code code)7598 vect_min_worthwhile_factor (enum tree_code code)
7599 {
7600   switch (code)
7601     {
7602     case PLUS_EXPR:
7603     case MINUS_EXPR:
7604     case NEGATE_EXPR:
7605       return 4;
7606 
7607     case BIT_AND_EXPR:
7608     case BIT_IOR_EXPR:
7609     case BIT_XOR_EXPR:
7610     case BIT_NOT_EXPR:
7611       return 2;
7612 
7613     default:
7614       return INT_MAX;
7615     }
7616 }
7617 
7618 /* Return true if VINFO indicates we are doing loop vectorization and if
7619    it is worth decomposing CODE operations into scalar operations for
7620    that loop's vectorization factor.  */
7621 
7622 bool
vect_worthwhile_without_simd_p(vec_info * vinfo,tree_code code)7623 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7624 {
7625   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7626   unsigned HOST_WIDE_INT value;
7627   return (loop_vinfo
7628             && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7629             && value >= vect_min_worthwhile_factor (code));
7630 }
7631 
7632 /* Function vectorizable_induction
7633 
7634    Check if PHI performs an induction computation that can be vectorized.
7635    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7636    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7637    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7638 
7639 bool
vectorizable_induction(gimple * phi,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,gimple ** vec_stmt,slp_tree slp_node)7640 vectorizable_induction (gimple *phi,
7641                               gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7642                               gimple **vec_stmt, slp_tree slp_node)
7643 {
7644   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7645   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7646   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7647   unsigned ncopies;
7648   bool nested_in_vect_loop = false;
7649   struct loop *iv_loop;
7650   tree vec_def;
7651   edge pe = loop_preheader_edge (loop);
7652   basic_block new_bb;
7653   tree new_vec, vec_init, vec_step, t;
7654   tree new_name;
7655   gimple *new_stmt;
7656   gphi *induction_phi;
7657   tree induc_def, vec_dest;
7658   tree init_expr, step_expr;
7659   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7660   unsigned i;
7661   tree expr;
7662   gimple_seq stmts;
7663   imm_use_iterator imm_iter;
7664   use_operand_p use_p;
7665   gimple *exit_phi;
7666   edge latch_e;
7667   tree loop_arg;
7668   gimple_stmt_iterator si;
7669   basic_block bb = gimple_bb (phi);
7670 
7671   if (gimple_code (phi) != GIMPLE_PHI)
7672     return false;
7673 
7674   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7675     return false;
7676 
7677   /* Make sure it was recognized as induction computation.  */
7678   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7679     return false;
7680 
7681   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7682   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7683 
7684   if (slp_node)
7685     ncopies = 1;
7686   else
7687     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7688   gcc_assert (ncopies >= 1);
7689 
7690   /* FORNOW. These restrictions should be relaxed.  */
7691   if (nested_in_vect_loop_p (loop, phi))
7692     {
7693       imm_use_iterator imm_iter;
7694       use_operand_p use_p;
7695       gimple *exit_phi;
7696       edge latch_e;
7697       tree loop_arg;
7698 
7699       if (ncopies > 1)
7700           {
7701             if (dump_enabled_p ())
7702               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7703                                    "multiple types in nested loop.\n");
7704             return false;
7705           }
7706 
7707       /* FORNOW: outer loop induction with SLP not supported.  */
7708       if (STMT_SLP_TYPE (stmt_info))
7709           return false;
7710 
7711       exit_phi = NULL;
7712       latch_e = loop_latch_edge (loop->inner);
7713       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7714       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7715           {
7716             gimple *use_stmt = USE_STMT (use_p);
7717             if (is_gimple_debug (use_stmt))
7718               continue;
7719 
7720             if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7721               {
7722                 exit_phi = use_stmt;
7723                 break;
7724               }
7725           }
7726       if (exit_phi)
7727           {
7728             stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7729             if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7730                     && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7731               {
7732                 if (dump_enabled_p ())
7733                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7734                                          "inner-loop induction only used outside "
7735                                          "of the outer vectorized loop.\n");
7736                 return false;
7737               }
7738           }
7739 
7740       nested_in_vect_loop = true;
7741       iv_loop = loop->inner;
7742     }
7743   else
7744     iv_loop = loop;
7745   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7746 
7747   if (slp_node && !nunits.is_constant ())
7748     {
7749       /* The current SLP code creates the initial value element-by-element.  */
7750       if (dump_enabled_p ())
7751           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7752                                "SLP induction not supported for variable-length"
7753                                " vectors.\n");
7754       return false;
7755     }
7756 
7757   if (!vec_stmt) /* transformation not required.  */
7758     {
7759       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7760       if (dump_enabled_p ())
7761         dump_printf_loc (MSG_NOTE, vect_location,
7762                          "=== vectorizable_induction ===\n");
7763       vect_model_induction_cost (stmt_info, ncopies);
7764       return true;
7765     }
7766 
7767   /* Transform.  */
7768 
7769   /* Compute a vector variable, initialized with the first VF values of
7770      the induction variable.  E.g., for an iv with IV_PHI='X' and
7771      evolution S, for a vector of 4 units, we want to compute:
7772      [X, X + S, X + 2*S, X + 3*S].  */
7773 
7774   if (dump_enabled_p ())
7775     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7776 
7777   latch_e = loop_latch_edge (iv_loop);
7778   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7779 
7780   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7781   gcc_assert (step_expr != NULL_TREE);
7782 
7783   pe = loop_preheader_edge (iv_loop);
7784   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7785                                              loop_preheader_edge (iv_loop));
7786 
7787   stmts = NULL;
7788   if (!nested_in_vect_loop)
7789     {
7790       /* Convert the initial value to the desired type.  */
7791       tree new_type = TREE_TYPE (vectype);
7792       init_expr = gimple_convert (&stmts, new_type, init_expr);
7793 
7794       /* If we are using the loop mask to "peel" for alignment then we need
7795            to adjust the start value here.  */
7796       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7797       if (skip_niters != NULL_TREE)
7798           {
7799             if (FLOAT_TYPE_P (vectype))
7800               skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7801                                                   skip_niters);
7802             else
7803               skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7804             tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7805                                                    skip_niters, step_expr);
7806             init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7807                                             init_expr, skip_step);
7808           }
7809     }
7810 
7811   /* Convert the step to the desired type.  */
7812   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7813 
7814   if (stmts)
7815     {
7816       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7817       gcc_assert (!new_bb);
7818     }
7819 
7820   /* Find the first insertion point in the BB.  */
7821   si = gsi_after_labels (bb);
7822 
7823   /* For SLP induction we have to generate several IVs as for example
7824      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7825      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7826      [VF*S, VF*S, VF*S, VF*S] for all.  */
7827   if (slp_node)
7828     {
7829       /* Enforced above.  */
7830       unsigned int const_nunits = nunits.to_constant ();
7831 
7832       /* Generate [VF*S, VF*S, ... ].  */
7833       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7834           {
7835             expr = build_int_cst (integer_type_node, vf);
7836             expr = fold_convert (TREE_TYPE (step_expr), expr);
7837           }
7838       else
7839           expr = build_int_cst (TREE_TYPE (step_expr), vf);
7840       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7841                                     expr, step_expr);
7842       if (! CONSTANT_CLASS_P (new_name))
7843           new_name = vect_init_vector (phi, new_name,
7844                                              TREE_TYPE (step_expr), NULL);
7845       new_vec = build_vector_from_val (vectype, new_name);
7846       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7847 
7848       /* Now generate the IVs.  */
7849       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7850       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7851       unsigned elts = const_nunits * nvects;
7852       unsigned nivs = least_common_multiple (group_size,
7853                                                        const_nunits) / const_nunits;
7854       gcc_assert (elts % group_size == 0);
7855       tree elt = init_expr;
7856       unsigned ivn;
7857       for (ivn = 0; ivn < nivs; ++ivn)
7858           {
7859             tree_vector_builder elts (vectype, const_nunits, 1);
7860             stmts = NULL;
7861             for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7862               {
7863                 if (ivn*const_nunits + eltn >= group_size
7864                       && (ivn * const_nunits + eltn) % group_size == 0)
7865                     elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7866                                             elt, step_expr);
7867                 elts.quick_push (elt);
7868               }
7869             vec_init = gimple_build_vector (&stmts, &elts);
7870             if (stmts)
7871               {
7872                 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7873                 gcc_assert (!new_bb);
7874               }
7875 
7876             /* Create the induction-phi that defines the induction-operand.  */
7877             vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7878             induction_phi = create_phi_node (vec_dest, iv_loop->header);
7879             set_vinfo_for_stmt (induction_phi,
7880                                     new_stmt_vec_info (induction_phi, loop_vinfo));
7881             induc_def = PHI_RESULT (induction_phi);
7882 
7883             /* Create the iv update inside the loop  */
7884             vec_def = make_ssa_name (vec_dest);
7885             new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7886             gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7887             set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7888 
7889             /* Set the arguments of the phi node:  */
7890             add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7891             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7892                            UNKNOWN_LOCATION);
7893 
7894             SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7895           }
7896 
7897       /* Re-use IVs when we can.  */
7898       if (ivn < nvects)
7899           {
7900             unsigned vfp
7901               = least_common_multiple (group_size, const_nunits) / group_size;
7902             /* Generate [VF'*S, VF'*S, ... ].  */
7903             if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7904               {
7905                 expr = build_int_cst (integer_type_node, vfp);
7906                 expr = fold_convert (TREE_TYPE (step_expr), expr);
7907               }
7908             else
7909               expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7910             new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7911                                           expr, step_expr);
7912             if (! CONSTANT_CLASS_P (new_name))
7913               new_name = vect_init_vector (phi, new_name,
7914                                                    TREE_TYPE (step_expr), NULL);
7915             new_vec = build_vector_from_val (vectype, new_name);
7916             vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7917             for (; ivn < nvects; ++ivn)
7918               {
7919                 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7920                 tree def;
7921                 if (gimple_code (iv) == GIMPLE_PHI)
7922                     def = gimple_phi_result (iv);
7923                 else
7924                     def = gimple_assign_lhs (iv);
7925                 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7926                                                         PLUS_EXPR,
7927                                                         def, vec_step);
7928                 if (gimple_code (iv) == GIMPLE_PHI)
7929                     gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7930                 else
7931                     {
7932                       gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7933                       gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7934                     }
7935                 set_vinfo_for_stmt (new_stmt,
7936                                           new_stmt_vec_info (new_stmt, loop_vinfo));
7937                 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7938               }
7939           }
7940 
7941       return true;
7942     }
7943 
7944   /* Create the vector that holds the initial_value of the induction.  */
7945   if (nested_in_vect_loop)
7946     {
7947       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7948            been created during vectorization of previous stmts.  We obtain it
7949            from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7950       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7951       /* If the initial value is not of proper type, convert it.  */
7952       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7953           {
7954             new_stmt
7955               = gimple_build_assign (vect_get_new_ssa_name (vectype,
7956                                                                         vect_simple_var,
7957                                                                         "vec_iv_"),
7958                                            VIEW_CONVERT_EXPR,
7959                                            build1 (VIEW_CONVERT_EXPR, vectype,
7960                                                      vec_init));
7961             vec_init = gimple_assign_lhs (new_stmt);
7962             new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7963                                                              new_stmt);
7964             gcc_assert (!new_bb);
7965             set_vinfo_for_stmt (new_stmt,
7966                                     new_stmt_vec_info (new_stmt, loop_vinfo));
7967           }
7968     }
7969   else
7970     {
7971       /* iv_loop is the loop to be vectorized. Create:
7972            vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7973       stmts = NULL;
7974       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7975 
7976       unsigned HOST_WIDE_INT const_nunits;
7977       if (nunits.is_constant (&const_nunits))
7978           {
7979             tree_vector_builder elts (vectype, const_nunits, 1);
7980             elts.quick_push (new_name);
7981             for (i = 1; i < const_nunits; i++)
7982               {
7983                 /* Create: new_name_i = new_name + step_expr  */
7984                 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7985                                                new_name, step_expr);
7986                 elts.quick_push (new_name);
7987               }
7988             /* Create a vector from [new_name_0, new_name_1, ...,
7989                new_name_nunits-1]  */
7990             vec_init = gimple_build_vector (&stmts, &elts);
7991           }
7992       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7993           /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7994           vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7995                                          new_name, step_expr);
7996       else
7997           {
7998             /* Build:
7999                   [base, base, base, ...]
8000                     + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8001             gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8002             gcc_assert (flag_associative_math);
8003             tree index = build_index_vector (vectype, 0, 1);
8004             tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
8005                                                                       new_name);
8006             tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
8007                                                                       step_expr);
8008             vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
8009             vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8010                                            vec_init, step_vec);
8011             vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8012                                            vec_init, base_vec);
8013           }
8014 
8015       if (stmts)
8016           {
8017             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8018             gcc_assert (!new_bb);
8019           }
8020     }
8021 
8022 
8023   /* Create the vector that holds the step of the induction.  */
8024   if (nested_in_vect_loop)
8025     /* iv_loop is nested in the loop to be vectorized. Generate:
8026        vec_step = [S, S, S, S]  */
8027     new_name = step_expr;
8028   else
8029     {
8030       /* iv_loop is the loop to be vectorized. Generate:
8031             vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8032       gimple_seq seq = NULL;
8033       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8034           {
8035             expr = build_int_cst (integer_type_node, vf);
8036             expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8037           }
8038       else
8039           expr = build_int_cst (TREE_TYPE (step_expr), vf);
8040       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8041                                      expr, step_expr);
8042       if (seq)
8043           {
8044             new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8045             gcc_assert (!new_bb);
8046           }
8047     }
8048 
8049   t = unshare_expr (new_name);
8050   gcc_assert (CONSTANT_CLASS_P (new_name)
8051                 || TREE_CODE (new_name) == SSA_NAME);
8052   new_vec = build_vector_from_val (vectype, t);
8053   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8054 
8055 
8056   /* Create the following def-use cycle:
8057      loop prolog:
8058          vec_init = ...
8059            vec_step = ...
8060      loop:
8061          vec_iv = PHI <vec_init, vec_loop>
8062          ...
8063          STMT
8064          ...
8065          vec_loop = vec_iv + vec_step;  */
8066 
8067   /* Create the induction-phi that defines the induction-operand.  */
8068   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8069   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8070   set_vinfo_for_stmt (induction_phi,
8071                           new_stmt_vec_info (induction_phi, loop_vinfo));
8072   induc_def = PHI_RESULT (induction_phi);
8073 
8074   /* Create the iv update inside the loop  */
8075   vec_def = make_ssa_name (vec_dest);
8076   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8077   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8078   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8079 
8080   /* Set the arguments of the phi node:  */
8081   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8082   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8083                  UNKNOWN_LOCATION);
8084 
8085   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8086 
8087   /* In case that vectorization factor (VF) is bigger than the number
8088      of elements that we can fit in a vectype (nunits), we have to generate
8089      more than one vector stmt - i.e - we need to "unroll" the
8090      vector stmt by a factor VF/nunits.  For more details see documentation
8091      in vectorizable_operation.  */
8092 
8093   if (ncopies > 1)
8094     {
8095       gimple_seq seq = NULL;
8096       stmt_vec_info prev_stmt_vinfo;
8097       /* FORNOW. This restriction should be relaxed.  */
8098       gcc_assert (!nested_in_vect_loop);
8099 
8100       /* Create the vector that holds the step of the induction.  */
8101       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8102           {
8103             expr = build_int_cst (integer_type_node, nunits);
8104             expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8105           }
8106       else
8107           expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8108       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8109                                      expr, step_expr);
8110       if (seq)
8111           {
8112             new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8113             gcc_assert (!new_bb);
8114           }
8115 
8116       t = unshare_expr (new_name);
8117       gcc_assert (CONSTANT_CLASS_P (new_name)
8118                       || TREE_CODE (new_name) == SSA_NAME);
8119       new_vec = build_vector_from_val (vectype, t);
8120       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8121 
8122       vec_def = induc_def;
8123       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8124       for (i = 1; i < ncopies; i++)
8125           {
8126             /* vec_i = vec_prev + vec_step  */
8127             new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8128                                                     vec_def, vec_step);
8129             vec_def = make_ssa_name (vec_dest, new_stmt);
8130             gimple_assign_set_lhs (new_stmt, vec_def);
8131 
8132             gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8133             set_vinfo_for_stmt (new_stmt,
8134                                     new_stmt_vec_info (new_stmt, loop_vinfo));
8135             STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8136             prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8137           }
8138     }
8139 
8140   if (nested_in_vect_loop)
8141     {
8142       /* Find the loop-closed exit-phi of the induction, and record
8143          the final vector of induction results:  */
8144       exit_phi = NULL;
8145       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8146         {
8147             gimple *use_stmt = USE_STMT (use_p);
8148             if (is_gimple_debug (use_stmt))
8149               continue;
8150 
8151             if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8152               {
8153                 exit_phi = use_stmt;
8154                 break;
8155               }
8156         }
8157       if (exit_phi)
8158           {
8159             stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8160             /* FORNOW. Currently not supporting the case that an inner-loop induction
8161                is not used in the outer-loop (i.e. only outside the outer-loop).  */
8162             gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8163                           && !STMT_VINFO_LIVE_P (stmt_vinfo));
8164 
8165             STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8166             if (dump_enabled_p ())
8167               {
8168                 dump_printf_loc (MSG_NOTE, vect_location,
8169                                      "vector of inductions after inner-loop:");
8170                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8171               }
8172           }
8173     }
8174 
8175 
8176   if (dump_enabled_p ())
8177     {
8178       dump_printf_loc (MSG_NOTE, vect_location,
8179                            "transform induction: created def-use cycle: ");
8180       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8181       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8182                               SSA_NAME_DEF_STMT (vec_def), 0);
8183     }
8184 
8185   return true;
8186 }
8187 
8188 /* Function vectorizable_live_operation.
8189 
8190    STMT computes a value that is used outside the loop.  Check if
8191    it can be supported.  */
8192 
8193 bool
vectorizable_live_operation(gimple * stmt,gimple_stmt_iterator * gsi ATTRIBUTE_UNUSED,slp_tree slp_node,int slp_index,gimple ** vec_stmt)8194 vectorizable_live_operation (gimple *stmt,
8195                                    gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8196                                    slp_tree slp_node, int slp_index,
8197                                    gimple **vec_stmt)
8198 {
8199   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8200   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8201   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8202   imm_use_iterator imm_iter;
8203   tree lhs, lhs_type, bitsize, vec_bitsize;
8204   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8205   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8206   int ncopies;
8207   gimple *use_stmt;
8208   auto_vec<tree> vec_oprnds;
8209   int vec_entry = 0;
8210   poly_uint64 vec_index = 0;
8211 
8212   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8213 
8214   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8215     return false;
8216 
8217   /* FORNOW.  CHECKME.  */
8218   if (nested_in_vect_loop_p (loop, stmt))
8219     return false;
8220 
8221   /* If STMT is not relevant and it is a simple assignment and its inputs are
8222      invariant then it can remain in place, unvectorized.  The original last
8223      scalar value that it computes will be used.  */
8224   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8225     {
8226       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8227       if (dump_enabled_p ())
8228           dump_printf_loc (MSG_NOTE, vect_location,
8229                                "statement is simple and uses invariant.  Leaving in "
8230                                "place.\n");
8231       return true;
8232     }
8233 
8234   if (slp_node)
8235     ncopies = 1;
8236   else
8237     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8238 
8239   if (slp_node)
8240     {
8241       gcc_assert (slp_index >= 0);
8242 
8243       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8244       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8245 
8246       /* Get the last occurrence of the scalar index from the concatenation of
8247            all the slp vectors. Calculate which slp vector it is and the index
8248            within.  */
8249       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8250 
8251       /* Calculate which vector contains the result, and which lane of
8252            that vector we need.  */
8253       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8254           {
8255             if (dump_enabled_p ())
8256               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8257                                    "Cannot determine which vector holds the"
8258                                    " final result.\n");
8259             return false;
8260           }
8261     }
8262 
8263   if (!vec_stmt)
8264     {
8265       /* No transformation required.  */
8266       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8267           {
8268             if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8269                                                          OPTIMIZE_FOR_SPEED))
8270               {
8271                 if (dump_enabled_p ())
8272                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273                                          "can't use a fully-masked loop because "
8274                                          "the target doesn't support extract last "
8275                                          "reduction.\n");
8276                 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8277               }
8278             else if (slp_node)
8279               {
8280                 if (dump_enabled_p ())
8281                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8282                                          "can't use a fully-masked loop because an "
8283                                          "SLP statement is live after the loop.\n");
8284                 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8285               }
8286             else if (ncopies > 1)
8287               {
8288                 if (dump_enabled_p ())
8289                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8290                                          "can't use a fully-masked loop because"
8291                                          " ncopies is greater than 1.\n");
8292                 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8293               }
8294             else
8295               {
8296                 gcc_assert (ncopies == 1 && !slp_node);
8297                 vect_record_loop_mask (loop_vinfo,
8298                                              &LOOP_VINFO_MASKS (loop_vinfo),
8299                                              1, vectype);
8300               }
8301           }
8302       return true;
8303     }
8304 
8305   /* If stmt has a related stmt, then use that for getting the lhs.  */
8306   if (is_pattern_stmt_p (stmt_info))
8307     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8308 
8309   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8310           : gimple_get_lhs (stmt);
8311   lhs_type = TREE_TYPE (lhs);
8312 
8313   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8314                ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8315                : TYPE_SIZE (TREE_TYPE (vectype)));
8316   vec_bitsize = TYPE_SIZE (vectype);
8317 
8318   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8319   tree vec_lhs, bitstart;
8320   if (slp_node)
8321     {
8322       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8323 
8324       /* Get the correct slp vectorized stmt.  */
8325       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8326       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8327           vec_lhs = gimple_phi_result (phi);
8328       else
8329           vec_lhs = gimple_get_lhs (vec_stmt);
8330 
8331       /* Get entry to use.  */
8332       bitstart = bitsize_int (vec_index);
8333       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8334     }
8335   else
8336     {
8337       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8338       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8339       gcc_checking_assert (ncopies == 1
8340                                  || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8341 
8342       /* For multiple copies, get the last copy.  */
8343       for (int i = 1; i < ncopies; ++i)
8344           vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8345                                                               vec_lhs);
8346 
8347       /* Get the last lane in the vector.  */
8348       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8349     }
8350 
8351   gimple_seq stmts = NULL;
8352   tree new_tree;
8353   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8354     {
8355       /* Emit:
8356 
8357              SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8358 
8359            where VEC_LHS is the vectorized live-out result and MASK is
8360            the loop mask for the final iteration.  */
8361       gcc_assert (ncopies == 1 && !slp_node);
8362       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8363       tree scalar_res = make_ssa_name (scalar_type);
8364       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8365                                               1, vectype, 0);
8366       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8367                                                                 2, mask, vec_lhs);
8368       gimple_call_set_lhs (new_stmt, scalar_res);
8369       gimple_seq_add_stmt (&stmts, new_stmt);
8370 
8371       /* Convert the extracted vector element to the required scalar type.  */
8372       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8373     }
8374   else
8375     {
8376       tree bftype = TREE_TYPE (vectype);
8377       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8378           bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8379       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8380       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8381                                                &stmts, true, NULL_TREE);
8382     }
8383 
8384   if (stmts)
8385     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8386 
8387   /* Replace use of lhs with newly computed result.  If the use stmt is a
8388      single arg PHI, just replace all uses of PHI result.  It's necessary
8389      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8390   use_operand_p use_p;
8391   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8392     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8393           && !is_gimple_debug (use_stmt))
8394     {
8395       if (gimple_code (use_stmt) == GIMPLE_PHI
8396             && gimple_phi_num_args (use_stmt) == 1)
8397           {
8398             replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8399           }
8400       else
8401           {
8402             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8403               SET_USE (use_p, new_tree);
8404           }
8405       update_stmt (use_stmt);
8406     }
8407 
8408   return true;
8409 }
8410 
8411 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8412 
8413 static void
vect_loop_kill_debug_uses(struct loop * loop,gimple * stmt)8414 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8415 {
8416   ssa_op_iter op_iter;
8417   imm_use_iterator imm_iter;
8418   def_operand_p def_p;
8419   gimple *ustmt;
8420 
8421   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8422     {
8423       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8424           {
8425             basic_block bb;
8426 
8427             if (!is_gimple_debug (ustmt))
8428               continue;
8429 
8430             bb = gimple_bb (ustmt);
8431 
8432             if (!flow_bb_inside_loop_p (loop, bb))
8433               {
8434                 if (gimple_debug_bind_p (ustmt))
8435                     {
8436                       if (dump_enabled_p ())
8437                         dump_printf_loc (MSG_NOTE, vect_location,
8438                                      "killing debug use\n");
8439 
8440                       gimple_debug_bind_reset_value (ustmt);
8441                       update_stmt (ustmt);
8442                     }
8443                 else
8444                     gcc_unreachable ();
8445               }
8446           }
8447     }
8448 }
8449 
8450 /* Given loop represented by LOOP_VINFO, return true if computation of
8451    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8452    otherwise.  */
8453 
8454 static bool
loop_niters_no_overflow(loop_vec_info loop_vinfo)8455 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8456 {
8457   /* Constant case.  */
8458   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8459     {
8460       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8461       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8462 
8463       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8464       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8465       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8466           return true;
8467     }
8468 
8469   widest_int max;
8470   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8471   /* Check the upper bound of loop niters.  */
8472   if (get_max_loop_iterations (loop, &max))
8473     {
8474       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8475       signop sgn = TYPE_SIGN (type);
8476       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8477       if (max < type_max)
8478           return true;
8479     }
8480   return false;
8481 }
8482 
8483 /* Return a mask type with half the number of elements as TYPE.  */
8484 
8485 tree
vect_halve_mask_nunits(tree type)8486 vect_halve_mask_nunits (tree type)
8487 {
8488   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8489   return build_truth_vector_type (nunits, current_vector_size);
8490 }
8491 
8492 /* Return a mask type with twice as many elements as TYPE.  */
8493 
8494 tree
vect_double_mask_nunits(tree type)8495 vect_double_mask_nunits (tree type)
8496 {
8497   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8498   return build_truth_vector_type (nunits, current_vector_size);
8499 }
8500 
8501 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8502    contain a sequence of NVECTORS masks that each control a vector of type
8503    VECTYPE.  */
8504 
8505 void
vect_record_loop_mask(loop_vec_info loop_vinfo,vec_loop_masks * masks,unsigned int nvectors,tree vectype)8506 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8507                            unsigned int nvectors, tree vectype)
8508 {
8509   gcc_assert (nvectors != 0);
8510   if (masks->length () < nvectors)
8511     masks->safe_grow_cleared (nvectors);
8512   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8513   /* The number of scalars per iteration and the number of vectors are
8514      both compile-time constants.  */
8515   unsigned int nscalars_per_iter
8516     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8517                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8518   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8519     {
8520       rgm->max_nscalars_per_iter = nscalars_per_iter;
8521       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8522     }
8523 }
8524 
8525 /* Given a complete set of masks MASKS, extract mask number INDEX
8526    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8527    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8528 
8529    See the comment above vec_loop_masks for more details about the mask
8530    arrangement.  */
8531 
8532 tree
vect_get_loop_mask(gimple_stmt_iterator * gsi,vec_loop_masks * masks,unsigned int nvectors,tree vectype,unsigned int index)8533 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8534                         unsigned int nvectors, tree vectype, unsigned int index)
8535 {
8536   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8537   tree mask_type = rgm->mask_type;
8538 
8539   /* Populate the rgroup's mask array, if this is the first time we've
8540      used it.  */
8541   if (rgm->masks.is_empty ())
8542     {
8543       rgm->masks.safe_grow_cleared (nvectors);
8544       for (unsigned int i = 0; i < nvectors; ++i)
8545           {
8546             tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8547             /* Provide a dummy definition until the real one is available.  */
8548             SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8549             rgm->masks[i] = mask;
8550           }
8551     }
8552 
8553   tree mask = rgm->masks[index];
8554   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8555                     TYPE_VECTOR_SUBPARTS (vectype)))
8556     {
8557       /* A loop mask for data type X can be reused for data type Y
8558            if X has N times more elements than Y and if Y's elements
8559            are N times bigger than X's.  In this case each sequence
8560            of N elements in the loop mask will be all-zero or all-one.
8561            We can then view-convert the mask so that each sequence of
8562            N elements is replaced by a single element.  */
8563       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8564                                     TYPE_VECTOR_SUBPARTS (vectype)));
8565       gimple_seq seq = NULL;
8566       mask_type = build_same_sized_truth_vector_type (vectype);
8567       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8568       if (seq)
8569           gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8570     }
8571   return mask;
8572 }
8573 
8574 /* Scale profiling counters by estimation for LOOP which is vectorized
8575    by factor VF.  */
8576 
8577 static void
scale_profile_for_vect_loop(struct loop * loop,unsigned vf)8578 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8579 {
8580   edge preheader = loop_preheader_edge (loop);
8581   /* Reduce loop iterations by the vectorization factor.  */
8582   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8583   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8584 
8585   if (freq_h.nonzero_p ())
8586     {
8587       profile_probability p;
8588 
8589       /* Avoid dropping loop body profile counter to 0 because of zero count
8590            in loop's preheader.  */
8591       if (!(freq_e == profile_count::zero ()))
8592         freq_e = freq_e.force_nonzero ();
8593       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8594       scale_loop_frequencies (loop, p);
8595     }
8596 
8597   edge exit_e = single_exit (loop);
8598   exit_e->probability = profile_probability::always ()
8599                                          .apply_scale (1, new_est_niter + 1);
8600 
8601   edge exit_l = single_pred_edge (loop->latch);
8602   profile_probability prob = exit_l->probability;
8603   exit_l->probability = exit_e->probability.invert ();
8604   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8605     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8606 }
8607 
8608 /* Function vect_transform_loop.
8609 
8610    The analysis phase has determined that the loop is vectorizable.
8611    Vectorize the loop - created vectorized stmts to replace the scalar
8612    stmts in the loop, and update the loop exit condition.
8613    Returns scalar epilogue loop if any.  */
8614 
8615 struct loop *
vect_transform_loop(loop_vec_info loop_vinfo)8616 vect_transform_loop (loop_vec_info loop_vinfo)
8617 {
8618   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8619   struct loop *epilogue = NULL;
8620   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8621   int nbbs = loop->num_nodes;
8622   int i;
8623   tree niters_vector = NULL_TREE;
8624   tree step_vector = NULL_TREE;
8625   tree niters_vector_mult_vf = NULL_TREE;
8626   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8627   unsigned int lowest_vf = constant_lower_bound (vf);
8628   bool grouped_store;
8629   bool slp_scheduled = false;
8630   gimple *stmt, *pattern_stmt;
8631   gimple_seq pattern_def_seq = NULL;
8632   gimple_stmt_iterator pattern_def_si = gsi_none ();
8633   bool transform_pattern_stmt = false;
8634   bool check_profitability = false;
8635   unsigned int th;
8636 
8637   if (dump_enabled_p ())
8638     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8639 
8640   /* Use the more conservative vectorization threshold.  If the number
8641      of iterations is constant assume the cost check has been performed
8642      by our caller.  If the threshold makes all loops profitable that
8643      run at least the (estimated) vectorization factor number of times
8644      checking is pointless, too.  */
8645   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8646   if (th >= vect_vf_for_cost (loop_vinfo)
8647       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8648     {
8649       if (dump_enabled_p ())
8650           dump_printf_loc (MSG_NOTE, vect_location,
8651                                "Profitability threshold is %d loop iterations.\n",
8652                          th);
8653       check_profitability = true;
8654     }
8655 
8656   /* Make sure there exists a single-predecessor exit bb.  Do this before
8657      versioning.   */
8658   edge e = single_exit (loop);
8659   if (! single_pred_p (e->dest))
8660     {
8661       split_loop_exit_edge (e);
8662       if (dump_enabled_p ())
8663           dump_printf (MSG_NOTE, "split exit edge\n");
8664     }
8665 
8666   /* Version the loop first, if required, so the profitability check
8667      comes first.  */
8668 
8669   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8670     {
8671       poly_uint64 versioning_threshold
8672           = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8673       if (check_profitability
8674             && ordered_p (poly_uint64 (th), versioning_threshold))
8675           {
8676             versioning_threshold = ordered_max (poly_uint64 (th),
8677                                                         versioning_threshold);
8678             check_profitability = false;
8679           }
8680       vect_loop_versioning (loop_vinfo, th, check_profitability,
8681                                   versioning_threshold);
8682       check_profitability = false;
8683     }
8684 
8685   /* Make sure there exists a single-predecessor exit bb also on the
8686      scalar loop copy.  Do this after versioning but before peeling
8687      so CFG structure is fine for both scalar and if-converted loop
8688      to make slpeel_duplicate_current_defs_from_edges face matched
8689      loop closed PHI nodes on the exit.  */
8690   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8691     {
8692       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8693       if (! single_pred_p (e->dest))
8694           {
8695             split_loop_exit_edge (e);
8696             if (dump_enabled_p ())
8697               dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8698           }
8699     }
8700 
8701   tree niters = vect_build_loop_niters (loop_vinfo);
8702   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8703   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8704   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8705   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8706                                     &step_vector, &niters_vector_mult_vf, th,
8707                                     check_profitability, niters_no_overflow);
8708 
8709   if (niters_vector == NULL_TREE)
8710     {
8711       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8712             && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8713             && known_eq (lowest_vf, vf))
8714           {
8715             niters_vector
8716               = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8717                                    LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8718             step_vector = build_one_cst (TREE_TYPE (niters));
8719           }
8720       else
8721           vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8722                                              &step_vector, niters_no_overflow);
8723     }
8724 
8725   /* 1) Make sure the loop header has exactly two entries
8726      2) Make sure we have a preheader basic block.  */
8727 
8728   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8729 
8730   split_edge (loop_preheader_edge (loop));
8731 
8732   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8733       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8734     /* This will deal with any possible peeling.  */
8735     vect_prepare_for_masked_peels (loop_vinfo);
8736 
8737   /* FORNOW: the vectorizer supports only loops which body consist
8738      of one basic block (header + empty latch). When the vectorizer will
8739      support more involved loop forms, the order by which the BBs are
8740      traversed need to be reconsidered.  */
8741 
8742   for (i = 0; i < nbbs; i++)
8743     {
8744       basic_block bb = bbs[i];
8745       stmt_vec_info stmt_info;
8746 
8747       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8748              gsi_next (&si))
8749         {
8750             gphi *phi = si.phi ();
8751             if (dump_enabled_p ())
8752               {
8753                 dump_printf_loc (MSG_NOTE, vect_location,
8754                                "------>vectorizing phi: ");
8755                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8756               }
8757             stmt_info = vinfo_for_stmt (phi);
8758             if (!stmt_info)
8759               continue;
8760 
8761             if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8762               vect_loop_kill_debug_uses (loop, phi);
8763 
8764             if (!STMT_VINFO_RELEVANT_P (stmt_info)
8765                 && !STMT_VINFO_LIVE_P (stmt_info))
8766               continue;
8767 
8768             if (STMT_VINFO_VECTYPE (stmt_info)
8769                 && (maybe_ne
8770                       (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8771                 && dump_enabled_p ())
8772               dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8773 
8774             if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8775                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8776                  || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8777                 && ! PURE_SLP_STMT (stmt_info))
8778               {
8779                 if (dump_enabled_p ())
8780                     dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8781                 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8782               }
8783           }
8784 
8785       pattern_stmt = NULL;
8786       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8787              !gsi_end_p (si) || transform_pattern_stmt;)
8788           {
8789             bool is_store;
8790 
8791           if (transform_pattern_stmt)
8792               stmt = pattern_stmt;
8793           else
8794               {
8795                 stmt = gsi_stmt (si);
8796                 /* During vectorization remove existing clobber stmts.  */
8797                 if (gimple_clobber_p (stmt))
8798                     {
8799                       unlink_stmt_vdef (stmt);
8800                       gsi_remove (&si, true);
8801                       release_defs (stmt);
8802                       continue;
8803                     }
8804               }
8805 
8806             if (dump_enabled_p ())
8807               {
8808                 dump_printf_loc (MSG_NOTE, vect_location,
8809                                      "------>vectorizing statement: ");
8810                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8811               }
8812 
8813             stmt_info = vinfo_for_stmt (stmt);
8814 
8815             /* vector stmts created in the outer-loop during vectorization of
8816                stmts in an inner-loop may not have a stmt_info, and do not
8817                need to be vectorized.  */
8818             if (!stmt_info)
8819               {
8820                 gsi_next (&si);
8821                 continue;
8822               }
8823 
8824             if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8825               vect_loop_kill_debug_uses (loop, stmt);
8826 
8827             if (!STMT_VINFO_RELEVANT_P (stmt_info)
8828                 && !STMT_VINFO_LIVE_P (stmt_info))
8829             {
8830               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8831                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8832                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8833                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8834                 {
8835                   stmt = pattern_stmt;
8836                   stmt_info = vinfo_for_stmt (stmt);
8837                 }
8838               else
8839                   {
8840                     gsi_next (&si);
8841                     continue;
8842                 }
8843               }
8844           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8845                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8846                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8847                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8848             transform_pattern_stmt = true;
8849 
8850             /* If pattern statement has def stmts, vectorize them too.  */
8851             if (is_pattern_stmt_p (stmt_info))
8852               {
8853                 if (pattern_def_seq == NULL)
8854                     {
8855                       pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8856                       pattern_def_si = gsi_start (pattern_def_seq);
8857                     }
8858                 else if (!gsi_end_p (pattern_def_si))
8859                     gsi_next (&pattern_def_si);
8860                 if (pattern_def_seq != NULL)
8861                     {
8862                       gimple *pattern_def_stmt = NULL;
8863                       stmt_vec_info pattern_def_stmt_info = NULL;
8864 
8865                       while (!gsi_end_p (pattern_def_si))
8866                         {
8867                           pattern_def_stmt = gsi_stmt (pattern_def_si);
8868                           pattern_def_stmt_info
8869                               = vinfo_for_stmt (pattern_def_stmt);
8870                           if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8871                                 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8872                               break;
8873                           gsi_next (&pattern_def_si);
8874                         }
8875 
8876                       if (!gsi_end_p (pattern_def_si))
8877                         {
8878                           if (dump_enabled_p ())
8879                               {
8880                                 dump_printf_loc (MSG_NOTE, vect_location,
8881                                                      "==> vectorizing pattern def "
8882                                                      "stmt: ");
8883                                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8884                                                       pattern_def_stmt, 0);
8885                               }
8886 
8887                           stmt = pattern_def_stmt;
8888                           stmt_info = pattern_def_stmt_info;
8889                         }
8890                       else
8891                         {
8892                           pattern_def_si = gsi_none ();
8893                           transform_pattern_stmt = false;
8894                         }
8895                     }
8896                 else
8897                     transform_pattern_stmt = false;
8898             }
8899 
8900             if (STMT_VINFO_VECTYPE (stmt_info))
8901               {
8902                 poly_uint64 nunits
8903                     = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8904                 if (!STMT_SLP_TYPE (stmt_info)
8905                       && maybe_ne (nunits, vf)
8906                       && dump_enabled_p ())
8907                       /* For SLP VF is set according to unrolling factor, and not
8908                          to vector size, hence for SLP this print is not valid.  */
8909                     dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8910               }
8911 
8912             /* SLP. Schedule all the SLP instances when the first SLP stmt is
8913                reached.  */
8914             if (STMT_SLP_TYPE (stmt_info))
8915               {
8916                 if (!slp_scheduled)
8917                     {
8918                       slp_scheduled = true;
8919 
8920                       if (dump_enabled_p ())
8921                         dump_printf_loc (MSG_NOTE, vect_location,
8922                                              "=== scheduling SLP instances ===\n");
8923 
8924                       vect_schedule_slp (loop_vinfo);
8925                     }
8926 
8927                 /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8928                 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8929                     {
8930                       if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8931                         {
8932                           pattern_def_seq = NULL;
8933                           gsi_next (&si);
8934                         }
8935                       continue;
8936                     }
8937               }
8938 
8939             /* -------- vectorize statement ------------ */
8940             if (dump_enabled_p ())
8941               dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8942 
8943             grouped_store = false;
8944             is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8945           if (is_store)
8946             {
8947                 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8948                     {
8949                       /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8950                          interleaving chain was completed - free all the stores in
8951                          the chain.  */
8952                       gsi_next (&si);
8953                       vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8954                     }
8955                 else
8956                     {
8957                       /* Free the attached stmt_vec_info and remove the stmt.  */
8958                       gimple *store = gsi_stmt (si);
8959                       free_stmt_vec_info (store);
8960                       unlink_stmt_vdef (store);
8961                       gsi_remove (&si, true);
8962                       release_defs (store);
8963                     }
8964 
8965                 /* Stores can only appear at the end of pattern statements.  */
8966                 gcc_assert (!transform_pattern_stmt);
8967                 pattern_def_seq = NULL;
8968               }
8969             else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8970               {
8971                 pattern_def_seq = NULL;
8972                 gsi_next (&si);
8973               }
8974           }                           /* stmts in BB */
8975 
8976       /* Stub out scalar statements that must not survive vectorization.
8977            Doing this here helps with grouped statements, or statements that
8978            are involved in patterns.  */
8979       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8980              !gsi_end_p (gsi); gsi_next (&gsi))
8981           {
8982             gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8983             if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8984               {
8985                 tree lhs = gimple_get_lhs (call);
8986                 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8987                     {
8988                       tree zero = build_zero_cst (TREE_TYPE (lhs));
8989                       gimple *new_stmt = gimple_build_assign (lhs, zero);
8990                       gsi_replace (&gsi, new_stmt, true);
8991                     }
8992               }
8993           }
8994     }                                   /* BBs in loop */
8995 
8996   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8997      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8998   if (integer_onep (step_vector))
8999     niters_no_overflow = true;
9000   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9001                                  niters_vector_mult_vf, !niters_no_overflow);
9002 
9003   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9004   scale_profile_for_vect_loop (loop, assumed_vf);
9005 
9006   /* True if the final iteration might not handle a full vector's
9007      worth of scalar iterations.  */
9008   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
9009   /* The minimum number of iterations performed by the epilogue.  This
9010      is 1 when peeling for gaps because we always need a final scalar
9011      iteration.  */
9012   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9013   /* +1 to convert latch counts to loop iteration counts,
9014      -min_epilogue_iters to remove iterations that cannot be performed
9015        by the vector code.  */
9016   int bias_for_lowest = 1 - min_epilogue_iters;
9017   int bias_for_assumed = bias_for_lowest;
9018   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9019   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9020     {
9021       /* When the amount of peeling is known at compile time, the first
9022            iteration will have exactly alignment_npeels active elements.
9023            In the worst case it will have at least one.  */
9024       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9025       bias_for_lowest += lowest_vf - min_first_active;
9026       bias_for_assumed += assumed_vf - min_first_active;
9027     }
9028   /* In these calculations the "- 1" converts loop iteration counts
9029      back to latch counts.  */
9030   if (loop->any_upper_bound)
9031     loop->nb_iterations_upper_bound
9032       = (final_iter_may_be_partial
9033            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9034                                 lowest_vf) - 1
9035            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9036                                  lowest_vf) - 1);
9037   if (loop->any_likely_upper_bound)
9038     loop->nb_iterations_likely_upper_bound
9039       = (final_iter_may_be_partial
9040            ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9041                                 + bias_for_lowest, lowest_vf) - 1
9042            : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9043                                  + bias_for_lowest, lowest_vf) - 1);
9044   if (loop->any_estimate)
9045     loop->nb_iterations_estimate
9046       = (final_iter_may_be_partial
9047            ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9048                                 assumed_vf) - 1
9049            : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9050                                  assumed_vf) - 1);
9051 
9052   if (dump_enabled_p ())
9053     {
9054       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9055           {
9056             dump_printf_loc (MSG_NOTE, vect_location,
9057                                  "LOOP VECTORIZED\n");
9058             if (loop->inner)
9059               dump_printf_loc (MSG_NOTE, vect_location,
9060                                    "OUTER LOOP VECTORIZED\n");
9061             dump_printf (MSG_NOTE, "\n");
9062           }
9063       else
9064           {
9065             dump_printf_loc (MSG_NOTE, vect_location,
9066                                  "LOOP EPILOGUE VECTORIZED (VS=");
9067             dump_dec (MSG_NOTE, current_vector_size);
9068             dump_printf (MSG_NOTE, ")\n");
9069           }
9070     }
9071 
9072   /* Free SLP instances here because otherwise stmt reference counting
9073      won't work.  */
9074   slp_instance instance;
9075   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9076     vect_free_slp_instance (instance);
9077   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9078   /* Clear-up safelen field since its value is invalid after vectorization
9079      since vectorized loop can have loop-carried dependencies.  */
9080   loop->safelen = 0;
9081 
9082   /* Don't vectorize epilogue for epilogue.  */
9083   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9084     epilogue = NULL;
9085 
9086   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9087     epilogue = NULL;
9088 
9089   if (epilogue)
9090     {
9091       auto_vector_sizes vector_sizes;
9092       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9093       unsigned int next_size = 0;
9094 
9095       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9096             && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9097             && known_eq (vf, lowest_vf))
9098           {
9099             unsigned int eiters
9100               = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9101                  - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9102             eiters = eiters % lowest_vf;
9103             epilogue->nb_iterations_upper_bound = eiters - 1;
9104 
9105             unsigned int ratio;
9106             while (next_size < vector_sizes.length ()
9107                      && !(constant_multiple_p (current_vector_size,
9108                                                      vector_sizes[next_size], &ratio)
9109                           && eiters >= lowest_vf / ratio))
9110               next_size += 1;
9111           }
9112       else
9113           while (next_size < vector_sizes.length ()
9114                  && maybe_lt (current_vector_size, vector_sizes[next_size]))
9115             next_size += 1;
9116 
9117       if (next_size == vector_sizes.length ())
9118           epilogue = NULL;
9119     }
9120 
9121   if (epilogue)
9122     {
9123       epilogue->force_vectorize = loop->force_vectorize;
9124       epilogue->safelen = loop->safelen;
9125       epilogue->dont_vectorize = false;
9126 
9127       /* We may need to if-convert epilogue to vectorize it.  */
9128       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9129           tree_if_conversion (epilogue);
9130     }
9131 
9132   return epilogue;
9133 }
9134 
9135 /* The code below is trying to perform simple optimization - revert
9136    if-conversion for masked stores, i.e. if the mask of a store is zero
9137    do not perform it and all stored value producers also if possible.
9138    For example,
9139      for (i=0; i<n; i++)
9140        if (c[i])
9141           {
9142             p1[i] += 1;
9143             p2[i] = p3[i] +2;
9144           }
9145    this transformation will produce the following semi-hammock:
9146 
9147    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9148      {
9149        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9150        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9151        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9152        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9153        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9154        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9155      }
9156 */
9157 
9158 void
optimize_mask_stores(struct loop * loop)9159 optimize_mask_stores (struct loop *loop)
9160 {
9161   basic_block *bbs = get_loop_body (loop);
9162   unsigned nbbs = loop->num_nodes;
9163   unsigned i;
9164   basic_block bb;
9165   struct loop *bb_loop;
9166   gimple_stmt_iterator gsi;
9167   gimple *stmt;
9168   auto_vec<gimple *> worklist;
9169 
9170   vect_location = find_loop_location (loop);
9171   /* Pick up all masked stores in loop if any.  */
9172   for (i = 0; i < nbbs; i++)
9173     {
9174       bb = bbs[i];
9175       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9176              gsi_next (&gsi))
9177           {
9178             stmt = gsi_stmt (gsi);
9179             if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9180               worklist.safe_push (stmt);
9181           }
9182     }
9183 
9184   free (bbs);
9185   if (worklist.is_empty ())
9186     return;
9187 
9188   /* Loop has masked stores.  */
9189   while (!worklist.is_empty ())
9190     {
9191       gimple *last, *last_store;
9192       edge e, efalse;
9193       tree mask;
9194       basic_block store_bb, join_bb;
9195       gimple_stmt_iterator gsi_to;
9196       tree vdef, new_vdef;
9197       gphi *phi;
9198       tree vectype;
9199       tree zero;
9200 
9201       last = worklist.pop ();
9202       mask = gimple_call_arg (last, 2);
9203       bb = gimple_bb (last);
9204       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9205            the same loop as if_bb.  It could be different to LOOP when two
9206            level loop-nest is vectorized and mask_store belongs to the inner
9207            one.  */
9208       e = split_block (bb, last);
9209       bb_loop = bb->loop_father;
9210       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9211       join_bb = e->dest;
9212       store_bb = create_empty_bb (bb);
9213       add_bb_to_loop (store_bb, bb_loop);
9214       e->flags = EDGE_TRUE_VALUE;
9215       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9216       /* Put STORE_BB to likely part.  */
9217       efalse->probability = profile_probability::unlikely ();
9218       store_bb->count = efalse->count ();
9219       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9220       if (dom_info_available_p (CDI_DOMINATORS))
9221           set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9222       if (dump_enabled_p ())
9223           dump_printf_loc (MSG_NOTE, vect_location,
9224                                "Create new block %d to sink mask stores.",
9225                                store_bb->index);
9226       /* Create vector comparison with boolean result.  */
9227       vectype = TREE_TYPE (mask);
9228       zero = build_zero_cst (vectype);
9229       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9230       gsi = gsi_last_bb (bb);
9231       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9232       /* Create new PHI node for vdef of the last masked store:
9233            .MEM_2 = VDEF <.MEM_1>
9234            will be converted to
9235            .MEM.3 = VDEF <.MEM_1>
9236            and new PHI node will be created in join bb
9237            .MEM_2 = PHI <.MEM_1, .MEM_3>
9238       */
9239       vdef = gimple_vdef (last);
9240       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9241       gimple_set_vdef (last, new_vdef);
9242       phi = create_phi_node (vdef, join_bb);
9243       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9244 
9245       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9246       while (true)
9247           {
9248             gimple_stmt_iterator gsi_from;
9249             gimple *stmt1 = NULL;
9250 
9251             /* Move masked store to STORE_BB.  */
9252             last_store = last;
9253             gsi = gsi_for_stmt (last);
9254             gsi_from = gsi;
9255             /* Shift GSI to the previous stmt for further traversal.  */
9256             gsi_prev (&gsi);
9257             gsi_to = gsi_start_bb (store_bb);
9258             gsi_move_before (&gsi_from, &gsi_to);
9259             /* Setup GSI_TO to the non-empty block start.  */
9260             gsi_to = gsi_start_bb (store_bb);
9261             if (dump_enabled_p ())
9262               {
9263                 dump_printf_loc (MSG_NOTE, vect_location,
9264                                      "Move stmt to created bb\n");
9265                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9266               }
9267             /* Move all stored value producers if possible.  */
9268             while (!gsi_end_p (gsi))
9269               {
9270                 tree lhs;
9271                 imm_use_iterator imm_iter;
9272                 use_operand_p use_p;
9273                 bool res;
9274 
9275                 /* Skip debug statements.  */
9276                 if (is_gimple_debug (gsi_stmt (gsi)))
9277                     {
9278                       gsi_prev (&gsi);
9279                       continue;
9280                     }
9281                 stmt1 = gsi_stmt (gsi);
9282                 /* Do not consider statements writing to memory or having
9283                      volatile operand.  */
9284                 if (gimple_vdef (stmt1)
9285                       || gimple_has_volatile_ops (stmt1))
9286                     break;
9287                 gsi_from = gsi;
9288                 gsi_prev (&gsi);
9289                 lhs = gimple_get_lhs (stmt1);
9290                 if (!lhs)
9291                     break;
9292 
9293                 /* LHS of vectorized stmt must be SSA_NAME.  */
9294                 if (TREE_CODE (lhs) != SSA_NAME)
9295                     break;
9296 
9297                 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9298                     {
9299                       /* Remove dead scalar statement.  */
9300                       if (has_zero_uses (lhs))
9301                         {
9302                           gsi_remove (&gsi_from, true);
9303                           continue;
9304                         }
9305                     }
9306 
9307                 /* Check that LHS does not have uses outside of STORE_BB.  */
9308                 res = true;
9309                 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9310                     {
9311                       gimple *use_stmt;
9312                       use_stmt = USE_STMT (use_p);
9313                       if (is_gimple_debug (use_stmt))
9314                         continue;
9315                       if (gimple_bb (use_stmt) != store_bb)
9316                         {
9317                           res = false;
9318                           break;
9319                         }
9320                     }
9321                 if (!res)
9322                     break;
9323 
9324                 if (gimple_vuse (stmt1)
9325                       && gimple_vuse (stmt1) != gimple_vuse (last_store))
9326                     break;
9327 
9328                 /* Can move STMT1 to STORE_BB.  */
9329                 if (dump_enabled_p ())
9330                     {
9331                       dump_printf_loc (MSG_NOTE, vect_location,
9332                                            "Move stmt to created bb\n");
9333                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9334                     }
9335                 gsi_move_before (&gsi_from, &gsi_to);
9336                 /* Shift GSI_TO for further insertion.  */
9337                 gsi_prev (&gsi_to);
9338               }
9339             /* Put other masked stores with the same mask to STORE_BB.  */
9340             if (worklist.is_empty ()
9341                 || gimple_call_arg (worklist.last (), 2) != mask
9342                 || worklist.last () != stmt1)
9343               break;
9344             last = worklist.pop ();
9345           }
9346       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9347     }
9348 }
9349