1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"            /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52 
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54                                                     slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56 
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59 
60 void
vect_slp_init(void)61 vect_slp_init (void)
62 {
63   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65 
66 void
vect_slp_fini(void)67 vect_slp_fini (void)
68 {
69   while (slp_first_node)
70     delete slp_first_node;
71   delete slp_tree_pool;
72   slp_tree_pool = NULL;
73 }
74 
75 void *
operator new(size_t n)76 _slp_tree::operator new (size_t n)
77 {
78   gcc_assert (n == sizeof (_slp_tree));
79   return slp_tree_pool->allocate_raw ();
80 }
81 
82 void
operator delete(void * node,size_t n)83 _slp_tree::operator delete (void *node, size_t n)
84 {
85   gcc_assert (n == sizeof (_slp_tree));
86   slp_tree_pool->remove_raw (node);
87 }
88 
89 
90 /* Initialize a SLP node.  */
91 
_slp_tree()92 _slp_tree::_slp_tree ()
93 {
94   this->prev_node = NULL;
95   if (slp_first_node)
96     slp_first_node->prev_node = this;
97   this->next_node = slp_first_node;
98   slp_first_node = this;
99   SLP_TREE_SCALAR_STMTS (this) = vNULL;
100   SLP_TREE_SCALAR_OPS (this) = vNULL;
101   SLP_TREE_VEC_STMTS (this) = vNULL;
102   SLP_TREE_VEC_DEFS (this) = vNULL;
103   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104   SLP_TREE_CHILDREN (this) = vNULL;
105   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108   SLP_TREE_CODE (this) = ERROR_MARK;
109   SLP_TREE_VECTYPE (this) = NULL_TREE;
110   SLP_TREE_REPRESENTATIVE (this) = NULL;
111   SLP_TREE_REF_COUNT (this) = 1;
112   this->failed = NULL;
113   this->max_nunits = 1;
114   this->lanes = 0;
115 }
116 
117 /* Tear down a SLP node.  */
118 
~_slp_tree()119 _slp_tree::~_slp_tree ()
120 {
121   if (this->prev_node)
122     this->prev_node->next_node = this->next_node;
123   else
124     slp_first_node = this->next_node;
125   if (this->next_node)
126     this->next_node->prev_node = this->prev_node;
127   SLP_TREE_CHILDREN (this).release ();
128   SLP_TREE_SCALAR_STMTS (this).release ();
129   SLP_TREE_SCALAR_OPS (this).release ();
130   SLP_TREE_VEC_STMTS (this).release ();
131   SLP_TREE_VEC_DEFS (this).release ();
132   SLP_TREE_LOAD_PERMUTATION (this).release ();
133   SLP_TREE_LANE_PERMUTATION (this).release ();
134   if (this->failed)
135     free (failed);
136 }
137 
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
139 
140 void
vect_free_slp_tree(slp_tree node)141 vect_free_slp_tree (slp_tree node)
142 {
143   int i;
144   slp_tree child;
145 
146   if (--SLP_TREE_REF_COUNT (node) != 0)
147     return;
148 
149   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150     if (child)
151       vect_free_slp_tree (child);
152 
153   /* If the node defines any SLP only patterns then those patterns are no
154      longer valid and should be removed.  */
155   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157     {
158       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161     }
162 
163   delete node;
164 }
165 
166 /* Return a location suitable for dumpings related to the SLP instance.  */
167 
168 dump_user_location_t
location() const169 _slp_instance::location () const
170 {
171   if (!root_stmts.is_empty ())
172     return root_stmts[0]->stmt;
173   else
174     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176 
177 
178 /* Free the memory allocated for the SLP instance.  */
179 
180 void
vect_free_slp_instance(slp_instance instance)181 vect_free_slp_instance (slp_instance instance)
182 {
183   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184   SLP_INSTANCE_LOADS (instance).release ();
185   SLP_INSTANCE_ROOT_STMTS (instance).release ();
186   instance->subgraph_entries.release ();
187   instance->cost_vec.release ();
188   free (instance);
189 }
190 
191 
192 /* Create an SLP node for SCALAR_STMTS.  */
193 
194 slp_tree
vect_create_new_slp_node(unsigned nops,tree_code code)195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197   slp_tree node = new _slp_tree;
198   SLP_TREE_SCALAR_STMTS (node) = vNULL;
199   SLP_TREE_CHILDREN (node).create (nops);
200   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201   SLP_TREE_CODE (node) = code;
202   return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS.  */
205 
206 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<stmt_vec_info> scalar_stmts,unsigned nops)207 vect_create_new_slp_node (slp_tree node,
208                                 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211   SLP_TREE_CHILDREN (node).create (nops);
212   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214   SLP_TREE_LANES (node) = scalar_stmts.length ();
215   return node;
216 }
217 
218 /* Create an SLP node for SCALAR_STMTS.  */
219 
220 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts,unsigned nops)221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225 
226 /* Create an SLP node for OPS.  */
227 
228 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<tree> ops)229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231   SLP_TREE_SCALAR_OPS (node) = ops;
232   SLP_TREE_DEF_TYPE (node) = vect_external_def;
233   SLP_TREE_LANES (node) = ops.length ();
234   return node;
235 }
236 
237 /* Create an SLP node for OPS.  */
238 
239 static slp_tree
vect_create_new_slp_node(vec<tree> ops)240 vect_create_new_slp_node (vec<tree> ops)
241 {
242   return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244 
245 
246 /* This structure is used in creation of an SLP tree.  Each instance
247    corresponds to the same operand in a group of scalar stmts in an SLP
248    node.  */
249 typedef struct _slp_oprnd_info
250 {
251   /* Def-stmts for the operands.  */
252   vec<stmt_vec_info> def_stmts;
253   /* Operands.  */
254   vec<tree> ops;
255   /* Information about the first statement, its vector def-type, type, the
256      operand itself in case it's constant, and an indication if it's a pattern
257      stmt.  */
258   tree first_op_type;
259   enum vect_def_type first_dt;
260   bool any_pattern;
261 } *slp_oprnd_info;
262 
263 
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265    operand.  */
266 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)267 vect_create_oprnd_info (int nops, int group_size)
268 {
269   int i;
270   slp_oprnd_info oprnd_info;
271   vec<slp_oprnd_info> oprnds_info;
272 
273   oprnds_info.create (nops);
274   for (i = 0; i < nops; i++)
275     {
276       oprnd_info = XNEW (struct _slp_oprnd_info);
277       oprnd_info->def_stmts.create (group_size);
278       oprnd_info->ops.create (group_size);
279       oprnd_info->first_dt = vect_uninitialized_def;
280       oprnd_info->first_op_type = NULL_TREE;
281       oprnd_info->any_pattern = false;
282       oprnds_info.quick_push (oprnd_info);
283     }
284 
285   return oprnds_info;
286 }
287 
288 
289 /* Free operands info.  */
290 
291 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294   int i;
295   slp_oprnd_info oprnd_info;
296 
297   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298     {
299       oprnd_info->def_stmts.release ();
300       oprnd_info->ops.release ();
301       XDELETE (oprnd_info);
302     }
303 
304   oprnds_info.release ();
305 }
306 
307 
308 /* Return true if STMTS contains a pattern statement.  */
309 
310 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313   stmt_vec_info stmt_info;
314   unsigned int i;
315   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316     if (is_pattern_stmt_p (stmt_info))
317       return true;
318   return false;
319 }
320 
321 /* Return true when all lanes in the external or constant NODE have
322    the same value.  */
323 
324 static bool
vect_slp_tree_uniform_p(slp_tree node)325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328                 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329 
330   /* Pre-exsting vectors.  */
331   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332     return false;
333 
334   unsigned i;
335   tree op, first = NULL_TREE;
336   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337     if (!first)
338       first = op;
339     else if (!operand_equal_p (first, op, 0))
340       return false;
341 
342   return true;
343 }
344 
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
347    of the chain.  */
348 
349 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351                                               stmt_vec_info first_stmt_info)
352 {
353   stmt_vec_info next_stmt_info = first_stmt_info;
354   int result = 0;
355 
356   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357     return -1;
358 
359   do
360     {
361       if (next_stmt_info == stmt_info)
362           return result;
363       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364       if (next_stmt_info)
365           result += DR_GROUP_GAP (next_stmt_info);
366     }
367   while (next_stmt_info);
368 
369   return -1;
370 }
371 
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373    using the method implemented by duplicate_and_interleave.  Return true
374    if so, returning the number of intermediate vectors in *NVECTORS_OUT
375    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376    (if nonnull).  */
377 
378 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380                                         tree elt_type, unsigned int *nvectors_out,
381                                         tree *vector_type_out,
382                                         tree *permutes)
383 {
384   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386     return false;
387 
388   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390   unsigned int nvectors = 1;
391   for (;;)
392     {
393       scalar_int_mode int_mode;
394       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396           {
397             /* Get the natural vector type for this SLP group size.  */
398             tree int_type = build_nonstandard_integer_type
399               (GET_MODE_BITSIZE (int_mode), 1);
400             tree vector_type
401               = get_vectype_for_scalar_type (vinfo, int_type, count);
402             if (vector_type
403                 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404                 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405                                  GET_MODE_SIZE (base_vector_mode)))
406               {
407                 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408                      together into elements of type INT_TYPE and using the result
409                      to build NVECTORS vectors.  */
410                 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411                 vec_perm_builder sel1 (nelts, 2, 3);
412                 vec_perm_builder sel2 (nelts, 2, 3);
413                 poly_int64 half_nelts = exact_div (nelts, 2);
414                 for (unsigned int i = 0; i < 3; ++i)
415                     {
416                       sel1.quick_push (i);
417                       sel1.quick_push (i + nelts);
418                       sel2.quick_push (half_nelts + i);
419                       sel2.quick_push (half_nelts + i + nelts);
420                     }
421                 vec_perm_indices indices1 (sel1, 2, nelts);
422                 vec_perm_indices indices2 (sel2, 2, nelts);
423                 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424                       && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425                     {
426                       if (nvectors_out)
427                         *nvectors_out = nvectors;
428                       if (vector_type_out)
429                         *vector_type_out = vector_type;
430                       if (permutes)
431                         {
432                           permutes[0] = vect_gen_perm_mask_checked (vector_type,
433                                                                                 indices1);
434                           permutes[1] = vect_gen_perm_mask_checked (vector_type,
435                                                                                 indices2);
436                         }
437                       return true;
438                     }
439               }
440           }
441       if (!multiple_p (elt_bytes, 2, &elt_bytes))
442           return false;
443       nvectors *= 2;
444     }
445 }
446 
447 /* Return true if DTA and DTB match.  */
448 
449 static bool
vect_def_types_match(enum vect_def_type dta,enum vect_def_type dtb)450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452   return (dta == dtb
453             || ((dta == vect_external_def || dta == vect_constant_def)
454                 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456 
457 static const int cond_expr_maps[3][5] = {
458   { 4, -1, -2, 1, 2 },
459   { 4, -2, -1, 1, 2 },
460   { 4, -1, -2, 2, 1 }
461 };
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
465 
466 /* For most SLP statements, there is a one-to-one mapping between
467    gimple arguments and child nodes.  If that is not true for STMT,
468    return an array that contains:
469 
470    - the number of child nodes, followed by
471    - for each child node, the index of the argument associated with that node.
472      The special index -1 is the first operand of an embedded comparison and
473      the special index -2 is the second operand of an embedded comparison.
474 
475    SWAP is as for vect_get_and_check_slp_defs.  */
476 
477 static const int *
vect_get_operand_map(const gimple * stmt,unsigned char swap=0)478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
479 {
480   if (auto assign = dyn_cast<const gassign *> (stmt))
481     {
482       if (gimple_assign_rhs_code (assign) == COND_EXPR
483             && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484           return cond_expr_maps[swap];
485     }
486   gcc_assert (!swap);
487   if (auto call = dyn_cast<const gcall *> (stmt))
488     {
489       if (gimple_call_internal_p (call))
490           switch (gimple_call_internal_fn (call))
491             {
492             case IFN_MASK_LOAD:
493               return arg2_map;
494 
495             case IFN_GATHER_LOAD:
496               return arg1_map;
497 
498             case IFN_MASK_GATHER_LOAD:
499               return arg1_arg4_map;
500 
501             default:
502               break;
503             }
504     }
505   return nullptr;
506 }
507 
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509    they are of a valid type and that they match the defs of the first stmt of
510    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
511    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
512    indicates swap is required for cond_expr stmts.  Specifically, SWAP
513    is 1 if STMT is cond and operands of comparison need to be swapped;
514    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
515 
516    If there was a fatal error return -1; if the error could be corrected by
517    swapping operands of father node of this one, return 1; if everything is
518    ok return 0.  */
519 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char swap,bool * skip_args,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521                                    bool *skip_args,
522                                    vec<stmt_vec_info> stmts, unsigned stmt_num,
523                                    vec<slp_oprnd_info> *oprnds_info)
524 {
525   stmt_vec_info stmt_info = stmts[stmt_num];
526   tree oprnd;
527   unsigned int i, number_of_oprnds;
528   enum vect_def_type dt = vect_uninitialized_def;
529   slp_oprnd_info oprnd_info;
530   unsigned int commutative_op = -1U;
531   bool first = stmt_num == 0;
532 
533   if (!is_a<gcall *> (stmt_info->stmt)
534       && !is_a<gassign *> (stmt_info->stmt)
535       && !is_a<gphi *> (stmt_info->stmt))
536     return -1;
537 
538   number_of_oprnds = gimple_num_args (stmt_info->stmt);
539   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540   if (map)
541     number_of_oprnds = *map++;
542   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
543     {
544       if (gimple_call_internal_p (stmt))
545           {
546             internal_fn ifn = gimple_call_internal_fn (stmt);
547             commutative_op = first_commutative_argument (ifn);
548           }
549     }
550   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
551     {
552       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553           commutative_op = 0;
554     }
555 
556   bool swapped = (swap != 0);
557   bool backedge = false;
558   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559   for (i = 0; i < number_of_oprnds; i++)
560     {
561       int opno = map ? map[i] : int (i);
562       if (opno < 0)
563           oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564       else
565           {
566             oprnd = gimple_arg (stmt_info->stmt, opno);
567             if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568               backedge = dominated_by_p (CDI_DOMINATORS,
569                                                gimple_phi_arg_edge (stmt, opno)->src,
570                                                gimple_bb (stmt_info->stmt));
571           }
572       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573           oprnd = TREE_OPERAND (oprnd, 0);
574 
575       oprnd_info = (*oprnds_info)[i];
576 
577       stmt_vec_info def_stmt_info;
578       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
579           {
580             if (dump_enabled_p ())
581               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582                                    "Build SLP failed: can't analyze def for %T\n",
583                                    oprnd);
584 
585             return -1;
586           }
587 
588       if (skip_args[i])
589           {
590             oprnd_info->def_stmts.quick_push (NULL);
591             oprnd_info->ops.quick_push (NULL_TREE);
592             oprnd_info->first_dt = vect_uninitialized_def;
593             continue;
594           }
595 
596       oprnd_info->def_stmts.quick_push (def_stmt_info);
597       oprnd_info->ops.quick_push (oprnd);
598 
599       if (def_stmt_info
600             && is_pattern_stmt_p (def_stmt_info))
601           {
602             if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603                 != def_stmt_info)
604               oprnd_info->any_pattern = true;
605             else
606               /* If we promote this to external use the original stmt def.  */
607               oprnd_info->ops.last ()
608                 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
609           }
610 
611       /* If there's a extern def on a backedge make sure we can
612            code-generate at the region start.
613            ???  This is another case that could be fixed by adjusting
614            how we split the function but at the moment we'd have conflicting
615            goals there.  */
616       if (backedge
617             && dts[i] == vect_external_def
618             && is_a <bb_vec_info> (vinfo)
619             && TREE_CODE (oprnd) == SSA_NAME
620             && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621             && !dominated_by_p (CDI_DOMINATORS,
622                                     as_a <bb_vec_info> (vinfo)->bbs[0],
623                                     gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
624           {
625             if (dump_enabled_p ())
626               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627                                    "Build SLP failed: extern def %T only defined "
628                                    "on backedge\n", oprnd);
629             return -1;
630           }
631 
632       if (first)
633           {
634             tree type = TREE_TYPE (oprnd);
635             dt = dts[i];
636             if ((dt == vect_constant_def
637                  || dt == vect_external_def)
638                 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639                 && (TREE_CODE (type) == BOOLEAN_TYPE
640                       || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641                                                                   type)))
642               {
643                 if (dump_enabled_p ())
644                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645                                          "Build SLP failed: invalid type of def "
646                                          "for variable-length SLP %T\n", oprnd);
647                 return -1;
648               }
649 
650             /* For the swapping logic below force vect_reduction_def
651                for the reduction op in a SLP reduction group.  */
652             if (!STMT_VINFO_DATA_REF (stmt_info)
653                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654                 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655                 && def_stmt_info)
656               dts[i] = dt = vect_reduction_def;
657 
658             /* Check the types of the definition.  */
659             switch (dt)
660               {
661               case vect_external_def:
662               case vect_constant_def:
663               case vect_internal_def:
664               case vect_reduction_def:
665               case vect_induction_def:
666               case vect_nested_cycle:
667                 break;
668 
669               default:
670                 /* FORNOW: Not supported.  */
671                 if (dump_enabled_p ())
672                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673                                          "Build SLP failed: illegal type of def %T\n",
674                                          oprnd);
675                 return -1;
676               }
677 
678             oprnd_info->first_dt = dt;
679             oprnd_info->first_op_type = type;
680           }
681     }
682   if (first)
683     return 0;
684 
685   /* Now match the operand definition types to that of the first stmt.  */
686   for (i = 0; i < number_of_oprnds;)
687     {
688       if (skip_args[i])
689           {
690             ++i;
691             continue;
692           }
693 
694       oprnd_info = (*oprnds_info)[i];
695       dt = dts[i];
696       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697       oprnd = oprnd_info->ops[stmt_num];
698       tree type = TREE_TYPE (oprnd);
699 
700       if (!types_compatible_p (oprnd_info->first_op_type, type))
701           {
702             if (dump_enabled_p ())
703               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704                                    "Build SLP failed: different operand types\n");
705             return 1;
706           }
707 
708       /* Not first stmt of the group, check that the def-stmt/s match
709            the def-stmt/s of the first stmt.  Allow different definition
710            types for reduction chains: the first stmt must be a
711            vect_reduction_def (a phi node), and the rest
712            end in the reduction chain.  */
713       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714              && !(oprnd_info->first_dt == vect_reduction_def
715                     && !STMT_VINFO_DATA_REF (stmt_info)
716                     && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717                     && def_stmt_info
718                     && !STMT_VINFO_DATA_REF (def_stmt_info)
719                     && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720                         == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721             || (!STMT_VINFO_DATA_REF (stmt_info)
722                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723                 && ((!def_stmt_info
724                        || STMT_VINFO_DATA_REF (def_stmt_info)
725                        || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726                            != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727                       != (oprnd_info->first_dt != vect_reduction_def))))
728           {
729             /* Try swapping operands if we got a mismatch.  For BB
730                vectorization only in case it will clearly improve things.  */
731             if (i == commutative_op && !swapped
732                 && (!is_a <bb_vec_info> (vinfo)
733                       || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734                                                        dts[i+1])
735                           && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736                                 || vect_def_types_match
737                                      ((*oprnds_info)[i+1]->first_dt, dts[i])))))
738               {
739                 if (dump_enabled_p ())
740                     dump_printf_loc (MSG_NOTE, vect_location,
741                                          "trying swapped operands\n");
742                 std::swap (dts[i], dts[i+1]);
743                 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744                                (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745                 std::swap ((*oprnds_info)[i]->ops[stmt_num],
746                                (*oprnds_info)[i+1]->ops[stmt_num]);
747                 swapped = true;
748                 continue;
749               }
750 
751             if (is_a <bb_vec_info> (vinfo)
752                 && !oprnd_info->any_pattern)
753               {
754                 /* Now for commutative ops we should see whether we can
755                      make the other operand matching.  */
756                 if (dump_enabled_p ())
757                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758                                          "treating operand as external\n");
759                 oprnd_info->first_dt = dt = vect_external_def;
760               }
761             else
762               {
763                 if (dump_enabled_p ())
764                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765                                          "Build SLP failed: different types\n");
766                 return 1;
767               }
768           }
769 
770       /* Make sure to demote the overall operand to external.  */
771       if (dt == vect_external_def)
772           oprnd_info->first_dt = vect_external_def;
773       /* For a SLP reduction chain we want to duplicate the reduction to
774            each of the chain members.  That gets us a sane SLP graph (still
775            the stmts are not 100% correct wrt the initial values).  */
776       else if ((dt == vect_internal_def
777                     || dt == vect_reduction_def)
778                  && oprnd_info->first_dt == vect_reduction_def
779                  && !STMT_VINFO_DATA_REF (stmt_info)
780                  && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781                  && !STMT_VINFO_DATA_REF (def_stmt_info)
782                  && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783                        == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
784           {
785             oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786             oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
787           }
788 
789       ++i;
790     }
791 
792   /* Swap operands.  */
793   if (swapped)
794     {
795       if (dump_enabled_p ())
796           dump_printf_loc (MSG_NOTE, vect_location,
797                                "swapped operands to match def types in %G",
798                                stmt_info->stmt);
799     }
800 
801   return 0;
802 }
803 
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805    to be combined into the same SLP group.  */
806 
807 bool
compatible_calls_p(gcall * call1,gcall * call2)808 compatible_calls_p (gcall *call1, gcall *call2)
809 {
810   unsigned int nargs = gimple_call_num_args (call1);
811   if (nargs != gimple_call_num_args (call2))
812     return false;
813 
814   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815     return false;
816 
817   if (gimple_call_internal_p (call1))
818     {
819       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820                                      TREE_TYPE (gimple_call_lhs (call2))))
821           return false;
822       for (unsigned int i = 0; i < nargs; ++i)
823           if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824                                          TREE_TYPE (gimple_call_arg (call2, i))))
825             return false;
826     }
827   else
828     {
829       if (!operand_equal_p (gimple_call_fn (call1),
830                                   gimple_call_fn (call2), 0))
831           return false;
832 
833       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834           return false;
835     }
836 
837   /* Check that any unvectorized arguments are equal.  */
838   if (const int *map = vect_get_operand_map (call1))
839     {
840       unsigned int nkept = *map++;
841       unsigned int mapi = 0;
842       for (unsigned int i = 0; i < nargs; ++i)
843           if (mapi < nkept && map[mapi] == int (i))
844             mapi += 1;
845           else if (!operand_equal_p (gimple_call_arg (call1, i),
846                                            gimple_call_arg (call2, i)))
847             return false;
848     }
849 
850   return true;
851 }
852 
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854    caller's attempt to find the vector type in STMT_INFO with the narrowest
855    element type.  Return true if VECTYPE is nonnull and if it is valid
856    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
857    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
858    vect_build_slp_tree.  */
859 
860 static bool
vect_record_max_nunits(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862                               unsigned int group_size,
863                               tree vectype, poly_uint64 *max_nunits)
864 {
865   if (!vectype)
866     {
867       if (dump_enabled_p ())
868           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869                                "Build SLP failed: unsupported data-type in %G\n",
870                                stmt_info->stmt);
871       /* Fatal mismatch.  */
872       return false;
873     }
874 
875   /* If populating the vector type requires unrolling then fail
876      before adjusting *max_nunits for basic-block vectorization.  */
877   if (is_a <bb_vec_info> (vinfo)
878       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879     {
880       if (dump_enabled_p ())
881           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882                                "Build SLP failed: unrolling required "
883                                "in basic block SLP\n");
884       /* Fatal mismatch.  */
885       return false;
886     }
887 
888   /* In case of multiple types we need to detect the smallest type.  */
889   vect_update_max_nunits (max_nunits, vectype);
890   return true;
891 }
892 
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894    permutation or are of unsupported types of operation.  Return
895    true if they are, otherwise return false and indicate in *MATCHES
896    which stmts are not isomorphic to the first one.  If MATCHES[0]
897    is false then this indicates the comparison could not be
898    carried out or the stmts will never be vectorized by SLP.
899 
900    Note COND_EXPR is possibly isomorphic to another one after swapping its
901    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902    the first stmt by swapping the two operands of comparison; set SWAP[i]
903    to 2 if stmt I is isormorphic to the first stmt by inverting the code
904    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
906 
907 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators,tree * node_vectype)908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909                            vec<stmt_vec_info> stmts, unsigned int group_size,
910                            poly_uint64 *max_nunits, bool *matches,
911                            bool *two_operators, tree *node_vectype)
912 {
913   unsigned int i;
914   stmt_vec_info first_stmt_info = stmts[0];
915   code_helper first_stmt_code = ERROR_MARK;
916   code_helper alt_stmt_code = ERROR_MARK;
917   code_helper rhs_code = ERROR_MARK;
918   code_helper first_cond_code = ERROR_MARK;
919   tree lhs;
920   bool need_same_oprnds = false;
921   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922   stmt_vec_info first_load = NULL, prev_first_load = NULL;
923   bool first_stmt_load_p = false, load_p = false;
924   bool first_stmt_phi_p = false, phi_p = false;
925   bool maybe_soft_fail = false;
926   tree soft_fail_nunits_vectype = NULL_TREE;
927 
928   /* For every stmt in NODE find its def stmt/s.  */
929   stmt_vec_info stmt_info;
930   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
931     {
932       gimple *stmt = stmt_info->stmt;
933       swap[i] = 0;
934       matches[i] = false;
935 
936       if (dump_enabled_p ())
937           dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
938 
939       /* Fail to vectorize statements marked as unvectorizable, throw
940            or are volatile.  */
941       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942             || stmt_can_throw_internal (cfun, stmt)
943             || gimple_has_volatile_ops (stmt))
944         {
945           if (dump_enabled_p ())
946               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947                                    "Build SLP failed: unvectorizable statement %G",
948                                    stmt);
949             /* ???  For BB vectorization we want to commutate operands in a way
950                to shuffle all unvectorizable defs into one operand and have
951                the other still vectorized.  The following doesn't reliably
952                work for this though but it's the easiest we can do here.  */
953             if (is_a <bb_vec_info> (vinfo) && i != 0)
954               continue;
955             /* Fatal mismatch.  */
956             matches[0] = false;
957           return false;
958         }
959 
960       lhs = gimple_get_lhs (stmt);
961       if (lhs == NULL_TREE)
962           {
963             if (dump_enabled_p ())
964               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965                                    "Build SLP failed: not GIMPLE_ASSIGN nor "
966                                    "GIMPLE_CALL %G", stmt);
967             if (is_a <bb_vec_info> (vinfo) && i != 0)
968               continue;
969             /* Fatal mismatch.  */
970             matches[0] = false;
971             return false;
972           }
973 
974       tree nunits_vectype;
975       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976                                                      &nunits_vectype, group_size))
977           {
978             if (is_a <bb_vec_info> (vinfo) && i != 0)
979               continue;
980             /* Fatal mismatch.  */
981             matches[0] = false;
982             return false;
983           }
984       /* Record nunits required but continue analysis, producing matches[]
985            as if nunits was not an issue.  This allows splitting of groups
986            to happen.  */
987       if (nunits_vectype
988             && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989                                               nunits_vectype, max_nunits))
990           {
991             gcc_assert (is_a <bb_vec_info> (vinfo));
992             maybe_soft_fail = true;
993             soft_fail_nunits_vectype = nunits_vectype;
994           }
995 
996       gcc_assert (vectype);
997 
998       gcall *call_stmt = dyn_cast <gcall *> (stmt);
999       if (call_stmt)
1000           {
1001             combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002             if (cfn != CFN_LAST)
1003               rhs_code = cfn;
1004             else
1005               rhs_code = CALL_EXPR;
1006 
1007             if (cfn == CFN_MASK_LOAD
1008                 || cfn == CFN_GATHER_LOAD
1009                 || cfn == CFN_MASK_GATHER_LOAD)
1010               load_p = true;
1011             else if ((internal_fn_p (cfn)
1012                         && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013                        || gimple_call_tail_p (call_stmt)
1014                        || gimple_call_noreturn_p (call_stmt)
1015                        || gimple_call_chain (call_stmt))
1016               {
1017                 if (dump_enabled_p ())
1018                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019                                          "Build SLP failed: unsupported call type %G",
1020                                          call_stmt);
1021                 if (is_a <bb_vec_info> (vinfo) && i != 0)
1022                     continue;
1023                 /* Fatal mismatch.  */
1024                 matches[0] = false;
1025                 return false;
1026               }
1027           }
1028       else if (gimple_code (stmt) == GIMPLE_PHI)
1029           {
1030             rhs_code = ERROR_MARK;
1031             phi_p = true;
1032           }
1033       else
1034           {
1035             rhs_code = gimple_assign_rhs_code (stmt);
1036             load_p = gimple_vuse (stmt);
1037           }
1038 
1039       /* Check the operation.  */
1040       if (i == 0)
1041           {
1042             *node_vectype = vectype;
1043             first_stmt_code = rhs_code;
1044             first_stmt_load_p = load_p;
1045             first_stmt_phi_p = phi_p;
1046 
1047             /* Shift arguments should be equal in all the packed stmts for a
1048                vector shift with scalar shift operand.  */
1049             if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050                 || rhs_code == LROTATE_EXPR
1051                 || rhs_code == RROTATE_EXPR)
1052               {
1053                 /* First see if we have a vector/vector shift.  */
1054                 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1055                     {
1056                       /* No vector/vector shift, try for a vector/scalar shift.  */
1057                       if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1058                         {
1059                           if (dump_enabled_p ())
1060                               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061                                                    "Build SLP failed: "
1062                                                    "op not supported by target.\n");
1063                           if (is_a <bb_vec_info> (vinfo) && i != 0)
1064                               continue;
1065                           /* Fatal mismatch.  */
1066                           matches[0] = false;
1067                           return false;
1068                         }
1069                       need_same_oprnds = true;
1070                       first_op1 = gimple_assign_rhs2 (stmt);
1071                     }
1072               }
1073             else if (rhs_code == WIDEN_LSHIFT_EXPR)
1074             {
1075               need_same_oprnds = true;
1076               first_op1 = gimple_assign_rhs2 (stmt);
1077             }
1078             else if (!load_p
1079                        && rhs_code == BIT_FIELD_REF)
1080               {
1081                 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082                 if (!is_a <bb_vec_info> (vinfo)
1083                       || TREE_CODE (vec) != SSA_NAME
1084                       || !operand_equal_p (TYPE_SIZE (vectype),
1085                                                TYPE_SIZE (TREE_TYPE (vec))))
1086                     {
1087                       if (dump_enabled_p ())
1088                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089                                              "Build SLP failed: "
1090                                              "BIT_FIELD_REF not supported\n");
1091                       /* Fatal mismatch.  */
1092                       matches[0] = false;
1093                       return false;
1094                     }
1095               }
1096             else if (rhs_code == CFN_DIV_POW2)
1097               {
1098                 need_same_oprnds = true;
1099                 first_op1 = gimple_call_arg (call_stmt, 1);
1100               }
1101           }
1102       else
1103           {
1104             if (first_stmt_code != rhs_code
1105                 && alt_stmt_code == ERROR_MARK)
1106               alt_stmt_code = rhs_code;
1107             if ((first_stmt_code != rhs_code
1108                  && (first_stmt_code != IMAGPART_EXPR
1109                        || rhs_code != REALPART_EXPR)
1110                  && (first_stmt_code != REALPART_EXPR
1111                        || rhs_code != IMAGPART_EXPR)
1112                  /* Handle mismatches in plus/minus by computing both
1113                       and merging the results.  */
1114                  && !((first_stmt_code == PLUS_EXPR
1115                          || first_stmt_code == MINUS_EXPR)
1116                         && (alt_stmt_code == PLUS_EXPR
1117                               || alt_stmt_code == MINUS_EXPR)
1118                         && rhs_code == alt_stmt_code)
1119                  && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120                         && (first_stmt_code == ARRAY_REF
1121                               || first_stmt_code == BIT_FIELD_REF
1122                               || first_stmt_code == INDIRECT_REF
1123                               || first_stmt_code == COMPONENT_REF
1124                               || first_stmt_code == MEM_REF)
1125                         && (rhs_code == ARRAY_REF
1126                               || rhs_code == BIT_FIELD_REF
1127                               || rhs_code == INDIRECT_REF
1128                               || rhs_code == COMPONENT_REF
1129                               || rhs_code == MEM_REF)))
1130                 || first_stmt_load_p != load_p
1131                 || first_stmt_phi_p != phi_p)
1132               {
1133                 if (dump_enabled_p ())
1134                     {
1135                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136                                            "Build SLP failed: different operation "
1137                                            "in stmt %G", stmt);
1138                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139                                            "original stmt %G", first_stmt_info->stmt);
1140                     }
1141                 /* Mismatch.  */
1142                 continue;
1143               }
1144 
1145             if (!load_p
1146                 && first_stmt_code == BIT_FIELD_REF
1147                 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1148                       != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1149               {
1150                 if (dump_enabled_p ())
1151                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152                                          "Build SLP failed: different BIT_FIELD_REF "
1153                                          "arguments in %G", stmt);
1154                 /* Mismatch.  */
1155                 continue;
1156               }
1157 
1158             if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1159               {
1160                 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1161                                                call_stmt))
1162                     {
1163                       if (dump_enabled_p ())
1164                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                                              "Build SLP failed: different calls in %G",
1166                                              stmt);
1167                       /* Mismatch.  */
1168                       continue;
1169                     }
1170               }
1171 
1172             if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1173                 && (gimple_bb (first_stmt_info->stmt)
1174                       != gimple_bb (stmt_info->stmt)))
1175               {
1176                 if (dump_enabled_p ())
1177                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178                                          "Build SLP failed: different BB for PHI "
1179                                          "or possibly trapping operation in %G", stmt);
1180                 /* Mismatch.  */
1181                 continue;
1182               }
1183 
1184             if (need_same_oprnds)
1185               {
1186                 tree other_op1 = gimple_arg (stmt, 1);
1187                 if (!operand_equal_p (first_op1, other_op1, 0))
1188                     {
1189                       if (dump_enabled_p ())
1190                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                                              "Build SLP failed: different shift "
1192                                              "arguments in %G", stmt);
1193                       /* Mismatch.  */
1194                       continue;
1195                     }
1196               }
1197 
1198             if (!types_compatible_p (vectype, *node_vectype))
1199               {
1200                 if (dump_enabled_p ())
1201                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202                                          "Build SLP failed: different vector type "
1203                                          "in %G", stmt);
1204                 /* Mismatch.  */
1205                 continue;
1206               }
1207           }
1208 
1209       /* Grouped store or load.  */
1210       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211           {
1212             if (REFERENCE_CLASS_P (lhs))
1213               {
1214                 /* Store.  */
1215                 ;
1216               }
1217             else
1218               {
1219                 /* Load.  */
1220                 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1221               if (prev_first_load)
1222                 {
1223                   /* Check that there are no loads from different interleaving
1224                      chains in the same node.  */
1225                   if (prev_first_load != first_load)
1226                     {
1227                       if (dump_enabled_p ())
1228                               dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1229                                                    vect_location,
1230                                                    "Build SLP failed: different "
1231                                                    "interleaving chains in one node %G",
1232                                                    stmt);
1233                           /* Mismatch.  */
1234                           continue;
1235                     }
1236                 }
1237               else
1238                 prev_first_load = first_load;
1239            }
1240         } /* Grouped access.  */
1241       else
1242           {
1243             if (load_p
1244                 && rhs_code != CFN_GATHER_LOAD
1245                 && rhs_code != CFN_MASK_GATHER_LOAD)
1246               {
1247                 /* Not grouped load.  */
1248                 if (dump_enabled_p ())
1249                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                                          "Build SLP failed: not grouped load %G", stmt);
1251 
1252                 /* FORNOW: Not grouped loads are not supported.  */
1253                 if (is_a <bb_vec_info> (vinfo) && i != 0)
1254                     continue;
1255                 /* Fatal mismatch.  */
1256                 matches[0] = false;
1257                 return false;
1258               }
1259 
1260             /* Not memory operation.  */
1261             if (!phi_p
1262                 && rhs_code.is_tree_code ()
1263                 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1264                 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1265                 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1266                 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1267                 && rhs_code != VIEW_CONVERT_EXPR
1268                 && rhs_code != CALL_EXPR
1269                 && rhs_code != BIT_FIELD_REF)
1270               {
1271                 if (dump_enabled_p ())
1272                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273                                          "Build SLP failed: operation unsupported %G",
1274                                          stmt);
1275                 if (is_a <bb_vec_info> (vinfo) && i != 0)
1276                     continue;
1277                 /* Fatal mismatch.  */
1278                 matches[0] = false;
1279                 return false;
1280               }
1281 
1282             if (rhs_code == COND_EXPR)
1283               {
1284                 tree cond_expr = gimple_assign_rhs1 (stmt);
1285                 enum tree_code cond_code = TREE_CODE (cond_expr);
1286                 enum tree_code swap_code = ERROR_MARK;
1287                 enum tree_code invert_code = ERROR_MARK;
1288 
1289                 if (i == 0)
1290                     first_cond_code = TREE_CODE (cond_expr);
1291                 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1292                     {
1293                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1294                       swap_code = swap_tree_comparison (cond_code);
1295                       invert_code = invert_tree_comparison (cond_code, honor_nans);
1296                     }
1297 
1298                 if (first_cond_code == cond_code)
1299                     ;
1300                 /* Isomorphic can be achieved by swapping.  */
1301                 else if (first_cond_code == swap_code)
1302                     swap[i] = 1;
1303                 /* Isomorphic can be achieved by inverting.  */
1304                 else if (first_cond_code == invert_code)
1305                     swap[i] = 2;
1306                 else
1307                     {
1308                       if (dump_enabled_p ())
1309                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310                                              "Build SLP failed: different"
1311                                              " operation %G", stmt);
1312                       /* Mismatch.  */
1313                       continue;
1314                     }
1315               }
1316           }
1317 
1318       matches[i] = true;
1319     }
1320 
1321   for (i = 0; i < group_size; ++i)
1322     if (!matches[i])
1323       return false;
1324 
1325   /* If we allowed a two-operation SLP node verify the target can cope
1326      with the permute we are going to use.  */
1327   if (alt_stmt_code != ERROR_MARK
1328       && (!alt_stmt_code.is_tree_code ()
1329             || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1330     {
1331       *two_operators = true;
1332     }
1333 
1334   if (maybe_soft_fail)
1335     {
1336       unsigned HOST_WIDE_INT const_nunits;
1337       if (!TYPE_VECTOR_SUBPARTS
1338               (soft_fail_nunits_vectype).is_constant (&const_nunits)
1339             || const_nunits > group_size)
1340           matches[0] = false;
1341       else
1342           {
1343             /* With constant vector elements simulate a mismatch at the
1344                point we need to split.  */
1345             unsigned tail = group_size & (const_nunits - 1);
1346             memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1347           }
1348       return false;
1349     }
1350 
1351   return true;
1352 }
1353 
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355    Note we never remove apart from at destruction time so we do not
1356    need a special value for deleted that differs from empty.  */
1357 struct bst_traits
1358 {
1359   typedef vec <stmt_vec_info> value_type;
1360   typedef vec <stmt_vec_info> compare_type;
1361   static inline hashval_t hash (value_type);
1362   static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1363   static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1364   static inline bool is_deleted (value_type x) { return !x.exists (); }
1365   static const bool empty_zero_p = true;
mark_emptybst_traits1366   static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1367   static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1368   static inline void remove (value_type &x) { x.release (); }
1369 };
1370 inline hashval_t
hash(value_type x)1371 bst_traits::hash (value_type x)
1372 {
1373   inchash::hash h;
1374   for (unsigned i = 0; i < x.length (); ++i)
1375     h.add_int (gimple_uid (x[i]->stmt));
1376   return h.end ();
1377 }
1378 inline bool
equal(value_type existing,value_type candidate)1379 bst_traits::equal (value_type existing, value_type candidate)
1380 {
1381   if (existing.length () != candidate.length ())
1382     return false;
1383   for (unsigned i = 0; i < existing.length (); ++i)
1384     if (existing[i] != candidate[i])
1385       return false;
1386   return true;
1387 }
1388 
1389 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390    but then vec::insert does memmove and that's not compatible with
1391    std::pair.  */
1392 struct chain_op_t
1393 {
chain_op_tchain_op_t1394   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1395       : code (code_), dt (dt_), op (op_) {}
1396   tree_code code;
1397   vect_def_type dt;
1398   tree op;
1399 };
1400 
1401 /* Comparator for sorting associatable chains.  */
1402 
1403 static int
dt_sort_cmp(const void * op1_,const void * op2_,void *)1404 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1405 {
1406   auto *op1 = (const chain_op_t *) op1_;
1407   auto *op2 = (const chain_op_t *) op2_;
1408   if (op1->dt != op2->dt)
1409     return (int)op1->dt - (int)op2->dt;
1410   return (int)op1->code - (int)op2->code;
1411 }
1412 
1413 /* Linearize the associatable expression chain at START with the
1414    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415    filling CHAIN with the result and using WORKLIST as intermediate storage.
1416    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1418    stmts, starting with START.  */
1419 
1420 static void
vect_slp_linearize_chain(vec_info * vinfo,vec<std::pair<tree_code,gimple * >> & worklist,vec<chain_op_t> & chain,enum tree_code code,gimple * start,gimple * & code_stmt,gimple * & alt_code_stmt,vec<gimple * > * chain_stmts)1421 vect_slp_linearize_chain (vec_info *vinfo,
1422                                 vec<std::pair<tree_code, gimple *> > &worklist,
1423                                 vec<chain_op_t> &chain,
1424                                 enum tree_code code, gimple *start,
1425                                 gimple *&code_stmt, gimple *&alt_code_stmt,
1426                                 vec<gimple *> *chain_stmts)
1427 {
1428   /* For each lane linearize the addition/subtraction (or other
1429      uniform associatable operation) expression tree.  */
1430   worklist.safe_push (std::make_pair (code, start));
1431   while (!worklist.is_empty ())
1432     {
1433       auto entry = worklist.pop ();
1434       gassign *stmt = as_a <gassign *> (entry.second);
1435       enum tree_code in_code = entry.first;
1436       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1437       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1438       if (!code_stmt
1439             && gimple_assign_rhs_code (stmt) == code)
1440           code_stmt = stmt;
1441       else if (!alt_code_stmt
1442                  && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1443           alt_code_stmt = stmt;
1444       if (chain_stmts)
1445           chain_stmts->safe_push (stmt);
1446       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1447           {
1448             tree op = gimple_op (stmt, opnum);
1449             vect_def_type dt;
1450             stmt_vec_info def_stmt_info;
1451             bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1452             gcc_assert (res);
1453             if (dt == vect_internal_def
1454                 && is_pattern_stmt_p (def_stmt_info))
1455               op = gimple_get_lhs (def_stmt_info->stmt);
1456             gimple *use_stmt;
1457             use_operand_p use_p;
1458             if (dt == vect_internal_def
1459                 && single_imm_use (op, &use_p, &use_stmt)
1460                 && is_gimple_assign (def_stmt_info->stmt)
1461                 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1462                       || (code == PLUS_EXPR
1463                           && (gimple_assign_rhs_code (def_stmt_info->stmt)
1464                                 == MINUS_EXPR))))
1465               {
1466                 tree_code op_def_code = this_code;
1467                 if (op_def_code == MINUS_EXPR && opnum == 1)
1468                     op_def_code = PLUS_EXPR;
1469                 if (in_code == MINUS_EXPR)
1470                     op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1471                 worklist.safe_push (std::make_pair (op_def_code,
1472                                                               def_stmt_info->stmt));
1473               }
1474             else
1475               {
1476                 tree_code op_def_code = this_code;
1477                 if (op_def_code == MINUS_EXPR && opnum == 1)
1478                     op_def_code = PLUS_EXPR;
1479                 if (in_code == MINUS_EXPR)
1480                     op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1481                 chain.safe_push (chain_op_t (op_def_code, dt, op));
1482               }
1483           }
1484     }
1485 }
1486 
1487 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1488                       simple_hashmap_traits <bst_traits, slp_tree> >
1489   scalar_stmts_to_slp_tree_map_t;
1490 
1491 static slp_tree
1492 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1493                            vec<stmt_vec_info> stmts, unsigned int group_size,
1494                            poly_uint64 *max_nunits,
1495                            bool *matches, unsigned *limit, unsigned *tree_size,
1496                            scalar_stmts_to_slp_tree_map_t *bst_map);
1497 
1498 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1499 vect_build_slp_tree (vec_info *vinfo,
1500                          vec<stmt_vec_info> stmts, unsigned int group_size,
1501                          poly_uint64 *max_nunits,
1502                          bool *matches, unsigned *limit, unsigned *tree_size,
1503                          scalar_stmts_to_slp_tree_map_t *bst_map)
1504 {
1505   if (slp_tree *leader = bst_map->get (stmts))
1506     {
1507       if (dump_enabled_p ())
1508           dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1509                                !(*leader)->failed ? "" : "failed ", *leader);
1510       if (!(*leader)->failed)
1511           {
1512             SLP_TREE_REF_COUNT (*leader)++;
1513             vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1514             stmts.release ();
1515             return *leader;
1516           }
1517       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1518       return NULL;
1519     }
1520 
1521   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522      so we can pick up backedge destinations during discovery.  */
1523   slp_tree res = new _slp_tree;
1524   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1525   SLP_TREE_SCALAR_STMTS (res) = stmts;
1526   bst_map->put (stmts.copy (), res);
1527 
1528   if (*limit == 0)
1529     {
1530       if (dump_enabled_p ())
1531           dump_printf_loc (MSG_NOTE, vect_location,
1532                                "SLP discovery limit exceeded\n");
1533       /* Mark the node invalid so we can detect those when still in use
1534            as backedge destinations.  */
1535       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1536       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1537       res->failed = XNEWVEC (bool, group_size);
1538       memset (res->failed, 0, sizeof (bool) * group_size);
1539       memset (matches, 0, sizeof (bool) * group_size);
1540       return NULL;
1541     }
1542   --*limit;
1543 
1544   if (dump_enabled_p ())
1545     dump_printf_loc (MSG_NOTE, vect_location,
1546                          "starting SLP discovery for node %p\n", res);
1547 
1548   poly_uint64 this_max_nunits = 1;
1549   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1550                                                   &this_max_nunits,
1551                                                   matches, limit, tree_size, bst_map);
1552   if (!res_)
1553     {
1554       if (dump_enabled_p ())
1555           dump_printf_loc (MSG_NOTE, vect_location,
1556                                "SLP discovery for node %p failed\n", res);
1557       /* Mark the node invalid so we can detect those when still in use
1558            as backedge destinations.  */
1559       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1560       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1561       res->failed = XNEWVEC (bool, group_size);
1562       if (flag_checking)
1563           {
1564             unsigned i;
1565             for (i = 0; i < group_size; ++i)
1566               if (!matches[i])
1567                 break;
1568             gcc_assert (i < group_size);
1569           }
1570       memcpy (res->failed, matches, sizeof (bool) * group_size);
1571     }
1572   else
1573     {
1574       if (dump_enabled_p ())
1575           dump_printf_loc (MSG_NOTE, vect_location,
1576                                "SLP discovery for node %p succeeded\n", res);
1577       gcc_assert (res_ == res);
1578       res->max_nunits = this_max_nunits;
1579       vect_update_max_nunits (max_nunits, this_max_nunits);
1580       /* Keep a reference for the bst_map use.  */
1581       SLP_TREE_REF_COUNT (res)++;
1582     }
1583   return res_;
1584 }
1585 
1586 /* Helper for building an associated SLP node chain.  */
1587 
1588 static void
vect_slp_build_two_operator_nodes(slp_tree perm,tree vectype,slp_tree op0,slp_tree op1,stmt_vec_info oper1,stmt_vec_info oper2,vec<std::pair<unsigned,unsigned>> lperm)1589 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1590                                            slp_tree op0, slp_tree op1,
1591                                            stmt_vec_info oper1, stmt_vec_info oper2,
1592                                            vec<std::pair<unsigned, unsigned> > lperm)
1593 {
1594   unsigned group_size = SLP_TREE_LANES (op1);
1595 
1596   slp_tree child1 = new _slp_tree;
1597   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1598   SLP_TREE_VECTYPE (child1) = vectype;
1599   SLP_TREE_LANES (child1) = group_size;
1600   SLP_TREE_CHILDREN (child1).create (2);
1601   SLP_TREE_CHILDREN (child1).quick_push (op0);
1602   SLP_TREE_CHILDREN (child1).quick_push (op1);
1603   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1604 
1605   slp_tree child2 = new _slp_tree;
1606   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1607   SLP_TREE_VECTYPE (child2) = vectype;
1608   SLP_TREE_LANES (child2) = group_size;
1609   SLP_TREE_CHILDREN (child2).create (2);
1610   SLP_TREE_CHILDREN (child2).quick_push (op0);
1611   SLP_TREE_REF_COUNT (op0)++;
1612   SLP_TREE_CHILDREN (child2).quick_push (op1);
1613   SLP_TREE_REF_COUNT (op1)++;
1614   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1615 
1616   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1617   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1618   SLP_TREE_VECTYPE (perm) = vectype;
1619   SLP_TREE_LANES (perm) = group_size;
1620   /* ???  We should set this NULL but that's not expected.  */
1621   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1622   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1623   SLP_TREE_CHILDREN (perm).quick_push (child1);
1624   SLP_TREE_CHILDREN (perm).quick_push (child2);
1625 }
1626 
1627 /* Recursively build an SLP tree starting from NODE.
1628    Fail (and return a value not equal to zero) if def-stmts are not
1629    isomorphic, require data permutation or are of unsupported types of
1630    operation.  Otherwise, return 0.
1631    The value returned is the depth in the SLP tree where a mismatch
1632    was found.  */
1633 
1634 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1635 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1636                            vec<stmt_vec_info> stmts, unsigned int group_size,
1637                            poly_uint64 *max_nunits,
1638                            bool *matches, unsigned *limit, unsigned *tree_size,
1639                            scalar_stmts_to_slp_tree_map_t *bst_map)
1640 {
1641   unsigned nops, i, this_tree_size = 0;
1642   poly_uint64 this_max_nunits = *max_nunits;
1643 
1644   matches[0] = false;
1645 
1646   stmt_vec_info stmt_info = stmts[0];
1647   if (!is_a<gcall *> (stmt_info->stmt)
1648       && !is_a<gassign *> (stmt_info->stmt)
1649       && !is_a<gphi *> (stmt_info->stmt))
1650     return NULL;
1651 
1652   nops = gimple_num_args (stmt_info->stmt);
1653   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1654     nops = map[0];
1655 
1656   /* If the SLP node is a PHI (induction or reduction), terminate
1657      the recursion.  */
1658   bool *skip_args = XALLOCAVEC (bool, nops);
1659   memset (skip_args, 0, sizeof (bool) * nops);
1660   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1661     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1662       {
1663           tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1664           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1665                                                                 group_size);
1666           if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1667                                              max_nunits))
1668             return NULL;
1669 
1670           vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1671           if (def_type == vect_induction_def)
1672             {
1673               /* Induction PHIs are not cycles but walk the initial
1674                  value.  Only for inner loops through, for outer loops
1675                  we need to pick up the value from the actual PHIs
1676                  to more easily support peeling and epilogue vectorization.  */
1677               class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678               if (!nested_in_vect_loop_p (loop, stmt_info))
1679                 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1680               else
1681                 loop = loop->inner;
1682               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1683             }
1684           else if (def_type == vect_reduction_def
1685                      || def_type == vect_double_reduction_def
1686                      || def_type == vect_nested_cycle)
1687             {
1688               /* Else def types have to match.  */
1689               stmt_vec_info other_info;
1690               bool all_same = true;
1691               FOR_EACH_VEC_ELT (stmts, i, other_info)
1692                 {
1693                     if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1694                       return NULL;
1695                     if (other_info != stmt_info)
1696                       all_same = false;
1697                 }
1698               class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699               /* Reduction initial values are not explicitely represented.  */
1700               if (!nested_in_vect_loop_p (loop, stmt_info))
1701                 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1702               /* Reduction chain backedge defs are filled manually.
1703                  ???  Need a better way to identify a SLP reduction chain PHI.
1704                  Or a better overall way to SLP match those.  */
1705               if (all_same && def_type == vect_reduction_def)
1706                 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1707             }
1708           else if (def_type != vect_internal_def)
1709             return NULL;
1710       }
1711 
1712 
1713   bool two_operators = false;
1714   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1715   tree vectype = NULL_TREE;
1716   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1717                                     &this_max_nunits, matches, &two_operators,
1718                                     &vectype))
1719     return NULL;
1720 
1721   /* If the SLP node is a load, terminate the recursion unless masked.  */
1722   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1723       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1724     {
1725       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1726           gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1727       else
1728           {
1729             *max_nunits = this_max_nunits;
1730             (*tree_size)++;
1731             node = vect_create_new_slp_node (node, stmts, 0);
1732             SLP_TREE_VECTYPE (node) = vectype;
1733             /* And compute the load permutation.  Whether it is actually
1734                a permutation depends on the unrolling factor which is
1735                decided later.  */
1736             vec<unsigned> load_permutation;
1737             int j;
1738             stmt_vec_info load_info;
1739             load_permutation.create (group_size);
1740             stmt_vec_info first_stmt_info
1741               = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1742             bool any_permute = false;
1743             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1744               {
1745                 int load_place = vect_get_place_in_interleaving_chain
1746                       (load_info, first_stmt_info);
1747                 gcc_assert (load_place != -1);
1748                 any_permute |= load_place != j;
1749                 load_permutation.quick_push (load_place);
1750               }
1751 
1752             if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1753               {
1754                 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1755                                 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1756                                 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1757                 load_permutation.release ();
1758                 /* We cannot handle permuted masked loads, see PR114375.  */
1759                 if (any_permute
1760                       || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1761                           && DR_GROUP_SIZE (first_stmt_info) != group_size)
1762                       || STMT_VINFO_STRIDED_P (stmt_info))
1763                     {
1764                       matches[0] = false;
1765                       return NULL;
1766                     }
1767               }
1768             else
1769               {
1770                 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1771                 return node;
1772               }
1773           }
1774     }
1775   else if (gimple_assign_single_p (stmt_info->stmt)
1776              && !gimple_vuse (stmt_info->stmt)
1777              && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1778     {
1779       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1780            the same SSA name vector of a compatible type to vectype.  */
1781       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1782       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1783       stmt_vec_info estmt_info;
1784       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1785           {
1786             gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1787             tree bfref = gimple_assign_rhs1 (estmt);
1788             HOST_WIDE_INT lane;
1789             if (!known_eq (bit_field_size (bfref),
1790                                tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1791                 || !constant_multiple_p (bit_field_offset (bfref),
1792                                                bit_field_size (bfref), &lane))
1793               {
1794                 lperm.release ();
1795                 matches[0] = false;
1796                 return NULL;
1797               }
1798             lperm.safe_push (std::make_pair (0, (unsigned)lane));
1799           }
1800       slp_tree vnode = vect_create_new_slp_node (vNULL);
1801       /* ???  We record vectype here but we hide eventually necessary
1802            punning and instead rely on code generation to materialize
1803            VIEW_CONVERT_EXPRs as necessary.  We instead should make
1804            this explicit somehow.  */
1805       SLP_TREE_VECTYPE (vnode) = vectype;
1806       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1807       /* We are always building a permutation node even if it is an identity
1808            permute to shield the rest of the vectorizer from the odd node
1809            representing an actual vector without any scalar ops.
1810            ???  We could hide it completely with making the permute node
1811            external?  */
1812       node = vect_create_new_slp_node (node, stmts, 1);
1813       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1814       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1815       SLP_TREE_VECTYPE (node) = vectype;
1816       SLP_TREE_CHILDREN (node).quick_push (vnode);
1817       return node;
1818     }
1819   /* When discovery reaches an associatable operation see whether we can
1820      improve that to match up lanes in a way superior to the operand
1821      swapping code which at most looks at two defs.
1822      ???  For BB vectorization we cannot do the brute-force search
1823      for matching as we can succeed by means of builds from scalars
1824      and have no good way to "cost" one build against another.  */
1825   else if (is_a <loop_vec_info> (vinfo)
1826              /* ???  We don't handle !vect_internal_def defs below.  */
1827              && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1828              && is_gimple_assign (stmt_info->stmt)
1829              && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1830                  || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1831              && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1832                  || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1833                        && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1834     {
1835       /* See if we have a chain of (mixed) adds or subtracts or other
1836            associatable ops.  */
1837       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1838       if (code == MINUS_EXPR)
1839           code = PLUS_EXPR;
1840       stmt_vec_info other_op_stmt_info = NULL;
1841       stmt_vec_info op_stmt_info = NULL;
1842       unsigned chain_len = 0;
1843       auto_vec<chain_op_t> chain;
1844       auto_vec<std::pair<tree_code, gimple *> > worklist;
1845       auto_vec<vec<chain_op_t> > chains (group_size);
1846       auto_vec<slp_tree, 4> children;
1847       bool hard_fail = true;
1848       for (unsigned lane = 0; lane < group_size; ++lane)
1849           {
1850             /* For each lane linearize the addition/subtraction (or other
1851                uniform associatable operation) expression tree.  */
1852             gimple *op_stmt = NULL, *other_op_stmt = NULL;
1853             vect_slp_linearize_chain (vinfo, worklist, chain, code,
1854                                             stmts[lane]->stmt, op_stmt, other_op_stmt,
1855                                             NULL);
1856             if (!op_stmt_info && op_stmt)
1857               op_stmt_info = vinfo->lookup_stmt (op_stmt);
1858             if (!other_op_stmt_info && other_op_stmt)
1859               other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1860             if (chain.length () == 2)
1861               {
1862                 /* In a chain of just two elements resort to the regular
1863                      operand swapping scheme.  If we run into a length
1864                      mismatch still hard-FAIL.  */
1865                 if (chain_len == 0)
1866                     hard_fail = false;
1867                 else
1868                     {
1869                       matches[lane] = false;
1870                       /* ???  We might want to process the other lanes, but
1871                          make sure to not give false matching hints to the
1872                          caller for lanes we did not process.  */
1873                       if (lane != group_size - 1)
1874                         matches[0] = false;
1875                     }
1876                 break;
1877               }
1878             else if (chain_len == 0)
1879               chain_len = chain.length ();
1880             else if (chain.length () != chain_len)
1881               {
1882                 /* ???  Here we could slip in magic to compensate with
1883                      neutral operands.  */
1884                 matches[lane] = false;
1885                 if (lane != group_size - 1)
1886                     matches[0] = false;
1887                 break;
1888               }
1889             chains.quick_push (chain.copy ());
1890             chain.truncate (0);
1891           }
1892       if (chains.length () == group_size)
1893           {
1894             /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1895             if (!op_stmt_info)
1896               {
1897                 hard_fail = false;
1898                 goto out;
1899               }
1900             /* Now we have a set of chains with the same length.  */
1901             /* 1. pre-sort according to def_type and operation.  */
1902             for (unsigned lane = 0; lane < group_size; ++lane)
1903               chains[lane].stablesort (dt_sort_cmp, vinfo);
1904             if (dump_enabled_p ())
1905               {
1906                 dump_printf_loc (MSG_NOTE, vect_location,
1907                                      "pre-sorted chains of %s\n",
1908                                      get_tree_code_name (code));
1909                 for (unsigned lane = 0; lane < group_size; ++lane)
1910                     {
1911                       for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1912                         dump_printf (MSG_NOTE, "%s %T ",
1913                                          get_tree_code_name (chains[lane][opnum].code),
1914                                          chains[lane][opnum].op);
1915                       dump_printf (MSG_NOTE, "\n");
1916                     }
1917               }
1918             /* 2. try to build children nodes, associating as necessary.  */
1919             for (unsigned n = 0; n < chain_len; ++n)
1920               {
1921                 vect_def_type dt = chains[0][n].dt;
1922                 unsigned lane;
1923                 for (lane = 0; lane < group_size; ++lane)
1924                     if (chains[lane][n].dt != dt)
1925                       {
1926                         if (dt == vect_constant_def
1927                               && chains[lane][n].dt == vect_external_def)
1928                           dt = vect_external_def;
1929                         else if (dt == vect_external_def
1930                                    && chains[lane][n].dt == vect_constant_def)
1931                           ;
1932                         else
1933                           break;
1934                       }
1935                 if (lane != group_size)
1936                     {
1937                       if (dump_enabled_p ())
1938                         dump_printf_loc (MSG_NOTE, vect_location,
1939                                              "giving up on chain due to mismatched "
1940                                              "def types\n");
1941                       matches[lane] = false;
1942                       if (lane != group_size - 1)
1943                         matches[0] = false;
1944                       goto out;
1945                     }
1946                 if (dt == vect_constant_def
1947                       || dt == vect_external_def)
1948                     {
1949                       /* Check whether we can build the invariant.  If we can't
1950                          we never will be able to.  */
1951                       tree type = TREE_TYPE (chains[0][n].op);
1952                       if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1953                           && (TREE_CODE (type) == BOOLEAN_TYPE
1954                                 || !can_duplicate_and_interleave_p (vinfo, group_size,
1955                                                                             type)))
1956                         {
1957                           matches[0] = false;
1958                           goto out;
1959                         }
1960                       vec<tree> ops;
1961                       ops.create (group_size);
1962                       for (lane = 0; lane < group_size; ++lane)
1963                         ops.quick_push (chains[lane][n].op);
1964                       slp_tree child = vect_create_new_slp_node (ops);
1965                       SLP_TREE_DEF_TYPE (child) = dt;
1966                       children.safe_push (child);
1967                     }
1968                 else if (dt != vect_internal_def)
1969                     {
1970                       /* Not sure, we might need sth special.
1971                          gcc.dg/vect/pr96854.c,
1972                          gfortran.dg/vect/fast-math-pr37021.f90
1973                          and gfortran.dg/vect/pr61171.f trigger.  */
1974                       /* Soft-fail for now.  */
1975                       hard_fail = false;
1976                       goto out;
1977                     }
1978                 else
1979                     {
1980                       vec<stmt_vec_info> op_stmts;
1981                       op_stmts.create (group_size);
1982                       slp_tree child = NULL;
1983                       /* Brute-force our way.  We have to consider a lane
1984                          failing after fixing an earlier fail up in the
1985                          SLP discovery recursion.  So track the current
1986                          permute per lane.  */
1987                       unsigned *perms = XALLOCAVEC (unsigned, group_size);
1988                       memset (perms, 0, sizeof (unsigned) * group_size);
1989                       do
1990                         {
1991                           op_stmts.truncate (0);
1992                           for (lane = 0; lane < group_size; ++lane)
1993                               op_stmts.quick_push
1994                                 (vinfo->lookup_def (chains[lane][n].op));
1995                           child = vect_build_slp_tree (vinfo, op_stmts,
1996                                                                group_size, &this_max_nunits,
1997                                                                matches, limit,
1998                                                                &this_tree_size, bst_map);
1999                           /* ???  We're likely getting too many fatal mismatches
2000                                here so maybe we want to ignore them (but then we
2001                                have no idea which lanes fatally mismatched).  */
2002                           if (child || !matches[0])
2003                               break;
2004                           /* Swap another lane we have not yet matched up into
2005                                lanes that did not match.  If we run out of
2006                                permute possibilities for a lane terminate the
2007                                search.  */
2008                           bool term = false;
2009                           for (lane = 1; lane < group_size; ++lane)
2010                               if (!matches[lane])
2011                                 {
2012                                   if (n + perms[lane] + 1 == chain_len)
2013                                     {
2014                                         term = true;
2015                                         break;
2016                                     }
2017                                   std::swap (chains[lane][n],
2018                                                chains[lane][n + perms[lane] + 1]);
2019                                   perms[lane]++;
2020                                 }
2021                           if (term)
2022                               break;
2023                         }
2024                       while (1);
2025                       if (!child)
2026                         {
2027                           if (dump_enabled_p ())
2028                               dump_printf_loc (MSG_NOTE, vect_location,
2029                                                    "failed to match up op %d\n", n);
2030                           op_stmts.release ();
2031                           if (lane != group_size - 1)
2032                               matches[0] = false;
2033                           else
2034                               matches[lane] = false;
2035                           goto out;
2036                         }
2037                       if (dump_enabled_p ())
2038                         {
2039                           dump_printf_loc (MSG_NOTE, vect_location,
2040                                                "matched up op %d to\n", n);
2041                           vect_print_slp_tree (MSG_NOTE, vect_location, child);
2042                         }
2043                       children.safe_push (child);
2044                     }
2045               }
2046             /* 3. build SLP nodes to combine the chain.  */
2047             for (unsigned lane = 0; lane < group_size; ++lane)
2048               if (chains[lane][0].code != code)
2049                 {
2050                     /* See if there's any alternate all-PLUS entry.  */
2051                     unsigned n;
2052                     for (n = 1; n < chain_len; ++n)
2053                       {
2054                         for (lane = 0; lane < group_size; ++lane)
2055                           if (chains[lane][n].code != code)
2056                               break;
2057                         if (lane == group_size)
2058                           break;
2059                       }
2060                     if (n != chain_len)
2061                       {
2062                         /* Swap that in at first position.  */
2063                         std::swap (children[0], children[n]);
2064                         for (lane = 0; lane < group_size; ++lane)
2065                           std::swap (chains[lane][0], chains[lane][n]);
2066                       }
2067                     else
2068                       {
2069                         /* ???  When this triggers and we end up with two
2070                            vect_constant/external_def up-front things break (ICE)
2071                            spectacularly finding an insertion place for the
2072                            all-constant op.  We should have a fully
2073                            vect_internal_def operand though(?) so we can swap
2074                            that into first place and then prepend the all-zero
2075                            constant.  */
2076                         if (dump_enabled_p ())
2077                           dump_printf_loc (MSG_NOTE, vect_location,
2078                                                "inserting constant zero to compensate "
2079                                                "for (partially) negated first "
2080                                                "operand\n");
2081                         chain_len++;
2082                         for (lane = 0; lane < group_size; ++lane)
2083                           chains[lane].safe_insert
2084                               (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2085                         vec<tree> zero_ops;
2086                         zero_ops.create (group_size);
2087                         zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2088                         for (lane = 1; lane < group_size; ++lane)
2089                           zero_ops.quick_push (zero_ops[0]);
2090                         slp_tree zero = vect_create_new_slp_node (zero_ops);
2091                         SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2092                         children.safe_insert (0, zero);
2093                       }
2094                     break;
2095                 }
2096             for (unsigned i = 1; i < children.length (); ++i)
2097               {
2098                 slp_tree op0 = children[i - 1];
2099                 slp_tree op1 = children[i];
2100                 bool this_two_op = false;
2101                 for (unsigned lane = 0; lane < group_size; ++lane)
2102                     if (chains[lane][i].code != chains[0][i].code)
2103                       {
2104                         this_two_op = true;
2105                         break;
2106                       }
2107                 slp_tree child;
2108                 if (i == children.length () - 1)
2109                     child = vect_create_new_slp_node (node, stmts, 2);
2110                 else
2111                     child = vect_create_new_slp_node (2, ERROR_MARK);
2112                 if (this_two_op)
2113                     {
2114                       vec<std::pair<unsigned, unsigned> > lperm;
2115                       lperm.create (group_size);
2116                       for (unsigned lane = 0; lane < group_size; ++lane)
2117                         lperm.quick_push (std::make_pair
2118                           (chains[lane][i].code != chains[0][i].code, lane));
2119                       vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2120                                                                  (chains[0][i].code == code
2121                                                                   ? op_stmt_info
2122                                                                   : other_op_stmt_info),
2123                                                                  (chains[0][i].code == code
2124                                                                   ? other_op_stmt_info
2125                                                                   : op_stmt_info),
2126                                                                  lperm);
2127                     }
2128                 else
2129                     {
2130                       SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2131                       SLP_TREE_VECTYPE (child) = vectype;
2132                       SLP_TREE_LANES (child) = group_size;
2133                       SLP_TREE_CHILDREN (child).quick_push (op0);
2134                       SLP_TREE_CHILDREN (child).quick_push (op1);
2135                       SLP_TREE_REPRESENTATIVE (child)
2136                         = (chains[0][i].code == code
2137                            ? op_stmt_info : other_op_stmt_info);
2138                     }
2139                 children[i] = child;
2140               }
2141             *tree_size += this_tree_size + 1;
2142             *max_nunits = this_max_nunits;
2143             while (!chains.is_empty ())
2144               chains.pop ().release ();
2145             return node;
2146           }
2147 out:
2148       while (!children.is_empty ())
2149           vect_free_slp_tree (children.pop ());
2150       while (!chains.is_empty ())
2151           chains.pop ().release ();
2152       /* Hard-fail, otherwise we might run into quadratic processing of the
2153            chains starting one stmt into the chain again.  */
2154       if (hard_fail)
2155           return NULL;
2156       /* Fall thru to normal processing.  */
2157     }
2158 
2159   /* Get at the operands, verifying they are compatible.  */
2160   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2161   slp_oprnd_info oprnd_info;
2162   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2163     {
2164       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2165                                                        stmts, i, &oprnds_info);
2166       if (res != 0)
2167           matches[(res == -1) ? 0 : i] = false;
2168       if (!matches[0])
2169           break;
2170     }
2171   for (i = 0; i < group_size; ++i)
2172     if (!matches[i])
2173       {
2174           vect_free_oprnd_info (oprnds_info);
2175           return NULL;
2176       }
2177   swap = NULL;
2178 
2179   auto_vec<slp_tree, 4> children;
2180 
2181   stmt_info = stmts[0];
2182 
2183   /* Create SLP_TREE nodes for the definition node/s.  */
2184   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2185     {
2186       slp_tree child;
2187       unsigned int j;
2188 
2189       /* We're skipping certain operands from processing, for example
2190            outer loop reduction initial defs.  */
2191       if (skip_args[i])
2192           {
2193             children.safe_push (NULL);
2194             continue;
2195           }
2196 
2197       if (oprnd_info->first_dt == vect_uninitialized_def)
2198           {
2199             /* COND_EXPR have one too many eventually if the condition
2200                is a SSA name.  */
2201             gcc_assert (i == 3 && nops == 4);
2202             continue;
2203           }
2204 
2205       if (is_a <bb_vec_info> (vinfo)
2206             && oprnd_info->first_dt == vect_internal_def
2207             && !oprnd_info->any_pattern)
2208           {
2209             /* For BB vectorization, if all defs are the same do not
2210                bother to continue the build along the single-lane
2211                graph but use a splat of the scalar value.  */
2212             stmt_vec_info first_def = oprnd_info->def_stmts[0];
2213             for (j = 1; j < group_size; ++j)
2214               if (oprnd_info->def_stmts[j] != first_def)
2215                 break;
2216             if (j == group_size
2217                 /* But avoid doing this for loads where we may be
2218                      able to CSE things, unless the stmt is not
2219                      vectorizable.  */
2220                 && (!STMT_VINFO_VECTORIZABLE (first_def)
2221                       || !gimple_vuse (first_def->stmt)))
2222               {
2223                 if (dump_enabled_p ())
2224                     dump_printf_loc (MSG_NOTE, vect_location,
2225                                          "Using a splat of the uniform operand %G",
2226                                          first_def->stmt);
2227                 oprnd_info->first_dt = vect_external_def;
2228               }
2229           }
2230 
2231       if (oprnd_info->first_dt == vect_external_def
2232             || oprnd_info->first_dt == vect_constant_def)
2233           {
2234             slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2235             SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2236             oprnd_info->ops = vNULL;
2237             children.safe_push (invnode);
2238             continue;
2239           }
2240 
2241       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2242                                                   group_size, &this_max_nunits,
2243                                                   matches, limit,
2244                                                   &this_tree_size, bst_map)) != NULL)
2245           {
2246             oprnd_info->def_stmts = vNULL;
2247             children.safe_push (child);
2248             continue;
2249           }
2250 
2251       /* If the SLP build for operand zero failed and operand zero
2252            and one can be commutated try that for the scalar stmts
2253            that failed the match.  */
2254       if (i == 0
2255             /* A first scalar stmt mismatch signals a fatal mismatch.  */
2256             && matches[0]
2257             /* ???  For COND_EXPRs we can swap the comparison operands
2258                as well as the arms under some constraints.  */
2259             && nops == 2
2260             && oprnds_info[1]->first_dt == vect_internal_def
2261             && is_gimple_assign (stmt_info->stmt)
2262             /* Swapping operands for reductions breaks assumptions later on.  */
2263             && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2264             && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2265           {
2266             /* See whether we can swap the matching or the non-matching
2267                stmt operands.  */
2268             bool swap_not_matching = true;
2269             do
2270               {
2271                 for (j = 0; j < group_size; ++j)
2272                     {
2273                       if (matches[j] != !swap_not_matching)
2274                         continue;
2275                       stmt_vec_info stmt_info = stmts[j];
2276                       /* Verify if we can swap operands of this stmt.  */
2277                       gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2278                       if (!stmt
2279                           || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2280                         {
2281                           if (!swap_not_matching)
2282                               goto fail;
2283                           swap_not_matching = false;
2284                           break;
2285                         }
2286                     }
2287               }
2288             while (j != group_size);
2289 
2290             /* Swap mismatched definition stmts.  */
2291             if (dump_enabled_p ())
2292               dump_printf_loc (MSG_NOTE, vect_location,
2293                                    "Re-trying with swapped operands of stmts ");
2294             for (j = 0; j < group_size; ++j)
2295               if (matches[j] == !swap_not_matching)
2296                 {
2297                     std::swap (oprnds_info[0]->def_stmts[j],
2298                                  oprnds_info[1]->def_stmts[j]);
2299                     std::swap (oprnds_info[0]->ops[j],
2300                                  oprnds_info[1]->ops[j]);
2301                     if (dump_enabled_p ())
2302                       dump_printf (MSG_NOTE, "%d ", j);
2303                 }
2304             if (dump_enabled_p ())
2305               dump_printf (MSG_NOTE, "\n");
2306             /* After swapping some operands we lost track whether an
2307                operand has any pattern defs so be conservative here.  */
2308             if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2309               oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2310             /* And try again with scratch 'matches' ... */
2311             bool *tem = XALLOCAVEC (bool, group_size);
2312             if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2313                                                       group_size, &this_max_nunits,
2314                                                       tem, limit,
2315                                                       &this_tree_size, bst_map)) != NULL)
2316               {
2317                 oprnd_info->def_stmts = vNULL;
2318                 children.safe_push (child);
2319                 continue;
2320               }
2321           }
2322 fail:
2323 
2324       /* If the SLP build failed and we analyze a basic-block
2325            simply treat nodes we fail to build as externally defined
2326            (and thus build vectors from the scalar defs).
2327            The cost model will reject outright expensive cases.
2328            ???  This doesn't treat cases where permutation ultimatively
2329            fails (or we don't try permutation below).  Ideally we'd
2330            even compute a permutation that will end up with the maximum
2331            SLP tree size...  */
2332       if (is_a <bb_vec_info> (vinfo)
2333             /* ???  Rejecting patterns this way doesn't work.  We'd have to
2334                do extra work to cancel the pattern so the uses see the
2335                scalar version.  */
2336             && !is_pattern_stmt_p (stmt_info)
2337             && !oprnd_info->any_pattern)
2338           {
2339             /* But if there's a leading vector sized set of matching stmts
2340                fail here so we can split the group.  This matches the condition
2341                vect_analyze_slp_instance uses.  */
2342             /* ???  We might want to split here and combine the results to support
2343                multiple vector sizes better.  */
2344             for (j = 0; j < group_size; ++j)
2345               if (!matches[j])
2346                 break;
2347             if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2348               {
2349                 if (dump_enabled_p ())
2350                     dump_printf_loc (MSG_NOTE, vect_location,
2351                                          "Building vector operands from scalars\n");
2352                 this_tree_size++;
2353                 child = vect_create_new_slp_node (oprnd_info->ops);
2354                 children.safe_push (child);
2355                 oprnd_info->ops = vNULL;
2356                 continue;
2357               }
2358           }
2359 
2360       gcc_assert (child == NULL);
2361       FOR_EACH_VEC_ELT (children, j, child)
2362           if (child)
2363             vect_free_slp_tree (child);
2364       vect_free_oprnd_info (oprnds_info);
2365       return NULL;
2366     }
2367 
2368   vect_free_oprnd_info (oprnds_info);
2369 
2370   /* If we have all children of a child built up from uniform scalars
2371      or does more than one possibly expensive vector construction then
2372      just throw that away, causing it built up from scalars.
2373      The exception is the SLP node for the vector store.  */
2374   if (is_a <bb_vec_info> (vinfo)
2375       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2376       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2377            do extra work to cancel the pattern so the uses see the
2378            scalar version.  */
2379       && !is_pattern_stmt_p (stmt_info))
2380     {
2381       slp_tree child;
2382       unsigned j;
2383       bool all_uniform_p = true;
2384       unsigned n_vector_builds = 0;
2385       FOR_EACH_VEC_ELT (children, j, child)
2386           {
2387             if (!child)
2388               ;
2389             else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2390               all_uniform_p = false;
2391             else if (!vect_slp_tree_uniform_p (child))
2392               {
2393                 all_uniform_p = false;
2394                 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2395                     n_vector_builds++;
2396               }
2397           }
2398       if (all_uniform_p
2399             || n_vector_builds > 1
2400             || (n_vector_builds == children.length ()
2401                 && is_a <gphi *> (stmt_info->stmt)))
2402           {
2403             /* Roll back.  */
2404             matches[0] = false;
2405             FOR_EACH_VEC_ELT (children, j, child)
2406               if (child)
2407                 vect_free_slp_tree (child);
2408 
2409             if (dump_enabled_p ())
2410               dump_printf_loc (MSG_NOTE, vect_location,
2411                                    "Building parent vector operands from "
2412                                    "scalars instead\n");
2413             return NULL;
2414           }
2415     }
2416 
2417   *tree_size += this_tree_size + 1;
2418   *max_nunits = this_max_nunits;
2419 
2420   if (two_operators)
2421     {
2422       /* ???  We'd likely want to either cache in bst_map sth like
2423            { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2424            the true { a+b, a+b, a+b, a+b } ... but there we don't have
2425            explicit stmts to put in so the keying on 'stmts' doesn't
2426            work (but we have the same issue with nodes that use 'ops').  */
2427       slp_tree one = new _slp_tree;
2428       slp_tree two = new _slp_tree;
2429       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2430       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2431       SLP_TREE_VECTYPE (one) = vectype;
2432       SLP_TREE_VECTYPE (two) = vectype;
2433       SLP_TREE_CHILDREN (one).safe_splice (children);
2434       SLP_TREE_CHILDREN (two).safe_splice (children);
2435       slp_tree child;
2436       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2437           SLP_TREE_REF_COUNT (child)++;
2438 
2439       /* Here we record the original defs since this
2440            node represents the final lane configuration.  */
2441       node = vect_create_new_slp_node (node, stmts, 2);
2442       SLP_TREE_VECTYPE (node) = vectype;
2443       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2444       SLP_TREE_CHILDREN (node).quick_push (one);
2445       SLP_TREE_CHILDREN (node).quick_push (two);
2446       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2447       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2448       enum tree_code ocode = ERROR_MARK;
2449       stmt_vec_info ostmt_info;
2450       unsigned j = 0;
2451       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2452           {
2453             gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2454             if (gimple_assign_rhs_code (ostmt) != code0)
2455               {
2456                 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2457                 ocode = gimple_assign_rhs_code (ostmt);
2458                 j = i;
2459               }
2460             else
2461               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2462           }
2463       SLP_TREE_CODE (one) = code0;
2464       SLP_TREE_CODE (two) = ocode;
2465       SLP_TREE_LANES (one) = stmts.length ();
2466       SLP_TREE_LANES (two) = stmts.length ();
2467       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2468       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2469       return node;
2470     }
2471 
2472   node = vect_create_new_slp_node (node, stmts, nops);
2473   SLP_TREE_VECTYPE (node) = vectype;
2474   SLP_TREE_CHILDREN (node).splice (children);
2475   return node;
2476 }
2477 
2478 /* Dump a single SLP tree NODE.  */
2479 
2480 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)2481 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2482                          slp_tree node)
2483 {
2484   unsigned i, j;
2485   slp_tree child;
2486   stmt_vec_info stmt_info;
2487   tree op;
2488 
2489   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2490   dump_user_location_t user_loc = loc.get_user_location ();
2491   dump_printf_loc (metadata, user_loc,
2492                        "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2493                        ", refcnt=%u)",
2494                        SLP_TREE_DEF_TYPE (node) == vect_external_def
2495                        ? " (external)"
2496                        : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2497                           ? " (constant)"
2498                           : ""), node,
2499                        estimated_poly_value (node->max_nunits),
2500                                                    SLP_TREE_REF_COUNT (node));
2501   if (SLP_TREE_VECTYPE (node))
2502     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2503   dump_printf (metadata, "\n");
2504   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2505     {
2506       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2507           dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2508       else
2509           dump_printf_loc (metadata, user_loc, "op template: %G",
2510                                SLP_TREE_REPRESENTATIVE (node)->stmt);
2511     }
2512   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2513     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2514       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2515   else
2516     {
2517       dump_printf_loc (metadata, user_loc, "\t{ ");
2518       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2519           dump_printf (metadata, "%T%s ", op,
2520                          i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2521       dump_printf (metadata, "}\n");
2522     }
2523   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2524     {
2525       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2526       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2527           dump_printf (dump_kind, " %u", j);
2528       dump_printf (dump_kind, " }\n");
2529     }
2530   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2531     {
2532       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2533       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2534           dump_printf (dump_kind, " %u[%u]",
2535                          SLP_TREE_LANE_PERMUTATION (node)[i].first,
2536                          SLP_TREE_LANE_PERMUTATION (node)[i].second);
2537       dump_printf (dump_kind, " }\n");
2538     }
2539   if (SLP_TREE_CHILDREN (node).is_empty ())
2540     return;
2541   dump_printf_loc (metadata, user_loc, "\tchildren");
2542   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2543     dump_printf (dump_kind, " %p", (void *)child);
2544   dump_printf (dump_kind, "\n");
2545 }
2546 
2547 DEBUG_FUNCTION void
debug(slp_tree node)2548 debug (slp_tree node)
2549 {
2550   debug_dump_context ctx;
2551   vect_print_slp_tree (MSG_NOTE,
2552                            dump_location_t::from_location_t (UNKNOWN_LOCATION),
2553                            node);
2554 }
2555 
2556 /* Recursive helper for the dot producer below.  */
2557 
2558 static void
dot_slp_tree(FILE * f,slp_tree node,hash_set<slp_tree> & visited)2559 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2560 {
2561   if (visited.add (node))
2562     return;
2563 
2564   fprintf (f, "\"%p\" [label=\"", (void *)node);
2565   vect_print_slp_tree (MSG_NOTE,
2566                            dump_location_t::from_location_t (UNKNOWN_LOCATION),
2567                            node);
2568   fprintf (f, "\"];\n");
2569 
2570 
2571   for (slp_tree child : SLP_TREE_CHILDREN (node))
2572     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2573 
2574   for (slp_tree child : SLP_TREE_CHILDREN (node))
2575     if (child)
2576       dot_slp_tree (f, child, visited);
2577 }
2578 
2579 DEBUG_FUNCTION void
dot_slp_tree(const char * fname,slp_tree node)2580 dot_slp_tree (const char *fname, slp_tree node)
2581 {
2582   FILE *f = fopen (fname, "w");
2583   fprintf (f, "digraph {\n");
2584   fflush (f);
2585     {
2586       debug_dump_context ctx (f);
2587       hash_set<slp_tree> visited;
2588       dot_slp_tree (f, node, visited);
2589     }
2590   fflush (f);
2591   fprintf (f, "}\n");
2592   fclose (f);
2593 }
2594 
2595 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2596 
2597 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)2598 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2599                           slp_tree node, hash_set<slp_tree> &visited)
2600 {
2601   unsigned i;
2602   slp_tree child;
2603 
2604   if (visited.add (node))
2605     return;
2606 
2607   vect_print_slp_tree (dump_kind, loc, node);
2608 
2609   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2610     if (child)
2611       vect_print_slp_graph (dump_kind, loc, child, visited);
2612 }
2613 
2614 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree entry)2615 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2616                           slp_tree entry)
2617 {
2618   hash_set<slp_tree> visited;
2619   vect_print_slp_graph (dump_kind, loc, entry, visited);
2620 }
2621 
2622 /* Mark the tree rooted at NODE with PURE_SLP.  */
2623 
2624 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)2625 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2626 {
2627   int i;
2628   stmt_vec_info stmt_info;
2629   slp_tree child;
2630 
2631   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2632     return;
2633 
2634   if (visited.add (node))
2635     return;
2636 
2637   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2638     STMT_SLP_TYPE (stmt_info) = pure_slp;
2639 
2640   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2641     if (child)
2642       vect_mark_slp_stmts (child, visited);
2643 }
2644 
2645 static void
vect_mark_slp_stmts(slp_tree node)2646 vect_mark_slp_stmts (slp_tree node)
2647 {
2648   hash_set<slp_tree> visited;
2649   vect_mark_slp_stmts (node, visited);
2650 }
2651 
2652 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2653 
2654 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)2655 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2656 {
2657   int i;
2658   stmt_vec_info stmt_info;
2659   slp_tree child;
2660 
2661   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2662     return;
2663 
2664   if (visited.add (node))
2665     return;
2666 
2667   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2668     {
2669       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2670                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2671       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2672     }
2673 
2674   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2675     if (child)
2676       vect_mark_slp_stmts_relevant (child, visited);
2677 }
2678 
2679 static void
vect_mark_slp_stmts_relevant(slp_tree node)2680 vect_mark_slp_stmts_relevant (slp_tree node)
2681 {
2682   hash_set<slp_tree> visited;
2683   vect_mark_slp_stmts_relevant (node, visited);
2684 }
2685 
2686 
2687 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2688 
2689 static void
vect_gather_slp_loads(vec<slp_tree> & loads,slp_tree node,hash_set<slp_tree> & visited)2690 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2691                            hash_set<slp_tree> &visited)
2692 {
2693   if (!node || visited.add (node))
2694     return;
2695 
2696   if (SLP_TREE_CHILDREN (node).length () == 0)
2697     {
2698       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2699           return;
2700       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2701       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2702             && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2703           loads.safe_push (node);
2704     }
2705   else
2706     {
2707       unsigned i;
2708       slp_tree child;
2709       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2710           vect_gather_slp_loads (loads, child, visited);
2711     }
2712 }
2713 
2714 
2715 /* Find the last store in SLP INSTANCE.  */
2716 
2717 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2718 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2719 {
2720   stmt_vec_info last = NULL;
2721   stmt_vec_info stmt_vinfo;
2722 
2723   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2724     {
2725       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2726       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2727     }
2728 
2729   return last;
2730 }
2731 
2732 /* Find the first stmt in NODE.  */
2733 
2734 stmt_vec_info
vect_find_first_scalar_stmt_in_slp(slp_tree node)2735 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2736 {
2737   stmt_vec_info first = NULL;
2738   stmt_vec_info stmt_vinfo;
2739 
2740   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2741     {
2742       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2743       if (!first
2744             || get_later_stmt (stmt_vinfo, first) == first)
2745           first = stmt_vinfo;
2746     }
2747 
2748   return first;
2749 }
2750 
2751 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2752    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2753    (also containing the first GROUP1_SIZE stmts, since stores are
2754    consecutive), the second containing the remainder.
2755    Return the first stmt in the second group.  */
2756 
2757 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2758 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2759 {
2760   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2761   gcc_assert (group1_size > 0);
2762   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2763   gcc_assert (group2_size > 0);
2764   DR_GROUP_SIZE (first_vinfo) = group1_size;
2765 
2766   stmt_vec_info stmt_info = first_vinfo;
2767   for (unsigned i = group1_size; i > 1; i--)
2768     {
2769       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2770       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2771     }
2772   /* STMT is now the last element of the first group.  */
2773   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2774   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2775 
2776   DR_GROUP_SIZE (group2) = group2_size;
2777   for (stmt_info = group2; stmt_info;
2778        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2779     {
2780       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2781       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2782     }
2783 
2784   /* For the second group, the DR_GROUP_GAP is that before the original group,
2785      plus skipping over the first vector.  */
2786   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2787 
2788   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2789   DR_GROUP_GAP (first_vinfo) += group2_size;
2790 
2791   if (dump_enabled_p ())
2792     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2793                          group1_size, group2_size);
2794 
2795   return group2;
2796 }
2797 
2798 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2799    statements and a vector of NUNITS elements.  */
2800 
2801 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2802 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2803 {
2804   return exact_div (common_multiple (nunits, group_size), group_size);
2805 }
2806 
2807 /* Helper that checks to see if a node is a load node.  */
2808 
2809 static inline bool
vect_is_slp_load_node(slp_tree root)2810 vect_is_slp_load_node  (slp_tree root)
2811 {
2812   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2813            && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2814            && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2815 }
2816 
2817 
2818 /* Helper function of optimize_load_redistribution that performs the operation
2819    recursively.  */
2820 
2821 static slp_tree
optimize_load_redistribution_1(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2822 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2823                                         vec_info *vinfo, unsigned int group_size,
2824                                         hash_map<slp_tree, slp_tree> *load_map,
2825                                         slp_tree root)
2826 {
2827   if (slp_tree *leader = load_map->get (root))
2828     return *leader;
2829 
2830   slp_tree node;
2831   unsigned i;
2832 
2833   /* For now, we don't know anything about externals so do not do anything.  */
2834   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2835     return NULL;
2836   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2837     {
2838       /* First convert this node into a load node and add it to the leaves
2839            list and flatten the permute from a lane to a load one.  If it's
2840            unneeded it will be elided later.  */
2841       vec<stmt_vec_info> stmts;
2842       stmts.create (SLP_TREE_LANES (root));
2843       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2844       for (unsigned j = 0; j < lane_perm.length (); j++)
2845           {
2846             std::pair<unsigned, unsigned> perm = lane_perm[j];
2847             node = SLP_TREE_CHILDREN (root)[perm.first];
2848 
2849             if (!vect_is_slp_load_node (node)
2850                 || SLP_TREE_CHILDREN (node).exists ())
2851               {
2852                 stmts.release ();
2853                 goto next;
2854               }
2855 
2856             stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2857           }
2858 
2859       if (dump_enabled_p ())
2860           dump_printf_loc (MSG_NOTE, vect_location,
2861                                "converting stmts on permute node %p\n", root);
2862 
2863       bool *matches = XALLOCAVEC (bool, group_size);
2864       poly_uint64 max_nunits = 1;
2865       unsigned tree_size = 0, limit = 1;
2866       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2867                                           matches, &limit, &tree_size, bst_map);
2868       if (!node)
2869           stmts.release ();
2870 
2871       load_map->put (root, node);
2872       return node;
2873     }
2874 
2875 next:
2876   load_map->put (root, NULL);
2877 
2878   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2879     {
2880       slp_tree value
2881           = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2882                                                     node);
2883       if (value)
2884           {
2885             SLP_TREE_REF_COUNT (value)++;
2886             SLP_TREE_CHILDREN (root)[i] = value;
2887             /* ???  We know the original leafs of the replaced nodes will
2888                be referenced by bst_map, only the permutes created by
2889                pattern matching are not.  */
2890             if (SLP_TREE_REF_COUNT (node) == 1)
2891               load_map->remove (node);
2892             vect_free_slp_tree (node);
2893           }
2894     }
2895 
2896   return NULL;
2897 }
2898 
2899 /* Temporary workaround for loads not being CSEd during SLP build.  This
2900    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2901    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2902    same DR such that the final operation is equal to a permuted load.  Such
2903    NODES are then directly converted into LOADS themselves.  The nodes are
2904    CSEd using BST_MAP.  */
2905 
2906 static void
optimize_load_redistribution(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2907 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2908                                     vec_info *vinfo, unsigned int group_size,
2909                                     hash_map<slp_tree, slp_tree> *load_map,
2910                                     slp_tree root)
2911 {
2912   slp_tree node;
2913   unsigned i;
2914 
2915   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2916     {
2917       slp_tree value
2918           = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2919                                                     node);
2920       if (value)
2921           {
2922             SLP_TREE_REF_COUNT (value)++;
2923             SLP_TREE_CHILDREN (root)[i] = value;
2924             /* ???  We know the original leafs of the replaced nodes will
2925                be referenced by bst_map, only the permutes created by
2926                pattern matching are not.  */
2927             if (SLP_TREE_REF_COUNT (node) == 1)
2928               load_map->remove (node);
2929             vect_free_slp_tree (node);
2930           }
2931     }
2932 }
2933 
2934 /* Helper function of vect_match_slp_patterns.
2935 
2936    Attempts to match patterns against the slp tree rooted in REF_NODE using
2937    VINFO.  Patterns are matched in post-order traversal.
2938 
2939    If matching is successful the value in REF_NODE is updated and returned, if
2940    not then it is returned unchanged.  */
2941 
2942 static bool
vect_match_slp_patterns_2(slp_tree * ref_node,vec_info * vinfo,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache,hash_set<slp_tree> * visited)2943 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2944                                  slp_tree_to_load_perm_map_t *perm_cache,
2945                                  slp_compat_nodes_map_t *compat_cache,
2946                                  hash_set<slp_tree> *visited)
2947 {
2948   unsigned i;
2949   slp_tree node = *ref_node;
2950   bool found_p = false;
2951   if (!node || visited->add (node))
2952     return false;
2953 
2954   slp_tree child;
2955   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2956     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2957                                                     vinfo, perm_cache, compat_cache,
2958                                                     visited);
2959 
2960   for (unsigned x = 0; x < num__slp_patterns; x++)
2961     {
2962       vect_pattern *pattern
2963           = slp_patterns[x] (perm_cache, compat_cache, ref_node);
2964       if (pattern)
2965           {
2966             pattern->build (vinfo);
2967             delete pattern;
2968             found_p = true;
2969           }
2970     }
2971 
2972   return found_p;
2973 }
2974 
2975 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2976    vec_info VINFO.
2977 
2978    The modified tree is returned.  Patterns are tried in order and multiple
2979    patterns may match.  */
2980 
2981 static bool
vect_match_slp_patterns(slp_instance instance,vec_info * vinfo,hash_set<slp_tree> * visited,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache)2982 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2983                                hash_set<slp_tree> *visited,
2984                                slp_tree_to_load_perm_map_t *perm_cache,
2985                                slp_compat_nodes_map_t *compat_cache)
2986 {
2987   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2988   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2989 
2990   if (dump_enabled_p ())
2991     dump_printf_loc (MSG_NOTE, vect_location,
2992                          "Analyzing SLP tree %p for patterns\n",
2993                          SLP_INSTANCE_TREE (instance));
2994 
2995   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2996                                             visited);
2997 }
2998 
2999 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3000    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3001    Return true if we could use IFN_STORE_LANES instead and if that appears
3002    to be the better approach.  */
3003 
3004 static bool
vect_slp_prefer_store_lanes_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,unsigned int new_group_size)3005 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3006                                      unsigned int group_size,
3007                                      unsigned int new_group_size)
3008 {
3009   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3010   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3011   if (!vectype)
3012     return false;
3013   /* Allow the split if one of the two new groups would operate on full
3014      vectors *within* rather than across one scalar loop iteration.
3015      This is purely a heuristic, but it should work well for group
3016      sizes of 3 and 4, where the possible splits are:
3017 
3018        3->2+1:  OK if the vector has exactly two elements
3019        4->2+2:  Likewise
3020        4->3+1:  Less clear-cut.  */
3021   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3022       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3023     return false;
3024   return vect_store_lanes_supported (vectype, group_size, false);
3025 }
3026 
3027 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3028    vect_build_slp_tree to build a tree of packed stmts if possible.
3029    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3030 
3031 static bool
3032 vect_analyze_slp_instance (vec_info *vinfo,
3033                                  scalar_stmts_to_slp_tree_map_t *bst_map,
3034                                  stmt_vec_info stmt_info, slp_instance_kind kind,
3035                                  unsigned max_tree_size, unsigned *limit);
3036 
3037 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3038    of KIND.  Return true if successful.  */
3039 
3040 static bool
vect_build_slp_instance(vec_info * vinfo,slp_instance_kind kind,vec<stmt_vec_info> & scalar_stmts,vec<stmt_vec_info> & root_stmt_infos,unsigned max_tree_size,unsigned * limit,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info_)3041 vect_build_slp_instance (vec_info *vinfo,
3042                                slp_instance_kind kind,
3043                                vec<stmt_vec_info> &scalar_stmts,
3044                                vec<stmt_vec_info> &root_stmt_infos,
3045                                unsigned max_tree_size, unsigned *limit,
3046                                scalar_stmts_to_slp_tree_map_t *bst_map,
3047                                /* ???  We need stmt_info for group splitting.  */
3048                                stmt_vec_info stmt_info_)
3049 {
3050   if (dump_enabled_p ())
3051     {
3052       dump_printf_loc (MSG_NOTE, vect_location,
3053                            "Starting SLP discovery for\n");
3054       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3055           dump_printf_loc (MSG_NOTE, vect_location,
3056                                "  %G", scalar_stmts[i]->stmt);
3057     }
3058 
3059   /* Build the tree for the SLP instance.  */
3060   unsigned int group_size = scalar_stmts.length ();
3061   bool *matches = XALLOCAVEC (bool, group_size);
3062   poly_uint64 max_nunits = 1;
3063   unsigned tree_size = 0;
3064   unsigned i;
3065   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3066                                                &max_nunits, matches, limit,
3067                                                &tree_size, bst_map);
3068   if (node != NULL)
3069     {
3070       /* Calculate the unrolling factor based on the smallest type.  */
3071       poly_uint64 unrolling_factor
3072           = calculate_unrolling_factor (max_nunits, group_size);
3073 
3074       if (maybe_ne (unrolling_factor, 1U)
3075             && is_a <bb_vec_info> (vinfo))
3076           {
3077             unsigned HOST_WIDE_INT const_max_nunits;
3078             if (!max_nunits.is_constant (&const_max_nunits)
3079                 || const_max_nunits > group_size)
3080               {
3081                 if (dump_enabled_p ())
3082                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3083                                          "Build SLP failed: store group "
3084                                          "size not a multiple of the vector size "
3085                                          "in basic block SLP\n");
3086                 vect_free_slp_tree (node);
3087                 return false;
3088               }
3089             /* Fatal mismatch.  */
3090             if (dump_enabled_p ())
3091               dump_printf_loc (MSG_NOTE, vect_location,
3092                                    "SLP discovery succeeded but node needs "
3093                                    "splitting\n");
3094             memset (matches, true, group_size);
3095             matches[group_size / const_max_nunits * const_max_nunits] = false;
3096             vect_free_slp_tree (node);
3097           }
3098       else
3099           {
3100             /* Create a new SLP instance.  */
3101             slp_instance new_instance = XNEW (class _slp_instance);
3102             SLP_INSTANCE_TREE (new_instance) = node;
3103             SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3104             SLP_INSTANCE_LOADS (new_instance) = vNULL;
3105             SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3106             SLP_INSTANCE_KIND (new_instance) = kind;
3107             new_instance->reduc_phis = NULL;
3108             new_instance->cost_vec = vNULL;
3109             new_instance->subgraph_entries = vNULL;
3110 
3111             if (dump_enabled_p ())
3112               dump_printf_loc (MSG_NOTE, vect_location,
3113                                    "SLP size %u vs. limit %u.\n",
3114                                    tree_size, max_tree_size);
3115 
3116             /* Fixup SLP reduction chains.  */
3117             if (kind == slp_inst_kind_reduc_chain)
3118               {
3119                 /* If this is a reduction chain with a conversion in front
3120                      amend the SLP tree with a node for that.  */
3121                 gimple *scalar_def
3122                     = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3123                 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3124                     {
3125                       /* Get at the conversion stmt - we know it's the single use
3126                          of the last stmt of the reduction chain.  */
3127                       use_operand_p use_p;
3128                       bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3129                                                      &use_p, &scalar_def);
3130                       gcc_assert (r);
3131                       stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3132                       next_info = vect_stmt_to_vectorize (next_info);
3133                       scalar_stmts = vNULL;
3134                       scalar_stmts.create (group_size);
3135                       for (unsigned i = 0; i < group_size; ++i)
3136                         scalar_stmts.quick_push (next_info);
3137                       slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3138                       SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3139                       SLP_TREE_CHILDREN (conv).quick_push (node);
3140                       SLP_INSTANCE_TREE (new_instance) = conv;
3141                       /* We also have to fake this conversion stmt as SLP reduction
3142                          group so we don't have to mess with too much code
3143                          elsewhere.  */
3144                       REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3145                       REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3146                     }
3147                 /* Fill the backedge child of the PHI SLP node.  The
3148                      general matching code cannot find it because the
3149                      scalar code does not reflect how we vectorize the
3150                      reduction.  */
3151                 use_operand_p use_p;
3152                 imm_use_iterator imm_iter;
3153                 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3154                 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3155                                              gimple_get_lhs (scalar_def))
3156                     /* There are exactly two non-debug uses, the reduction
3157                        PHI and the loop-closed PHI node.  */
3158                     if (!is_gimple_debug (USE_STMT (use_p))
3159                         && gimple_bb (USE_STMT (use_p)) == loop->header)
3160                       {
3161                         auto_vec<stmt_vec_info, 64> phis (group_size);
3162                         stmt_vec_info phi_info
3163                           = vinfo->lookup_stmt (USE_STMT (use_p));
3164                         for (unsigned i = 0; i < group_size; ++i)
3165                           phis.quick_push (phi_info);
3166                         slp_tree *phi_node = bst_map->get (phis);
3167                         unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3168                         SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3169                           = SLP_INSTANCE_TREE (new_instance);
3170                         SLP_INSTANCE_TREE (new_instance)->refcnt++;
3171                       }
3172               }
3173 
3174             vinfo->slp_instances.safe_push (new_instance);
3175 
3176             /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3177                the number of scalar stmts in the root in a few places.
3178                Verify that assumption holds.  */
3179             gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3180                               .length () == group_size);
3181 
3182             if (dump_enabled_p ())
3183               {
3184                 dump_printf_loc (MSG_NOTE, vect_location,
3185                                      "Final SLP tree for instance %p:\n", new_instance);
3186                 vect_print_slp_graph (MSG_NOTE, vect_location,
3187                                             SLP_INSTANCE_TREE (new_instance));
3188               }
3189 
3190             return true;
3191           }
3192     }
3193   else
3194     {
3195       /* Failed to SLP.  */
3196       /* Free the allocated memory.  */
3197       scalar_stmts.release ();
3198     }
3199 
3200   stmt_vec_info stmt_info = stmt_info_;
3201   /* Try to break the group up into pieces.  */
3202   if (kind == slp_inst_kind_store)
3203     {
3204       /* ???  We could delay all the actual splitting of store-groups
3205            until after SLP discovery of the original group completed.
3206            Then we can recurse to vect_build_slp_instance directly.  */
3207       for (i = 0; i < group_size; i++)
3208           if (!matches[i])
3209             break;
3210 
3211       /* For basic block SLP, try to break the group up into multiples of
3212            a vector size.  */
3213       if (is_a <bb_vec_info> (vinfo)
3214             && (i > 1 && i < group_size))
3215           {
3216             tree scalar_type
3217               = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3218             tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3219                                                                   1 << floor_log2 (i));
3220             unsigned HOST_WIDE_INT const_nunits;
3221             if (vectype
3222                 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3223               {
3224                 /* Split into two groups at the first vector boundary.  */
3225                 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3226                 unsigned group1_size = i & ~(const_nunits - 1);
3227 
3228                 if (dump_enabled_p ())
3229                     dump_printf_loc (MSG_NOTE, vect_location,
3230                                          "Splitting SLP group at stmt %u\n", i);
3231                 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3232                                                                              group1_size);
3233                 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3234                                                                 kind, max_tree_size,
3235                                                                 limit);
3236                 /* Split the rest at the failure point and possibly
3237                      re-analyze the remaining matching part if it has
3238                      at least two lanes.  */
3239                 if (group1_size < i
3240                       && (i + 1 < group_size
3241                           || i - group1_size > 1))
3242                     {
3243                       stmt_vec_info rest2 = rest;
3244                       rest = vect_split_slp_store_group (rest, i - group1_size);
3245                       if (i - group1_size > 1)
3246                         res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3247                                                                   kind, max_tree_size,
3248                                                                   limit);
3249                     }
3250                 /* Re-analyze the non-matching tail if it has at least
3251                      two lanes.  */
3252                 if (i + 1 < group_size)
3253                     res |= vect_analyze_slp_instance (vinfo, bst_map,
3254                                                               rest, kind, max_tree_size,
3255                                                               limit);
3256                 return res;
3257               }
3258           }
3259 
3260       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3261       if (is_a <loop_vec_info> (vinfo)
3262             && (i > 1 && i < group_size)
3263             && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3264           {
3265             unsigned group1_size = i;
3266 
3267             if (dump_enabled_p ())
3268               dump_printf_loc (MSG_NOTE, vect_location,
3269                                    "Splitting SLP group at stmt %u\n", i);
3270 
3271             stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3272                                                                          group1_size);
3273             /* Loop vectorization cannot handle gaps in stores, make sure
3274                the split group appears as strided.  */
3275             STMT_VINFO_STRIDED_P (rest) = 1;
3276             DR_GROUP_GAP (rest) = 0;
3277             STMT_VINFO_STRIDED_P (stmt_info) = 1;
3278             DR_GROUP_GAP (stmt_info) = 0;
3279 
3280             bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3281                                                             kind, max_tree_size, limit);
3282             if (i + 1 < group_size)
3283               res |= vect_analyze_slp_instance (vinfo, bst_map,
3284                                                         rest, kind, max_tree_size, limit);
3285 
3286             return res;
3287           }
3288 
3289       /* Even though the first vector did not all match, we might be able to SLP
3290            (some) of the remainder.  FORNOW ignore this possibility.  */
3291     }
3292 
3293   /* Failed to SLP.  */
3294   if (dump_enabled_p ())
3295     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3296   return false;
3297 }
3298 
3299 
3300 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3301    vect_build_slp_tree to build a tree of packed stmts if possible.
3302    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3303 
3304 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,slp_instance_kind kind,unsigned max_tree_size,unsigned * limit)3305 vect_analyze_slp_instance (vec_info *vinfo,
3306                                  scalar_stmts_to_slp_tree_map_t *bst_map,
3307                                  stmt_vec_info stmt_info,
3308                                  slp_instance_kind kind,
3309                                  unsigned max_tree_size, unsigned *limit)
3310 {
3311   unsigned int i;
3312   vec<stmt_vec_info> scalar_stmts;
3313 
3314   if (is_a <bb_vec_info> (vinfo))
3315     vect_location = stmt_info->stmt;
3316 
3317   stmt_vec_info next_info = stmt_info;
3318   if (kind == slp_inst_kind_store)
3319     {
3320       /* Collect the stores and store them in scalar_stmts.  */
3321       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3322       while (next_info)
3323           {
3324             scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3325             next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3326           }
3327     }
3328   else if (kind == slp_inst_kind_reduc_chain)
3329     {
3330       /* Collect the reduction stmts and store them in scalar_stmts.  */
3331       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3332       while (next_info)
3333           {
3334             scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3335             next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3336           }
3337       /* Mark the first element of the reduction chain as reduction to properly
3338            transform the node.  In the reduction analysis phase only the last
3339            element of the chain is marked as reduction.  */
3340       STMT_VINFO_DEF_TYPE (stmt_info)
3341           = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3342       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3343           = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3344     }
3345   else if (kind == slp_inst_kind_ctor)
3346     {
3347       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3348       tree val;
3349       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3350       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3351           {
3352             stmt_vec_info def_info = vinfo->lookup_def (val);
3353             def_info = vect_stmt_to_vectorize (def_info);
3354             scalar_stmts.quick_push (def_info);
3355           }
3356       if (dump_enabled_p ())
3357           dump_printf_loc (MSG_NOTE, vect_location,
3358                                "Analyzing vectorizable constructor: %G\n",
3359                                stmt_info->stmt);
3360     }
3361   else if (kind == slp_inst_kind_reduc_group)
3362     {
3363       /* Collect reduction statements.  */
3364       const vec<stmt_vec_info> &reductions
3365           = as_a <loop_vec_info> (vinfo)->reductions;
3366       scalar_stmts.create (reductions.length ());
3367       for (i = 0; reductions.iterate (i, &next_info); i++)
3368           if ((STMT_VINFO_RELEVANT_P (next_info)
3369                || STMT_VINFO_LIVE_P (next_info))
3370               /* ???  Make sure we didn't skip a conversion around a reduction
3371                  path.  In that case we'd have to reverse engineer that conversion
3372                  stmt following the chain using reduc_idx and from the PHI
3373                  using reduc_def.  */
3374               && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3375             scalar_stmts.quick_push (next_info);
3376       /* If less than two were relevant/live there's nothing to SLP.  */
3377       if (scalar_stmts.length () < 2)
3378           return false;
3379     }
3380   else
3381     gcc_unreachable ();
3382 
3383   vec<stmt_vec_info> roots = vNULL;
3384   if (kind == slp_inst_kind_ctor)
3385     {
3386       roots.create (1);
3387       roots.quick_push (stmt_info);
3388     }
3389   /* Build the tree for the SLP instance.  */
3390   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3391                                               roots,
3392                                               max_tree_size, limit, bst_map,
3393                                               kind == slp_inst_kind_store
3394                                               ? stmt_info : NULL);
3395   if (!res)
3396     roots.release ();
3397 
3398   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3399      where we should do store group splitting.  */
3400 
3401   return res;
3402 }
3403 
3404 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3405    trees of packed scalar stmts if SLP is possible.  */
3406 
3407 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)3408 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3409 {
3410   unsigned int i;
3411   stmt_vec_info first_element;
3412   slp_instance instance;
3413 
3414   DUMP_VECT_SCOPE ("vect_analyze_slp");
3415 
3416   unsigned limit = max_tree_size;
3417 
3418   scalar_stmts_to_slp_tree_map_t *bst_map
3419     = new scalar_stmts_to_slp_tree_map_t ();
3420 
3421   /* Find SLP sequences starting from groups of grouped stores.  */
3422   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3423     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3424                                      STMT_VINFO_GROUPED_ACCESS (first_element)
3425                                      ? slp_inst_kind_store : slp_inst_kind_ctor,
3426                                      max_tree_size, &limit);
3427 
3428   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3429     {
3430       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3431           {
3432             vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3433             /* Apply patterns.  */
3434             for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3435               bb_vinfo->roots[i].stmts[j]
3436                 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3437             if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3438                                                bb_vinfo->roots[i].stmts,
3439                                                bb_vinfo->roots[i].roots,
3440                                                max_tree_size, &limit, bst_map, NULL))
3441               {
3442                 bb_vinfo->roots[i].stmts = vNULL;
3443                 bb_vinfo->roots[i].roots = vNULL;
3444               }
3445           }
3446     }
3447 
3448   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3449     {
3450       /* Find SLP sequences starting from reduction chains.  */
3451       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3452           if (! STMT_VINFO_RELEVANT_P (first_element)
3453               && ! STMT_VINFO_LIVE_P (first_element))
3454             ;
3455           else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3456                                                         slp_inst_kind_reduc_chain,
3457                                                         max_tree_size, &limit))
3458             {
3459               /* Dissolve reduction chain group.  */
3460               stmt_vec_info vinfo = first_element;
3461               stmt_vec_info last = NULL;
3462               while (vinfo)
3463                 {
3464                     stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3465                     REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3466                     REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3467                     last = vinfo;
3468                     vinfo = next;
3469                 }
3470               STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3471               /* It can be still vectorized as part of an SLP reduction.  */
3472               loop_vinfo->reductions.safe_push (last);
3473             }
3474 
3475       /* Find SLP sequences starting from groups of reductions.  */
3476       if (loop_vinfo->reductions.length () > 1)
3477           vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3478                                            slp_inst_kind_reduc_group, max_tree_size,
3479                                            &limit);
3480     }
3481 
3482   hash_set<slp_tree> visited_patterns;
3483   slp_tree_to_load_perm_map_t perm_cache;
3484   slp_compat_nodes_map_t compat_cache;
3485 
3486   /* See if any patterns can be found in the SLP tree.  */
3487   bool pattern_found = false;
3488   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3489     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3490                                                         &visited_patterns, &perm_cache,
3491                                                         &compat_cache);
3492 
3493   /* If any were found optimize permutations of loads.  */
3494   if (pattern_found)
3495     {
3496       hash_map<slp_tree, slp_tree> load_map;
3497       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3498           {
3499             slp_tree root = SLP_INSTANCE_TREE (instance);
3500             optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3501                                                   &load_map, root);
3502           }
3503     }
3504 
3505 
3506 
3507   /* The map keeps a reference on SLP nodes built, release that.  */
3508   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3509        it != bst_map->end (); ++it)
3510     if ((*it).second)
3511       vect_free_slp_tree ((*it).second);
3512   delete bst_map;
3513 
3514   if (pattern_found && dump_enabled_p ())
3515     {
3516       dump_printf_loc (MSG_NOTE, vect_location,
3517                            "Pattern matched SLP tree\n");
3518       hash_set<slp_tree> visited;
3519       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3520           vect_print_slp_graph (MSG_NOTE, vect_location,
3521                                     SLP_INSTANCE_TREE (instance), visited);
3522     }
3523 
3524   return opt_result::success ();
3525 }
3526 
3527 struct slpg_vertex
3528 {
slpg_vertexslpg_vertex3529   slpg_vertex (slp_tree node_)
3530     : node (node_), perm_in (-1), perm_out (-1) {}
3531 
get_perm_materializedslpg_vertex3532   int get_perm_materialized () const
3533     { return perm_in != perm_out ? perm_in : 0; }
3534 
3535   slp_tree node;
3536   /* The common permutation on the incoming lanes (towards SLP children).  */
3537   int perm_in;
3538   /* The permutation on the outgoing lanes (towards SLP parents).  When
3539      the node is a materialization point for a permute this differs
3540      from perm_in (and is then usually zero).  Materialization happens
3541      on the input side.  */
3542   int perm_out;
3543 };
3544 
3545 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
3546 
3547 static void
vect_slp_build_vertices(hash_set<slp_tree> & visited,slp_tree node,vec<slpg_vertex> & vertices,vec<int> & leafs)3548 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3549                                vec<slpg_vertex> &vertices, vec<int> &leafs)
3550 {
3551   unsigned i;
3552   slp_tree child;
3553 
3554   if (visited.add (node))
3555     return;
3556 
3557   node->vertex = vertices.length ();
3558   vertices.safe_push (slpg_vertex (node));
3559 
3560   bool leaf = true;
3561   bool force_leaf = false;
3562   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3563     if (child)
3564       {
3565           leaf = false;
3566           vect_slp_build_vertices (visited, child, vertices, leafs);
3567       }
3568     else
3569       force_leaf = true;
3570   /* Since SLP discovery works along use-def edges all cycles have an
3571      entry - but there's the exception of cycles where we do not handle
3572      the entry explicitely (but with a NULL SLP node), like some reductions
3573      and inductions.  Force those SLP PHIs to act as leafs to make them
3574      backwards reachable.  */
3575   if (leaf || force_leaf)
3576     leafs.safe_push (node->vertex);
3577 }
3578 
3579 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
3580 
3581 static void
vect_slp_build_vertices(vec_info * info,vec<slpg_vertex> & vertices,vec<int> & leafs)3582 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3583                                vec<int> &leafs)
3584 {
3585   hash_set<slp_tree> visited;
3586   unsigned i;
3587   slp_instance instance;
3588   FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3589     vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3590                                    leafs);
3591 }
3592 
3593 /* Apply (reverse) bijectite PERM to VEC.  */
3594 
3595 template <class T>
3596 static void
vect_slp_permute(vec<unsigned> perm,vec<T> & vec,bool reverse)3597 vect_slp_permute (vec<unsigned> perm,
3598                       vec<T> &vec, bool reverse)
3599 {
3600   auto_vec<T, 64> saved;
3601   saved.create (vec.length ());
3602   for (unsigned i = 0; i < vec.length (); ++i)
3603     saved.quick_push (vec[i]);
3604 
3605   if (reverse)
3606     {
3607       for (unsigned i = 0; i < vec.length (); ++i)
3608           vec[perm[i]] = saved[i];
3609       for (unsigned i = 0; i < vec.length (); ++i)
3610           gcc_assert (vec[perm[i]] == saved[i]);
3611     }
3612   else
3613     {
3614       for (unsigned i = 0; i < vec.length (); ++i)
3615           vec[i] = saved[perm[i]];
3616       for (unsigned i = 0; i < vec.length (); ++i)
3617           gcc_assert (vec[i] == saved[perm[i]]);
3618     }
3619 }
3620 
3621 /* Return whether permutations PERM_A and PERM_B as recorded in the
3622    PERMS vector are equal.  */
3623 
3624 static bool
vect_slp_perms_eq(const vec<vec<unsigned>> & perms,int perm_a,int perm_b)3625 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3626                        int perm_a, int perm_b)
3627 {
3628   return (perm_a == perm_b
3629             || (perm_a != -1 && perm_b != -1
3630                 && perms[perm_a].length () == perms[perm_b].length ()
3631                 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3632                                sizeof (unsigned) * perms[perm_a].length ()) == 0));
3633 }
3634 
3635 /* Optimize the SLP graph of VINFO.  */
3636 
3637 void
vect_optimize_slp(vec_info * vinfo)3638 vect_optimize_slp (vec_info *vinfo)
3639 {
3640   if (vinfo->slp_instances.is_empty ())
3641     return;
3642 
3643   slp_tree node;
3644   unsigned i;
3645   auto_vec<slpg_vertex> vertices;
3646   auto_vec<int> leafs;
3647   vect_slp_build_vertices (vinfo, vertices, leafs);
3648 
3649   struct graph *slpg = new_graph (vertices.length ());
3650   for (slpg_vertex &v : vertices)
3651     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3652       if (child)
3653           add_edge (slpg, v.node->vertex, child->vertex);
3654 
3655   /* Compute (reverse) postorder on the inverted graph.  */
3656   auto_vec<int> ipo;
3657   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3658 
3659   auto_vec<vec<unsigned> > perms;
3660   perms.safe_push (vNULL); /* zero is no permute */
3661 
3662   /* Produce initial permutations.  */
3663   for (i = 0; i < leafs.length (); ++i)
3664     {
3665       int idx = leafs[i];
3666       slp_tree node = vertices[idx].node;
3667 
3668       /* Handle externals and constants optimistically throughout the
3669            iteration.  But treat existing vectors as fixed since we
3670            do not handle permuting them below.  */
3671       if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3672              && !SLP_TREE_VEC_DEFS (node).exists ())
3673             || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3674           continue;
3675 
3676       /* Leafs do not change across iterations.  Note leafs also double
3677            as entries to the reverse graph.  */
3678       if (!slpg->vertices[idx].succ)
3679           {
3680             vertices[idx].perm_in = 0;
3681             vertices[idx].perm_out = 0;
3682           }
3683 
3684       /* Loads are the only thing generating permutes.  */
3685       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3686           continue;
3687 
3688       /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3689            node unpermuted, record this permute.  */
3690       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3691       if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3692           continue;
3693       dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3694       unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3695       bool any_permute = false;
3696       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3697           {
3698             unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3699             imin = MIN (imin, idx);
3700             imax = MAX (imax, idx);
3701             if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3702               any_permute = true;
3703           }
3704       /* If there's no permute no need to split one out.  */
3705       if (!any_permute)
3706           continue;
3707       /* If the span doesn't match we'd disrupt VF computation, avoid
3708            that for now.  */
3709       if (imax - imin + 1 != SLP_TREE_LANES (node))
3710           continue;
3711 
3712       /* For now only handle true permutes, like
3713            vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
3714            when permuting constants and invariants keeping the permute
3715            bijective.  */
3716       auto_sbitmap load_index (SLP_TREE_LANES (node));
3717       bitmap_clear (load_index);
3718       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3719           bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3720       unsigned j;
3721       for (j = 0; j < SLP_TREE_LANES (node); ++j)
3722           if (!bitmap_bit_p (load_index, j))
3723             break;
3724       if (j != SLP_TREE_LANES (node))
3725           continue;
3726 
3727       vec<unsigned> perm = vNULL;
3728       perm.safe_grow (SLP_TREE_LANES (node), true);
3729       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3730           perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3731       perms.safe_push (perm);
3732       vertices[idx].perm_in = perms.length () - 1;
3733       vertices[idx].perm_out = perms.length () - 1;
3734     }
3735 
3736   /* We have to mark outgoing permutations facing non-associating-reduction
3737      graph entries that are not represented as to be materialized.  */
3738   for (slp_instance instance : vinfo->slp_instances)
3739     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3740       {
3741           /* Just setting perm_out isn't enough for the propagation to
3742              pick this up.  */
3743           vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3744           vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3745       }
3746     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
3747       {
3748           stmt_vec_info stmt_info
3749             = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
3750           stmt_vec_info reduc_info = info_for_reduction (vinfo, stmt_info);
3751           if (needs_fold_left_reduction_p (TREE_TYPE
3752                                                      (gimple_get_lhs (stmt_info->stmt)),
3753                                                    STMT_VINFO_REDUC_CODE (reduc_info)))
3754             {
3755               unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
3756               vertices[node_i].perm_in = 0;
3757               vertices[node_i].perm_out = 0;
3758             }
3759       }
3760 
3761   /* Propagate permutes along the graph and compute materialization points.  */
3762   bool changed;
3763   bool do_materialization = false;
3764   unsigned iteration = 0;
3765   do
3766     {
3767       changed = false;
3768       ++iteration;
3769 
3770       if (dump_enabled_p ())
3771           dump_printf_loc (MSG_NOTE, vect_location,
3772                                "SLP optimize iteration %d\n", iteration);
3773 
3774       for (i = vertices.length (); i > 0 ; --i)
3775           {
3776             int idx = ipo[i-1];
3777             slp_tree node = vertices[idx].node;
3778 
3779             /* Handle externals and constants optimistically throughout the
3780                iteration.  */
3781             if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3782                 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3783               continue;
3784 
3785             /* We still eventually have failed backedge SLP nodes in the
3786                graph, those are only cancelled when analyzing operations.
3787                Simply treat them as transparent ops, propagating permutes
3788                through them.  */
3789             if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3790               {
3791                 /* We do not handle stores with a permutation, so all
3792                      incoming permutes must have been materialized.  */
3793                 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3794                 if (STMT_VINFO_DATA_REF (rep)
3795                       && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3796                     {
3797                       /* ???  We're forcing materialization in place
3798                          of the child here, we'd need special handling
3799                          in materialization to leave perm_in -1 here.  */
3800                       vertices[idx].perm_in = 0;
3801                       vertices[idx].perm_out = 0;
3802                     }
3803                 /* We cannot move a permute across an operation that is
3804                      not independent on lanes.  Note this is an explicit
3805                      negative list since that's much shorter than the respective
3806                      positive one but it's critical to keep maintaining it.  */
3807                 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3808                     switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3809                       {
3810                       case CFN_COMPLEX_ADD_ROT90:
3811                       case CFN_COMPLEX_ADD_ROT270:
3812                       case CFN_COMPLEX_MUL:
3813                       case CFN_COMPLEX_MUL_CONJ:
3814                       case CFN_VEC_ADDSUB:
3815                       case CFN_VEC_FMADDSUB:
3816                       case CFN_VEC_FMSUBADD:
3817                         vertices[idx].perm_in = 0;
3818                         vertices[idx].perm_out = 0;
3819                       default:;
3820                       }
3821               }
3822 
3823             if (!slpg->vertices[idx].succ)
3824               /* Pick up pre-computed leaf values.  */
3825               ;
3826             else
3827               {
3828                 bool any_succ_perm_out_m1 = false;
3829                 int perm_in = vertices[idx].perm_in;
3830                 for (graph_edge *succ = slpg->vertices[idx].succ;
3831                        succ; succ = succ->succ_next)
3832                     {
3833                       int succ_idx = succ->dest;
3834                       int succ_perm = vertices[succ_idx].perm_out;
3835                       /* Handle unvisited (and constant) nodes optimistically.  */
3836                       /* ???  But for constants once we want to handle
3837                          non-bijective permutes we have to verify the permute,
3838                          when unifying lanes, will not unify different constants.
3839                          For example see gcc.dg/vect/bb-slp-14.c for a case
3840                          that would break.  */
3841                       if (succ_perm == -1)
3842                         {
3843                           /* When we handled a non-leaf optimistically, note
3844                                that so we can adjust its outgoing permute below.  */
3845                           slp_tree succ_node = vertices[succ_idx].node;
3846                           if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3847                                 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3848                               any_succ_perm_out_m1 = true;
3849                           continue;
3850                         }
3851                       if (perm_in == -1)
3852                         perm_in = succ_perm;
3853                       else if (succ_perm == 0
3854                                  || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3855                         {
3856                           perm_in = 0;
3857                           break;
3858                         }
3859                     }
3860 
3861                 /* Adjust any incoming permutes we treated optimistically.  */
3862                 if (perm_in != -1 && any_succ_perm_out_m1)
3863                     {
3864                       for (graph_edge *succ = slpg->vertices[idx].succ;
3865                            succ; succ = succ->succ_next)
3866                         {
3867                           slp_tree succ_node = vertices[succ->dest].node;
3868                           if (vertices[succ->dest].perm_out == -1
3869                                 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3870                                 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3871                               {
3872                                 vertices[succ->dest].perm_out = perm_in;
3873                                 /* And ensure this propagates.  */
3874                                 if (vertices[succ->dest].perm_in == -1)
3875                                   vertices[succ->dest].perm_in = perm_in;
3876                               }
3877                         }
3878                       changed = true;
3879                     }
3880 
3881                 if (!vect_slp_perms_eq (perms, perm_in,
3882                                               vertices[idx].perm_in))
3883                     {
3884                       /* Make sure we eventually converge.  */
3885                       gcc_checking_assert (vertices[idx].perm_in == -1
3886                                                || perm_in == 0);
3887                       vertices[idx].perm_in = perm_in;
3888 
3889                       /* While we can handle VEC_PERM nodes as transparent
3890                          pass-through they can be a cheap materialization
3891                          point as well.  In addition they can act as source
3892                          of a random permutation as well.
3893                          The following ensures that former materialization
3894                          points that now have zero incoming permutes no
3895                          longer appear as such and that former "any" permutes
3896                          get pass-through.  We keep VEC_PERM nodes optimistic
3897                          as "any" outgoing permute though.  */
3898                       if (vertices[idx].perm_out != 0
3899                           && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3900                         vertices[idx].perm_out = perm_in;
3901                       changed = true;
3902                     }
3903               }
3904 
3905             /* Elide pruning at materialization points in the first
3906                iteration phase.  */
3907             if (!do_materialization)
3908               continue;
3909 
3910             int perm = vertices[idx].perm_out;
3911             if (perm == 0 || perm == -1)
3912               continue;
3913 
3914             /* Decide on permute materialization.  Look whether there's
3915                a use (pred) edge that is permuted differently than us.
3916                In that case mark ourselves so the permutation is applied.  */
3917             bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3918             if (all_preds_permuted)
3919               for (graph_edge *pred = slpg->vertices[idx].pred;
3920                      pred; pred = pred->pred_next)
3921                 {
3922                     int pred_perm = vertices[pred->src].perm_in;
3923                     gcc_checking_assert (pred_perm != -1);
3924                     if (!vect_slp_perms_eq (perms, perm, pred_perm))
3925                       {
3926                         all_preds_permuted = false;
3927                         break;
3928                       }
3929                 }
3930             if (!all_preds_permuted)
3931               {
3932                 vertices[idx].perm_out = 0;
3933                 changed = true;
3934               }
3935           }
3936 
3937       /* If the initial propagation converged, switch on materialization
3938            and re-propagate.  */
3939       if (!changed && !do_materialization)
3940           {
3941             do_materialization = true;
3942             changed = true;
3943           }
3944     }
3945   while (changed);
3946   statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3947 
3948   /* Materialize.  */
3949   for (i = 0; i < vertices.length (); ++i)
3950     {
3951       int perm_in = vertices[i].perm_in;
3952       slp_tree node = vertices[i].node;
3953 
3954       /* First permute invariant/external original successors, we handle
3955            those optimistically during propagation and duplicate them if
3956            they are used with different permutations.  */
3957       unsigned j;
3958       slp_tree child;
3959       if (perm_in > 0)
3960           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3961             {
3962               if (!child
3963                     || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3964                         && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3965                 continue;
3966 
3967               /* If the vector is uniform there's nothing to do.  */
3968               if (vect_slp_tree_uniform_p (child))
3969                 continue;
3970 
3971               /* We can end up sharing some externals via two_operator
3972                  handling.  Be prepared to unshare those.  */
3973               if (child->refcnt != 1)
3974                 {
3975                     gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3976                     SLP_TREE_CHILDREN (node)[j] = child
3977                       = vect_create_new_slp_node
3978                           (SLP_TREE_SCALAR_OPS (child).copy ());
3979                 }
3980               vect_slp_permute (perms[perm_in],
3981                                     SLP_TREE_SCALAR_OPS (child), true);
3982             }
3983 
3984       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3985           {
3986             /* Apply the common permutes to the input vectors.  */
3987             if (perm_in > 0)
3988               {
3989                 /* If the node is already a permute node we can apply
3990                      the permutation to the lane selection, effectively
3991                      materializing it on the incoming vectors.  */
3992                 if (dump_enabled_p ())
3993                     dump_printf_loc (MSG_NOTE, vect_location,
3994                                          "simplifying permute node %p\n",
3995                                          node);
3996                 for (unsigned k = 0;
3997                        k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3998                     SLP_TREE_LANE_PERMUTATION (node)[k].second
3999                       = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
4000               }
4001             /* Apply the anticipated output permute to the permute and
4002                stmt vectors.  */
4003             int perm_out = vertices[i].perm_out;
4004             if (perm_out > 0)
4005               {
4006                 vect_slp_permute (perms[perm_out],
4007                                         SLP_TREE_SCALAR_STMTS (node), true);
4008                 vect_slp_permute (perms[perm_out],
4009                                         SLP_TREE_LANE_PERMUTATION (node), true);
4010               }
4011           }
4012       else if (vertices[i].get_perm_materialized () != 0)
4013           {
4014             if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4015               /* For loads simply drop the permutation, the load permutation
4016                  already performs the desired permutation.  */
4017               ;
4018             else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4019               gcc_unreachable ();
4020             else
4021               {
4022                 if (dump_enabled_p ())
4023                     dump_printf_loc (MSG_NOTE, vect_location,
4024                                          "inserting permute node in place of %p\n",
4025                                          node);
4026 
4027                 /* Make a copy of NODE and in-place change it to a
4028                      VEC_PERM node to permute the lanes of the copy.  */
4029                 slp_tree copy = new _slp_tree;
4030                 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
4031                 SLP_TREE_CHILDREN (node) = vNULL;
4032                 SLP_TREE_SCALAR_STMTS (copy)
4033                     = SLP_TREE_SCALAR_STMTS (node).copy ();
4034                 vect_slp_permute (perms[perm_in],
4035                                         SLP_TREE_SCALAR_STMTS (copy), true);
4036                 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
4037                 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
4038                 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4039                 SLP_TREE_LANE_PERMUTATION (copy)
4040                     = SLP_TREE_LANE_PERMUTATION (node);
4041                 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4042                 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4043                 copy->refcnt = 1;
4044                 copy->max_nunits = node->max_nunits;
4045                 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4046                 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4047                 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4048 
4049                 /* Now turn NODE into a VEC_PERM.  */
4050                 SLP_TREE_CHILDREN (node).safe_push (copy);
4051                 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4052                 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4053                     SLP_TREE_LANE_PERMUTATION (node)
4054                       .quick_push (std::make_pair (0, perms[perm_in][j]));
4055                 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4056               }
4057           }
4058       else if (perm_in > 0) /* perm_in == perm_out */
4059           {
4060             /* Apply the reverse permutation to our stmts.  */
4061             vect_slp_permute (perms[perm_in],
4062                                   SLP_TREE_SCALAR_STMTS (node), true);
4063             /* And to the lane/load permutation, which we can simply
4064                make regular by design.  */
4065             if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4066               {
4067                 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4068                 /* ???  When we handle non-bijective permutes the idea
4069                      is that we can force the load-permutation to be
4070                      { min, min + 1, min + 2, ... max }.  But then the
4071                      scalar defs might no longer match the lane content
4072                      which means wrong-code with live lane vectorization.
4073                      So we possibly have to have NULL entries for those.  */
4074                 vect_slp_permute (perms[perm_in],
4075                                         SLP_TREE_LOAD_PERMUTATION (node), true);
4076               }
4077             else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4078               gcc_unreachable ();
4079           }
4080     }
4081 
4082   /* Elide any permutations at BB reduction roots.  */
4083   if (is_a <bb_vec_info> (vinfo))
4084     {
4085       for (slp_instance instance : vinfo->slp_instances)
4086           {
4087             if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4088               continue;
4089             slp_tree old = SLP_INSTANCE_TREE (instance);
4090             if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4091                 && SLP_TREE_CHILDREN (old).length () == 1)
4092               {
4093                 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4094                 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4095                     {
4096                       /* Preserve the special VEC_PERM we use to shield existing
4097                          vector defs from the rest.  But make it a no-op.  */
4098                       auto_vec<stmt_vec_info, 64> saved;
4099                       saved.create (SLP_TREE_SCALAR_STMTS (old).length ());
4100                       for (unsigned i = 0;
4101                            i < SLP_TREE_SCALAR_STMTS (old).length (); ++i)
4102                         saved.quick_push (SLP_TREE_SCALAR_STMTS (old)[i]);
4103                       for (unsigned i = 0;
4104                            i < SLP_TREE_SCALAR_STMTS (old).length (); ++i)
4105                         SLP_TREE_SCALAR_STMTS (old)[i]
4106                           = saved[SLP_TREE_LANE_PERMUTATION (old)[i].second];
4107                       unsigned i = 0;
4108                       for (std::pair<unsigned, unsigned> &p
4109                            : SLP_TREE_LANE_PERMUTATION (old))
4110                         p.second = i++;
4111                     }
4112                 else
4113                     {
4114                       SLP_INSTANCE_TREE (instance) = child;
4115                       SLP_TREE_REF_COUNT (child)++;
4116                       vect_free_slp_tree (old);
4117                     }
4118               }
4119             else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4120                        && SLP_TREE_REF_COUNT (old) == 1
4121                        && vertices[old->vertex].get_perm_materialized () != 0)
4122               {
4123                 /* ???  For loads the situation is more complex since
4124                      we can't modify the permute in place in case the
4125                      node is used multiple times.  In fact for loads this
4126                      should be somehow handled in the propagation engine.  */
4127                 /* Apply the reverse permutation to our stmts.  */
4128                 int perm = vertices[old->vertex].get_perm_materialized ();
4129                 vect_slp_permute (perms[perm],
4130                                         SLP_TREE_SCALAR_STMTS (old), true);
4131                 vect_slp_permute (perms[perm],
4132                                         SLP_TREE_LOAD_PERMUTATION (old), true);
4133               }
4134           }
4135     }
4136 
4137   /* Free the perms vector used for propagation.  */
4138   while (!perms.is_empty ())
4139     perms.pop ().release ();
4140   free_graph (slpg);
4141 
4142 
4143   /* Now elide load permutations that are not necessary.  */
4144   for (i = 0; i < leafs.length (); ++i)
4145     {
4146       node = vertices[leafs[i]].node;
4147       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4148           continue;
4149 
4150       /* In basic block vectorization we allow any subchain of an interleaving
4151            chain.
4152            FORNOW: not in loop SLP because of realignment complications.  */
4153       if (is_a <bb_vec_info> (vinfo))
4154           {
4155             bool subchain_p = true;
4156             stmt_vec_info next_load_info = NULL;
4157             stmt_vec_info load_info;
4158             unsigned j;
4159             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4160               {
4161                 if (j != 0
4162                       && (next_load_info != load_info
4163                           || DR_GROUP_GAP (load_info) != 1))
4164                     {
4165                       subchain_p = false;
4166                       break;
4167                     }
4168                 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4169               }
4170             if (subchain_p)
4171               {
4172                 SLP_TREE_LOAD_PERMUTATION (node).release ();
4173                 continue;
4174               }
4175           }
4176       else
4177           {
4178             stmt_vec_info load_info;
4179             bool this_load_permuted = false;
4180             unsigned j;
4181             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4182               if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4183                 {
4184                     this_load_permuted = true;
4185                     break;
4186                 }
4187             stmt_vec_info first_stmt_info
4188               = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4189             if (!this_load_permuted
4190                 /* The load requires permutation when unrolling exposes
4191                      a gap either because the group is larger than the SLP
4192                      group-size or because there is a gap between the groups.  */
4193                 && (known_eq (LOOP_VINFO_VECT_FACTOR
4194                                     (as_a <loop_vec_info> (vinfo)), 1U)
4195                       || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4196                           && DR_GROUP_GAP (first_stmt_info) == 0)))
4197               {
4198                 SLP_TREE_LOAD_PERMUTATION (node).release ();
4199                 continue;
4200               }
4201           }
4202     }
4203 }
4204 
4205 /* Gather loads reachable from the individual SLP graph entries.  */
4206 
4207 void
vect_gather_slp_loads(vec_info * vinfo)4208 vect_gather_slp_loads (vec_info *vinfo)
4209 {
4210   unsigned i;
4211   slp_instance instance;
4212   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4213     {
4214       hash_set<slp_tree> visited;
4215       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4216                                    SLP_INSTANCE_TREE (instance), visited);
4217     }
4218 }
4219 
4220 
4221 /* For each possible SLP instance decide whether to SLP it and calculate overall
4222    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
4223    least one instance.  */
4224 
4225 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)4226 vect_make_slp_decision (loop_vec_info loop_vinfo)
4227 {
4228   unsigned int i;
4229   poly_uint64 unrolling_factor = 1;
4230   const vec<slp_instance> &slp_instances
4231     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4232   slp_instance instance;
4233   int decided_to_slp = 0;
4234 
4235   DUMP_VECT_SCOPE ("vect_make_slp_decision");
4236 
4237   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4238     {
4239       /* FORNOW: SLP if you can.  */
4240       /* All unroll factors have the form:
4241 
4242              GET_MODE_SIZE (vinfo->vector_mode) * X
4243 
4244            for some rational X, so they must have a common multiple.  */
4245       unrolling_factor
4246           = force_common_multiple (unrolling_factor,
4247                                          SLP_INSTANCE_UNROLLING_FACTOR (instance));
4248 
4249       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
4250            call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4251            loop-based vectorization.  Such stmts will be marked as HYBRID.  */
4252       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4253       decided_to_slp++;
4254     }
4255 
4256   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4257 
4258   if (decided_to_slp && dump_enabled_p ())
4259     {
4260       dump_printf_loc (MSG_NOTE, vect_location,
4261                            "Decided to SLP %d instances. Unrolling factor ",
4262                            decided_to_slp);
4263       dump_dec (MSG_NOTE, unrolling_factor);
4264       dump_printf (MSG_NOTE, "\n");
4265     }
4266 
4267   return (decided_to_slp > 0);
4268 }
4269 
4270 /* Private data for vect_detect_hybrid_slp.  */
4271 struct vdhs_data
4272 {
4273   loop_vec_info loop_vinfo;
4274   vec<stmt_vec_info> *worklist;
4275 };
4276 
4277 /* Walker for walk_gimple_op.  */
4278 
4279 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)4280 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4281 {
4282   walk_stmt_info *wi = (walk_stmt_info *)data;
4283   vdhs_data *dat = (vdhs_data *)wi->info;
4284 
4285   if (wi->is_lhs)
4286     return NULL_TREE;
4287 
4288   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4289   if (!def_stmt_info)
4290     return NULL_TREE;
4291   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4292   if (PURE_SLP_STMT (def_stmt_info))
4293     {
4294       if (dump_enabled_p ())
4295           dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4296                                def_stmt_info->stmt);
4297       STMT_SLP_TYPE (def_stmt_info) = hybrid;
4298       dat->worklist->safe_push (def_stmt_info);
4299     }
4300 
4301   return NULL_TREE;
4302 }
4303 
4304 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4305    if so, otherwise pushing it to WORKLIST.  */
4306 
4307 static void
maybe_push_to_hybrid_worklist(vec_info * vinfo,vec<stmt_vec_info> & worklist,stmt_vec_info stmt_info)4308 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4309                                      vec<stmt_vec_info> &worklist,
4310                                      stmt_vec_info stmt_info)
4311 {
4312   if (dump_enabled_p ())
4313     dump_printf_loc (MSG_NOTE, vect_location,
4314                          "Processing hybrid candidate : %G", stmt_info->stmt);
4315   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4316   imm_use_iterator iter2;
4317   ssa_op_iter iter1;
4318   use_operand_p use_p;
4319   def_operand_p def_p;
4320   bool any_def = false;
4321   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4322     {
4323       any_def = true;
4324       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4325           {
4326             if (is_gimple_debug (USE_STMT (use_p)))
4327               continue;
4328             stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4329             /* An out-of loop use means this is a loop_vect sink.  */
4330             if (!use_info)
4331               {
4332                 if (dump_enabled_p ())
4333                     dump_printf_loc (MSG_NOTE, vect_location,
4334                                          "Found loop_vect sink: %G", stmt_info->stmt);
4335                 worklist.safe_push (stmt_info);
4336                 return;
4337               }
4338             else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4339               {
4340                 if (dump_enabled_p ())
4341                     dump_printf_loc (MSG_NOTE, vect_location,
4342                                          "Found loop_vect use: %G", use_info->stmt);
4343                 worklist.safe_push (stmt_info);
4344                 return;
4345               }
4346           }
4347     }
4348   /* No def means this is a loo_vect sink.  */
4349   if (!any_def)
4350     {
4351       if (dump_enabled_p ())
4352           dump_printf_loc (MSG_NOTE, vect_location,
4353                                "Found loop_vect sink: %G", stmt_info->stmt);
4354       worklist.safe_push (stmt_info);
4355       return;
4356     }
4357   if (dump_enabled_p ())
4358     dump_printf_loc (MSG_NOTE, vect_location,
4359                          "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4360   STMT_SLP_TYPE (stmt_info) = pure_slp;
4361 }
4362 
4363 /* Find stmts that must be both vectorized and SLPed.  */
4364 
4365 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)4366 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4367 {
4368   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4369 
4370   /* All stmts participating in SLP are marked pure_slp, all other
4371      stmts are loop_vect.
4372      First collect all loop_vect stmts into a worklist.
4373      SLP patterns cause not all original scalar stmts to appear in
4374      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4375      Rectify this here and do a backward walk over the IL only considering
4376      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4377      mark them as pure_slp.  */
4378   auto_vec<stmt_vec_info> worklist;
4379   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4380     {
4381       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4382       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4383              gsi_next (&gsi))
4384           {
4385             gphi *phi = gsi.phi ();
4386             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4387             if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4388               maybe_push_to_hybrid_worklist (loop_vinfo,
4389                                                      worklist, stmt_info);
4390           }
4391       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4392              gsi_prev (&gsi))
4393           {
4394             gimple *stmt = gsi_stmt (gsi);
4395             if (is_gimple_debug (stmt))
4396               continue;
4397             stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4398             if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4399               {
4400                 for (gimple_stmt_iterator gsi2
4401                          = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4402                        !gsi_end_p (gsi2); gsi_next (&gsi2))
4403                     {
4404                       stmt_vec_info patt_info
4405                         = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4406                       if (!STMT_SLP_TYPE (patt_info)
4407                           && STMT_VINFO_RELEVANT (patt_info))
4408                         maybe_push_to_hybrid_worklist (loop_vinfo,
4409                                                                worklist, patt_info);
4410                     }
4411                 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4412               }
4413             if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4414               maybe_push_to_hybrid_worklist (loop_vinfo,
4415                                                      worklist, stmt_info);
4416           }
4417     }
4418 
4419   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4420      mark any SLP vectorized stmt as hybrid.
4421      ???  We're visiting def stmts N times (once for each non-SLP and
4422      once for each hybrid-SLP use).  */
4423   walk_stmt_info wi;
4424   vdhs_data dat;
4425   dat.worklist = &worklist;
4426   dat.loop_vinfo = loop_vinfo;
4427   memset (&wi, 0, sizeof (wi));
4428   wi.info = (void *)&dat;
4429   while (!worklist.is_empty ())
4430     {
4431       stmt_vec_info stmt_info = worklist.pop ();
4432       /* Since SSA operands are not set up for pattern stmts we need
4433            to use walk_gimple_op.  */
4434       wi.is_lhs = 0;
4435       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4436       /* For gather/scatter make sure to walk the offset operand, that
4437            can be a scaling and conversion away.  */
4438       gather_scatter_info gs_info;
4439       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4440             && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
4441           {
4442             int dummy;
4443             vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
4444           }
4445     }
4446 }
4447 
4448 
4449 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
4450 
_bb_vec_info(vec<basic_block> _bbs,vec_info_shared * shared)4451 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4452   : vec_info (vec_info::bb, shared),
4453     bbs (_bbs),
4454     roots (vNULL)
4455 {
4456   for (unsigned i = 0; i < bbs.length (); ++i)
4457     {
4458       if (i != 0)
4459           for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4460                gsi_next (&si))
4461             {
4462               gphi *phi = si.phi ();
4463               gimple_set_uid (phi, 0);
4464               add_stmt (phi);
4465             }
4466       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4467              !gsi_end_p (gsi); gsi_next (&gsi))
4468           {
4469             gimple *stmt = gsi_stmt (gsi);
4470             gimple_set_uid (stmt, 0);
4471             if (is_gimple_debug (stmt))
4472               continue;
4473             add_stmt (stmt);
4474           }
4475     }
4476 }
4477 
4478 
4479 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4480    stmts in the basic block.  */
4481 
~_bb_vec_info()4482 _bb_vec_info::~_bb_vec_info ()
4483 {
4484   /* Reset region marker.  */
4485   for (unsigned i = 0; i < bbs.length (); ++i)
4486     {
4487       if (i != 0)
4488           for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4489                gsi_next (&si))
4490             {
4491               gphi *phi = si.phi ();
4492               gimple_set_uid (phi, -1);
4493             }
4494       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4495              !gsi_end_p (gsi); gsi_next (&gsi))
4496           {
4497             gimple *stmt = gsi_stmt (gsi);
4498             gimple_set_uid (stmt, -1);
4499           }
4500     }
4501 
4502   for (unsigned i = 0; i < roots.length (); ++i)
4503     {
4504       roots[i].stmts.release ();
4505       roots[i].roots.release ();
4506     }
4507   roots.release ();
4508 }
4509 
4510 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
4511    given then that child nodes have already been processed, and that
4512    their def types currently match their SLP node's def type.  */
4513 
4514 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)4515 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4516                                             slp_instance node_instance,
4517                                             stmt_vector_for_cost *cost_vec)
4518 {
4519   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4520 
4521   /* Calculate the number of vector statements to be created for the
4522      scalar stmts in this node.  For SLP reductions it is equal to the
4523      number of vector statements in the children (which has already been
4524      calculated by the recursive call).  Otherwise it is the number of
4525      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4526      VF divided by the number of elements in a vector.  */
4527   if (!STMT_VINFO_DATA_REF (stmt_info)
4528       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4529     {
4530       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4531           if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4532             {
4533               SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4534                 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4535               break;
4536             }
4537     }
4538   else
4539     {
4540       poly_uint64 vf;
4541       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4542           vf = loop_vinfo->vectorization_factor;
4543       else
4544           vf = 1;
4545       unsigned int group_size = SLP_TREE_LANES (node);
4546       tree vectype = SLP_TREE_VECTYPE (node);
4547       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4548           = vect_get_num_vectors (vf * group_size, vectype);
4549     }
4550 
4551   /* Handle purely internal nodes.  */
4552   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4553     {
4554       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
4555           return false;
4556 
4557       stmt_vec_info slp_stmt_info;
4558       unsigned int i;
4559       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
4560           {
4561             if (STMT_VINFO_LIVE_P (slp_stmt_info)
4562                 && !vectorizable_live_operation (vinfo,
4563                                                          slp_stmt_info, NULL, node,
4564                                                          node_instance, i,
4565                                                          false, cost_vec))
4566               return false;
4567           }
4568       return true;
4569     }
4570 
4571   bool dummy;
4572   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4573                                   node, node_instance, cost_vec);
4574 }
4575 
4576 /* Try to build NODE from scalars, returning true on success.
4577    NODE_INSTANCE is the SLP instance that contains NODE.  */
4578 
4579 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)4580 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4581                                     slp_instance node_instance)
4582 {
4583   stmt_vec_info stmt_info;
4584   unsigned int i;
4585 
4586   if (!is_a <bb_vec_info> (vinfo)
4587       || node == SLP_INSTANCE_TREE (node_instance)
4588       || !SLP_TREE_SCALAR_STMTS (node).exists ()
4589       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4590     return false;
4591 
4592   if (dump_enabled_p ())
4593     dump_printf_loc (MSG_NOTE, vect_location,
4594                          "Building vector operands of %p from scalars instead\n", node);
4595 
4596   /* Don't remove and free the child nodes here, since they could be
4597      referenced by other structures.  The analysis and scheduling phases
4598      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
4599   unsigned int group_size = SLP_TREE_LANES (node);
4600   SLP_TREE_DEF_TYPE (node) = vect_external_def;
4601   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4602   SLP_TREE_LOAD_PERMUTATION (node).release ();
4603   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4604     {
4605       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4606       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4607     }
4608   return true;
4609 }
4610 
4611 /* Return true if all elements of the slice are the same.  */
4612 bool
all_same_p() const4613 vect_scalar_ops_slice::all_same_p () const
4614 {
4615   for (unsigned int i = 1; i < length; ++i)
4616     if (!operand_equal_p (op (0), op (i)))
4617       return false;
4618   return true;
4619 }
4620 
4621 hashval_t
hash(const value_type & s)4622 vect_scalar_ops_slice_hash::hash (const value_type &s)
4623 {
4624   hashval_t hash = 0;
4625   for (unsigned i = 0; i < s.length; ++i)
4626     hash = iterative_hash_expr (s.op (i), hash);
4627   return hash;
4628 }
4629 
4630 bool
equal(const value_type & s1,const compare_type & s2)4631 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4632                                            const compare_type &s2)
4633 {
4634   if (s1.length != s2.length)
4635     return false;
4636   for (unsigned i = 0; i < s1.length; ++i)
4637     if (!operand_equal_p (s1.op (i), s2.op (i)))
4638       return false;
4639   return true;
4640 }
4641 
4642 /* Compute the prologue cost for invariant or constant operands represented
4643    by NODE.  */
4644 
4645 static void
vect_prologue_cost_for_slp(slp_tree node,stmt_vector_for_cost * cost_vec)4646 vect_prologue_cost_for_slp (slp_tree node,
4647                                   stmt_vector_for_cost *cost_vec)
4648 {
4649   /* There's a special case of an existing vector, that costs nothing.  */
4650   if (SLP_TREE_SCALAR_OPS (node).length () == 0
4651       && !SLP_TREE_VEC_DEFS (node).is_empty ())
4652     return;
4653   /* Without looking at the actual initializer a vector of
4654      constants can be implemented as load from the constant pool.
4655      When all elements are the same we can use a splat.  */
4656   tree vectype = SLP_TREE_VECTYPE (node);
4657   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4658   unsigned HOST_WIDE_INT const_nunits;
4659   unsigned nelt_limit;
4660   auto ops = &SLP_TREE_SCALAR_OPS (node);
4661   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4662   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4663       && ! multiple_p (const_nunits, group_size))
4664     {
4665       nelt_limit = const_nunits;
4666       hash_set<vect_scalar_ops_slice_hash> vector_ops;
4667       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4668           if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4669             starts.quick_push (i * const_nunits);
4670     }
4671   else
4672     {
4673       /* If either the vector has variable length or the vectors
4674            are composed of repeated whole groups we only need to
4675            cost construction once.  All vectors will be the same.  */
4676       nelt_limit = group_size;
4677       starts.quick_push (0);
4678     }
4679   /* ???  We're just tracking whether vectors in a single node are the same.
4680      Ideally we'd do something more global.  */
4681   for (unsigned int start : starts)
4682     {
4683       vect_cost_for_stmt kind;
4684       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4685           kind = vector_load;
4686       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4687           kind = scalar_to_vec;
4688       else
4689           kind = vec_construct;
4690       record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4691     }
4692 }
4693 
4694 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4695    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4696 
4697    Return true if the operations are supported.  */
4698 
4699 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited_set,vec<slp_tree> & visited_vec,stmt_vector_for_cost * cost_vec)4700 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4701                                           slp_instance node_instance,
4702                                           hash_set<slp_tree> &visited_set,
4703                                           vec<slp_tree> &visited_vec,
4704                                           stmt_vector_for_cost *cost_vec)
4705 {
4706   int i, j;
4707   slp_tree child;
4708 
4709   /* Assume we can code-generate all invariants.  */
4710   if (!node
4711       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4712       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4713     return true;
4714 
4715   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4716     {
4717       if (dump_enabled_p ())
4718           dump_printf_loc (MSG_NOTE, vect_location,
4719                                "Failed cyclic SLP reference in %p\n", node);
4720       return false;
4721     }
4722   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4723 
4724   /* If we already analyzed the exact same set of scalar stmts we're done.
4725      We share the generated vector stmts for those.  */
4726   if (visited_set.add (node))
4727     return true;
4728   visited_vec.safe_push (node);
4729 
4730   bool res = true;
4731   unsigned visited_rec_start = visited_vec.length ();
4732   unsigned cost_vec_rec_start = cost_vec->length ();
4733   bool seen_non_constant_child = false;
4734   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4735     {
4736       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4737                                                         visited_set, visited_vec,
4738                                                         cost_vec);
4739       if (!res)
4740           break;
4741       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4742           seen_non_constant_child = true;
4743     }
4744   /* We're having difficulties scheduling nodes with just constant
4745      operands and no scalar stmts since we then cannot compute a stmt
4746      insertion place.  */
4747   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4748     {
4749       if (dump_enabled_p ())
4750           dump_printf_loc (MSG_NOTE, vect_location,
4751                                "Cannot vectorize all-constant op node %p\n", node);
4752       res = false;
4753     }
4754 
4755   if (res)
4756     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4757                                                         cost_vec);
4758   /* If analysis failed we have to pop all recursive visited nodes
4759      plus ourselves.  */
4760   if (!res)
4761     {
4762       while (visited_vec.length () >= visited_rec_start)
4763           visited_set.remove (visited_vec.pop ());
4764       cost_vec->truncate (cost_vec_rec_start);
4765     }
4766 
4767   /* When the node can be vectorized cost invariant nodes it references.
4768      This is not done in DFS order to allow the refering node
4769      vectorizable_* calls to nail down the invariant nodes vector type
4770      and possibly unshare it if it needs a different vector type than
4771      other referrers.  */
4772   if (res)
4773     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4774       if (child
4775             && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4776                 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4777             /* Perform usual caching, note code-generation still
4778                code-gens these nodes multiple times but we expect
4779                to CSE them later.  */
4780             && !visited_set.add (child))
4781           {
4782             visited_vec.safe_push (child);
4783             /* ???  After auditing more code paths make a "default"
4784                and push the vector type from NODE to all children
4785                if it is not already set.  */
4786             /* Compute the number of vectors to be generated.  */
4787             tree vector_type = SLP_TREE_VECTYPE (child);
4788             if (!vector_type)
4789               {
4790                 /* For shifts with a scalar argument we don't need
4791                      to cost or code-generate anything.
4792                      ???  Represent this more explicitely.  */
4793                 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4794                                  == shift_vec_info_type)
4795                                 && j == 1);
4796                 continue;
4797               }
4798             unsigned group_size = SLP_TREE_LANES (child);
4799             poly_uint64 vf = 1;
4800             if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4801               vf = loop_vinfo->vectorization_factor;
4802             SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4803               = vect_get_num_vectors (vf * group_size, vector_type);
4804             /* And cost them.  */
4805             vect_prologue_cost_for_slp (child, cost_vec);
4806           }
4807 
4808   /* If this node or any of its children can't be vectorized, try pruning
4809      the tree here rather than felling the whole thing.  */
4810   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4811     {
4812       /* We'll need to revisit this for invariant costing and number
4813            of vectorized stmt setting.   */
4814       res = true;
4815     }
4816 
4817   return res;
4818 }
4819 
4820 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4821    region and that can be vectorized using vectorizable_live_operation
4822    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
4823    scalar code computing it to be retained.  */
4824 
4825 static void
vect_bb_slp_mark_live_stmts(bb_vec_info bb_vinfo,slp_tree node,slp_instance instance,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & svisited,hash_set<slp_tree> & visited)4826 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4827                                    slp_instance instance,
4828                                    stmt_vector_for_cost *cost_vec,
4829                                    hash_set<stmt_vec_info> &svisited,
4830                                    hash_set<slp_tree> &visited)
4831 {
4832   if (visited.add (node))
4833     return;
4834 
4835   unsigned i;
4836   stmt_vec_info stmt_info;
4837   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4838   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4839     {
4840       if (svisited.contains (stmt_info))
4841           continue;
4842       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4843       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4844             && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4845           /* Only the pattern root stmt computes the original scalar value.  */
4846           continue;
4847       bool mark_visited = true;
4848       gimple *orig_stmt = orig_stmt_info->stmt;
4849       ssa_op_iter op_iter;
4850       def_operand_p def_p;
4851       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4852           {
4853             imm_use_iterator use_iter;
4854             gimple *use_stmt;
4855             stmt_vec_info use_stmt_info;
4856             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4857               if (!is_gimple_debug (use_stmt))
4858                 {
4859                     use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4860                     if (!use_stmt_info
4861                         || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4862                       {
4863                         STMT_VINFO_LIVE_P (stmt_info) = true;
4864                         if (vectorizable_live_operation (bb_vinfo, stmt_info,
4865                                                                  NULL, node, instance, i,
4866                                                                  false, cost_vec))
4867                           /* ???  So we know we can vectorize the live stmt
4868                                from one SLP node.  If we cannot do so from all
4869                                or none consistently we'd have to record which
4870                                SLP node (and lane) we want to use for the live
4871                                operation.  So make sure we can code-generate
4872                                from all nodes.  */
4873                           mark_visited = false;
4874                         else
4875                           STMT_VINFO_LIVE_P (stmt_info) = false;
4876                         break;
4877                       }
4878                 }
4879             /* We have to verify whether we can insert the lane extract
4880                before all uses.  The following is a conservative approximation.
4881                We cannot put this into vectorizable_live_operation because
4882                iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4883                doesn't work.
4884                Note that while the fact that we emit code for loads at the
4885                first load should make this a non-problem leafs we construct
4886                from scalars are vectorized after the last scalar def.
4887                ???  If we'd actually compute the insert location during
4888                analysis we could use sth less conservative than the last
4889                scalar stmt in the node for the dominance check.  */
4890             /* ???  What remains is "live" uses in vector CTORs in the same
4891                SLP graph which is where those uses can end up code-generated
4892                right after their definition instead of close to their original
4893                use.  But that would restrict us to code-generate lane-extracts
4894                from the latest stmt in a node.  So we compensate for this
4895                during code-generation, simply not replacing uses for those
4896                hopefully rare cases.  */
4897             if (STMT_VINFO_LIVE_P (stmt_info))
4898               FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4899                 if (!is_gimple_debug (use_stmt)
4900                       && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4901                           || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4902                       && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4903                     {
4904                       if (dump_enabled_p ())
4905                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4906                                              "Cannot determine insertion place for "
4907                                              "lane extract\n");
4908                       STMT_VINFO_LIVE_P (stmt_info) = false;
4909                       mark_visited = true;
4910                     }
4911           }
4912       if (mark_visited)
4913           svisited.add (stmt_info);
4914     }
4915 
4916   slp_tree child;
4917   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4918     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4919       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4920                                            cost_vec, svisited, visited);
4921 }
4922 
4923 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
4924 
4925 static bool
vectorizable_bb_reduc_epilogue(slp_instance instance,stmt_vector_for_cost * cost_vec)4926 vectorizable_bb_reduc_epilogue (slp_instance instance,
4927                                         stmt_vector_for_cost *cost_vec)
4928 {
4929   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4930   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4931   if (reduc_code == MINUS_EXPR)
4932     reduc_code = PLUS_EXPR;
4933   internal_fn reduc_fn;
4934   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4935   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4936       || reduc_fn == IFN_LAST
4937       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4938       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4939                                              TREE_TYPE (vectype)))
4940     return false;
4941 
4942   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4943      cost log2 vector operations plus shuffles and one extraction.  */
4944   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4945   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4946                         vectype, 0, vect_body);
4947   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4948                         vectype, 0, vect_body);
4949   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4950                         vectype, 0, vect_body);
4951   return true;
4952 }
4953 
4954 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4955    and recurse to children.  */
4956 
4957 static void
vect_slp_prune_covered_roots(slp_tree node,hash_set<stmt_vec_info> & roots,hash_set<slp_tree> & visited)4958 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4959                                     hash_set<slp_tree> &visited)
4960 {
4961   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4962       || visited.add (node))
4963     return;
4964 
4965   stmt_vec_info stmt;
4966   unsigned i;
4967   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4968     roots.remove (vect_orig_stmt (stmt));
4969 
4970   slp_tree child;
4971   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4972     if (child)
4973       vect_slp_prune_covered_roots (child, roots, visited);
4974 }
4975 
4976 /* Analyze statements in SLP instances of VINFO.  Return true if the
4977    operations are supported. */
4978 
4979 bool
vect_slp_analyze_operations(vec_info * vinfo)4980 vect_slp_analyze_operations (vec_info *vinfo)
4981 {
4982   slp_instance instance;
4983   int i;
4984 
4985   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4986 
4987   hash_set<slp_tree> visited;
4988   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4989     {
4990       auto_vec<slp_tree> visited_vec;
4991       stmt_vector_for_cost cost_vec;
4992       cost_vec.create (2);
4993       if (is_a <bb_vec_info> (vinfo))
4994           vect_location = instance->location ();
4995       if (!vect_slp_analyze_node_operations (vinfo,
4996                                                        SLP_INSTANCE_TREE (instance),
4997                                                        instance, visited, visited_vec,
4998                                                        &cost_vec)
4999             /* CTOR instances require vectorized defs for the SLP tree root.  */
5000             || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
5001                 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
5002                       != vect_internal_def
5003                       /* Make sure we vectorized with the expected type.  */
5004                       || !useless_type_conversion_p
5005                               (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
5006                                                         (instance->root_stmts[0]->stmt))),
5007                                TREE_TYPE (SLP_TREE_VECTYPE
5008                                                       (SLP_INSTANCE_TREE (instance))))))
5009             /* Check we can vectorize the reduction.  */
5010             || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
5011                 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
5012         {
5013             slp_tree node = SLP_INSTANCE_TREE (instance);
5014             stmt_vec_info stmt_info;
5015             if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5016               stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5017             else
5018               stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5019             if (dump_enabled_p ())
5020               dump_printf_loc (MSG_NOTE, vect_location,
5021                                    "removing SLP instance operations starting from: %G",
5022                                    stmt_info->stmt);
5023             vect_free_slp_instance (instance);
5024           vinfo->slp_instances.ordered_remove (i);
5025             cost_vec.release ();
5026             while (!visited_vec.is_empty ())
5027               visited.remove (visited_vec.pop ());
5028           }
5029       else
5030           {
5031             i++;
5032             if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
5033               {
5034                 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
5035                 cost_vec.release ();
5036               }
5037             else
5038               /* For BB vectorization remember the SLP graph entry
5039                  cost for later.  */
5040               instance->cost_vec = cost_vec;
5041           }
5042     }
5043 
5044   /* Now look for SLP instances with a root that are covered by other
5045      instances and remove them.  */
5046   hash_set<stmt_vec_info> roots;
5047   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5048     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5049       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
5050   if (!roots.is_empty ())
5051     {
5052       visited.empty ();
5053       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5054           vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
5055                                               visited);
5056       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
5057           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
5058               && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
5059             {
5060               stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5061               if (dump_enabled_p ())
5062                 dump_printf_loc (MSG_NOTE, vect_location,
5063                                      "removing SLP instance operations starting "
5064                                      "from: %G", root->stmt);
5065               vect_free_slp_instance (instance);
5066               vinfo->slp_instances.ordered_remove (i);
5067             }
5068           else
5069             ++i;
5070     }
5071 
5072   /* Compute vectorizable live stmts.  */
5073   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5074     {
5075       hash_set<stmt_vec_info> svisited;
5076       hash_set<slp_tree> visited;
5077       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5078           {
5079             vect_location = instance->location ();
5080             vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5081                                                instance, &instance->cost_vec, svisited,
5082                                                visited);
5083           }
5084     }
5085 
5086   return !vinfo->slp_instances.is_empty ();
5087 }
5088 
5089 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5090    closing the eventual chain.  */
5091 
5092 static slp_instance
get_ultimate_leader(slp_instance instance,hash_map<slp_instance,slp_instance> & instance_leader)5093 get_ultimate_leader (slp_instance instance,
5094                          hash_map<slp_instance, slp_instance> &instance_leader)
5095 {
5096   auto_vec<slp_instance *, 8> chain;
5097   slp_instance *tem;
5098   while (*(tem = instance_leader.get (instance)) != instance)
5099     {
5100       chain.safe_push (tem);
5101       instance = *tem;
5102     }
5103   while (!chain.is_empty ())
5104     *chain.pop () = instance;
5105   return instance;
5106 }
5107 
5108 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
5109 
5110 static void
vect_bb_partition_graph_r(bb_vec_info bb_vinfo,slp_instance instance,slp_tree node,hash_map<stmt_vec_info,slp_instance> & stmt_to_instance,hash_map<slp_instance,slp_instance> & instance_leader,hash_set<slp_tree> & visited)5111 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5112                                  slp_instance instance, slp_tree node,
5113                                  hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5114                                  hash_map<slp_instance, slp_instance> &instance_leader,
5115                                  hash_set<slp_tree> &visited)
5116 {
5117   stmt_vec_info stmt_info;
5118   unsigned i;
5119 
5120   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5121     {
5122       bool existed_p;
5123       slp_instance &stmt_instance
5124           = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5125       if (!existed_p)
5126           ;
5127       else if (stmt_instance != instance)
5128           {
5129             /* If we're running into a previously marked stmt make us the
5130                leader of the current ultimate leader.  This keeps the
5131                leader chain acyclic and works even when the current instance
5132                connects two previously independent graph parts.  */
5133             slp_instance stmt_leader
5134               = get_ultimate_leader (stmt_instance, instance_leader);
5135             if (stmt_leader != instance)
5136               instance_leader.put (stmt_leader, instance);
5137           }
5138       stmt_instance = instance;
5139     }
5140 
5141   if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5142     return;
5143 
5144   slp_tree child;
5145   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5146     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5147       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5148                                          instance_leader, visited);
5149 }
5150 
5151 /* Partition the SLP graph into pieces that can be costed independently.  */
5152 
5153 static void
vect_bb_partition_graph(bb_vec_info bb_vinfo)5154 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5155 {
5156   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5157 
5158   /* First walk the SLP graph assigning each involved scalar stmt a
5159      corresponding SLP graph entry and upon visiting a previously
5160      marked stmt, make the stmts leader the current SLP graph entry.  */
5161   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5162   hash_map<slp_instance, slp_instance> instance_leader;
5163   hash_set<slp_tree> visited;
5164   slp_instance instance;
5165   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5166     {
5167       instance_leader.put (instance, instance);
5168       vect_bb_partition_graph_r (bb_vinfo,
5169                                          instance, SLP_INSTANCE_TREE (instance),
5170                                          stmt_to_instance, instance_leader,
5171                                          visited);
5172     }
5173 
5174   /* Then collect entries to each independent subgraph.  */
5175   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5176     {
5177       slp_instance leader = get_ultimate_leader (instance, instance_leader);
5178       leader->subgraph_entries.safe_push (instance);
5179       if (dump_enabled_p ()
5180             && leader != instance)
5181           dump_printf_loc (MSG_NOTE, vect_location,
5182                                "instance %p is leader of %p\n",
5183                                leader, instance);
5184     }
5185 }
5186 
5187 /* Compute the set of scalar stmts participating in internal and external
5188    nodes.  */
5189 
5190 static void
vect_slp_gather_vectorized_scalar_stmts(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited,hash_set<stmt_vec_info> & vstmts,hash_set<stmt_vec_info> & estmts)5191 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5192                                                    hash_set<slp_tree> &visited,
5193                                                    hash_set<stmt_vec_info> &vstmts,
5194                                                    hash_set<stmt_vec_info> &estmts)
5195 {
5196   int i;
5197   stmt_vec_info stmt_info;
5198   slp_tree child;
5199 
5200   if (visited.add (node))
5201     return;
5202 
5203   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5204     {
5205       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5206           vstmts.add (stmt_info);
5207 
5208       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5209           if (child)
5210             vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5211                                                                vstmts, estmts);
5212     }
5213   else
5214     for (tree def : SLP_TREE_SCALAR_OPS (node))
5215       {
5216           stmt_vec_info def_stmt = vinfo->lookup_def (def);
5217           if (def_stmt)
5218             estmts.add (def_stmt);
5219       }
5220 }
5221 
5222 
5223 /* Compute the scalar cost of the SLP node NODE and its children
5224    and return it.  Do not account defs that are marked in LIFE and
5225    update LIFE according to uses of NODE.  */
5226 
5227 static void
vect_bb_slp_scalar_cost(vec_info * vinfo,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & vectorized_scalar_stmts,hash_set<slp_tree> & visited)5228 vect_bb_slp_scalar_cost (vec_info *vinfo,
5229                                slp_tree node, vec<bool, va_heap> *life,
5230                                stmt_vector_for_cost *cost_vec,
5231                                hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5232                                hash_set<slp_tree> &visited)
5233 {
5234   unsigned i;
5235   stmt_vec_info stmt_info;
5236   slp_tree child;
5237 
5238   if (visited.add (node))
5239     return;
5240 
5241   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5242     {
5243       ssa_op_iter op_iter;
5244       def_operand_p def_p;
5245 
5246       if ((*life)[i])
5247           continue;
5248 
5249       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5250       gimple *orig_stmt = orig_stmt_info->stmt;
5251 
5252       /* If there is a non-vectorized use of the defs then the scalar
5253          stmt is kept live in which case we do not account it or any
5254            required defs in the SLP children in the scalar cost.  This
5255            way we make the vectorization more costly when compared to
5256            the scalar cost.  */
5257       if (!STMT_VINFO_LIVE_P (stmt_info))
5258           {
5259             auto_vec<gimple *, 8> worklist;
5260             hash_set<gimple *> *worklist_visited = NULL;
5261             worklist.quick_push (orig_stmt);
5262             do
5263               {
5264                 gimple *work_stmt = worklist.pop ();
5265                 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
5266                     {
5267                       imm_use_iterator use_iter;
5268                       gimple *use_stmt;
5269                       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
5270                                                    DEF_FROM_PTR (def_p))
5271                         if (!is_gimple_debug (use_stmt))
5272                           {
5273                               stmt_vec_info use_stmt_info
5274                                 = vinfo->lookup_stmt (use_stmt);
5275                               if (!use_stmt_info
5276                                   || !vectorized_scalar_stmts.contains (use_stmt_info))
5277                                 {
5278                                   if (use_stmt_info
5279                                         && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
5280                                     {
5281                                         /* For stmts participating in patterns we have
5282                                            to check its uses recursively.  */
5283                                         if (!worklist_visited)
5284                                           worklist_visited = new hash_set<gimple *> ();
5285                                         if (!worklist_visited->add (use_stmt))
5286                                           worklist.safe_push (use_stmt);
5287                                         continue;
5288                                     }
5289                                   (*life)[i] = true;
5290                                   goto next_lane;
5291                                 }
5292                           }
5293                     }
5294               }
5295             while (!worklist.is_empty ());
5296 next_lane:
5297             if (worklist_visited)
5298               delete worklist_visited;
5299             if ((*life)[i])
5300               continue;
5301           }
5302 
5303       /* Count scalar stmts only once.  */
5304       if (gimple_visited_p (orig_stmt))
5305           continue;
5306       gimple_set_visited (orig_stmt, true);
5307 
5308       vect_cost_for_stmt kind;
5309       if (STMT_VINFO_DATA_REF (orig_stmt_info))
5310           {
5311             if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5312               kind = scalar_load;
5313             else
5314               kind = scalar_store;
5315           }
5316       else if (vect_nop_conversion_p (orig_stmt_info))
5317           continue;
5318       /* For single-argument PHIs assume coalescing which means zero cost
5319            for the scalar and the vector PHIs.  This avoids artificially
5320            favoring the vector path (but may pessimize it in some cases).  */
5321       else if (is_a <gphi *> (orig_stmt_info->stmt)
5322                  && gimple_phi_num_args
5323                         (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5324           continue;
5325       else
5326           kind = scalar_stmt;
5327       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5328                               SLP_TREE_VECTYPE (node), 0, vect_body);
5329     }
5330 
5331   auto_vec<bool, 20> subtree_life;
5332   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5333     {
5334       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5335           {
5336             /* Do not directly pass LIFE to the recursive call, copy it to
5337                confine changes in the callee to the current child/subtree.  */
5338             if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5339               {
5340                 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5341                 for (unsigned j = 0;
5342                        j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5343                     {
5344                       auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5345                       if (perm.first == i)
5346                         subtree_life[perm.second] = (*life)[j];
5347                     }
5348               }
5349             else
5350               {
5351                 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5352                 subtree_life.safe_splice (*life);
5353               }
5354             vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5355                                            vectorized_scalar_stmts, visited);
5356             subtree_life.truncate (0);
5357           }
5358     }
5359 }
5360 
5361 /* Comparator for the loop-index sorted cost vectors.  */
5362 
5363 static int
li_cost_vec_cmp(const void * a_,const void * b_)5364 li_cost_vec_cmp (const void *a_, const void *b_)
5365 {
5366   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5367   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5368   if (a->first < b->first)
5369     return -1;
5370   else if (a->first == b->first)
5371     return 0;
5372   return 1;
5373 }
5374 
5375 /* Check if vectorization of the basic block is profitable for the
5376    subgraph denoted by SLP_INSTANCES.  */
5377 
5378 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo,vec<slp_instance> slp_instances,loop_p orig_loop)5379 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5380                                             vec<slp_instance> slp_instances,
5381                                             loop_p orig_loop)
5382 {
5383   slp_instance instance;
5384   int i;
5385   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5386   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5387 
5388   if (dump_enabled_p ())
5389     {
5390       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5391       hash_set<slp_tree> visited;
5392       FOR_EACH_VEC_ELT (slp_instances, i, instance)
5393           vect_print_slp_graph (MSG_NOTE, vect_location,
5394                                     SLP_INSTANCE_TREE (instance), visited);
5395     }
5396 
5397   /* Compute the set of scalar stmts we know will go away 'locally' when
5398      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
5399      not accurate for nodes promoted extern late or for scalar stmts that
5400      are used both in extern defs and in vectorized defs.  */
5401   hash_set<stmt_vec_info> vectorized_scalar_stmts;
5402   hash_set<stmt_vec_info> scalar_stmts_in_externs;
5403   hash_set<slp_tree> visited;
5404   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5405     {
5406       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5407                                                          SLP_INSTANCE_TREE (instance),
5408                                                          visited,
5409                                                          vectorized_scalar_stmts,
5410                                                          scalar_stmts_in_externs);
5411       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5412           vectorized_scalar_stmts.add (rstmt);
5413     }
5414   /* Scalar stmts used as defs in external nodes need to be preseved, so
5415      remove them from vectorized_scalar_stmts.  */
5416   for (stmt_vec_info stmt : scalar_stmts_in_externs)
5417     vectorized_scalar_stmts.remove (stmt);
5418 
5419   /* Calculate scalar cost and sum the cost for the vector stmts
5420      previously collected.  */
5421   stmt_vector_for_cost scalar_costs = vNULL;
5422   stmt_vector_for_cost vector_costs = vNULL;
5423   visited.empty ();
5424   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5425     {
5426       auto_vec<bool, 20> life;
5427       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5428                                     true);
5429       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5430           record_stmt_cost (&scalar_costs,
5431                                 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5432                                 scalar_stmt,
5433                                 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5434       vect_bb_slp_scalar_cost (bb_vinfo,
5435                                      SLP_INSTANCE_TREE (instance),
5436                                      &life, &scalar_costs, vectorized_scalar_stmts,
5437                                      visited);
5438       vector_costs.safe_splice (instance->cost_vec);
5439       instance->cost_vec.release ();
5440     }
5441 
5442   if (dump_enabled_p ())
5443     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5444 
5445   /* When costing non-loop vectorization we need to consider each covered
5446      loop independently and make sure vectorization is profitable.  For
5447      now we assume a loop may be not entered or executed an arbitrary
5448      number of iterations (???  static information can provide more
5449      precise info here) which means we can simply cost each containing
5450      loops stmts separately.  */
5451 
5452   /* First produce cost vectors sorted by loop index.  */
5453   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5454     li_scalar_costs (scalar_costs.length ());
5455   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5456     li_vector_costs (vector_costs.length ());
5457   stmt_info_for_cost *cost;
5458   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5459     {
5460       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5461       li_scalar_costs.quick_push (std::make_pair (l, cost));
5462     }
5463   /* Use a random used loop as fallback in case the first vector_costs
5464      entry does not have a stmt_info associated with it.  */
5465   unsigned l = li_scalar_costs[0].first;
5466   FOR_EACH_VEC_ELT (vector_costs, i, cost)
5467     {
5468       /* We inherit from the previous COST, invariants, externals and
5469            extracts immediately follow the cost for the related stmt.  */
5470       if (cost->stmt_info)
5471           l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5472       li_vector_costs.quick_push (std::make_pair (l, cost));
5473     }
5474   li_scalar_costs.qsort (li_cost_vec_cmp);
5475   li_vector_costs.qsort (li_cost_vec_cmp);
5476 
5477   /* Now cost the portions individually.  */
5478   unsigned vi = 0;
5479   unsigned si = 0;
5480   bool profitable = true;
5481   while (si < li_scalar_costs.length ()
5482            && vi < li_vector_costs.length ())
5483     {
5484       unsigned sl = li_scalar_costs[si].first;
5485       unsigned vl = li_vector_costs[vi].first;
5486       if (sl != vl)
5487           {
5488             if (dump_enabled_p ())
5489               dump_printf_loc (MSG_NOTE, vect_location,
5490                                    "Scalar %d and vector %d loop part do not "
5491                                    "match up, skipping scalar part\n", sl, vl);
5492             /* Skip the scalar part, assuming zero cost on the vector side.  */
5493             do
5494               {
5495                 si++;
5496               }
5497             while (si < li_scalar_costs.length ()
5498                      && li_scalar_costs[si].first == sl);
5499             continue;
5500           }
5501 
5502       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5503       do
5504           {
5505             add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5506             si++;
5507           }
5508       while (si < li_scalar_costs.length ()
5509                && li_scalar_costs[si].first == sl);
5510       unsigned dummy;
5511       finish_cost (scalar_target_cost_data, nullptr,
5512                        &dummy, &scalar_cost, &dummy);
5513 
5514       /* Complete the target-specific vector cost calculation.  */
5515       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5516       do
5517           {
5518             add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5519             vi++;
5520           }
5521       while (vi < li_vector_costs.length ()
5522                && li_vector_costs[vi].first == vl);
5523       finish_cost (vect_target_cost_data, scalar_target_cost_data,
5524                        &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5525       delete scalar_target_cost_data;
5526       delete vect_target_cost_data;
5527 
5528       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5529 
5530       if (dump_enabled_p ())
5531           {
5532             dump_printf_loc (MSG_NOTE, vect_location,
5533                                  "Cost model analysis for part in loop %d:\n", sl);
5534             dump_printf (MSG_NOTE, "  Vector cost: %d\n",
5535                            vec_inside_cost + vec_outside_cost);
5536             dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
5537           }
5538 
5539       /* Vectorization is profitable if its cost is more than the cost of scalar
5540            version.  Note that we err on the vector side for equal cost because
5541            the cost estimate is otherwise quite pessimistic (constant uses are
5542            free on the scalar side but cost a load on the vector side for
5543            example).  */
5544       if (vec_outside_cost + vec_inside_cost > scalar_cost)
5545           {
5546             profitable = false;
5547             break;
5548           }
5549     }
5550   if (profitable && vi < li_vector_costs.length ())
5551     {
5552       if (dump_enabled_p ())
5553           dump_printf_loc (MSG_NOTE, vect_location,
5554                                "Excess vector cost for part in loop %d:\n",
5555                                li_vector_costs[vi].first);
5556       profitable = false;
5557     }
5558 
5559   /* Unset visited flag.  This is delayed when the subgraph is profitable
5560      and we process the loop for remaining unvectorized if-converted code.  */
5561   if (!orig_loop || !profitable)
5562     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5563       gimple_set_visited  (cost->stmt_info->stmt, false);
5564 
5565   scalar_costs.release ();
5566   vector_costs.release ();
5567 
5568   return profitable;
5569 }
5570 
5571 /* qsort comparator for lane defs.  */
5572 
5573 static int
vld_cmp(const void * a_,const void * b_)5574 vld_cmp (const void *a_, const void *b_)
5575 {
5576   auto *a = (const std::pair<unsigned, tree> *)a_;
5577   auto *b = (const std::pair<unsigned, tree> *)b_;
5578   return a->first - b->first;
5579 }
5580 
5581 /* Return true if USE_STMT is a vector lane insert into VEC and set
5582    *THIS_LANE to the lane number that is set.  */
5583 
5584 static bool
vect_slp_is_lane_insert(gimple * use_stmt,tree vec,unsigned * this_lane)5585 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5586 {
5587   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5588   if (!use_ass
5589       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5590       || (vec
5591             ? gimple_assign_rhs1 (use_ass) != vec
5592             : ((vec = gimple_assign_rhs1 (use_ass)), false))
5593       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5594                                              TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5595       || !constant_multiple_p
5596               (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5597                tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5598                this_lane))
5599     return false;
5600   return true;
5601 }
5602 
5603 /* Find any vectorizable constructors and add them to the grouped_store
5604    array.  */
5605 
5606 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)5607 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5608 {
5609   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5610     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5611            !gsi_end_p (gsi); gsi_next (&gsi))
5612     {
5613       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5614       if (!assign)
5615           continue;
5616 
5617       tree rhs = gimple_assign_rhs1 (assign);
5618       enum tree_code code = gimple_assign_rhs_code (assign);
5619       use_operand_p use_p;
5620       gimple *use_stmt;
5621       if (code == CONSTRUCTOR)
5622           {
5623             if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5624                 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5625                                  CONSTRUCTOR_NELTS (rhs))
5626                 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5627                 || uniform_vector_p (rhs))
5628               continue;
5629 
5630             unsigned j;
5631             tree val;
5632             FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5633                 if (TREE_CODE (val) != SSA_NAME
5634                       || !bb_vinfo->lookup_def (val))
5635                     break;
5636             if (j != CONSTRUCTOR_NELTS (rhs))
5637               continue;
5638 
5639             stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5640             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5641           }
5642       else if (code == BIT_INSERT_EXPR
5643                  && VECTOR_TYPE_P (TREE_TYPE (rhs))
5644                  && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5645                  && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5646                  && integer_zerop (gimple_assign_rhs3 (assign))
5647                  && useless_type_conversion_p
5648                         (TREE_TYPE (TREE_TYPE (rhs)),
5649                          TREE_TYPE (gimple_assign_rhs2 (assign)))
5650                  && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5651           {
5652             /* We start to match on insert to lane zero but since the
5653                inserts need not be ordered we'd have to search both
5654                the def and the use chains.  */
5655             tree vectype = TREE_TYPE (rhs);
5656             unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5657             auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5658             auto_sbitmap lanes (nlanes);
5659             bitmap_clear (lanes);
5660             bitmap_set_bit (lanes, 0);
5661             tree def = gimple_assign_lhs (assign);
5662             lane_defs.quick_push
5663                           (std::make_pair (0, gimple_assign_rhs2 (assign)));
5664             unsigned lanes_found = 1;
5665             /* Start with the use chains, the last stmt will be the root.  */
5666             stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5667             vec<stmt_vec_info> roots = vNULL;
5668             roots.safe_push (last);
5669             do
5670               {
5671                 use_operand_p use_p;
5672                 gimple *use_stmt;
5673                 if (!single_imm_use (def, &use_p, &use_stmt))
5674                     break;
5675                 unsigned this_lane;
5676                 if (!bb_vinfo->lookup_stmt (use_stmt)
5677                       || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5678                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5679                     break;
5680                 if (bitmap_bit_p (lanes, this_lane))
5681                     break;
5682                 lanes_found++;
5683                 bitmap_set_bit (lanes, this_lane);
5684                 gassign *use_ass = as_a <gassign *> (use_stmt);
5685                 lane_defs.quick_push (std::make_pair
5686                                              (this_lane, gimple_assign_rhs2 (use_ass)));
5687                 last = bb_vinfo->lookup_stmt (use_ass);
5688                 roots.safe_push (last);
5689                 def = gimple_assign_lhs (use_ass);
5690               }
5691             while (lanes_found < nlanes);
5692             if (roots.length () > 1)
5693               std::swap(roots[0], roots[roots.length () - 1]);
5694             if (lanes_found < nlanes)
5695               {
5696                 /* Now search the def chain.  */
5697                 def = gimple_assign_rhs1 (assign);
5698                 do
5699                     {
5700                       if (TREE_CODE (def) != SSA_NAME
5701                           || !has_single_use (def))
5702                         break;
5703                       gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5704                       unsigned this_lane;
5705                       if (!bb_vinfo->lookup_stmt (def_stmt)
5706                           || !vect_slp_is_lane_insert (def_stmt,
5707                                                                NULL_TREE, &this_lane)
5708                           || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5709                         break;
5710                       if (bitmap_bit_p (lanes, this_lane))
5711                         break;
5712                       lanes_found++;
5713                       bitmap_set_bit (lanes, this_lane);
5714                       lane_defs.quick_push (std::make_pair
5715                                                     (this_lane,
5716                                                      gimple_assign_rhs2 (def_stmt)));
5717                       roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5718                       def = gimple_assign_rhs1 (def_stmt);
5719                     }
5720                 while (lanes_found < nlanes);
5721               }
5722             if (lanes_found == nlanes)
5723               {
5724                 /* Sort lane_defs after the lane index and register the root.  */
5725                 lane_defs.qsort (vld_cmp);
5726                 vec<stmt_vec_info> stmts;
5727                 stmts.create (nlanes);
5728                 for (unsigned i = 0; i < nlanes; ++i)
5729                     stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5730                 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5731                                                                stmts, roots));
5732               }
5733             else
5734               roots.release ();
5735           }
5736       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5737                  && (associative_tree_code (code) || code == MINUS_EXPR)
5738                  /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
5739                       checks pessimize a two-element reduction.  PR54400.
5740                       ???  In-order reduction could be handled if we only
5741                       traverse one operand chain in vect_slp_linearize_chain.  */
5742                  && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5743                        || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5744                            && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5745                  /* Ops with constants at the tail can be stripped here.  */
5746                  && TREE_CODE (rhs) == SSA_NAME
5747                  && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5748                  /* Should be the chain end.  */
5749                  && (!single_imm_use (gimple_assign_lhs (assign),
5750                                             &use_p, &use_stmt)
5751                        || !is_gimple_assign (use_stmt)
5752                        || (gimple_assign_rhs_code (use_stmt) != code
5753                            && ((code != PLUS_EXPR && code != MINUS_EXPR)
5754                                  || (gimple_assign_rhs_code (use_stmt)
5755                                      != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5756           {
5757             /* We start the match at the end of a possible association
5758                chain.  */
5759             auto_vec<chain_op_t> chain;
5760             auto_vec<std::pair<tree_code, gimple *> > worklist;
5761             auto_vec<gimple *> chain_stmts;
5762             gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5763             if (code == MINUS_EXPR)
5764               code = PLUS_EXPR;
5765             internal_fn reduc_fn;
5766             if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5767                 || reduc_fn == IFN_LAST)
5768               continue;
5769             vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5770                                             /* ??? */
5771                                             code_stmt, alt_code_stmt, &chain_stmts);
5772             if (chain.length () > 1)
5773               {
5774                 /* Sort the chain according to def_type and operation.  */
5775                 chain.sort (dt_sort_cmp, bb_vinfo);
5776                 /* ???  Now we'd want to strip externals and constants
5777                      but record those to be handled in the epilogue.  */
5778                 /* ???  For now do not allow mixing ops or externs/constants.  */
5779                 bool invalid = false;
5780                 for (unsigned i = 0; i < chain.length (); ++i)
5781                     if (chain[i].dt != vect_internal_def
5782                         || chain[i].code != code)
5783                       invalid = true;
5784                 if (!invalid)
5785                     {
5786                       vec<stmt_vec_info> stmts;
5787                       stmts.create (chain.length ());
5788                       for (unsigned i = 0; i < chain.length (); ++i)
5789                         stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5790                       vec<stmt_vec_info> roots;
5791                       roots.create (chain_stmts.length ());
5792                       for (unsigned i = 0; i < chain_stmts.length (); ++i)
5793                         roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5794                       bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5795                                                                    stmts, roots));
5796                     }
5797               }
5798           }
5799     }
5800 }
5801 
5802 /* Walk the grouped store chains and replace entries with their
5803    pattern variant if any.  */
5804 
5805 static void
vect_fixup_store_groups_with_patterns(vec_info * vinfo)5806 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5807 {
5808   stmt_vec_info first_element;
5809   unsigned i;
5810 
5811   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5812     {
5813       /* We also have CTORs in this array.  */
5814       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5815           continue;
5816       if (STMT_VINFO_IN_PATTERN_P (first_element))
5817           {
5818             stmt_vec_info orig = first_element;
5819             first_element = STMT_VINFO_RELATED_STMT (first_element);
5820             DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5821             DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5822             DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5823             DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5824             vinfo->grouped_stores[i] = first_element;
5825           }
5826       stmt_vec_info prev = first_element;
5827       while (DR_GROUP_NEXT_ELEMENT (prev))
5828           {
5829             stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5830             if (STMT_VINFO_IN_PATTERN_P (elt))
5831               {
5832                 stmt_vec_info orig = elt;
5833                 elt = STMT_VINFO_RELATED_STMT (elt);
5834                 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5835                 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5836                 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5837               }
5838             DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5839             prev = elt;
5840           }
5841     }
5842 }
5843 
5844 /* Check if the region described by BB_VINFO can be vectorized, returning
5845    true if so.  When returning false, set FATAL to true if the same failure
5846    would prevent vectorization at other vector sizes, false if it is still
5847    worth trying other sizes.  N_STMTS is the number of statements in the
5848    region.  */
5849 
5850 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal,vec<int> * dataref_groups)5851 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5852                            vec<int> *dataref_groups)
5853 {
5854   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5855 
5856   slp_instance instance;
5857   int i;
5858   poly_uint64 min_vf = 2;
5859 
5860   /* The first group of checks is independent of the vector size.  */
5861   fatal = true;
5862 
5863   /* Analyze the data references.  */
5864 
5865   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5866     {
5867       if (dump_enabled_p ())
5868         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5869                                "not vectorized: unhandled data-ref in basic "
5870                                "block.\n");
5871       return false;
5872     }
5873 
5874   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5875     {
5876      if (dump_enabled_p ())
5877        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5878                               "not vectorized: unhandled data access in "
5879                               "basic block.\n");
5880       return false;
5881     }
5882 
5883   vect_slp_check_for_constructors (bb_vinfo);
5884 
5885   /* If there are no grouped stores and no constructors in the region
5886      there is no need to continue with pattern recog as vect_analyze_slp
5887      will fail anyway.  */
5888   if (bb_vinfo->grouped_stores.is_empty ()
5889       && bb_vinfo->roots.is_empty ())
5890     {
5891       if (dump_enabled_p ())
5892           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5893                                "not vectorized: no grouped stores in "
5894                                "basic block.\n");
5895       return false;
5896     }
5897 
5898   /* While the rest of the analysis below depends on it in some way.  */
5899   fatal = false;
5900 
5901   vect_pattern_recog (bb_vinfo);
5902 
5903   /* Update store groups from pattern processing.  */
5904   vect_fixup_store_groups_with_patterns (bb_vinfo);
5905 
5906   /* Check the SLP opportunities in the basic block, analyze and build SLP
5907      trees.  */
5908   if (!vect_analyze_slp (bb_vinfo, n_stmts))
5909     {
5910       if (dump_enabled_p ())
5911           {
5912             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5913                                  "Failed to SLP the basic block.\n");
5914             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5915                                  "not vectorized: failed to find SLP opportunities "
5916                                  "in basic block.\n");
5917           }
5918       return false;
5919     }
5920 
5921   /* Optimize permutations.  */
5922   vect_optimize_slp (bb_vinfo);
5923 
5924   /* Gather the loads reachable from the SLP graph entries.  */
5925   vect_gather_slp_loads (bb_vinfo);
5926 
5927   vect_record_base_alignments (bb_vinfo);
5928 
5929   /* Analyze and verify the alignment of data references and the
5930      dependence in the SLP instances.  */
5931   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5932     {
5933       vect_location = instance->location ();
5934       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5935             || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5936           {
5937             slp_tree node = SLP_INSTANCE_TREE (instance);
5938             stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5939             if (dump_enabled_p ())
5940               dump_printf_loc (MSG_NOTE, vect_location,
5941                                    "removing SLP instance operations starting from: %G",
5942                                    stmt_info->stmt);
5943             vect_free_slp_instance (instance);
5944             BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5945             continue;
5946           }
5947 
5948       /* Mark all the statements that we want to vectorize as pure SLP and
5949            relevant.  */
5950       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5951       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5952       unsigned j;
5953       stmt_vec_info root;
5954       /* Likewise consider instance root stmts as vectorized.  */
5955       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5956           STMT_SLP_TYPE (root) = pure_slp;
5957 
5958       i++;
5959     }
5960   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5961     return false;
5962 
5963   if (!vect_slp_analyze_operations (bb_vinfo))
5964     {
5965       if (dump_enabled_p ())
5966         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5967                                "not vectorized: bad operation in basic block.\n");
5968       return false;
5969     }
5970 
5971   vect_bb_partition_graph (bb_vinfo);
5972 
5973   return true;
5974 }
5975 
5976 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
5977    basic blocks in BBS, returning true on success.
5978    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
5979 
5980 static bool
vect_slp_region(vec<basic_block> bbs,vec<data_reference_p> datarefs,vec<int> * dataref_groups,unsigned int n_stmts,loop_p orig_loop)5981 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5982                      vec<int> *dataref_groups, unsigned int n_stmts,
5983                      loop_p orig_loop)
5984 {
5985   bb_vec_info bb_vinfo;
5986   auto_vector_modes vector_modes;
5987 
5988   /* Autodetect first vector size we try.  */
5989   machine_mode next_vector_mode = VOIDmode;
5990   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5991   unsigned int mode_i = 0;
5992 
5993   vec_info_shared shared;
5994 
5995   machine_mode autodetected_vector_mode = VOIDmode;
5996   while (1)
5997     {
5998       bool vectorized = false;
5999       bool fatal = false;
6000       bb_vinfo = new _bb_vec_info (bbs, &shared);
6001 
6002       bool first_time_p = shared.datarefs.is_empty ();
6003       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
6004       if (first_time_p)
6005           bb_vinfo->shared->save_datarefs ();
6006       else
6007           bb_vinfo->shared->check_datarefs ();
6008       bb_vinfo->vector_mode = next_vector_mode;
6009 
6010       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
6011           {
6012             if (dump_enabled_p ())
6013               {
6014                 dump_printf_loc (MSG_NOTE, vect_location,
6015                                      "***** Analysis succeeded with vector mode"
6016                                      " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
6017                 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
6018               }
6019 
6020             bb_vinfo->shared->check_datarefs ();
6021 
6022             auto_vec<slp_instance> profitable_subgraphs;
6023             for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
6024               {
6025                 if (instance->subgraph_entries.is_empty ())
6026                     continue;
6027 
6028                 vect_location = instance->location ();
6029                 if (!unlimited_cost_model (NULL)
6030                       && !vect_bb_vectorization_profitable_p
6031                               (bb_vinfo, instance->subgraph_entries, orig_loop))
6032                     {
6033                       if (dump_enabled_p ())
6034                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6035                                              "not vectorized: vectorization is not "
6036                                              "profitable.\n");
6037                       continue;
6038                     }
6039 
6040                 if (!dbg_cnt (vect_slp))
6041                     continue;
6042 
6043                 profitable_subgraphs.safe_push (instance);
6044               }
6045 
6046             /* When we're vectorizing an if-converted loop body make sure
6047                we vectorized all if-converted code.  */
6048             if (!profitable_subgraphs.is_empty ()
6049                 && orig_loop)
6050               {
6051                 gcc_assert (bb_vinfo->bbs.length () == 1);
6052                 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
6053                        !gsi_end_p (gsi); gsi_next (&gsi))
6054                     {
6055                       /* The costing above left us with DCEable vectorized scalar
6056                          stmts having the visited flag set on profitable
6057                          subgraphs.  Do the delayed clearing of the flag here.  */
6058                       if (gimple_visited_p (gsi_stmt (gsi)))
6059                         {
6060                           gimple_set_visited (gsi_stmt (gsi), false);
6061                           continue;
6062                         }
6063                       if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
6064                         continue;
6065 
6066                       if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
6067                         if (gimple_assign_rhs_code (ass) == COND_EXPR)
6068                           {
6069                               if (!profitable_subgraphs.is_empty ()
6070                                   && dump_enabled_p ())
6071                                 dump_printf_loc (MSG_NOTE, vect_location,
6072                                                      "not profitable because of "
6073                                                      "unprofitable if-converted scalar "
6074                                                      "code\n");
6075                               profitable_subgraphs.truncate (0);
6076                           }
6077                     }
6078               }
6079 
6080             /* Finally schedule the profitable subgraphs.  */
6081             for (slp_instance instance : profitable_subgraphs)
6082               {
6083                 if (!vectorized && dump_enabled_p ())
6084                     dump_printf_loc (MSG_NOTE, vect_location,
6085                                          "Basic block will be vectorized "
6086                                          "using SLP\n");
6087                 vectorized = true;
6088 
6089                 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
6090 
6091                 unsigned HOST_WIDE_INT bytes;
6092                 if (dump_enabled_p ())
6093                     {
6094                       if (GET_MODE_SIZE
6095                               (bb_vinfo->vector_mode).is_constant (&bytes))
6096                         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6097                                              "basic block part vectorized using %wu "
6098                                              "byte vectors\n", bytes);
6099                       else
6100                         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6101                                              "basic block part vectorized using "
6102                                              "variable length vectors\n");
6103                     }
6104               }
6105           }
6106       else
6107           {
6108             if (dump_enabled_p ())
6109               dump_printf_loc (MSG_NOTE, vect_location,
6110                                    "***** Analysis failed with vector mode %s\n",
6111                                    GET_MODE_NAME (bb_vinfo->vector_mode));
6112           }
6113 
6114       if (mode_i == 0)
6115           autodetected_vector_mode = bb_vinfo->vector_mode;
6116 
6117       if (!fatal)
6118           while (mode_i < vector_modes.length ()
6119                  && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6120             {
6121               if (dump_enabled_p ())
6122                 dump_printf_loc (MSG_NOTE, vect_location,
6123                                      "***** The result for vector mode %s would"
6124                                      " be the same\n",
6125                                      GET_MODE_NAME (vector_modes[mode_i]));
6126               mode_i += 1;
6127             }
6128 
6129       delete bb_vinfo;
6130 
6131       if (mode_i < vector_modes.length ()
6132             && VECTOR_MODE_P (autodetected_vector_mode)
6133             && (related_vector_mode (vector_modes[mode_i],
6134                                            GET_MODE_INNER (autodetected_vector_mode))
6135                 == autodetected_vector_mode)
6136             && (related_vector_mode (autodetected_vector_mode,
6137                                            GET_MODE_INNER (vector_modes[mode_i]))
6138                 == vector_modes[mode_i]))
6139           {
6140             if (dump_enabled_p ())
6141               dump_printf_loc (MSG_NOTE, vect_location,
6142                                    "***** Skipping vector mode %s, which would"
6143                                    " repeat the analysis for %s\n",
6144                                    GET_MODE_NAME (vector_modes[mode_i]),
6145                                    GET_MODE_NAME (autodetected_vector_mode));
6146             mode_i += 1;
6147           }
6148 
6149       if (vectorized
6150             || mode_i == vector_modes.length ()
6151             || autodetected_vector_mode == VOIDmode
6152             /* If vect_slp_analyze_bb_1 signaled that analysis for all
6153                vector sizes will fail do not bother iterating.  */
6154             || fatal)
6155           return vectorized;
6156 
6157       /* Try the next biggest vector size.  */
6158       next_vector_mode = vector_modes[mode_i++];
6159       if (dump_enabled_p ())
6160           dump_printf_loc (MSG_NOTE, vect_location,
6161                                "***** Re-trying analysis with vector mode %s\n",
6162                                GET_MODE_NAME (next_vector_mode));
6163     }
6164 }
6165 
6166 
6167 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
6168    true if anything in the basic-block was vectorized.  */
6169 
6170 static bool
vect_slp_bbs(const vec<basic_block> & bbs,loop_p orig_loop)6171 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6172 {
6173   vec<data_reference_p> datarefs = vNULL;
6174   auto_vec<int> dataref_groups;
6175   int insns = 0;
6176   int current_group = 0;
6177 
6178   for (unsigned i = 0; i < bbs.length (); i++)
6179     {
6180       basic_block bb = bbs[i];
6181       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6182              gsi_next (&gsi))
6183           {
6184             gimple *stmt = gsi_stmt (gsi);
6185             if (is_gimple_debug (stmt))
6186               continue;
6187 
6188             insns++;
6189 
6190             if (gimple_location (stmt) != UNKNOWN_LOCATION)
6191               vect_location = stmt;
6192 
6193             if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6194                                                         &dataref_groups, current_group))
6195               ++current_group;
6196           }
6197       /* New BBs always start a new DR group.  */
6198       ++current_group;
6199     }
6200 
6201   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6202 }
6203 
6204 /* Special entry for the BB vectorizer.  Analyze and transform a single
6205    if-converted BB with ORIG_LOOPs body being the not if-converted
6206    representation.  Returns true if anything in the basic-block was
6207    vectorized.  */
6208 
6209 bool
vect_slp_if_converted_bb(basic_block bb,loop_p orig_loop)6210 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6211 {
6212   auto_vec<basic_block> bbs;
6213   bbs.safe_push (bb);
6214   return vect_slp_bbs (bbs, orig_loop);
6215 }
6216 
6217 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
6218    true if anything in the basic-block was vectorized.  */
6219 
6220 bool
vect_slp_function(function * fun)6221 vect_slp_function (function *fun)
6222 {
6223   bool r = false;
6224   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6225   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6226 
6227   /* For the moment split the function into pieces to avoid making
6228      the iteration on the vector mode moot.  Split at points we know
6229      to not handle well which is CFG merges (SLP discovery doesn't
6230      handle non-loop-header PHIs) and loop exits.  Since pattern
6231      recog requires reverse iteration to visit uses before defs
6232      simply chop RPO into pieces.  */
6233   auto_vec<basic_block> bbs;
6234   for (unsigned i = 0; i < n; i++)
6235     {
6236       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6237       bool split = false;
6238 
6239       /* Split when a BB is not dominated by the first block.  */
6240       if (!bbs.is_empty ()
6241             && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6242           {
6243             if (dump_enabled_p ())
6244               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245                                    "splitting region at dominance boundary bb%d\n",
6246                                    bb->index);
6247             split = true;
6248           }
6249       /* Split when the loop determined by the first block
6250            is exited.  This is because we eventually insert
6251            invariants at region begin.  */
6252       else if (!bbs.is_empty ()
6253                  && bbs[0]->loop_father != bb->loop_father
6254                  && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6255           {
6256             if (dump_enabled_p ())
6257               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258                                    "splitting region at loop %d exit at bb%d\n",
6259                                    bbs[0]->loop_father->num, bb->index);
6260             split = true;
6261           }
6262 
6263       if (split && !bbs.is_empty ())
6264           {
6265             r |= vect_slp_bbs (bbs, NULL);
6266             bbs.truncate (0);
6267           }
6268 
6269       /* We need to be able to insert at the head of the region which
6270            we cannot for region starting with a returns-twice call.  */
6271       if (bbs.is_empty ())
6272           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
6273             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
6274               {
6275                 if (dump_enabled_p ())
6276                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277                                          "skipping bb%d as start of region as it "
6278                                          "starts with returns-twice call\n",
6279                                          bb->index);
6280                 continue;
6281               }
6282 
6283       bbs.safe_push (bb);
6284 
6285       /* When we have a stmt ending this block and defining a
6286            value we have to insert on edges when inserting after it for
6287            a vector containing its definition.  Avoid this for now.  */
6288       if (gimple *last = last_stmt (bb))
6289           if (gimple_get_lhs (last)
6290               && is_ctrl_altering_stmt (last))
6291             {
6292               if (dump_enabled_p ())
6293                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6294                                      "splitting region at control altering "
6295                                      "definition %G", last);
6296               r |= vect_slp_bbs (bbs, NULL);
6297               bbs.truncate (0);
6298             }
6299     }
6300 
6301   if (!bbs.is_empty ())
6302     r |= vect_slp_bbs (bbs, NULL);
6303 
6304   free (rpo);
6305 
6306   return r;
6307 }
6308 
6309 /* Build a variable-length vector in which the elements in ELTS are repeated
6310    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
6311    RESULTS and add any new instructions to SEQ.
6312 
6313    The approach we use is:
6314 
6315    (1) Find a vector mode VM with integer elements of mode IM.
6316 
6317    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6318        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
6319        from small vectors to IM.
6320 
6321    (3) Duplicate each ELTS'[I] into a vector of mode VM.
6322 
6323    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6324        correct byte contents.
6325 
6326    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6327 
6328    We try to find the largest IM for which this sequence works, in order
6329    to cut down on the number of interleaves.  */
6330 
6331 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,const vec<tree> & elts,unsigned int nresults,vec<tree> & results)6332 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6333                                 const vec<tree> &elts, unsigned int nresults,
6334                                 vec<tree> &results)
6335 {
6336   unsigned int nelts = elts.length ();
6337   tree element_type = TREE_TYPE (vector_type);
6338 
6339   /* (1) Find a vector mode VM with integer elements of mode IM.  */
6340   unsigned int nvectors = 1;
6341   tree new_vector_type;
6342   tree permutes[2];
6343   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6344                                                &nvectors, &new_vector_type,
6345                                                permutes))
6346     gcc_unreachable ();
6347 
6348   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
6349   unsigned int partial_nelts = nelts / nvectors;
6350   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6351 
6352   tree_vector_builder partial_elts;
6353   auto_vec<tree, 32> pieces (nvectors * 2);
6354   pieces.quick_grow_cleared (nvectors * 2);
6355   for (unsigned int i = 0; i < nvectors; ++i)
6356     {
6357       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6358                ELTS' has mode IM.  */
6359       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6360       for (unsigned int j = 0; j < partial_nelts; ++j)
6361           partial_elts.quick_push (elts[i * partial_nelts + j]);
6362       tree t = gimple_build_vector (seq, &partial_elts);
6363       t = gimple_build (seq, VIEW_CONVERT_EXPR,
6364                               TREE_TYPE (new_vector_type), t);
6365 
6366       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
6367       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6368     }
6369 
6370   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6371            correct byte contents.
6372 
6373      Conceptually, we need to repeat the following operation log2(nvectors)
6374      times, where hi_start = nvectors / 2:
6375 
6376           out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6377           out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6378 
6379      However, if each input repeats every N elements and the VF is
6380      a multiple of N * 2, the HI result is the same as the LO result.
6381      This will be true for the first N1 iterations of the outer loop,
6382      followed by N2 iterations for which both the LO and HI results
6383      are needed.  I.e.:
6384 
6385           N1 + N2 = log2(nvectors)
6386 
6387      Each "N1 iteration" doubles the number of redundant vectors and the
6388      effect of the process as a whole is to have a sequence of nvectors/2**N1
6389      vectors that repeats 2**N1 times.  Rather than generate these redundant
6390      vectors, we halve the number of vectors for each N1 iteration.  */
6391   unsigned int in_start = 0;
6392   unsigned int out_start = nvectors;
6393   unsigned int new_nvectors = nvectors;
6394   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6395     {
6396       unsigned int hi_start = new_nvectors / 2;
6397       unsigned int out_i = 0;
6398       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6399           {
6400             if ((in_i & 1) != 0
6401                 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6402                                    2 * in_repeat))
6403               continue;
6404 
6405             tree output = make_ssa_name (new_vector_type);
6406             tree input1 = pieces[in_start + (in_i / 2)];
6407             tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6408             gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6409                                                          input1, input2,
6410                                                          permutes[in_i & 1]);
6411             gimple_seq_add_stmt (seq, stmt);
6412             pieces[out_start + out_i] = output;
6413             out_i += 1;
6414           }
6415       std::swap (in_start, out_start);
6416       new_nvectors = out_i;
6417     }
6418 
6419   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
6420   results.reserve (nresults);
6421   for (unsigned int i = 0; i < nresults; ++i)
6422     if (i < new_nvectors)
6423       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6424                                                   pieces[in_start + i]));
6425     else
6426       results.quick_push (results[i - new_nvectors]);
6427 }
6428 
6429 
6430 /* For constant and loop invariant defs in OP_NODE this function creates
6431    vector defs that will be used in the vectorized stmts and stores them
6432    to SLP_TREE_VEC_DEFS of OP_NODE.  */
6433 
6434 static void
vect_create_constant_vectors(vec_info * vinfo,slp_tree op_node)6435 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6436 {
6437   unsigned HOST_WIDE_INT nunits;
6438   tree vec_cst;
6439   unsigned j, number_of_places_left_in_vector;
6440   tree vector_type;
6441   tree vop;
6442   int group_size = op_node->ops.length ();
6443   unsigned int vec_num, i;
6444   unsigned number_of_copies = 1;
6445   bool constant_p;
6446   gimple_seq ctor_seq = NULL;
6447   auto_vec<tree, 16> permute_results;
6448 
6449   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
6450   vector_type = SLP_TREE_VECTYPE (op_node);
6451 
6452   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6453   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6454   auto_vec<tree> voprnds (number_of_vectors);
6455 
6456   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6457      created vectors. It is greater than 1 if unrolling is performed.
6458 
6459      For example, we have two scalar operands, s1 and s2 (e.g., group of
6460      strided accesses of size two), while NUNITS is four (i.e., four scalars
6461      of this type can be packed in a vector).  The output vector will contain
6462      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
6463      will be 2).
6464 
6465      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6466      containing the operands.
6467 
6468      For example, NUNITS is four as before, and the group size is 8
6469      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
6470      {s5, s6, s7, s8}.  */
6471 
6472   /* When using duplicate_and_interleave, we just need one element for
6473      each scalar statement.  */
6474   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6475     nunits = group_size;
6476 
6477   number_of_copies = nunits * number_of_vectors / group_size;
6478 
6479   number_of_places_left_in_vector = nunits;
6480   constant_p = true;
6481   tree_vector_builder elts (vector_type, nunits, 1);
6482   elts.quick_grow (nunits);
6483   stmt_vec_info insert_after = NULL;
6484   for (j = 0; j < number_of_copies; j++)
6485     {
6486       tree op;
6487       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6488         {
6489           /* Create 'vect_ = {op0,op1,...,opn}'.  */
6490           number_of_places_left_in_vector--;
6491             tree orig_op = op;
6492             if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6493               {
6494                 if (CONSTANT_CLASS_P (op))
6495                     {
6496                       if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6497                         {
6498                           /* Can't use VIEW_CONVERT_EXPR for booleans because
6499                                of possibly different sizes of scalar value and
6500                                vector element.  */
6501                           if (integer_zerop (op))
6502                               op = build_int_cst (TREE_TYPE (vector_type), 0);
6503                           else if (integer_onep (op))
6504                               op = build_all_ones_cst (TREE_TYPE (vector_type));
6505                           else
6506                               gcc_unreachable ();
6507                         }
6508                       else
6509                         op = fold_unary (VIEW_CONVERT_EXPR,
6510                                              TREE_TYPE (vector_type), op);
6511                       gcc_assert (op && CONSTANT_CLASS_P (op));
6512                     }
6513                 else
6514                     {
6515                       tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6516                       gimple *init_stmt;
6517                       if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6518                         {
6519                           tree true_val
6520                               = build_all_ones_cst (TREE_TYPE (vector_type));
6521                           tree false_val
6522                               = build_zero_cst (TREE_TYPE (vector_type));
6523                           gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6524                           init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6525                                                                    op, true_val,
6526                                                                    false_val);
6527                         }
6528                       else
6529                         {
6530                           op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6531                                            op);
6532                           init_stmt
6533                               = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6534                                                          op);
6535                         }
6536                       gimple_seq_add_stmt (&ctor_seq, init_stmt);
6537                       op = new_temp;
6538                     }
6539               }
6540             elts[number_of_places_left_in_vector] = op;
6541             if (!CONSTANT_CLASS_P (op))
6542               constant_p = false;
6543             /* For BB vectorization we have to compute an insert location
6544                when a def is inside the analyzed region since we cannot
6545                simply insert at the BB start in this case.  */
6546             stmt_vec_info opdef;
6547             if (TREE_CODE (orig_op) == SSA_NAME
6548                 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6549                 && is_a <bb_vec_info> (vinfo)
6550                 && (opdef = vinfo->lookup_def (orig_op)))
6551               {
6552                 if (!insert_after)
6553                     insert_after = opdef;
6554                 else
6555                     insert_after = get_later_stmt (insert_after, opdef);
6556               }
6557 
6558           if (number_of_places_left_in_vector == 0)
6559             {
6560                 if (constant_p
6561                       ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6562                       : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6563                     vec_cst = gimple_build_vector (&ctor_seq, &elts);
6564                 else
6565                     {
6566                       if (permute_results.is_empty ())
6567                         duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6568                                                         elts, number_of_vectors,
6569                                                         permute_results);
6570                       vec_cst = permute_results[number_of_vectors - j - 1];
6571                     }
6572                 if (!gimple_seq_empty_p (ctor_seq))
6573                     {
6574                       if (insert_after)
6575                         {
6576                           gimple_stmt_iterator gsi;
6577                           if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6578                               {
6579                                 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6580                                 gsi_insert_seq_before (&gsi, ctor_seq,
6581                                                              GSI_CONTINUE_LINKING);
6582                               }
6583                           else if (!stmt_ends_bb_p (insert_after->stmt))
6584                               {
6585                                 gsi = gsi_for_stmt (insert_after->stmt);
6586                                 gsi_insert_seq_after (&gsi, ctor_seq,
6587                                                             GSI_CONTINUE_LINKING);
6588                               }
6589                           else
6590                               {
6591                                 /* When we want to insert after a def where the
6592                                    defining stmt throws then insert on the fallthru
6593                                    edge.  */
6594                                 edge e = find_fallthru_edge
6595                                              (gimple_bb (insert_after->stmt)->succs);
6596                                 basic_block new_bb
6597                                   = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6598                                 gcc_assert (!new_bb);
6599                               }
6600                         }
6601                       else
6602                         vinfo->insert_seq_on_entry (NULL, ctor_seq);
6603                       ctor_seq = NULL;
6604                     }
6605                 voprnds.quick_push (vec_cst);
6606                 insert_after = NULL;
6607               number_of_places_left_in_vector = nunits;
6608                 constant_p = true;
6609                 elts.new_vector (vector_type, nunits, 1);
6610                 elts.quick_grow (nunits);
6611             }
6612         }
6613     }
6614 
6615   /* Since the vectors are created in the reverse order, we should invert
6616      them.  */
6617   vec_num = voprnds.length ();
6618   for (j = vec_num; j != 0; j--)
6619     {
6620       vop = voprnds[j - 1];
6621       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6622     }
6623 
6624   /* In case that VF is greater than the unrolling factor needed for the SLP
6625      group of stmts, NUMBER_OF_VECTORS to be created is greater than
6626      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6627      to replicate the vectors.  */
6628   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6629     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6630            i++)
6631       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6632 }
6633 
6634 /* Get the Ith vectorized definition from SLP_NODE.  */
6635 
6636 tree
vect_get_slp_vect_def(slp_tree slp_node,unsigned i)6637 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6638 {
6639   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6640     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6641   else
6642     return SLP_TREE_VEC_DEFS (slp_node)[i];
6643 }
6644 
6645 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
6646 
6647 void
vect_get_slp_defs(slp_tree slp_node,vec<tree> * vec_defs)6648 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6649 {
6650   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6651   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6652     {
6653       unsigned j;
6654       gimple *vec_def_stmt;
6655       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6656           vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6657     }
6658   else
6659     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6660 }
6661 
6662 /* Get N vectorized definitions for SLP_NODE.  */
6663 
6664 void
vect_get_slp_defs(vec_info *,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)6665 vect_get_slp_defs (vec_info *,
6666                        slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6667 {
6668   if (n == -1U)
6669     n = SLP_TREE_CHILDREN (slp_node).length ();
6670 
6671   for (unsigned i = 0; i < n; ++i)
6672     {
6673       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6674       vec<tree> vec_defs = vNULL;
6675       vect_get_slp_defs (child, &vec_defs);
6676       vec_oprnds->quick_push (vec_defs);
6677     }
6678 }
6679 
6680 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6681    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6682    permute statements for the SLP node NODE.  Store the number of vector
6683    permute instructions in *N_PERMS and the number of vector load
6684    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
6685    that were not needed.  */
6686 
6687 bool
vect_transform_slp_perm_load(vec_info * vinfo,slp_tree node,const vec<tree> & dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,bool analyze_only,unsigned * n_perms,unsigned int * n_loads,bool dce_chain)6688 vect_transform_slp_perm_load (vec_info *vinfo,
6689                                     slp_tree node, const vec<tree> &dr_chain,
6690                                     gimple_stmt_iterator *gsi, poly_uint64 vf,
6691                                     bool analyze_only, unsigned *n_perms,
6692                                     unsigned int *n_loads, bool dce_chain)
6693 {
6694   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6695   int vec_index = 0;
6696   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6697   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6698   unsigned int mask_element;
6699   machine_mode mode;
6700 
6701   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6702     return false;
6703 
6704   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6705 
6706   mode = TYPE_MODE (vectype);
6707   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6708 
6709   /* Initialize the vect stmts of NODE to properly insert the generated
6710      stmts later.  */
6711   if (! analyze_only)
6712     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6713            i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6714       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6715 
6716   /* Generate permutation masks for every NODE. Number of masks for each NODE
6717      is equal to GROUP_SIZE.
6718      E.g., we have a group of three nodes with three loads from the same
6719      location in each node, and the vector size is 4. I.e., we have a
6720      a0b0c0a1b1c1... sequence and we need to create the following vectors:
6721      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6722      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6723      ...
6724 
6725      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6726      The last mask is illegal since we assume two operands for permute
6727      operation, and the mask element values can't be outside that range.
6728      Hence, the last mask must be converted into {2,5,5,5}.
6729      For the first two permutations we need the first and the second input
6730      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6731      we need the second and the third vectors: {b1,c1,a2,b2} and
6732      {c2,a3,b3,c3}.  */
6733 
6734   int vect_stmts_counter = 0;
6735   unsigned int index = 0;
6736   int first_vec_index = -1;
6737   int second_vec_index = -1;
6738   bool noop_p = true;
6739   *n_perms = 0;
6740 
6741   vec_perm_builder mask;
6742   unsigned int nelts_to_build;
6743   unsigned int nvectors_per_build;
6744   unsigned int in_nlanes;
6745   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6746                           && multiple_p (nunits, group_size));
6747   if (repeating_p)
6748     {
6749       /* A single vector contains a whole number of copies of the node, so:
6750            (a) all permutes can use the same mask; and
6751            (b) the permutes only need a single vector input.  */
6752       mask.new_vector (nunits, group_size, 3);
6753       nelts_to_build = mask.encoded_nelts ();
6754       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6755       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6756     }
6757   else
6758     {
6759       /* We need to construct a separate mask for each vector statement.  */
6760       unsigned HOST_WIDE_INT const_nunits, const_vf;
6761       if (!nunits.is_constant (&const_nunits)
6762             || !vf.is_constant (&const_vf))
6763           return false;
6764       mask.new_vector (const_nunits, const_nunits, 1);
6765       nelts_to_build = const_vf * group_size;
6766       nvectors_per_build = 1;
6767       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6768     }
6769   auto_sbitmap used_in_lanes (in_nlanes);
6770   bitmap_clear (used_in_lanes);
6771   auto_bitmap used_defs;
6772 
6773   unsigned int count = mask.encoded_nelts ();
6774   mask.quick_grow (count);
6775   vec_perm_indices indices;
6776 
6777   for (unsigned int j = 0; j < nelts_to_build; j++)
6778     {
6779       unsigned int iter_num = j / group_size;
6780       unsigned int stmt_num = j % group_size;
6781       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6782                               + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6783       bitmap_set_bit (used_in_lanes, i);
6784       if (repeating_p)
6785           {
6786             first_vec_index = 0;
6787             mask_element = i;
6788           }
6789       else
6790           {
6791             /* Enforced before the loop when !repeating_p.  */
6792             unsigned int const_nunits = nunits.to_constant ();
6793             vec_index = i / const_nunits;
6794             mask_element = i % const_nunits;
6795             if (vec_index == first_vec_index
6796                 || first_vec_index == -1)
6797               {
6798                 first_vec_index = vec_index;
6799               }
6800             else if (vec_index == second_vec_index
6801                        || second_vec_index == -1)
6802               {
6803                 second_vec_index = vec_index;
6804                 mask_element += const_nunits;
6805               }
6806             else
6807               {
6808                 if (dump_enabled_p ())
6809                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810                                          "permutation requires at "
6811                                          "least three vectors %G",
6812                                          stmt_info->stmt);
6813                 gcc_assert (analyze_only);
6814                 return false;
6815               }
6816 
6817             gcc_assert (mask_element < 2 * const_nunits);
6818           }
6819 
6820       if (mask_element != index)
6821           noop_p = false;
6822       mask[index++] = mask_element;
6823 
6824       if (index == count && !noop_p)
6825           {
6826             indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6827             if (!can_vec_perm_const_p (mode, indices))
6828               {
6829                 if (dump_enabled_p ())
6830                     {
6831                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6832                                            vect_location,
6833                                            "unsupported vect permute { ");
6834                       for (i = 0; i < count; ++i)
6835                         {
6836                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6837                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6838                         }
6839                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6840                     }
6841                 gcc_assert (analyze_only);
6842                 return false;
6843               }
6844 
6845             ++*n_perms;
6846           }
6847 
6848       if (index == count)
6849           {
6850             if (!analyze_only)
6851               {
6852                 tree mask_vec = NULL_TREE;
6853 
6854                 if (! noop_p)
6855                     mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6856 
6857                 if (second_vec_index == -1)
6858                     second_vec_index = first_vec_index;
6859 
6860                 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6861                     {
6862                       /* Generate the permute statement if necessary.  */
6863                       tree first_vec = dr_chain[first_vec_index + ri];
6864                       tree second_vec = dr_chain[second_vec_index + ri];
6865                       gimple *perm_stmt;
6866                       if (! noop_p)
6867                         {
6868                           gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6869                           tree perm_dest
6870                               = vect_create_destination_var (gimple_assign_lhs (stmt),
6871                                                                    vectype);
6872                           perm_dest = make_ssa_name (perm_dest);
6873                           perm_stmt
6874                               = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6875                                                          first_vec, second_vec,
6876                                                          mask_vec);
6877                           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6878                                                                gsi);
6879                           if (dce_chain)
6880                               {
6881                                 bitmap_set_bit (used_defs, first_vec_index + ri);
6882                                 bitmap_set_bit (used_defs, second_vec_index + ri);
6883                               }
6884                         }
6885                       else
6886                         {
6887                           /* If mask was NULL_TREE generate the requested
6888                                identity transform.  */
6889                           perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6890                           if (dce_chain)
6891                               bitmap_set_bit (used_defs, first_vec_index + ri);
6892                         }
6893 
6894                       /* Store the vector statement in NODE.  */
6895                       SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6896                     }
6897               }
6898 
6899             index = 0;
6900             first_vec_index = -1;
6901             second_vec_index = -1;
6902             noop_p = true;
6903           }
6904     }
6905 
6906   if (n_loads)
6907     {
6908       if (repeating_p)
6909           *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6910       else
6911           {
6912             /* Enforced above when !repeating_p.  */
6913             unsigned int const_nunits = nunits.to_constant ();
6914             *n_loads = 0;
6915             bool load_seen = false;
6916             for (unsigned i = 0; i < in_nlanes; ++i)
6917               {
6918                 if (i % const_nunits == 0)
6919                     {
6920                       if (load_seen)
6921                         *n_loads += 1;
6922                       load_seen = false;
6923                     }
6924                 if (bitmap_bit_p (used_in_lanes, i))
6925                     load_seen = true;
6926               }
6927             if (load_seen)
6928               *n_loads += 1;
6929           }
6930     }
6931 
6932   if (dce_chain)
6933     for (unsigned i = 0; i < dr_chain.length (); ++i)
6934       if (!bitmap_bit_p (used_defs, i))
6935           {
6936             gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6937             gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6938             gsi_remove (&rgsi, true);
6939             release_defs (stmt);
6940           }
6941 
6942   return true;
6943 }
6944 
6945 /* Produce the next vector result for SLP permutation NODE by adding a vector
6946    statement at GSI.  If MASK_VEC is nonnull, add:
6947 
6948       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6949 
6950    otherwise add:
6951 
6952       <new SSA name> = FIRST_DEF.  */
6953 
6954 static void
vect_add_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,tree first_def,tree second_def,tree mask_vec)6955 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6956                                 slp_tree node, tree first_def, tree second_def,
6957                                 tree mask_vec)
6958 {
6959   tree vectype = SLP_TREE_VECTYPE (node);
6960 
6961   /* ???  We SLP match existing vector element extracts but
6962      allow punning which we need to re-instantiate at uses
6963      but have no good way of explicitly representing.  */
6964   if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6965     {
6966       gassign *conv_stmt
6967           = gimple_build_assign (make_ssa_name (vectype),
6968                                      build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6969       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6970       first_def = gimple_assign_lhs (conv_stmt);
6971     }
6972   gassign *perm_stmt;
6973   tree perm_dest = make_ssa_name (vectype);
6974   if (mask_vec)
6975     {
6976       if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6977           {
6978             gassign *conv_stmt
6979               = gimple_build_assign (make_ssa_name (vectype),
6980                                            build1 (VIEW_CONVERT_EXPR,
6981                                                      vectype, second_def));
6982             vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6983             second_def = gimple_assign_lhs (conv_stmt);
6984           }
6985       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6986                                                first_def, second_def,
6987                                                mask_vec);
6988     }
6989   else
6990     /* We need a copy here in case the def was external.  */
6991     perm_stmt = gimple_build_assign (perm_dest, first_def);
6992   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6993   /* Store the vector statement in NODE.  */
6994   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6995 }
6996 
6997 /* Vectorize the SLP permutations in NODE as specified
6998    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6999    child number and lane number.
7000    Interleaving of two two-lane two-child SLP subtrees (not supported):
7001      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
7002    A blend of two four-lane two-child SLP subtrees:
7003      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
7004    Highpart of a four-lane one-child SLP subtree (not supported):
7005      [ { 0, 2 }, { 0, 3 } ]
7006    Where currently only a subset is supported by code generating below.  */
7007 
7008 static bool
vectorizable_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,stmt_vector_for_cost * cost_vec)7009 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
7010                                     slp_tree node, stmt_vector_for_cost *cost_vec)
7011 {
7012   tree vectype = SLP_TREE_VECTYPE (node);
7013 
7014   /* ???  We currently only support all same vector input and output types
7015      while the SLP IL should really do a concat + select and thus accept
7016      arbitrary mismatches.  */
7017   slp_tree child;
7018   unsigned i;
7019   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7020   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
7021   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7022     {
7023       if (!vect_maybe_update_slp_op_vectype (child, vectype)
7024             || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
7025           {
7026             if (dump_enabled_p ())
7027               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7028                                    "Unsupported lane permutation\n");
7029             return false;
7030           }
7031       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
7032           repeating_p = false;
7033     }
7034 
7035   vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
7036   gcc_assert (perm.length () == SLP_TREE_LANES (node));
7037   if (dump_enabled_p ())
7038     {
7039       dump_printf_loc (MSG_NOTE, vect_location,
7040                            "vectorizing permutation");
7041       for (unsigned i = 0; i < perm.length (); ++i)
7042           dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
7043       if (repeating_p)
7044           dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
7045       dump_printf (MSG_NOTE, "\n");
7046     }
7047 
7048   /* REPEATING_P is true if every output vector is guaranteed to use the
7049      same permute vector.  We can handle that case for both variable-length
7050      and constant-length vectors, but we only handle other cases for
7051      constant-length vectors.
7052 
7053      Set:
7054 
7055      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7056        mask vector that we want to build.
7057 
7058      - NCOPIES to the number of copies of PERM that we need in order
7059        to build the necessary permute mask vectors.
7060 
7061      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7062        for each permute mask vector.  This is only relevant when GSI is
7063        nonnull.  */
7064   uint64_t npatterns;
7065   unsigned nelts_per_pattern;
7066   uint64_t ncopies;
7067   unsigned noutputs_per_mask;
7068   if (repeating_p)
7069     {
7070       /* We need a single permute mask vector that has the form:
7071 
7072              { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7073 
7074            In other words, the original n-element permute in PERM is
7075            "unrolled" to fill a full vector.  The stepped vector encoding
7076            that we use for permutes requires 3n elements.  */
7077       npatterns = SLP_TREE_LANES (node);
7078       nelts_per_pattern = ncopies = 3;
7079       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7080     }
7081   else
7082     {
7083       /* Calculate every element of every permute mask vector explicitly,
7084            instead of relying on the pattern described above.  */
7085       if (!nunits.is_constant (&npatterns))
7086           return false;
7087       nelts_per_pattern = ncopies = 1;
7088       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
7089           if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
7090             return false;
7091       noutputs_per_mask = 1;
7092     }
7093   unsigned olanes = ncopies * SLP_TREE_LANES (node);
7094   gcc_assert (repeating_p || multiple_p (olanes, nunits));
7095 
7096   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7097      from the { SLP operand, scalar lane } permutation as recorded in the
7098      SLP node as intermediate step.  This part should already work
7099      with SLP children with arbitrary number of lanes.  */
7100   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
7101   auto_vec<unsigned> active_lane;
7102   vperm.create (olanes);
7103   active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
7104   for (unsigned i = 0; i < ncopies; ++i)
7105     {
7106       for (unsigned pi = 0; pi < perm.length (); ++pi)
7107           {
7108             std::pair<unsigned, unsigned> p = perm[pi];
7109             tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7110             if (repeating_p)
7111               vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7112             else
7113               {
7114                 /* We checked above that the vectors are constant-length.  */
7115                 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7116                 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7117                 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7118                 vperm.quick_push ({{p.first, vi}, vl});
7119               }
7120           }
7121       /* Advance to the next group.  */
7122       for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7123           active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7124     }
7125 
7126   if (dump_enabled_p ())
7127     {
7128       dump_printf_loc (MSG_NOTE, vect_location, "as");
7129       for (unsigned i = 0; i < vperm.length (); ++i)
7130           {
7131             if (i != 0
7132                 && (repeating_p
7133                       ? multiple_p (i, npatterns)
7134                       : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7135               dump_printf (MSG_NOTE, ",");
7136             dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7137                            vperm[i].first.first, vperm[i].first.second,
7138                            vperm[i].second);
7139           }
7140       dump_printf (MSG_NOTE, "\n");
7141     }
7142 
7143   /* We can only handle two-vector permutes, everything else should
7144      be lowered on the SLP level.  The following is closely inspired
7145      by vect_transform_slp_perm_load and is supposed to eventually
7146      replace it.
7147      ???   As intermediate step do code-gen in the SLP tree representation
7148      somehow?  */
7149   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7150   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7151   unsigned int index = 0;
7152   poly_uint64 mask_element;
7153   vec_perm_builder mask;
7154   mask.new_vector (nunits, npatterns, nelts_per_pattern);
7155   unsigned int count = mask.encoded_nelts ();
7156   mask.quick_grow (count);
7157   vec_perm_indices indices;
7158   unsigned nperms = 0;
7159   for (unsigned i = 0; i < vperm.length (); ++i)
7160     {
7161       mask_element = vperm[i].second;
7162       if (first_vec.first == -1U
7163             || first_vec == vperm[i].first)
7164           first_vec = vperm[i].first;
7165       else if (second_vec.first == -1U
7166                  || second_vec == vperm[i].first)
7167           {
7168             second_vec = vperm[i].first;
7169             mask_element += nunits;
7170           }
7171       else
7172           {
7173             if (dump_enabled_p ())
7174               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7175                                    "permutation requires at "
7176                                    "least three vectors\n");
7177             gcc_assert (!gsi);
7178             return false;
7179           }
7180 
7181       mask[index++] = mask_element;
7182 
7183       if (index == count)
7184           {
7185             indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7186             bool identity_p = indices.series_p (0, 1, 0, 1);
7187             if (!identity_p
7188                 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7189               {
7190                 if (dump_enabled_p ())
7191                     {
7192                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7193                                            vect_location,
7194                                            "unsupported vect permute { ");
7195                       for (i = 0; i < count; ++i)
7196                         {
7197                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7198                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7199                         }
7200                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7201                     }
7202                 gcc_assert (!gsi);
7203                 return false;
7204               }
7205 
7206             if (!identity_p)
7207               nperms++;
7208             if (gsi)
7209               {
7210                 if (second_vec.first == -1U)
7211                     second_vec = first_vec;
7212 
7213                 slp_tree
7214                     first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7215                     second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7216 
7217                 tree mask_vec = NULL_TREE;
7218                 if (!identity_p)
7219                     mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7220 
7221                 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7222                     {
7223                       tree first_def
7224                         = vect_get_slp_vect_def (first_node,
7225                                                        first_vec.second + vi);
7226                       tree second_def
7227                         = vect_get_slp_vect_def (second_node,
7228                                                        second_vec.second + vi);
7229                       vect_add_slp_permutation (vinfo, gsi, node, first_def,
7230                                                       second_def, mask_vec);
7231                     }
7232               }
7233 
7234             index = 0;
7235             first_vec = std::make_pair (-1U, -1U);
7236             second_vec = std::make_pair (-1U, -1U);
7237           }
7238     }
7239 
7240   if (!gsi)
7241     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7242 
7243   return true;
7244 }
7245 
7246 /* Vectorize SLP NODE.  */
7247 
7248 static void
vect_schedule_slp_node(vec_info * vinfo,slp_tree node,slp_instance instance)7249 vect_schedule_slp_node (vec_info *vinfo,
7250                               slp_tree node, slp_instance instance)
7251 {
7252   gimple_stmt_iterator si;
7253   int i;
7254   slp_tree child;
7255 
7256   /* Vectorize externals and constants.  */
7257   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7258       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7259     {
7260       /* ???  vectorizable_shift can end up using a scalar operand which is
7261            currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
7262            node in this case.  */
7263       if (!SLP_TREE_VECTYPE (node))
7264           return;
7265 
7266       /* There are two reasons vector defs might already exist.  The first
7267            is that we are vectorizing an existing vector def.  The second is
7268            when performing BB vectorization shared constant/external nodes
7269            are not split apart during partitioning so during the code-gen
7270            DFS walk we can end up visiting them twice.  */
7271       if (! SLP_TREE_VEC_DEFS (node).exists ())
7272           vect_create_constant_vectors (vinfo, node);
7273       return;
7274     }
7275 
7276   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
7277 
7278   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7279 
7280   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7281   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7282 
7283   if (dump_enabled_p ())
7284     dump_printf_loc (MSG_NOTE, vect_location,
7285                          "------>vectorizing SLP node starting from: %G",
7286                          stmt_info->stmt);
7287 
7288   if (STMT_VINFO_DATA_REF (stmt_info)
7289       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7290     {
7291       /* Vectorized loads go before the first scalar load to make it
7292            ready early, vectorized stores go before the last scalar
7293            stmt which is where all uses are ready.  */
7294       stmt_vec_info last_stmt_info = NULL;
7295       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7296           last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7297       else /* DR_IS_WRITE */
7298           last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7299       si = gsi_for_stmt (last_stmt_info->stmt);
7300     }
7301   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7302               || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7303               || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7304              && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7305     {
7306       /* For PHI node vectorization we do not use the insertion iterator.  */
7307       si = gsi_none ();
7308     }
7309   else
7310     {
7311       /* Emit other stmts after the children vectorized defs which is
7312            earliest possible.  */
7313       gimple *last_stmt = NULL;
7314       if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
7315           if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7316               || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7317             {
7318               /* But avoid scheduling internal defs outside of the loop when
7319                  we might have only implicitly tracked loop mask/len defs.  */
7320               gimple_stmt_iterator si
7321                 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
7322               last_stmt = gsi_stmt (si);
7323             }
7324       bool seen_vector_def = false;
7325       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7326           if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7327             {
7328               /* For fold-left reductions we are retaining the scalar
7329                  reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7330                  set so the representation isn't perfect.  Resort to the
7331                  last scalar def here.  */
7332               if (SLP_TREE_VEC_STMTS (child).is_empty ())
7333                 {
7334                     gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7335                                   == cycle_phi_info_type);
7336                     gphi *phi = as_a <gphi *>
7337                                     (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7338                     if (!last_stmt
7339                         || vect_stmt_dominates_stmt_p (last_stmt, phi))
7340                       last_stmt = phi;
7341                 }
7342               /* We are emitting all vectorized stmts in the same place and
7343                  the last one is the last.
7344                  ???  Unless we have a load permutation applied and that
7345                  figures to re-use an earlier generated load.  */
7346               unsigned j;
7347               gimple *vstmt;
7348               FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7349                 if (!last_stmt
7350                       || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7351                     last_stmt = vstmt;
7352             }
7353           else if (!SLP_TREE_VECTYPE (child))
7354             {
7355               /* For externals we use unvectorized at all scalar defs.  */
7356               unsigned j;
7357               tree def;
7358               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7359                 if (TREE_CODE (def) == SSA_NAME
7360                       && !SSA_NAME_IS_DEFAULT_DEF (def))
7361                     {
7362                       gimple *stmt = SSA_NAME_DEF_STMT (def);
7363                       if (!last_stmt
7364                           || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7365                         last_stmt = stmt;
7366                     }
7367             }
7368           else
7369             {
7370               /* For externals we have to look at all defs since their
7371                  insertion place is decided per vector.  But beware
7372                  of pre-existing vectors where we need to make sure
7373                  we do not insert before the region boundary.  */
7374               if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7375                     && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7376                 seen_vector_def = true;
7377               else
7378                 {
7379                     unsigned j;
7380                     tree vdef;
7381                     FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7382                       if (TREE_CODE (vdef) == SSA_NAME
7383                           && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7384                         {
7385                           gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7386                           if (!last_stmt
7387                                 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7388                               last_stmt = vstmt;
7389                         }
7390                 }
7391             }
7392       /* This can happen when all children are pre-existing vectors or
7393            constants.  */
7394       if (!last_stmt)
7395           last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7396       if (!last_stmt)
7397           {
7398             gcc_assert (seen_vector_def);
7399             si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7400           }
7401       else if (is_ctrl_altering_stmt (last_stmt))
7402           {
7403             /* We split regions to vectorize at control altering stmts
7404                with a definition so this must be an external which
7405                we can insert at the start of the region.  */
7406             si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7407           }
7408       else if (is_a <bb_vec_info> (vinfo)
7409                  && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7410                  && gimple_could_trap_p (stmt_info->stmt))
7411           {
7412             /* We've constrained possibly trapping operations to all come
7413                from the same basic-block, if vectorized defs would allow earlier
7414                scheduling still force vectorized stmts to the original block.
7415                This is only necessary for BB vectorization since for loop vect
7416                all operations are in a single BB and scalar stmt based
7417                placement doesn't play well with epilogue vectorization.  */
7418             gcc_assert (dominated_by_p (CDI_DOMINATORS,
7419                                               gimple_bb (stmt_info->stmt),
7420                                               gimple_bb (last_stmt)));
7421             si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7422           }
7423       else if (is_a <gphi *> (last_stmt))
7424           si = gsi_after_labels (gimple_bb (last_stmt));
7425       else
7426           {
7427             si = gsi_for_stmt (last_stmt);
7428             gsi_next (&si);
7429           }
7430     }
7431 
7432   /* Handle purely internal nodes.  */
7433   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7434     {
7435       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
7436            be shared with different SLP nodes (but usually it's the same
7437            operation apart from the case the stmt is only there for denoting
7438            the actual scalar lane defs ...).  So do not call vect_transform_stmt
7439            but open-code it here (partly).  */
7440       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7441       gcc_assert (done);
7442       stmt_vec_info slp_stmt_info;
7443       unsigned int i;
7444       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7445           if (STMT_VINFO_LIVE_P (slp_stmt_info))
7446             {
7447               done = vectorizable_live_operation (vinfo,
7448                                                             slp_stmt_info, &si, node,
7449                                                             instance, i, true, NULL);
7450               gcc_assert (done);
7451             }
7452     }
7453   else
7454     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7455 }
7456 
7457 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7458    For loop vectorization this is done in vectorizable_call, but for SLP
7459    it needs to be deferred until end of vect_schedule_slp, because multiple
7460    SLP instances may refer to the same scalar stmt.  */
7461 
7462 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited)7463 vect_remove_slp_scalar_calls (vec_info *vinfo,
7464                                     slp_tree node, hash_set<slp_tree> &visited)
7465 {
7466   gimple *new_stmt;
7467   gimple_stmt_iterator gsi;
7468   int i;
7469   slp_tree child;
7470   tree lhs;
7471   stmt_vec_info stmt_info;
7472 
7473   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7474     return;
7475 
7476   if (visited.add (node))
7477     return;
7478 
7479   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7480     vect_remove_slp_scalar_calls (vinfo, child, visited);
7481 
7482   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7483     {
7484       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7485       if (!stmt || gimple_bb (stmt) == NULL)
7486           continue;
7487       if (is_pattern_stmt_p (stmt_info)
7488             || !PURE_SLP_STMT (stmt_info))
7489           continue;
7490       lhs = gimple_call_lhs (stmt);
7491       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7492       gsi = gsi_for_stmt (stmt);
7493       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7494       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7495     }
7496 }
7497 
7498 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node)7499 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7500 {
7501   hash_set<slp_tree> visited;
7502   vect_remove_slp_scalar_calls (vinfo, node, visited);
7503 }
7504 
7505 /* Vectorize the instance root.  */
7506 
7507 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)7508 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7509 {
7510   gassign *rstmt = NULL;
7511 
7512   if (instance->kind == slp_inst_kind_ctor)
7513     {
7514       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7515           {
7516             gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7517             tree vect_lhs = gimple_get_lhs (child_stmt);
7518             tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7519             if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7520                                                     TREE_TYPE (vect_lhs)))
7521               vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7522                                      vect_lhs);
7523             rstmt = gimple_build_assign (root_lhs, vect_lhs);
7524           }
7525       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7526           {
7527             int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7528             gimple *child_stmt;
7529             int j;
7530             vec<constructor_elt, va_gc> *v;
7531             vec_alloc (v, nelts);
7532 
7533             /* A CTOR can handle V16HI composition from VNx8HI so we
7534                do not need to convert vector elements if the types
7535                do not match.  */
7536             FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7537               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7538                                             gimple_get_lhs (child_stmt));
7539             tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7540             tree rtype
7541               = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7542             tree r_constructor = build_constructor (rtype, v);
7543             rstmt = gimple_build_assign (lhs, r_constructor);
7544           }
7545     }
7546   else if (instance->kind == slp_inst_kind_bb_reduc)
7547     {
7548       /* Largely inspired by reduction chain epilogue handling in
7549            vect_create_epilog_for_reduction.  */
7550       vec<tree> vec_defs = vNULL;
7551       vect_get_slp_defs (node, &vec_defs);
7552       enum tree_code reduc_code
7553           = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7554       /* ???  We actually have to reflect signs somewhere.  */
7555       if (reduc_code == MINUS_EXPR)
7556           reduc_code = PLUS_EXPR;
7557       gimple_seq epilogue = NULL;
7558       /* We may end up with more than one vector result, reduce them
7559            to one vector.  */
7560       tree vec_def = vec_defs[0];
7561       for (unsigned i = 1; i < vec_defs.length (); ++i)
7562           vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7563                                         vec_def, vec_defs[i]);
7564       vec_defs.release ();
7565       /* ???  Support other schemes than direct internal fn.  */
7566       internal_fn reduc_fn;
7567       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7568             || reduc_fn == IFN_LAST)
7569           gcc_unreachable ();
7570       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7571                                               TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7572 
7573       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7574       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7575       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7576       update_stmt (gsi_stmt (rgsi));
7577       return;
7578     }
7579   else
7580     gcc_unreachable ();
7581 
7582   gcc_assert (rstmt);
7583 
7584   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7585   gsi_replace (&rgsi, rstmt, true);
7586 }
7587 
7588 struct slp_scc_info
7589 {
7590   bool on_stack;
7591   int dfs;
7592   int lowlink;
7593 };
7594 
7595 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
7596 
7597 static void
vect_schedule_scc(vec_info * vinfo,slp_tree node,slp_instance instance,hash_map<slp_tree,slp_scc_info> & scc_info,int & maxdfs,vec<slp_tree> & stack)7598 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7599                        hash_map<slp_tree, slp_scc_info> &scc_info,
7600                        int &maxdfs, vec<slp_tree> &stack)
7601 {
7602   bool existed_p;
7603   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7604   gcc_assert (!existed_p);
7605   info->dfs = maxdfs;
7606   info->lowlink = maxdfs;
7607   maxdfs++;
7608 
7609   /* Leaf.  */
7610   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7611     {
7612       info->on_stack = false;
7613       vect_schedule_slp_node (vinfo, node, instance);
7614       return;
7615     }
7616 
7617   info->on_stack = true;
7618   stack.safe_push (node);
7619 
7620   unsigned i;
7621   slp_tree child;
7622   /* DFS recurse.  */
7623   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7624     {
7625       if (!child)
7626           continue;
7627       slp_scc_info *child_info = scc_info.get (child);
7628       if (!child_info)
7629           {
7630             vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7631             /* Recursion might have re-allocated the node.  */
7632             info = scc_info.get (node);
7633             child_info = scc_info.get (child);
7634             info->lowlink = MIN (info->lowlink, child_info->lowlink);
7635           }
7636       else if (child_info->on_stack)
7637           info->lowlink = MIN (info->lowlink, child_info->dfs);
7638     }
7639   if (info->lowlink != info->dfs)
7640     return;
7641 
7642   auto_vec<slp_tree, 4> phis_to_fixup;
7643 
7644   /* Singleton.  */
7645   if (stack.last () == node)
7646     {
7647       stack.pop ();
7648       info->on_stack = false;
7649       vect_schedule_slp_node (vinfo, node, instance);
7650       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7651             && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7652           phis_to_fixup.quick_push (node);
7653     }
7654   else
7655     {
7656       /* SCC.  */
7657       int last_idx = stack.length () - 1;
7658       while (stack[last_idx] != node)
7659           last_idx--;
7660       /* We can break the cycle at PHIs who have at least one child
7661            code generated.  Then we could re-start the DFS walk until
7662            all nodes in the SCC are covered (we might have new entries
7663            for only back-reachable nodes).  But it's simpler to just
7664            iterate and schedule those that are ready.  */
7665       unsigned todo = stack.length () - last_idx;
7666       do
7667           {
7668             for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7669               {
7670                 slp_tree entry = stack[idx];
7671                 if (!entry)
7672                     continue;
7673                 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7674                                 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7675                 bool ready = !phi;
7676                 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7677                       if (!child)
7678                         {
7679                           gcc_assert (phi);
7680                           ready = true;
7681                           break;
7682                         }
7683                       else if (scc_info.get (child)->on_stack)
7684                         {
7685                           if (!phi)
7686                               {
7687                                 ready = false;
7688                                 break;
7689                               }
7690                         }
7691                       else
7692                         {
7693                           if (phi)
7694                               {
7695                                 ready = true;
7696                                 break;
7697                               }
7698                         }
7699                 if (ready)
7700                     {
7701                       vect_schedule_slp_node (vinfo, entry, instance);
7702                       scc_info.get (entry)->on_stack = false;
7703                       stack[idx] = NULL;
7704                       todo--;
7705                       if (phi)
7706                         phis_to_fixup.safe_push (entry);
7707                     }
7708               }
7709           }
7710       while (todo != 0);
7711 
7712       /* Pop the SCC.  */
7713       stack.truncate (last_idx);
7714     }
7715 
7716   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
7717   slp_tree phi_node;
7718   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7719     {
7720       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7721       edge_iterator ei;
7722       edge e;
7723       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7724           {
7725             unsigned dest_idx = e->dest_idx;
7726             child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7727             if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7728               continue;
7729             /* Simply fill all args.  */
7730             for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7731               add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7732                                vect_get_slp_vect_def (child, i),
7733                                e, gimple_phi_arg_location (phi, dest_idx));
7734           }
7735     }
7736 }
7737 
7738 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
7739 
7740 void
vect_schedule_slp(vec_info * vinfo,const vec<slp_instance> & slp_instances)7741 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7742 {
7743   slp_instance instance;
7744   unsigned int i;
7745 
7746   hash_map<slp_tree, slp_scc_info> scc_info;
7747   int maxdfs = 0;
7748   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7749     {
7750       slp_tree node = SLP_INSTANCE_TREE (instance);
7751       if (dump_enabled_p ())
7752           {
7753             dump_printf_loc (MSG_NOTE, vect_location,
7754                                  "Vectorizing SLP tree:\n");
7755             /* ???  Dump all?  */
7756             if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7757               dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7758                                SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7759             vect_print_slp_graph (MSG_NOTE, vect_location,
7760                                         SLP_INSTANCE_TREE (instance));
7761           }
7762       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7763            have a PHI be the node breaking the cycle.  */
7764       auto_vec<slp_tree> stack;
7765       if (!scc_info.get (node))
7766           vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7767 
7768       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7769           vectorize_slp_instance_root_stmt (node, instance);
7770 
7771       if (dump_enabled_p ())
7772           dump_printf_loc (MSG_NOTE, vect_location,
7773                          "vectorizing stmts using SLP.\n");
7774     }
7775 
7776   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7777     {
7778       slp_tree root = SLP_INSTANCE_TREE (instance);
7779       stmt_vec_info store_info;
7780       unsigned int j;
7781 
7782       /* Remove scalar call stmts.  Do not do this for basic-block
7783            vectorization as not all uses may be vectorized.
7784            ???  Why should this be necessary?  DCE should be able to
7785            remove the stmts itself.
7786            ???  For BB vectorization we can as well remove scalar
7787            stmts starting from the SLP tree root if they have no
7788            uses.  */
7789       if (is_a <loop_vec_info> (vinfo))
7790           vect_remove_slp_scalar_calls (vinfo, root);
7791 
7792       /* Remove vectorized stores original scalar stmts.  */
7793       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7794         {
7795             if (!STMT_VINFO_DATA_REF (store_info)
7796                 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7797               break;
7798 
7799             store_info = vect_orig_stmt (store_info);
7800             /* Free the attached stmt_vec_info and remove the stmt.  */
7801             vinfo->remove_stmt (store_info);
7802 
7803             /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7804                to not crash in vect_free_slp_tree later.  */
7805             if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7806               SLP_TREE_REPRESENTATIVE (root) = NULL;
7807         }
7808     }
7809 }
7810