1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "tree-cfg.h"
53 #include "tree-hash-traits.h"
54 #include "vec-perm-indices.h"
55 #include "internal-fn.h"
56 #include "gimple-fold.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63                                     tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74           {
75             if (dump_enabled_p ())
76               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77                                    "no array mode for %s[%wu]\n",
78                                    GET_MODE_NAME (mode), count);
79             return false;
80           }
81     }
82 
83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
84     {
85       if (dump_enabled_p ())
86           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
87                          "cannot use %s<%s><%s>\n", name,
88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
89       return false;
90     }
91 
92   if (dump_enabled_p ())
93     dump_printf_loc (MSG_NOTE, vect_location,
94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
95                      GET_MODE_NAME (mode));
96 
97   return true;
98 }
99 
100 
101 /* Return the smallest scalar part of STMT_INFO.
102    This is used to determine the vectype of the stmt.  We generally set the
103    vectype according to the type of the result (lhs).  For stmts whose
104    result-type is different than the type of the arguments (e.g., demotion,
105    promotion), vectype will be reset appropriately (later).  Note that we have
106    to visit the smallest datatype in this function, because that determines the
107    VF.  If the smallest datatype in the loop is present only as the rhs of a
108    promotion operation - we'd miss it.
109    Such a case, where a variable of this datatype does not appear in the lhs
110    anywhere in the loop, can only occur if it's an invariant: e.g.:
111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
112    invariant motion.  However, we cannot rely on invariant motion to always
113    take invariants out of the loop, and so in the case of promotion we also
114    have to check the rhs.
115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
116    types.  */
117 
118 tree
vect_get_smallest_scalar_type(stmt_vec_info stmt_info,tree scalar_type)119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
120 {
121   HOST_WIDE_INT lhs, rhs;
122 
123   /* During the analysis phase, this function is called on arbitrary
124      statements that might not have scalar results.  */
125   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
126     return scalar_type;
127 
128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 
130   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
131   if (assign)
132     {
133       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
134       if (gimple_assign_cast_p (assign)
135             || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136             || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137             || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138             || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139             || gimple_assign_rhs_code (assign) == WIDEN_PLUS_EXPR
140             || gimple_assign_rhs_code (assign) == WIDEN_MINUS_EXPR
141             || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
142           {
143             tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
144 
145             rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
146             if (rhs < lhs)
147               scalar_type = rhs_type;
148           }
149     }
150   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
151     {
152       unsigned int i = 0;
153       if (gimple_call_internal_p (call))
154           {
155             internal_fn ifn = gimple_call_internal_fn (call);
156             if (internal_load_fn_p (ifn))
157               /* For loads the LHS type does the trick.  */
158               i = ~0U;
159             else if (internal_store_fn_p (ifn))
160               {
161                 /* For stores use the tyep of the stored value.  */
162                 i = internal_fn_stored_value_index (ifn);
163                 scalar_type = TREE_TYPE (gimple_call_arg (call, i));
164                 i = ~0U;
165               }
166             else if (internal_fn_mask_index (ifn) == 0)
167               i = 1;
168           }
169       if (i < gimple_call_num_args (call))
170           {
171             tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
172             if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
173               {
174                 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
175                 if (rhs < lhs)
176                     scalar_type = rhs_type;
177               }
178           }
179     }
180 
181   return scalar_type;
182 }
183 
184 
185 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
186    tested at run-time.  Return TRUE if DDR was successfully inserted.
187    Return false if versioning is not supported.  */
188 
189 static opt_result
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)190 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
191 {
192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
193 
194   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
195     return opt_result::failure_at (vect_location,
196                                            "will not create alias checks, as"
197                                            " --param vect-max-version-for-alias-checks"
198                                            " == 0\n");
199 
200   opt_result res
201     = runtime_alias_check_p (ddr, loop,
202                                    optimize_loop_nest_for_speed_p (loop));
203   if (!res)
204     return res;
205 
206   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
207   return opt_result::success ();
208 }
209 
210 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
211 
212 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)213 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
214 {
215   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
216   for (unsigned int i = 0; i < checks.length(); ++i)
217     if (checks[i] == value)
218       return;
219 
220   if (dump_enabled_p ())
221     dump_printf_loc (MSG_NOTE, vect_location,
222                          "need run-time check that %T is nonzero\n",
223                          value);
224   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
225 }
226 
227 /* Return true if we know that the order of vectorized DR_INFO_A and
228    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
229    DR_INFO_B.  At least one of the accesses is a write.  */
230 
231 static bool
vect_preserves_scalar_order_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b)232 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
233 {
234   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
235   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
236 
237   /* Single statements are always kept in their original order.  */
238   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
239       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
240     return true;
241 
242   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
243      emitted at the position of the first scalar load.
244      Stores in a group are emitted at the position of the last scalar store.
245      Compute that position and check whether the resulting order matches
246      the current one.  */
247   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
248   if (il_a)
249     {
250       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
251           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
252                s = DR_GROUP_NEXT_ELEMENT (s))
253             il_a = get_later_stmt (il_a, s);
254       else /* DR_IS_READ */
255           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
256                s = DR_GROUP_NEXT_ELEMENT (s))
257             if (get_later_stmt (il_a, s) == il_a)
258               il_a = s;
259     }
260   else
261     il_a = stmtinfo_a;
262   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
263   if (il_b)
264     {
265       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
266           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
267                s = DR_GROUP_NEXT_ELEMENT (s))
268             il_b = get_later_stmt (il_b, s);
269       else /* DR_IS_READ */
270           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
271                s = DR_GROUP_NEXT_ELEMENT (s))
272             if (get_later_stmt (il_b, s) == il_b)
273               il_b = s;
274     }
275   else
276     il_b = stmtinfo_b;
277   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
278   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
279 }
280 
281 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
282    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
283    distances.  These distances are conservatively correct but they don't
284    reflect a guaranteed dependence.
285 
286    Return true if this function does all the work necessary to avoid
287    an alias or false if the caller should use the dependence distances
288    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
289    the depth of the loop described by LOOP_VINFO and the other arguments
290    are as for vect_analyze_data_ref_dependence.  */
291 
292 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)293 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
294                                                loop_vec_info loop_vinfo,
295                                                int loop_depth, unsigned int *max_vf)
296 {
297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
298   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
299     {
300       int dist = dist_v[loop_depth];
301       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
302           {
303             /* If the user asserted safelen >= DIST consecutive iterations
304                can be executed concurrently, assume independence.
305 
306                ??? An alternative would be to add the alias check even
307                in this case, and vectorize the fallback loop with the
308                maximum VF set to safelen.  However, if the user has
309                explicitly given a length, it's less likely that that
310                would be a win.  */
311             if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
312               {
313                 if ((unsigned int) loop->safelen < *max_vf)
314                     *max_vf = loop->safelen;
315                 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
316                 continue;
317               }
318 
319             /* For dependence distances of 2 or more, we have the option
320                of limiting VF or checking for an alias at runtime.
321                Prefer to check at runtime if we can, to avoid limiting
322                the VF unnecessarily when the bases are in fact independent.
323 
324                Note that the alias checks will be removed if the VF ends up
325                being small enough.  */
326             dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
327             dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
328             return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
329                       && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
330                       && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
331           }
332     }
333   return true;
334 }
335 
336 
337 /* Function vect_analyze_data_ref_dependence.
338 
339    FIXME: I needed to change the sense of the returned flag.
340 
341    Return FALSE if there (might) exist a dependence between a memory-reference
342    DRA and a memory-reference DRB.  When versioning for alias may check a
343    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
344    the data dependence.  */
345 
346 static opt_result
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)347 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
348                                           loop_vec_info loop_vinfo,
349                                           unsigned int *max_vf)
350 {
351   unsigned int i;
352   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
353   struct data_reference *dra = DDR_A (ddr);
354   struct data_reference *drb = DDR_B (ddr);
355   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
356   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
357   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
358   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
359   lambda_vector dist_v;
360   unsigned int loop_depth;
361 
362   /* If user asserted safelen consecutive iterations can be
363      executed concurrently, assume independence.  */
364   auto apply_safelen = [&]()
365     {
366       if (loop->safelen >= 2)
367           {
368             if ((unsigned int) loop->safelen < *max_vf)
369               *max_vf = loop->safelen;
370             LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
371             return true;
372           }
373       return false;
374     };
375 
376   /* In loop analysis all data references should be vectorizable.  */
377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
379     gcc_unreachable ();
380 
381   /* Independent data accesses.  */
382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
383     return opt_result::success ();
384 
385   if (dra == drb
386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
387     return opt_result::success ();
388 
389   /* We do not have to consider dependences between accesses that belong
390      to the same group, unless the stride could be smaller than the
391      group size.  */
392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
394             == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
396     return opt_result::success ();
397 
398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
399      least two scalar iterations, there is always also a true dependence.
400      As the vectorizer does not re-order loads and stores we can ignore
401      the anti-dependence if TBAA can disambiguate both DRs similar to the
402      case with known negative distance anti-dependences (positive
403      distance anti-dependences would violate TBAA constraints).  */
404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
407                                          get_alias_set (DR_REF (drb))))
408     return opt_result::success ();
409 
410   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
411       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
412     {
413       if (apply_safelen ())
414           return opt_result::success ();
415 
416       return opt_result::failure_at
417           (stmtinfo_a->stmt,
418            "possible alias involving gather/scatter between %T and %T\n",
419            DR_REF (dra), DR_REF (drb));
420     }
421 
422   /* Unknown data dependence.  */
423   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
424     {
425       if (apply_safelen ())
426           return opt_result::success ();
427 
428       if (dump_enabled_p ())
429           dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
430                                "versioning for alias required: "
431                                "can't determine dependence between %T and %T\n",
432                                DR_REF (dra), DR_REF (drb));
433 
434       /* Add to list of ddrs that need to be tested at run-time.  */
435       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
436     }
437 
438   /* Known data dependence.  */
439   if (DDR_NUM_DIST_VECTS (ddr) == 0)
440     {
441       if (apply_safelen ())
442           return opt_result::success ();
443 
444       if (dump_enabled_p ())
445           dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
446                                "versioning for alias required: "
447                                "bad dist vector for %T and %T\n",
448                                DR_REF (dra), DR_REF (drb));
449       /* Add to list of ddrs that need to be tested at run-time.  */
450       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
451     }
452 
453   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
454 
455   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
456       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
457                                                             loop_depth, max_vf))
458     return opt_result::success ();
459 
460   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
461     {
462       int dist = dist_v[loop_depth];
463 
464       if (dump_enabled_p ())
465           dump_printf_loc (MSG_NOTE, vect_location,
466                          "dependence distance  = %d.\n", dist);
467 
468       if (dist == 0)
469           {
470             if (dump_enabled_p ())
471               dump_printf_loc (MSG_NOTE, vect_location,
472                                    "dependence distance == 0 between %T and %T\n",
473                                    DR_REF (dra), DR_REF (drb));
474 
475             /* When we perform grouped accesses and perform implicit CSE
476                by detecting equal accesses and doing disambiguation with
477                runtime alias tests like for
478                   .. = a[i];
479                     .. = a[i+1];
480                     a[i] = ..;
481                     a[i+1] = ..;
482                     *p = ..;
483                     .. = a[i];
484                     .. = a[i+1];
485                where we will end up loading { a[i], a[i+1] } once, make
486                sure that inserting group loads before the first load and
487                stores after the last store will do the right thing.
488                Similar for groups like
489                   a[i] = ...;
490                     ... = a[i];
491                     a[i+1] = ...;
492                where loads from the group interleave with the store.  */
493             if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
494               return opt_result::failure_at (stmtinfo_a->stmt,
495                                                      "READ_WRITE dependence"
496                                                      " in interleaving.\n");
497 
498             if (loop->safelen < 2)
499               {
500                 tree indicator = dr_zero_step_indicator (dra);
501                 if (!indicator || integer_zerop (indicator))
502                     return opt_result::failure_at (stmtinfo_a->stmt,
503                                                          "access also has a zero step\n");
504                 else if (TREE_CODE (indicator) != INTEGER_CST)
505                     vect_check_nonzero_value (loop_vinfo, indicator);
506               }
507             continue;
508           }
509 
510       if (dist > 0 && DDR_REVERSED_P (ddr))
511           {
512             /* If DDR_REVERSED_P the order of the data-refs in DDR was
513                reversed (to make distance vector positive), and the actual
514                distance is negative.  */
515             if (dump_enabled_p ())
516               dump_printf_loc (MSG_NOTE, vect_location,
517                                "dependence distance negative.\n");
518             /* When doing outer loop vectorization, we need to check if there is
519                a backward dependence at the inner loop level if the dependence
520                at the outer loop is reversed.  See PR81740.  */
521             if (nested_in_vect_loop_p (loop, stmtinfo_a)
522                 || nested_in_vect_loop_p (loop, stmtinfo_b))
523               {
524                 unsigned inner_depth = index_in_loop_nest (loop->inner->num,
525                                                                        DDR_LOOP_NEST (ddr));
526                 if (dist_v[inner_depth] < 0)
527                     return opt_result::failure_at (stmtinfo_a->stmt,
528                                                          "not vectorized, dependence "
529                                                          "between data-refs %T and %T\n",
530                                                          DR_REF (dra), DR_REF (drb));
531               }
532             /* Record a negative dependence distance to later limit the
533                amount of stmt copying / unrolling we can perform.
534                Only need to handle read-after-write dependence.  */
535             if (DR_IS_READ (drb)
536                 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
537                       || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
538               STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
539             continue;
540           }
541 
542       unsigned int abs_dist = abs (dist);
543       if (abs_dist >= 2 && abs_dist < *max_vf)
544           {
545             /* The dependence distance requires reduction of the maximal
546                vectorization factor.  */
547             *max_vf = abs_dist;
548             if (dump_enabled_p ())
549               dump_printf_loc (MSG_NOTE, vect_location,
550                                "adjusting maximal vectorization factor to %i\n",
551                                *max_vf);
552           }
553 
554       if (abs_dist >= *max_vf)
555           {
556             /* Dependence distance does not create dependence, as far as
557                vectorization is concerned, in this case.  */
558             if (dump_enabled_p ())
559               dump_printf_loc (MSG_NOTE, vect_location,
560                                "dependence distance >= VF.\n");
561             continue;
562           }
563 
564       return opt_result::failure_at (stmtinfo_a->stmt,
565                                              "not vectorized, possible dependence "
566                                              "between data-refs %T and %T\n",
567                                              DR_REF (dra), DR_REF (drb));
568     }
569 
570   return opt_result::success ();
571 }
572 
573 /* Function vect_analyze_data_ref_dependences.
574 
575    Examine all the data references in the loop, and make sure there do not
576    exist any data dependences between them.  Set *MAX_VF according to
577    the maximum vectorization factor the data dependences allow.  */
578 
579 opt_result
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)580 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
581                                            unsigned int *max_vf)
582 {
583   unsigned int i;
584   struct data_dependence_relation *ddr;
585 
586   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
587 
588   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
589     {
590       LOOP_VINFO_DDRS (loop_vinfo)
591           .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
592                      * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
593       /* We do not need read-read dependences.  */
594       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
595                                                     &LOOP_VINFO_DDRS (loop_vinfo),
596                                                     LOOP_VINFO_LOOP_NEST (loop_vinfo),
597                                                     false);
598       gcc_assert (res);
599     }
600 
601   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
602 
603   /* For epilogues we either have no aliases or alias versioning
604      was applied to original loop.  Therefore we may just get max_vf
605      using VF of original loop.  */
606   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
607     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
608   else
609     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
610       {
611           opt_result res
612             = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
613           if (!res)
614             return res;
615       }
616 
617   return opt_result::success ();
618 }
619 
620 
621 /* Function vect_slp_analyze_data_ref_dependence.
622 
623    Return TRUE if there (might) exist a dependence between a memory-reference
624    DRA and a memory-reference DRB for VINFO.  When versioning for alias
625    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
626    according to the data dependence.  */
627 
628 static bool
vect_slp_analyze_data_ref_dependence(vec_info * vinfo,struct data_dependence_relation * ddr)629 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
630                                               struct data_dependence_relation *ddr)
631 {
632   struct data_reference *dra = DDR_A (ddr);
633   struct data_reference *drb = DDR_B (ddr);
634   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
635   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
636 
637   /* We need to check dependences of statements marked as unvectorizable
638      as well, they still can prohibit vectorization.  */
639 
640   /* Independent data accesses.  */
641   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
642     return false;
643 
644   if (dra == drb)
645     return false;
646 
647   /* Read-read is OK.  */
648   if (DR_IS_READ (dra) && DR_IS_READ (drb))
649     return false;
650 
651   /* If dra and drb are part of the same interleaving chain consider
652      them independent.  */
653   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
654       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
655             == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
656     return false;
657 
658   /* Unknown data dependence.  */
659   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
660     {
661       if  (dump_enabled_p ())
662           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663                                "can't determine dependence between %T and %T\n",
664                                DR_REF (dra), DR_REF (drb));
665     }
666   else if (dump_enabled_p ())
667     dump_printf_loc (MSG_NOTE, vect_location,
668                          "determined dependence between %T and %T\n",
669                          DR_REF (dra), DR_REF (drb));
670 
671   return true;
672 }
673 
674 
675 /* Analyze dependences involved in the transform of a store SLP NODE.  */
676 
677 static bool
vect_slp_analyze_store_dependences(vec_info * vinfo,slp_tree node)678 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
679 {
680   /* This walks over all stmts involved in the SLP store done
681      in NODE verifying we can sink them up to the last stmt in the
682      group.  */
683   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
684   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
685 
686   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
687     {
688       stmt_vec_info access_info
689           = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
690       if (access_info == last_access_info)
691           continue;
692       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
693       ao_ref ref;
694       bool ref_initialized_p = false;
695       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
696              gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
697           {
698             gimple *stmt = gsi_stmt (gsi);
699             if (! gimple_vuse (stmt))
700               continue;
701 
702             /* If we couldn't record a (single) data reference for this
703                stmt we have to resort to the alias oracle.  */
704             stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
705             data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
706             if (!dr_b)
707               {
708                 /* We are moving a store - this means
709                      we cannot use TBAA for disambiguation.  */
710                 if (!ref_initialized_p)
711                     ao_ref_init (&ref, DR_REF (dr_a));
712                 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
713                       || ref_maybe_used_by_stmt_p (stmt, &ref, false))
714                     return false;
715                 continue;
716               }
717 
718             gcc_assert (!gimple_visited_p (stmt));
719 
720             ddr_p ddr = initialize_data_dependence_relation (dr_a,
721                                                                          dr_b, vNULL);
722             bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
723             free_dependence_relation (ddr);
724             if (dependent)
725               return false;
726           }
727     }
728   return true;
729 }
730 
731 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
732    contain the vector of scalar stores of this instance if we are
733    disambiguating the loads.  */
734 
735 static bool
vect_slp_analyze_load_dependences(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stores,stmt_vec_info last_store_info)736 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
737                                            vec<stmt_vec_info> stores,
738                                            stmt_vec_info last_store_info)
739 {
740   /* This walks over all stmts involved in the SLP load done
741      in NODE verifying we can hoist them up to the first stmt in the
742      group.  */
743   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
744   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
745 
746   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
747     {
748       stmt_vec_info access_info
749           = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
750       if (access_info == first_access_info)
751           continue;
752       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
753       ao_ref ref;
754       bool ref_initialized_p = false;
755       hash_set<stmt_vec_info> grp_visited;
756       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
757              gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
758           {
759             gimple *stmt = gsi_stmt (gsi);
760             if (! gimple_vdef (stmt))
761               continue;
762 
763             stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
764 
765             /* If we run into a store of this same instance (we've just
766                marked those) then delay dependence checking until we run
767                into the last store because this is where it will have
768                been sunk to (and we verified that we can do that already).  */
769             if (gimple_visited_p (stmt))
770               {
771                 if (stmt_info != last_store_info)
772                     continue;
773 
774                 for (stmt_vec_info &store_info : stores)
775                     {
776                       data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
777                       ddr_p ddr = initialize_data_dependence_relation
778                                         (dr_a, store_dr, vNULL);
779                       bool dependent
780                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
781                       free_dependence_relation (ddr);
782                       if (dependent)
783                         return false;
784                     }
785                 continue;
786               }
787 
788             auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
789               {
790                 /* We are hoisting a load - this means we can use TBAA for
791                      disambiguation.  */
792                 if (!ref_initialized_p)
793                     ao_ref_init (&ref, DR_REF (dr_a));
794                 if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
795                     {
796                       /* If we couldn't record a (single) data reference for this
797                          stmt we have to give up now.  */
798                       data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
799                       if (!dr_b)
800                         return false;
801                       ddr_p ddr = initialize_data_dependence_relation (dr_a,
802                                                                                    dr_b, vNULL);
803                       bool dependent
804                         = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
805                       free_dependence_relation (ddr);
806                       if (dependent)
807                         return false;
808                     }
809                 /* No dependence.  */
810                 return true;
811               };
812             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
813               {
814                 /* When we run into a store group we have to honor
815                      that earlier stores might be moved here.  We don't
816                      know exactly which and where to since we lack a
817                      back-mapping from DR to SLP node, so assume all
818                      earlier stores are sunk here.  It's enough to
819                      consider the last stmt of a group for this.
820                      ???  Both this and the fact that we disregard that
821                      the conflicting instance might be removed later
822                      is overly conservative.  */
823                 if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
824                     for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
825                          store_info != NULL;
826                          store_info = DR_GROUP_NEXT_ELEMENT (store_info))
827                       if ((store_info == stmt_info
828                            || get_later_stmt (store_info, stmt_info) == stmt_info)
829                           && !check_hoist (store_info))
830                         return false;
831               }
832             else
833               {
834                 if (!check_hoist (stmt_info))
835                     return false;
836               }
837           }
838     }
839   return true;
840 }
841 
842 
843 /* Function vect_analyze_data_ref_dependences.
844 
845    Examine all the data references in the basic-block, and make sure there
846    do not exist any data dependences between them.  Set *MAX_VF according to
847    the maximum vectorization factor the data dependences allow.  */
848 
849 bool
vect_slp_analyze_instance_dependence(vec_info * vinfo,slp_instance instance)850 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
851 {
852   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
853 
854   /* The stores of this instance are at the root of the SLP tree.  */
855   slp_tree store = NULL;
856   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
857     store = SLP_INSTANCE_TREE (instance);
858 
859   /* Verify we can sink stores to the vectorized stmt insert location.  */
860   stmt_vec_info last_store_info = NULL;
861   if (store)
862     {
863       if (! vect_slp_analyze_store_dependences (vinfo, store))
864           return false;
865 
866       /* Mark stores in this instance and remember the last one.  */
867       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
868       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
869           gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
870     }
871 
872   bool res = true;
873 
874   /* Verify we can sink loads to the vectorized stmt insert location,
875      special-casing stores of this instance.  */
876   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
877     if (! vect_slp_analyze_load_dependences (vinfo, load,
878                                                        store
879                                                        ? SLP_TREE_SCALAR_STMTS (store)
880                                                        : vNULL, last_store_info))
881       {
882           res = false;
883           break;
884       }
885 
886   /* Unset the visited flag.  */
887   if (store)
888     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
889       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
890 
891   return res;
892 }
893 
894 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
895    applied.  */
896 
897 int
dr_misalignment(dr_vec_info * dr_info,tree vectype,poly_int64 offset)898 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
899 {
900   HOST_WIDE_INT diff = 0;
901   /* Alignment is only analyzed for the first element of a DR group,
902      use that but adjust misalignment by the offset of the access.  */
903   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
904     {
905       dr_vec_info *first_dr
906           = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
907       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
908            INTEGER_CSTs and the first element in the group has the lowest
909            address.  */
910       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
911                 - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
912       gcc_assert (diff >= 0);
913       dr_info = first_dr;
914     }
915 
916   int misalign = dr_info->misalignment;
917   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
918   if (misalign == DR_MISALIGNMENT_UNKNOWN)
919     return misalign;
920 
921   /* If the access is only aligned for a vector type with smaller alignment
922      requirement the access has unknown misalignment.  */
923   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
924                     targetm.vectorize.preferred_vector_alignment (vectype)))
925     return DR_MISALIGNMENT_UNKNOWN;
926 
927   /* Apply the offset from the DR group start and the externally supplied
928      offset which can for example result from a negative stride access.  */
929   poly_int64 misalignment = misalign + diff + offset;
930 
931   /* vect_compute_data_ref_alignment will have ensured that target_alignment
932      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
933   unsigned HOST_WIDE_INT target_alignment_c
934     = dr_info->target_alignment.to_constant ();
935   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
936     return DR_MISALIGNMENT_UNKNOWN;
937   return misalign;
938 }
939 
940 /* Record the base alignment guarantee given by DRB, which occurs
941    in STMT_INFO.  */
942 
943 static void
vect_record_base_alignment(vec_info * vinfo,stmt_vec_info stmt_info,innermost_loop_behavior * drb)944 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
945                                   innermost_loop_behavior *drb)
946 {
947   bool existed;
948   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
949     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
950   if (!existed || entry.second->base_alignment < drb->base_alignment)
951     {
952       entry = std::make_pair (stmt_info, drb);
953       if (dump_enabled_p ())
954           dump_printf_loc (MSG_NOTE, vect_location,
955                                "recording new base alignment for %T\n"
956                                "  alignment:    %d\n"
957                                "  misalignment: %d\n"
958                                "  based on:     %G",
959                                drb->base_address,
960                                drb->base_alignment,
961                                drb->base_misalignment,
962                                stmt_info->stmt);
963     }
964 }
965 
966 /* If the region we're going to vectorize is reached, all unconditional
967    data references occur at least once.  We can therefore pool the base
968    alignment guarantees from each unconditional reference.  Do this by
969    going through all the data references in VINFO and checking whether
970    the containing statement makes the reference unconditionally.  If so,
971    record the alignment of the base address in VINFO so that it can be
972    used for all other references with the same base.  */
973 
974 void
vect_record_base_alignments(vec_info * vinfo)975 vect_record_base_alignments (vec_info *vinfo)
976 {
977   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
978   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
979   for (data_reference *dr : vinfo->shared->datarefs)
980     {
981       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
982       stmt_vec_info stmt_info = dr_info->stmt;
983       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
984             && STMT_VINFO_VECTORIZABLE (stmt_info)
985             && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
986           {
987             vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
988 
989             /* If DR is nested in the loop that is being vectorized, we can also
990                record the alignment of the base wrt the outer loop.  */
991             if (loop && nested_in_vect_loop_p (loop, stmt_info))
992               vect_record_base_alignment
993                 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
994           }
995     }
996 }
997 
998 /* Function vect_compute_data_ref_alignment
999 
1000    Compute the misalignment of the data reference DR_INFO when vectorizing
1001    with VECTYPE.
1002 
1003    Output:
1004    1. initialized misalignment info for DR_INFO
1005 
1006    FOR NOW: No analysis is actually performed. Misalignment is calculated
1007    only for trivial cases. TODO.  */
1008 
1009 static void
vect_compute_data_ref_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype)1010 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1011                                          tree vectype)
1012 {
1013   stmt_vec_info stmt_info = dr_info->stmt;
1014   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1015   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1016   class loop *loop = NULL;
1017   tree ref = DR_REF (dr_info->dr);
1018 
1019   if (dump_enabled_p ())
1020     dump_printf_loc (MSG_NOTE, vect_location,
1021                      "vect_compute_data_ref_alignment:\n");
1022 
1023   if (loop_vinfo)
1024     loop = LOOP_VINFO_LOOP (loop_vinfo);
1025 
1026   /* Initialize misalignment to unknown.  */
1027   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1028 
1029   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1030     return;
1031 
1032   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1033   bool step_preserves_misalignment_p;
1034 
1035   poly_uint64 vector_alignment
1036     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1037                      BITS_PER_UNIT);
1038   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1039 
1040   /* If the main loop has peeled for alignment we have no way of knowing
1041      whether the data accesses in the epilogues are aligned.  We can't at
1042      compile time answer the question whether we have entered the main loop or
1043      not.  Fixes PR 92351.  */
1044   if (loop_vinfo)
1045     {
1046       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1047       if (orig_loop_vinfo
1048             && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1049           return;
1050     }
1051 
1052   unsigned HOST_WIDE_INT vect_align_c;
1053   if (!vector_alignment.is_constant (&vect_align_c))
1054     return;
1055 
1056   /* No step for BB vectorization.  */
1057   if (!loop)
1058     {
1059       gcc_assert (integer_zerop (drb->step));
1060       step_preserves_misalignment_p = true;
1061     }
1062 
1063   /* In case the dataref is in an inner-loop of the loop that is being
1064      vectorized (LOOP), we use the base and misalignment information
1065      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1066      stays the same throughout the execution of the inner-loop, which is why
1067      we have to check that the stride of the dataref in the inner-loop evenly
1068      divides by the vector alignment.  */
1069   else if (nested_in_vect_loop_p (loop, stmt_info))
1070     {
1071       step_preserves_misalignment_p
1072           = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1073 
1074       if (dump_enabled_p ())
1075           {
1076             if (step_preserves_misalignment_p)
1077               dump_printf_loc (MSG_NOTE, vect_location,
1078                                    "inner step divides the vector alignment.\n");
1079             else
1080               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1081                                    "inner step doesn't divide the vector"
1082                                    " alignment.\n");
1083           }
1084     }
1085 
1086   /* Similarly we can only use base and misalignment information relative to
1087      an innermost loop if the misalignment stays the same throughout the
1088      execution of the loop.  As above, this is the case if the stride of
1089      the dataref evenly divides by the alignment.  */
1090   else
1091     {
1092       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1093       step_preserves_misalignment_p
1094           = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1095 
1096       if (!step_preserves_misalignment_p && dump_enabled_p ())
1097           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1098                                "step doesn't divide the vector alignment.\n");
1099     }
1100 
1101   unsigned int base_alignment = drb->base_alignment;
1102   unsigned int base_misalignment = drb->base_misalignment;
1103 
1104   /* Calculate the maximum of the pooled base address alignment and the
1105      alignment that we can compute for DR itself.  */
1106   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1107     = base_alignments->get (drb->base_address);
1108   if (entry
1109       && base_alignment < (*entry).second->base_alignment
1110       && (loop_vinfo
1111             || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1112                                     gimple_bb (entry->first->stmt))
1113                 && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1114                       || (entry->first->dr_aux.group <= dr_info->group)))))
1115     {
1116       base_alignment = entry->second->base_alignment;
1117       base_misalignment = entry->second->base_misalignment;
1118     }
1119 
1120   if (drb->offset_alignment < vect_align_c
1121       || !step_preserves_misalignment_p
1122       /* We need to know whether the step wrt the vectorized loop is
1123            negative when computing the starting misalignment below.  */
1124       || TREE_CODE (drb->step) != INTEGER_CST)
1125     {
1126       if (dump_enabled_p ())
1127           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128                                "Unknown alignment for access: %T\n", ref);
1129       return;
1130     }
1131 
1132   if (base_alignment < vect_align_c)
1133     {
1134       unsigned int max_alignment;
1135       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1136       if (max_alignment < vect_align_c
1137             || !vect_can_force_dr_alignment_p (base,
1138                                                        vect_align_c * BITS_PER_UNIT))
1139           {
1140             if (dump_enabled_p ())
1141               dump_printf_loc (MSG_NOTE, vect_location,
1142                                    "can't force alignment of ref: %T\n", ref);
1143             return;
1144           }
1145 
1146       /* Force the alignment of the decl.
1147            NOTE: This is the only change to the code we make during
1148            the analysis phase, before deciding to vectorize the loop.  */
1149       if (dump_enabled_p ())
1150           dump_printf_loc (MSG_NOTE, vect_location,
1151                                "force alignment of %T\n", ref);
1152 
1153       dr_info->base_decl = base;
1154       dr_info->base_misaligned = true;
1155       base_misalignment = 0;
1156     }
1157   poly_int64 misalignment
1158     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1159 
1160   unsigned int const_misalignment;
1161   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1162     {
1163       if (dump_enabled_p ())
1164           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                                "Non-constant misalignment for access: %T\n", ref);
1166       return;
1167     }
1168 
1169   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1170 
1171   if (dump_enabled_p ())
1172     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173                          "misalign = %d bytes of ref %T\n",
1174                          const_misalignment, ref);
1175 
1176   return;
1177 }
1178 
1179 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1180    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1181    is made aligned via peeling.  */
1182 
1183 static bool
vect_dr_aligned_if_related_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1184 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1185                                                    dr_vec_info *dr_peel_info)
1186 {
1187   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1188                       DR_TARGET_ALIGNMENT (dr_info)))
1189     {
1190       poly_offset_int diff
1191           = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1192              - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1193       if (known_eq (diff, 0)
1194             || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1195           return true;
1196     }
1197   return false;
1198 }
1199 
1200 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1201    aligned via peeling.  */
1202 
1203 static bool
vect_dr_aligned_if_peeled_dr_is(dr_vec_info * dr_info,dr_vec_info * dr_peel_info)1204 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1205                                          dr_vec_info *dr_peel_info)
1206 {
1207   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1208                               DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1209       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1210                                  DR_OFFSET (dr_peel_info->dr), 0)
1211       || !operand_equal_p (DR_STEP (dr_info->dr),
1212                                  DR_STEP (dr_peel_info->dr), 0))
1213     return false;
1214 
1215   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1216 }
1217 
1218 /* Compute the value for dr_info->misalign so that the access appears
1219    aligned.  This is used by peeling to compensate for dr_misalignment
1220    applying the offset for negative step.  */
1221 
1222 int
vect_dr_misalign_for_aligned_access(dr_vec_info * dr_info)1223 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1224 {
1225   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1226     return 0;
1227 
1228   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1229   poly_int64 misalignment
1230     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1231        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1232 
1233   unsigned HOST_WIDE_INT target_alignment_c;
1234   int misalign;
1235   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1236       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1237     return DR_MISALIGNMENT_UNKNOWN;
1238   return misalign;
1239 }
1240 
1241 /* Function vect_update_misalignment_for_peel.
1242    Sets DR_INFO's misalignment
1243    - to 0 if it has the same alignment as DR_PEEL_INFO,
1244    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1245    - to -1 (unknown) otherwise.
1246 
1247    DR_INFO - the data reference whose misalignment is to be adjusted.
1248    DR_PEEL_INFO - the data reference whose misalignment is being made
1249                       zero in the vector loop by the peel.
1250    NPEEL - the number of iterations in the peel loop if the misalignment
1251            of DR_PEEL_INFO is known at compile time.  */
1252 
1253 static void
vect_update_misalignment_for_peel(dr_vec_info * dr_info,dr_vec_info * dr_peel_info,int npeel)1254 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1255                                            dr_vec_info *dr_peel_info, int npeel)
1256 {
1257   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1258   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1259     {
1260       SET_DR_MISALIGNMENT (dr_info,
1261                                  vect_dr_misalign_for_aligned_access (dr_peel_info));
1262       return;
1263     }
1264 
1265   unsigned HOST_WIDE_INT alignment;
1266   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1267       && known_alignment_for_access_p (dr_info,
1268                                                STMT_VINFO_VECTYPE (dr_info->stmt))
1269       && known_alignment_for_access_p (dr_peel_info,
1270                                                STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1271     {
1272       int misal = dr_info->misalignment;
1273       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1274       misal &= alignment - 1;
1275       set_dr_misalignment (dr_info, misal);
1276       return;
1277     }
1278 
1279   if (dump_enabled_p ())
1280     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1281                          "to unknown (-1).\n");
1282   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1283 }
1284 
1285 /* Return true if alignment is relevant for DR_INFO.  */
1286 
1287 static bool
vect_relevant_for_alignment_p(dr_vec_info * dr_info)1288 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1289 {
1290   stmt_vec_info stmt_info = dr_info->stmt;
1291 
1292   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1293     return false;
1294 
1295   /* For interleaving, only the alignment of the first access matters.  */
1296   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1297       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1298     return false;
1299 
1300   /* Scatter-gather and invariant accesses continue to address individual
1301      scalars, so vector-level alignment is irrelevant.  */
1302   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1303       || integer_zerop (DR_STEP (dr_info->dr)))
1304     return false;
1305 
1306   /* Strided accesses perform only component accesses, alignment is
1307      irrelevant for them.  */
1308   if (STMT_VINFO_STRIDED_P (stmt_info)
1309       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1310     return false;
1311 
1312   return true;
1313 }
1314 
1315 /* Given an memory reference EXP return whether its alignment is less
1316    than its size.  */
1317 
1318 static bool
not_size_aligned(tree exp)1319 not_size_aligned (tree exp)
1320 {
1321   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1322     return true;
1323 
1324   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1325             > get_object_alignment (exp));
1326 }
1327 
1328 /* Function vector_alignment_reachable_p
1329 
1330    Return true if vector alignment for DR_INFO is reachable by peeling
1331    a few loop iterations.  Return false otherwise.  */
1332 
1333 static bool
vector_alignment_reachable_p(dr_vec_info * dr_info)1334 vector_alignment_reachable_p (dr_vec_info *dr_info)
1335 {
1336   stmt_vec_info stmt_info = dr_info->stmt;
1337   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1338 
1339   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1340     {
1341       /* For interleaved access we peel only if number of iterations in
1342            the prolog loop ({VF - misalignment}), is a multiple of the
1343            number of the interleaved accesses.  */
1344       int elem_size, mis_in_elements;
1345 
1346       /* FORNOW: handle only known alignment.  */
1347       if (!known_alignment_for_access_p (dr_info, vectype))
1348           return false;
1349 
1350       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1351       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1352       elem_size = vector_element_size (vector_size, nelements);
1353       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1354 
1355       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1356           return false;
1357     }
1358 
1359   /* If misalignment is known at the compile time then allow peeling
1360      only if natural alignment is reachable through peeling.  */
1361   if (known_alignment_for_access_p (dr_info, vectype)
1362       && !aligned_access_p (dr_info, vectype))
1363     {
1364       HOST_WIDE_INT elmsize =
1365                     int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1366       if (dump_enabled_p ())
1367           {
1368             dump_printf_loc (MSG_NOTE, vect_location,
1369                              "data size = %wd. misalignment = %d.\n", elmsize,
1370                                  dr_misalignment (dr_info, vectype));
1371           }
1372       if (dr_misalignment (dr_info, vectype) % elmsize)
1373           {
1374             if (dump_enabled_p ())
1375               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376                                "data size does not divide the misalignment.\n");
1377             return false;
1378           }
1379     }
1380 
1381   if (!known_alignment_for_access_p (dr_info, vectype))
1382     {
1383       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1384       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1385       if (dump_enabled_p ())
1386           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387                            "Unknown misalignment, %snaturally aligned\n",
1388                                is_packed ? "not " : "");
1389       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1390     }
1391 
1392   return true;
1393 }
1394 
1395 
1396 /* Calculate the cost of the memory access represented by DR_INFO.  */
1397 
1398 static void
vect_get_data_access_cost(vec_info * vinfo,dr_vec_info * dr_info,dr_alignment_support alignment_support_scheme,int misalignment,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec)1399 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1400                                  dr_alignment_support alignment_support_scheme,
1401                                  int misalignment,
1402                                  unsigned int *inside_cost,
1403                            unsigned int *outside_cost,
1404                                  stmt_vector_for_cost *body_cost_vec,
1405                                  stmt_vector_for_cost *prologue_cost_vec)
1406 {
1407   stmt_vec_info stmt_info = dr_info->stmt;
1408   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1409   int ncopies;
1410 
1411   if (PURE_SLP_STMT (stmt_info))
1412     ncopies = 1;
1413   else
1414     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1415 
1416   if (DR_IS_READ (dr_info->dr))
1417     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1418                               misalignment, true, inside_cost,
1419                               outside_cost, prologue_cost_vec, body_cost_vec, false);
1420   else
1421     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1422                                misalignment, inside_cost, body_cost_vec);
1423 
1424   if (dump_enabled_p ())
1425     dump_printf_loc (MSG_NOTE, vect_location,
1426                      "vect_get_data_access_cost: inside_cost = %d, "
1427                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1428 }
1429 
1430 
1431 typedef struct _vect_peel_info
1432 {
1433   dr_vec_info *dr_info;
1434   int npeel;
1435   unsigned int count;
1436 } *vect_peel_info;
1437 
1438 typedef struct _vect_peel_extended_info
1439 {
1440   vec_info *vinfo;
1441   struct _vect_peel_info peel_info;
1442   unsigned int inside_cost;
1443   unsigned int outside_cost;
1444 } *vect_peel_extended_info;
1445 
1446 
1447 /* Peeling hashtable helpers.  */
1448 
1449 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1450 {
1451   static inline hashval_t hash (const _vect_peel_info *);
1452   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1453 };
1454 
1455 inline hashval_t
hash(const _vect_peel_info * peel_info)1456 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1457 {
1458   return (hashval_t) peel_info->npeel;
1459 }
1460 
1461 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1462 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1463 {
1464   return (a->npeel == b->npeel);
1465 }
1466 
1467 
1468 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1469 
1470 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,dr_vec_info * dr_info,int npeel,bool supportable_if_not_aligned)1471 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1472                                 loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1473                                 int npeel, bool supportable_if_not_aligned)
1474 {
1475   struct _vect_peel_info elem, *slot;
1476   _vect_peel_info **new_slot;
1477 
1478   elem.npeel = npeel;
1479   slot = peeling_htab->find (&elem);
1480   if (slot)
1481     slot->count++;
1482   else
1483     {
1484       slot = XNEW (struct _vect_peel_info);
1485       slot->npeel = npeel;
1486       slot->dr_info = dr_info;
1487       slot->count = 1;
1488       new_slot = peeling_htab->find_slot (slot, INSERT);
1489       *new_slot = slot;
1490     }
1491 
1492   /* If this DR is not supported with unknown misalignment then bias
1493      this slot when the cost model is disabled.  */
1494   if (!supportable_if_not_aligned
1495       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1496     slot->count += VECT_MAX_COST;
1497 }
1498 
1499 
1500 /* Traverse peeling hash table to find peeling option that aligns maximum
1501    number of data accesses.  */
1502 
1503 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1504 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1505                                              _vect_peel_extended_info *max)
1506 {
1507   vect_peel_info elem = *slot;
1508 
1509   if (elem->count > max->peel_info.count
1510       || (elem->count == max->peel_info.count
1511           && max->peel_info.npeel > elem->npeel))
1512     {
1513       max->peel_info.npeel = elem->npeel;
1514       max->peel_info.count = elem->count;
1515       max->peel_info.dr_info = elem->dr_info;
1516     }
1517 
1518   return 1;
1519 }
1520 
1521 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1522    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1523    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1524    after peeling.  */
1525 
1526 static void
vect_get_peeling_costs_all_drs(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,stmt_vector_for_cost * prologue_cost_vec,unsigned int npeel)1527 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1528                                         dr_vec_info *dr0_info,
1529                                         unsigned int *inside_cost,
1530                                         unsigned int *outside_cost,
1531                                         stmt_vector_for_cost *body_cost_vec,
1532                                         stmt_vector_for_cost *prologue_cost_vec,
1533                                         unsigned int npeel)
1534 {
1535   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1536 
1537   bool dr0_alignment_known_p
1538     = (dr0_info
1539        && known_alignment_for_access_p (dr0_info,
1540                                                   STMT_VINFO_VECTYPE (dr0_info->stmt)));
1541 
1542   for (data_reference *dr : datarefs)
1543     {
1544       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1545       if (!vect_relevant_for_alignment_p (dr_info))
1546           continue;
1547 
1548       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1549       dr_alignment_support alignment_support_scheme;
1550       int misalignment;
1551       unsigned HOST_WIDE_INT alignment;
1552 
1553       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1554                                                       size_zero_node) < 0;
1555       poly_int64 off = 0;
1556       if (negative)
1557           off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1558                  * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1559 
1560       if (npeel == 0)
1561           misalignment = dr_misalignment (dr_info, vectype, off);
1562       else if (dr_info == dr0_info
1563                  || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1564           misalignment = 0;
1565       else if (!dr0_alignment_known_p
1566                  || !known_alignment_for_access_p (dr_info, vectype)
1567                  || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1568           misalignment = DR_MISALIGNMENT_UNKNOWN;
1569       else
1570           {
1571             misalignment = dr_misalignment (dr_info, vectype, off);
1572             misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1573             misalignment &= alignment - 1;
1574           }
1575       alignment_support_scheme
1576           = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1577                                                    misalignment);
1578 
1579       vect_get_data_access_cost (loop_vinfo, dr_info,
1580                                          alignment_support_scheme, misalignment,
1581                                          inside_cost, outside_cost,
1582                                          body_cost_vec, prologue_cost_vec);
1583     }
1584 }
1585 
1586 /* Traverse peeling hash table and calculate cost for each peeling option.
1587    Find the one with the lowest cost.  */
1588 
1589 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1590 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1591                                            _vect_peel_extended_info *min)
1592 {
1593   vect_peel_info elem = *slot;
1594   int dummy;
1595   unsigned int inside_cost = 0, outside_cost = 0;
1596   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1597   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1598                            epilogue_cost_vec;
1599 
1600   prologue_cost_vec.create (2);
1601   body_cost_vec.create (2);
1602   epilogue_cost_vec.create (2);
1603 
1604   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1605                                           &outside_cost, &body_cost_vec,
1606                                           &prologue_cost_vec, elem->npeel);
1607 
1608   body_cost_vec.release ();
1609 
1610   outside_cost += vect_get_known_peeling_cost
1611     (loop_vinfo, elem->npeel, &dummy,
1612      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1613      &prologue_cost_vec, &epilogue_cost_vec);
1614 
1615   /* Prologue and epilogue costs are added to the target model later.
1616      These costs depend only on the scalar iteration cost, the
1617      number of peeling iterations finally chosen, and the number of
1618      misaligned statements.  So discard the information found here.  */
1619   prologue_cost_vec.release ();
1620   epilogue_cost_vec.release ();
1621 
1622   if (inside_cost < min->inside_cost
1623       || (inside_cost == min->inside_cost
1624             && outside_cost < min->outside_cost))
1625     {
1626       min->inside_cost = inside_cost;
1627       min->outside_cost = outside_cost;
1628       min->peel_info.dr_info = elem->dr_info;
1629       min->peel_info.npeel = elem->npeel;
1630       min->peel_info.count = elem->count;
1631     }
1632 
1633   return 1;
1634 }
1635 
1636 
1637 /* Choose best peeling option by traversing peeling hash table and either
1638    choosing an option with the lowest cost (if cost model is enabled) or the
1639    option that aligns as many accesses as possible.  */
1640 
1641 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1642 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1643                                                loop_vec_info loop_vinfo)
1644 {
1645    struct _vect_peel_extended_info res;
1646 
1647    res.peel_info.dr_info = NULL;
1648    res.vinfo = loop_vinfo;
1649 
1650    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1651      {
1652        res.inside_cost = INT_MAX;
1653        res.outside_cost = INT_MAX;
1654        peeling_htab->traverse <_vect_peel_extended_info *,
1655                                      vect_peeling_hash_get_lowest_cost> (&res);
1656      }
1657    else
1658      {
1659        res.peel_info.count = 0;
1660        peeling_htab->traverse <_vect_peel_extended_info *,
1661                                      vect_peeling_hash_get_most_frequent> (&res);
1662        res.inside_cost = 0;
1663        res.outside_cost = 0;
1664      }
1665 
1666    return res;
1667 }
1668 
1669 /* Return true if the new peeling NPEEL is supported.  */
1670 
1671 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,dr_vec_info * dr0_info,unsigned npeel)1672 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1673                                 unsigned npeel)
1674 {
1675   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1676   enum dr_alignment_support supportable_dr_alignment;
1677 
1678   bool dr0_alignment_known_p
1679     = known_alignment_for_access_p (dr0_info,
1680                                             STMT_VINFO_VECTYPE (dr0_info->stmt));
1681 
1682   /* Ensure that all data refs can be vectorized after the peel.  */
1683   for (data_reference *dr : datarefs)
1684     {
1685       if (dr == dr0_info->dr)
1686           continue;
1687 
1688       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1689       if (!vect_relevant_for_alignment_p (dr_info)
1690             || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1691           continue;
1692 
1693       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1694       int misalignment;
1695       unsigned HOST_WIDE_INT alignment;
1696       if (!dr0_alignment_known_p
1697             || !known_alignment_for_access_p (dr_info, vectype)
1698             || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1699           misalignment = DR_MISALIGNMENT_UNKNOWN;
1700       else
1701           {
1702             misalignment = dr_misalignment (dr_info, vectype);
1703             misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1704             misalignment &= alignment - 1;
1705           }
1706       supportable_dr_alignment
1707           = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1708                                                    misalignment);
1709       if (supportable_dr_alignment == dr_unaligned_unsupported)
1710           return false;
1711     }
1712 
1713   return true;
1714 }
1715 
1716 /* Compare two data-references DRA and DRB to group them into chunks
1717    with related alignment.  */
1718 
1719 static int
dr_align_group_sort_cmp(const void * dra_,const void * drb_)1720 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1721 {
1722   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1723   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1724   int cmp;
1725 
1726   /* Stabilize sort.  */
1727   if (dra == drb)
1728     return 0;
1729 
1730   /* Ordering of DRs according to base.  */
1731   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1732                                      DR_BASE_ADDRESS (drb));
1733   if (cmp != 0)
1734     return cmp;
1735 
1736   /* And according to DR_OFFSET.  */
1737   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1738   if (cmp != 0)
1739     return cmp;
1740 
1741   /* And after step.  */
1742   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1743   if (cmp != 0)
1744     return cmp;
1745 
1746   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1747   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1748   if (cmp == 0)
1749     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1750   return cmp;
1751 }
1752 
1753 /* Function vect_enhance_data_refs_alignment
1754 
1755    This pass will use loop versioning and loop peeling in order to enhance
1756    the alignment of data references in the loop.
1757 
1758    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1759    original loop is to be vectorized.  Any other loops that are created by
1760    the transformations performed in this pass - are not supposed to be
1761    vectorized.  This restriction will be relaxed.
1762 
1763    This pass will require a cost model to guide it whether to apply peeling
1764    or versioning or a combination of the two.  For example, the scheme that
1765    intel uses when given a loop with several memory accesses, is as follows:
1766    choose one memory access ('p') which alignment you want to force by doing
1767    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1768    other accesses are not necessarily aligned, or (2) use loop versioning to
1769    generate one loop in which all accesses are aligned, and another loop in
1770    which only 'p' is necessarily aligned.
1771 
1772    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1773    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1774    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1775 
1776    Devising a cost model is the most critical aspect of this work.  It will
1777    guide us on which access to peel for, whether to use loop versioning, how
1778    many versions to create, etc.  The cost model will probably consist of
1779    generic considerations as well as target specific considerations (on
1780    powerpc for example, misaligned stores are more painful than misaligned
1781    loads).
1782 
1783    Here are the general steps involved in alignment enhancements:
1784 
1785      -- original loop, before alignment analysis:
1786           for (i=0; i<N; i++){
1787             x = q[i];                             # DR_MISALIGNMENT(q) = unknown
1788             p[i] = y;                             # DR_MISALIGNMENT(p) = unknown
1789           }
1790 
1791      -- After vect_compute_data_refs_alignment:
1792           for (i=0; i<N; i++){
1793             x = q[i];                             # DR_MISALIGNMENT(q) = 3
1794             p[i] = y;                             # DR_MISALIGNMENT(p) = unknown
1795           }
1796 
1797      -- Possibility 1: we do loop versioning:
1798      if (p is aligned) {
1799           for (i=0; i<N; i++){          # loop 1A
1800             x = q[i];                             # DR_MISALIGNMENT(q) = 3
1801             p[i] = y;                             # DR_MISALIGNMENT(p) = 0
1802           }
1803      }
1804      else {
1805           for (i=0; i<N; i++){          # loop 1B
1806             x = q[i];                             # DR_MISALIGNMENT(q) = 3
1807             p[i] = y;                             # DR_MISALIGNMENT(p) = unaligned
1808           }
1809      }
1810 
1811      -- Possibility 2: we do loop peeling:
1812      for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1813           x = q[i];
1814           p[i] = y;
1815      }
1816      for (i = 3; i < N; i++){ # loop 2A
1817           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1818           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1819      }
1820 
1821      -- Possibility 3: combination of loop peeling and versioning:
1822      for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1823           x = q[i];
1824           p[i] = y;
1825      }
1826      if (p is aligned) {
1827           for (i = 3; i<N; i++){        # loop 3A
1828             x = q[i];                             # DR_MISALIGNMENT(q) = 0
1829             p[i] = y;                             # DR_MISALIGNMENT(p) = 0
1830           }
1831      }
1832      else {
1833           for (i = 3; i<N; i++){        # loop 3B
1834             x = q[i];                             # DR_MISALIGNMENT(q) = 0
1835             p[i] = y;                             # DR_MISALIGNMENT(p) = unaligned
1836           }
1837      }
1838 
1839      These loops are later passed to loop_transform to be vectorized.  The
1840      vectorizer will use the alignment information to guide the transformation
1841      (whether to generate regular loads/stores, or with special handling for
1842      misalignment).  */
1843 
1844 opt_result
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1845 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1846 {
1847   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1848   dr_vec_info *first_store = NULL;
1849   dr_vec_info *dr0_info = NULL;
1850   struct data_reference *dr;
1851   unsigned int i;
1852   bool do_peeling = false;
1853   bool do_versioning = false;
1854   unsigned int npeel = 0;
1855   bool one_misalignment_known = false;
1856   bool one_misalignment_unknown = false;
1857   bool one_dr_unsupportable = false;
1858   dr_vec_info *unsupportable_dr_info = NULL;
1859   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1860   hash_table<peel_info_hasher> peeling_htab (1);
1861 
1862   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1863 
1864   /* Reset data so we can safely be called multiple times.  */
1865   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1866   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1867 
1868   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1869     return opt_result::success ();
1870 
1871   /* Sort the vector of datarefs so DRs that have the same or dependent
1872      alignment are next to each other.  */
1873   auto_vec<data_reference_p> datarefs
1874     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1875   datarefs.qsort (dr_align_group_sort_cmp);
1876 
1877   /* Compute the number of DRs that become aligned when we peel
1878      a dataref so it becomes aligned.  */
1879   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1880   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1881   unsigned i0;
1882   for (i0 = 0; i0 < datarefs.length (); ++i0)
1883     if (DR_BASE_ADDRESS (datarefs[i0]))
1884       break;
1885   for (i = i0 + 1; i <= datarefs.length (); ++i)
1886     {
1887       if (i == datarefs.length ()
1888             || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1889                                      DR_BASE_ADDRESS (datarefs[i]), 0)
1890             || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1891                                      DR_OFFSET (datarefs[i]), 0)
1892             || !operand_equal_p (DR_STEP (datarefs[i0]),
1893                                      DR_STEP (datarefs[i]), 0))
1894           {
1895             /* The subgroup [i0, i-1] now only differs in DR_INIT and
1896                possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1897                will get known misalignment if we align one of the refs
1898                with the largest DR_TARGET_ALIGNMENT.  */
1899             for (unsigned j = i0; j < i; ++j)
1900               {
1901                 dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1902                 for (unsigned k = i0; k < i; ++k)
1903                     {
1904                       if (k == j)
1905                         continue;
1906                       dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1907                       if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1908                                                                              dr_infoj))
1909                         n_same_align_refs[j]++;
1910                     }
1911               }
1912             i0 = i;
1913           }
1914     }
1915 
1916   /* While cost model enhancements are expected in the future, the high level
1917      view of the code at this time is as follows:
1918 
1919      A) If there is a misaligned access then see if peeling to align
1920         this access can make all data references satisfy
1921         vect_supportable_dr_alignment.  If so, update data structures
1922         as needed and return true.
1923 
1924      B) If peeling wasn't possible and there is a data reference with an
1925         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1926         then see if loop versioning checks can be used to make all data
1927         references satisfy vect_supportable_dr_alignment.  If so, update
1928         data structures as needed and return true.
1929 
1930      C) If neither peeling nor versioning were successful then return false if
1931         any data reference does not satisfy vect_supportable_dr_alignment.
1932 
1933      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1934 
1935      Note, Possibility 3 above (which is peeling and versioning together) is not
1936      being done at this time.  */
1937 
1938   /* (1) Peeling to force alignment.  */
1939 
1940   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1941      Considerations:
1942      + How many accesses will become aligned due to the peeling
1943      - How many accesses will become unaligned due to the peeling,
1944        and the cost of misaligned accesses.
1945      - The cost of peeling (the extra runtime checks, the increase
1946        in code size).  */
1947 
1948   FOR_EACH_VEC_ELT (datarefs, i, dr)
1949     {
1950       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1951       if (!vect_relevant_for_alignment_p (dr_info))
1952           continue;
1953 
1954       stmt_vec_info stmt_info = dr_info->stmt;
1955       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1956       do_peeling = vector_alignment_reachable_p (dr_info);
1957       if (do_peeling)
1958         {
1959             if (known_alignment_for_access_p (dr_info, vectype))
1960             {
1961                 unsigned int npeel_tmp = 0;
1962                 bool negative = tree_int_cst_compare (DR_STEP (dr),
1963                                                                 size_zero_node) < 0;
1964 
1965                 /* If known_alignment_for_access_p then we have set
1966                    DR_MISALIGNMENT which is only done if we know it at compiler
1967                    time, so it is safe to assume target alignment is constant.
1968                  */
1969                 unsigned int target_align =
1970                     DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1971                 unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
1972                 poly_int64 off = 0;
1973                 if (negative)
1974                     off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
1975                 unsigned int mis = dr_misalignment (dr_info, vectype, off);
1976                 mis = negative ? mis : -mis;
1977                 if (mis != 0)
1978                     npeel_tmp = (mis & (target_align - 1)) / dr_size;
1979 
1980               /* For multiple types, it is possible that the bigger type access
1981                  will have more than one peeling option.  E.g., a loop with two
1982                  types: one of size (vector size / 4), and the other one of
1983                  size (vector size / 8).  Vectorization factor will 8.  If both
1984                  accesses are misaligned by 3, the first one needs one scalar
1985                  iteration to be aligned, and the second one needs 5.  But the
1986                      first one will be aligned also by peeling 5 scalar
1987                  iterations, and in that case both accesses will be aligned.
1988                  Hence, except for the immediate peeling amount, we also want
1989                  to try to add full vector size, while we don't exceed
1990                  vectorization factor.
1991                  We do this automatically for cost model, since we calculate
1992                      cost for every peeling option.  */
1993                 poly_uint64 nscalars = npeel_tmp;
1994               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1995                     {
1996                       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997                       nscalars = (STMT_SLP_TYPE (stmt_info)
1998                                     ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1999                     }
2000 
2001                 /* Save info about DR in the hash table.  Also include peeling
2002                      amounts according to the explanation above.  Indicate
2003                      the alignment status when the ref is not aligned.
2004                      ???  Rather than using unknown alignment here we should
2005                      prune all entries from the peeling hashtable which cause
2006                      DRs to be not supported.  */
2007                 bool supportable_if_not_aligned
2008                     = vect_supportable_dr_alignment
2009                         (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2010                 while (known_le (npeel_tmp, nscalars))
2011                 {
2012                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2013                                                       dr_info, npeel_tmp,
2014                                                       supportable_if_not_aligned);
2015                       npeel_tmp += MAX (1, target_align / dr_size);
2016                 }
2017 
2018                 one_misalignment_known = true;
2019             }
2020           else
2021             {
2022               /* If we don't know any misalignment values, we prefer
2023                  peeling for data-ref that has the maximum number of data-refs
2024                  with the same alignment, unless the target prefers to align
2025                  stores over load.  */
2026                 unsigned same_align_drs = n_same_align_refs[i];
2027                 if (!dr0_info
2028                       || dr0_same_align_drs < same_align_drs)
2029                     {
2030                       dr0_same_align_drs = same_align_drs;
2031                       dr0_info = dr_info;
2032                     }
2033                 /* For data-refs with the same number of related
2034                      accesses prefer the one where the misalign
2035                      computation will be invariant in the outermost loop.  */
2036                 else if (dr0_same_align_drs == same_align_drs)
2037                     {
2038                       class loop *ivloop0, *ivloop;
2039                       ivloop0 = outermost_invariant_loop_for_expr
2040                         (loop, DR_BASE_ADDRESS (dr0_info->dr));
2041                       ivloop = outermost_invariant_loop_for_expr
2042                         (loop, DR_BASE_ADDRESS (dr));
2043                       if ((ivloop && !ivloop0)
2044                           || (ivloop && ivloop0
2045                                 && flow_loop_nested_p (ivloop, ivloop0)))
2046                         dr0_info = dr_info;
2047                     }
2048 
2049                 one_misalignment_unknown = true;
2050 
2051                 /* Check for data refs with unsupportable alignment that
2052                    can be peeled.  */
2053                 enum dr_alignment_support supportable_dr_alignment
2054                     = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2055                                                              DR_MISALIGNMENT_UNKNOWN);
2056                 if (supportable_dr_alignment == dr_unaligned_unsupported)
2057                     {
2058                       one_dr_unsupportable = true;
2059                       unsupportable_dr_info = dr_info;
2060                     }
2061 
2062                 if (!first_store && DR_IS_WRITE (dr))
2063                     {
2064                       first_store = dr_info;
2065                       first_store_same_align_drs = same_align_drs;
2066                     }
2067             }
2068         }
2069       else
2070         {
2071             if (!aligned_access_p (dr_info, vectype))
2072             {
2073               if (dump_enabled_p ())
2074                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2075                                  "vector alignment may not be reachable\n");
2076               break;
2077             }
2078         }
2079     }
2080 
2081   /* Check if we can possibly peel the loop.  */
2082   if (!vect_can_advance_ivs_p (loop_vinfo)
2083       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
2084       || loop->inner)
2085     do_peeling = false;
2086 
2087   struct _vect_peel_extended_info peel_for_known_alignment;
2088   struct _vect_peel_extended_info peel_for_unknown_alignment;
2089   struct _vect_peel_extended_info best_peel;
2090 
2091   peel_for_unknown_alignment.inside_cost = INT_MAX;
2092   peel_for_unknown_alignment.outside_cost = INT_MAX;
2093   peel_for_unknown_alignment.peel_info.count = 0;
2094 
2095   if (do_peeling
2096       && one_misalignment_unknown)
2097     {
2098       /* Check if the target requires to prefer stores over loads, i.e., if
2099          misaligned stores are more expensive than misaligned loads (taking
2100          drs with same alignment into account).  */
2101       unsigned int load_inside_cost = 0;
2102       unsigned int load_outside_cost = 0;
2103       unsigned int store_inside_cost = 0;
2104       unsigned int store_outside_cost = 0;
2105       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2106 
2107       stmt_vector_for_cost dummy;
2108       dummy.create (2);
2109       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2110                                               &load_inside_cost,
2111                                               &load_outside_cost,
2112                                               &dummy, &dummy, estimated_npeels);
2113       dummy.release ();
2114 
2115       if (first_store)
2116           {
2117             dummy.create (2);
2118             vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2119                                                     &store_inside_cost,
2120                                                     &store_outside_cost,
2121                                                     &dummy, &dummy,
2122                                                     estimated_npeels);
2123             dummy.release ();
2124           }
2125       else
2126           {
2127             store_inside_cost = INT_MAX;
2128             store_outside_cost = INT_MAX;
2129           }
2130 
2131       if (load_inside_cost > store_inside_cost
2132             || (load_inside_cost == store_inside_cost
2133                 && load_outside_cost > store_outside_cost))
2134           {
2135             dr0_info = first_store;
2136             dr0_same_align_drs = first_store_same_align_drs;
2137             peel_for_unknown_alignment.inside_cost = store_inside_cost;
2138             peel_for_unknown_alignment.outside_cost = store_outside_cost;
2139           }
2140       else
2141           {
2142             peel_for_unknown_alignment.inside_cost = load_inside_cost;
2143             peel_for_unknown_alignment.outside_cost = load_outside_cost;
2144           }
2145 
2146       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2147       prologue_cost_vec.create (2);
2148       epilogue_cost_vec.create (2);
2149 
2150       int dummy2;
2151       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2152           (loop_vinfo, estimated_npeels, &dummy2,
2153            &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2154            &prologue_cost_vec, &epilogue_cost_vec);
2155 
2156       prologue_cost_vec.release ();
2157       epilogue_cost_vec.release ();
2158 
2159       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2160     }
2161 
2162   peel_for_unknown_alignment.peel_info.npeel = 0;
2163   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2164 
2165   best_peel = peel_for_unknown_alignment;
2166 
2167   peel_for_known_alignment.inside_cost = INT_MAX;
2168   peel_for_known_alignment.outside_cost = INT_MAX;
2169   peel_for_known_alignment.peel_info.count = 0;
2170   peel_for_known_alignment.peel_info.dr_info = NULL;
2171 
2172   if (do_peeling && one_misalignment_known)
2173     {
2174       /* Peeling is possible, but there is no data access that is not supported
2175          unless aligned.  So we try to choose the best possible peeling from
2176            the hash table.  */
2177       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2178           (&peeling_htab, loop_vinfo);
2179     }
2180 
2181   /* Compare costs of peeling for known and unknown alignment. */
2182   if (peel_for_known_alignment.peel_info.dr_info != NULL
2183       && peel_for_unknown_alignment.inside_cost
2184       >= peel_for_known_alignment.inside_cost)
2185     {
2186       best_peel = peel_for_known_alignment;
2187 
2188       /* If the best peeling for known alignment has NPEEL == 0, perform no
2189          peeling at all except if there is an unsupportable dr that we can
2190          align.  */
2191       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2192           do_peeling = false;
2193     }
2194 
2195   /* If there is an unsupportable data ref, prefer this over all choices so far
2196      since we'd have to discard a chosen peeling except when it accidentally
2197      aligned the unsupportable data ref.  */
2198   if (one_dr_unsupportable)
2199     dr0_info = unsupportable_dr_info;
2200   else if (do_peeling)
2201     {
2202       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2203            TODO: Use nopeel_outside_cost or get rid of it?  */
2204       unsigned nopeel_inside_cost = 0;
2205       unsigned nopeel_outside_cost = 0;
2206 
2207       stmt_vector_for_cost dummy;
2208       dummy.create (2);
2209       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2210                                               &nopeel_outside_cost, &dummy, &dummy, 0);
2211       dummy.release ();
2212 
2213       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2214            costs will be recorded.  */
2215       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2216       prologue_cost_vec.create (2);
2217       epilogue_cost_vec.create (2);
2218 
2219       int dummy2;
2220       nopeel_outside_cost += vect_get_known_peeling_cost
2221           (loop_vinfo, 0, &dummy2,
2222            &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2223            &prologue_cost_vec, &epilogue_cost_vec);
2224 
2225       prologue_cost_vec.release ();
2226       epilogue_cost_vec.release ();
2227 
2228       npeel = best_peel.peel_info.npeel;
2229       dr0_info = best_peel.peel_info.dr_info;
2230 
2231       /* If no peeling is not more expensive than the best peeling we
2232            have so far, don't perform any peeling.  */
2233       if (nopeel_inside_cost <= best_peel.inside_cost)
2234           do_peeling = false;
2235     }
2236 
2237   if (do_peeling)
2238     {
2239       stmt_vec_info stmt_info = dr0_info->stmt;
2240       if (known_alignment_for_access_p (dr0_info,
2241                                                   STMT_VINFO_VECTYPE (stmt_info)))
2242         {
2243             bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2244                                                             size_zero_node) < 0;
2245           if (!npeel)
2246             {
2247               /* Since it's known at compile time, compute the number of
2248                  iterations in the peeled loop (the peeling factor) for use in
2249                  updating DR_MISALIGNMENT values.  The peeling factor is the
2250                  vectorization factor minus the misalignment as an element
2251                  count.  */
2252                 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2253                 poly_int64 off = 0;
2254                 if (negative)
2255                     off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2256                            * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2257                 unsigned int mis
2258                     = dr_misalignment (dr0_info, vectype, off);
2259                 mis = negative ? mis : -mis;
2260                 /* If known_alignment_for_access_p then we have set
2261                    DR_MISALIGNMENT which is only done if we know it at compiler
2262                    time, so it is safe to assume target alignment is constant.
2263                  */
2264                 unsigned int target_align =
2265                     DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2266                 npeel = ((mis & (target_align - 1))
2267                            / vect_get_scalar_dr_size (dr0_info));
2268             }
2269 
2270             /* For interleaved data access every iteration accesses all the
2271                members of the group, therefore we divide the number of iterations
2272                by the group size.  */
2273             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2274               npeel /= DR_GROUP_SIZE (stmt_info);
2275 
2276           if (dump_enabled_p ())
2277             dump_printf_loc (MSG_NOTE, vect_location,
2278                              "Try peeling by %d\n", npeel);
2279         }
2280 
2281       /* Ensure that all datarefs can be vectorized after the peel.  */
2282       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2283           do_peeling = false;
2284 
2285       /* Check if all datarefs are supportable and log.  */
2286       if (do_peeling
2287             && npeel == 0
2288             && known_alignment_for_access_p (dr0_info,
2289                                                      STMT_VINFO_VECTYPE (stmt_info)))
2290           return opt_result::success ();
2291 
2292       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2293       if (do_peeling)
2294         {
2295           unsigned max_allowed_peel
2296               = param_vect_max_peeling_for_alignment;
2297             if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2298               max_allowed_peel = 0;
2299           if (max_allowed_peel != (unsigned)-1)
2300             {
2301               unsigned max_peel = npeel;
2302               if (max_peel == 0)
2303                 {
2304                       poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2305                       unsigned HOST_WIDE_INT target_align_c;
2306                       if (target_align.is_constant (&target_align_c))
2307                         max_peel =
2308                           target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2309                       else
2310                         {
2311                           do_peeling = false;
2312                           if (dump_enabled_p ())
2313                               dump_printf_loc (MSG_NOTE, vect_location,
2314                                 "Disable peeling, max peels set and vector"
2315                                 " alignment unknown\n");
2316                         }
2317                 }
2318               if (max_peel > max_allowed_peel)
2319                 {
2320                   do_peeling = false;
2321                   if (dump_enabled_p ())
2322                     dump_printf_loc (MSG_NOTE, vect_location,
2323                         "Disable peeling, max peels reached: %d\n", max_peel);
2324                 }
2325             }
2326         }
2327 
2328       /* Cost model #2 - if peeling may result in a remaining loop not
2329            iterating enough to be vectorized then do not peel.  Since this
2330            is a cost heuristic rather than a correctness decision, use the
2331            most likely runtime value for variable vectorization factors.  */
2332       if (do_peeling
2333             && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2334           {
2335             unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2336             unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2337             if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2338                 < assumed_vf + max_peel)
2339               do_peeling = false;
2340           }
2341 
2342       if (do_peeling)
2343         {
2344           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2345              If the misalignment of DR_i is identical to that of dr0 then set
2346              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2347              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2348              by the peeling factor times the element size of DR_i (MOD the
2349              vectorization factor times the size).  Otherwise, the
2350              misalignment of DR_i must be set to unknown.  */
2351             FOR_EACH_VEC_ELT (datarefs, i, dr)
2352               if (dr != dr0_info->dr)
2353                 {
2354                     dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2355                     if (!vect_relevant_for_alignment_p (dr_info))
2356                       continue;
2357 
2358                     vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2359                 }
2360 
2361           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2362           if (npeel)
2363             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2364           else
2365               LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2366             SET_DR_MISALIGNMENT (dr0_info,
2367                                      vect_dr_misalign_for_aligned_access (dr0_info));
2368             if (dump_enabled_p ())
2369             {
2370               dump_printf_loc (MSG_NOTE, vect_location,
2371                                "Alignment of access forced using peeling.\n");
2372               dump_printf_loc (MSG_NOTE, vect_location,
2373                                "Peeling for alignment will be applied.\n");
2374             }
2375 
2376             /* The inside-loop cost will be accounted for in vectorizable_load
2377                and vectorizable_store correctly with adjusted alignments.
2378                Drop the body_cst_vec on the floor here.  */
2379             return opt_result::success ();
2380         }
2381     }
2382 
2383   /* (2) Versioning to force alignment.  */
2384 
2385   /* Try versioning if:
2386      1) optimize loop for speed and the cost-model is not cheap
2387      2) there is at least one unsupported misaligned data ref with an unknown
2388         misalignment, and
2389      3) all misaligned data refs with a known misalignment are supported, and
2390      4) the number of runtime alignment checks is within reason.  */
2391 
2392   do_versioning
2393     = (optimize_loop_nest_for_speed_p (loop)
2394        && !loop->inner /* FORNOW */
2395        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2396 
2397   if (do_versioning)
2398     {
2399       FOR_EACH_VEC_ELT (datarefs, i, dr)
2400         {
2401             dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2402             if (!vect_relevant_for_alignment_p (dr_info))
2403               continue;
2404 
2405             stmt_vec_info stmt_info = dr_info->stmt;
2406             if (STMT_VINFO_STRIDED_P (stmt_info))
2407               {
2408                 do_versioning = false;
2409                 break;
2410               }
2411 
2412             tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2413             bool negative = tree_int_cst_compare (DR_STEP (dr),
2414                                                             size_zero_node) < 0;
2415             poly_int64 off = 0;
2416             if (negative)
2417               off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2418                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2419             int misalignment;
2420             if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2421               continue;
2422 
2423             enum dr_alignment_support supportable_dr_alignment
2424               = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2425                                                        misalignment);
2426             if (supportable_dr_alignment == dr_unaligned_unsupported)
2427             {
2428                 if (misalignment != DR_MISALIGNMENT_UNKNOWN
2429                       || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2430                           >= (unsigned) param_vect_max_version_for_alignment_checks))
2431                 {
2432                   do_versioning = false;
2433                   break;
2434                 }
2435 
2436                 /* At present we don't support versioning for alignment
2437                      with variable VF, since there's no guarantee that the
2438                      VF is a power of two.  We could relax this if we added
2439                      a way of enforcing a power-of-two size.  */
2440                 unsigned HOST_WIDE_INT size;
2441                 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2442                     {
2443                       do_versioning = false;
2444                       break;
2445                     }
2446 
2447                 /* Forcing alignment in the first iteration is no good if
2448                      we don't keep it across iterations.  For now, just disable
2449                      versioning in this case.
2450                      ?? We could actually unroll the loop to achieve the required
2451                      overall step alignment, and forcing the alignment could be
2452                      done by doing some iterations of the non-vectorized loop.  */
2453                 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2454                                      * DR_STEP_ALIGNMENT (dr),
2455                                      DR_TARGET_ALIGNMENT (dr_info)))
2456                     {
2457                       do_versioning = false;
2458                       break;
2459                     }
2460 
2461               /* The rightmost bits of an aligned address must be zeros.
2462                  Construct the mask needed for this test.  For example,
2463                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2464                  mask must be 15 = 0xf. */
2465                 int mask = size - 1;
2466 
2467                 /* FORNOW: use the same mask to test all potentially unaligned
2468                      references in the loop.  */
2469                 if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2470                       && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2471                     {
2472                       do_versioning = false;
2473                       break;
2474                     }
2475 
2476               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2477                 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2478             }
2479         }
2480 
2481       /* Versioning requires at least one misaligned data reference.  */
2482       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2483         do_versioning = false;
2484       else if (!do_versioning)
2485         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2486     }
2487 
2488   if (do_versioning)
2489     {
2490       const vec<stmt_vec_info> &may_misalign_stmts
2491           = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2492       stmt_vec_info stmt_info;
2493 
2494       /* It can now be assumed that the data references in the statements
2495          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2496          of the loop being vectorized.  */
2497       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2498         {
2499             dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2500             SET_DR_MISALIGNMENT (dr_info,
2501                                      vect_dr_misalign_for_aligned_access (dr_info));
2502             if (dump_enabled_p ())
2503             dump_printf_loc (MSG_NOTE, vect_location,
2504                              "Alignment of access forced using versioning.\n");
2505         }
2506 
2507       if (dump_enabled_p ())
2508         dump_printf_loc (MSG_NOTE, vect_location,
2509                          "Versioning for alignment will be applied.\n");
2510 
2511       /* Peeling and versioning can't be done together at this time.  */
2512       gcc_assert (! (do_peeling && do_versioning));
2513 
2514       return opt_result::success ();
2515     }
2516 
2517   /* This point is reached if neither peeling nor versioning is being done.  */
2518   gcc_assert (! (do_peeling || do_versioning));
2519 
2520   return opt_result::success ();
2521 }
2522 
2523 
2524 /* Function vect_analyze_data_refs_alignment
2525 
2526    Analyze the alignment of the data-references in the loop.
2527    Return FALSE if a data reference is found that cannot be vectorized.  */
2528 
2529 opt_result
vect_analyze_data_refs_alignment(loop_vec_info loop_vinfo)2530 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2531 {
2532   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2533 
2534   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2535   struct data_reference *dr;
2536   unsigned int i;
2537 
2538   vect_record_base_alignments (loop_vinfo);
2539   FOR_EACH_VEC_ELT (datarefs, i, dr)
2540     {
2541       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2542       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2543           {
2544             if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2545                 && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2546               continue;
2547             vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2548                                                      STMT_VINFO_VECTYPE (dr_info->stmt));
2549           }
2550     }
2551 
2552   return opt_result::success ();
2553 }
2554 
2555 
2556 /* Analyze alignment of DRs of stmts in NODE.  */
2557 
2558 static bool
vect_slp_analyze_node_alignment(vec_info * vinfo,slp_tree node)2559 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2560 {
2561   /* Alignment is maintained in the first element of the group.  */
2562   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2563   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2564   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2565   tree vectype = SLP_TREE_VECTYPE (node);
2566   poly_uint64 vector_alignment
2567     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2568                      BITS_PER_UNIT);
2569   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2570     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2571   /* Re-analyze alignment when we're facing a vectorization with a bigger
2572      alignment requirement.  */
2573   else if (known_lt (dr_info->target_alignment, vector_alignment))
2574     {
2575       poly_uint64 old_target_alignment = dr_info->target_alignment;
2576       int old_misalignment = dr_info->misalignment;
2577       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2578       /* But keep knowledge about a smaller alignment.  */
2579       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2580             && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2581           {
2582             dr_info->target_alignment = old_target_alignment;
2583             dr_info->misalignment = old_misalignment;
2584           }
2585     }
2586   /* When we ever face unordered target alignments the first one wins in terms
2587      of analyzing and the other will become unknown in dr_misalignment.  */
2588   return true;
2589 }
2590 
2591 /* Function vect_slp_analyze_instance_alignment
2592 
2593    Analyze the alignment of the data-references in the SLP instance.
2594    Return FALSE if a data reference is found that cannot be vectorized.  */
2595 
2596 bool
vect_slp_analyze_instance_alignment(vec_info * vinfo,slp_instance instance)2597 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2598                                                             slp_instance instance)
2599 {
2600   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2601 
2602   slp_tree node;
2603   unsigned i;
2604   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2605     if (! vect_slp_analyze_node_alignment (vinfo, node))
2606       return false;
2607 
2608   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2609       && ! vect_slp_analyze_node_alignment
2610                (vinfo, SLP_INSTANCE_TREE (instance)))
2611     return false;
2612 
2613   return true;
2614 }
2615 
2616 
2617 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2618    accesses of legal size, step, etc.  Detect gaps, single element
2619    interleaving, and other special cases. Set grouped access info.
2620    Collect groups of strided stores for further use in SLP analysis.
2621    Worker for vect_analyze_group_access.  */
2622 
2623 static bool
vect_analyze_group_access_1(vec_info * vinfo,dr_vec_info * dr_info)2624 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2625 {
2626   data_reference *dr = dr_info->dr;
2627   tree step = DR_STEP (dr);
2628   tree scalar_type = TREE_TYPE (DR_REF (dr));
2629   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2630   stmt_vec_info stmt_info = dr_info->stmt;
2631   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2632   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2633   HOST_WIDE_INT dr_step = -1;
2634   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2635   bool slp_impossible = false;
2636 
2637   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2638      size of the interleaving group (including gaps).  */
2639   if (tree_fits_shwi_p (step))
2640     {
2641       dr_step = tree_to_shwi (step);
2642       /* Check that STEP is a multiple of type size.  Otherwise there is
2643          a non-element-sized gap at the end of the group which we
2644            cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2645            ???  As we can handle non-constant step fine here we should
2646            simply remove uses of DR_GROUP_GAP between the last and first
2647            element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2648            simply not include that gap.  */
2649       if ((dr_step % type_size) != 0)
2650           {
2651             if (dump_enabled_p ())
2652               dump_printf_loc (MSG_NOTE, vect_location,
2653                                    "Step %T is not a multiple of the element size"
2654                                    " for %T\n",
2655                                    step, DR_REF (dr));
2656             return false;
2657           }
2658       groupsize = absu_hwi (dr_step) / type_size;
2659     }
2660   else
2661     groupsize = 0;
2662 
2663   /* Not consecutive access is possible only if it is a part of interleaving.  */
2664   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2665     {
2666       /* Check if it this DR is a part of interleaving, and is a single
2667            element of the group that is accessed in the loop.  */
2668 
2669       /* Gaps are supported only for loads. STEP must be a multiple of the type
2670            size.  */
2671       if (DR_IS_READ (dr)
2672             && (dr_step % type_size) == 0
2673             && groupsize > 0
2674             /* This could be UINT_MAX but as we are generating code in a very
2675                inefficient way we have to cap earlier.
2676                See PR91403 for example.  */
2677             && groupsize <= 4096)
2678           {
2679             DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2680             DR_GROUP_SIZE (stmt_info) = groupsize;
2681             DR_GROUP_GAP (stmt_info) = groupsize - 1;
2682             if (dump_enabled_p ())
2683               dump_printf_loc (MSG_NOTE, vect_location,
2684                                    "Detected single element interleaving %T"
2685                                    " step %T\n",
2686                                    DR_REF (dr), step);
2687 
2688             return true;
2689           }
2690 
2691       if (dump_enabled_p ())
2692           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2693                                "not consecutive access %G", stmt_info->stmt);
2694 
2695       if (bb_vinfo)
2696           {
2697             /* Mark the statement as unvectorizable.  */
2698             STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2699             return true;
2700           }
2701 
2702       if (dump_enabled_p ())
2703           dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2704       STMT_VINFO_STRIDED_P (stmt_info) = true;
2705       return true;
2706     }
2707 
2708   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2709     {
2710       /* First stmt in the interleaving chain. Check the chain.  */
2711       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2712       struct data_reference *data_ref = dr;
2713       unsigned int count = 1;
2714       tree prev_init = DR_INIT (data_ref);
2715       HOST_WIDE_INT diff, gaps = 0;
2716 
2717       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2718       while (next)
2719         {
2720           /* We never have the same DR multiple times.  */
2721           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2722                                         DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2723 
2724             data_ref = STMT_VINFO_DATA_REF (next);
2725 
2726             /* All group members have the same STEP by construction.  */
2727             gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2728 
2729           /* Check that the distance between two accesses is equal to the type
2730              size. Otherwise, we have gaps.  */
2731           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2732                       - TREE_INT_CST_LOW (prev_init)) / type_size;
2733             if (diff < 1 || diff > UINT_MAX)
2734               {
2735                 /* For artificial testcases with array accesses with large
2736                      constant indices we can run into overflow issues which
2737                      can end up fooling the groupsize constraint below so
2738                      check the individual gaps (which are represented as
2739                      unsigned int) as well.  */
2740                 if (dump_enabled_p ())
2741                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742                                          "interleaved access with gap larger "
2743                                          "than representable\n");
2744                 return false;
2745               }
2746             if (diff != 1)
2747               {
2748                 /* FORNOW: SLP of accesses with gaps is not supported.  */
2749                 slp_impossible = true;
2750                 if (DR_IS_WRITE (data_ref))
2751                     {
2752                   if (dump_enabled_p ())
2753                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2754                                      "interleaved store with gaps\n");
2755                       return false;
2756                     }
2757 
2758               gaps += diff - 1;
2759               }
2760 
2761             last_accessed_element += diff;
2762 
2763           /* Store the gap from the previous member of the group. If there is no
2764              gap in the access, DR_GROUP_GAP is always 1.  */
2765             DR_GROUP_GAP (next) = diff;
2766 
2767             prev_init = DR_INIT (data_ref);
2768             next = DR_GROUP_NEXT_ELEMENT (next);
2769             /* Count the number of data-refs in the chain.  */
2770             count++;
2771         }
2772 
2773       if (groupsize == 0)
2774         groupsize = count + gaps;
2775 
2776       /* This could be UINT_MAX but as we are generating code in a very
2777          inefficient way we have to cap earlier.  See PR78699 for example.  */
2778       if (groupsize > 4096)
2779           {
2780             if (dump_enabled_p ())
2781               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2782                                    "group is too large\n");
2783             return false;
2784           }
2785 
2786       /* Check that the size of the interleaving is equal to count for stores,
2787          i.e., that there are no gaps.  */
2788       if (groupsize != count
2789             && !DR_IS_READ (dr))
2790         {
2791             groupsize = count;
2792             STMT_VINFO_STRIDED_P (stmt_info) = true;
2793           }
2794 
2795       /* If there is a gap after the last load in the group it is the
2796            difference between the groupsize and the last accessed
2797            element.
2798            When there is no gap, this difference should be 0.  */
2799       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2800 
2801       DR_GROUP_SIZE (stmt_info) = groupsize;
2802       if (dump_enabled_p ())
2803           {
2804             dump_printf_loc (MSG_NOTE, vect_location,
2805                                  "Detected interleaving ");
2806             if (DR_IS_READ (dr))
2807               dump_printf (MSG_NOTE, "load ");
2808             else if (STMT_VINFO_STRIDED_P (stmt_info))
2809               dump_printf (MSG_NOTE, "strided store ");
2810             else
2811               dump_printf (MSG_NOTE, "store ");
2812             dump_printf (MSG_NOTE, "of size %u\n",
2813                            (unsigned)groupsize);
2814             dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2815             next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2816             while (next)
2817               {
2818                 if (DR_GROUP_GAP (next) != 1)
2819                     dump_printf_loc (MSG_NOTE, vect_location,
2820                                          "\t<gap of %d elements>\n",
2821                                          DR_GROUP_GAP (next) - 1);
2822                 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2823                 next = DR_GROUP_NEXT_ELEMENT (next);
2824               }
2825             if (DR_GROUP_GAP (stmt_info) != 0)
2826               dump_printf_loc (MSG_NOTE, vect_location,
2827                                    "\t<gap of %d elements>\n",
2828                                    DR_GROUP_GAP (stmt_info));
2829           }
2830 
2831       /* SLP: create an SLP data structure for every interleaving group of
2832            stores for further analysis in vect_analyse_slp.  */
2833       if (DR_IS_WRITE (dr) && !slp_impossible)
2834           {
2835             if (loop_vinfo)
2836               LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2837             if (bb_vinfo)
2838               BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2839           }
2840     }
2841 
2842   return true;
2843 }
2844 
2845 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2846    accesses of legal size, step, etc.  Detect gaps, single element
2847    interleaving, and other special cases. Set grouped access info.
2848    Collect groups of strided stores for further use in SLP analysis.  */
2849 
2850 static bool
vect_analyze_group_access(vec_info * vinfo,dr_vec_info * dr_info)2851 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2852 {
2853   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2854     {
2855       /* Dissolve the group if present.  */
2856       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2857       while (stmt_info)
2858           {
2859             stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2860             DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2861             DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2862             stmt_info = next;
2863           }
2864       return false;
2865     }
2866   return true;
2867 }
2868 
2869 /* Analyze the access pattern of the data-reference DR_INFO.
2870    In case of non-consecutive accesses call vect_analyze_group_access() to
2871    analyze groups of accesses.  */
2872 
2873 static bool
vect_analyze_data_ref_access(vec_info * vinfo,dr_vec_info * dr_info)2874 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2875 {
2876   data_reference *dr = dr_info->dr;
2877   tree step = DR_STEP (dr);
2878   tree scalar_type = TREE_TYPE (DR_REF (dr));
2879   stmt_vec_info stmt_info = dr_info->stmt;
2880   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2881   class loop *loop = NULL;
2882 
2883   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2884     return true;
2885 
2886   if (loop_vinfo)
2887     loop = LOOP_VINFO_LOOP (loop_vinfo);
2888 
2889   if (loop_vinfo && !step)
2890     {
2891       if (dump_enabled_p ())
2892           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893                            "bad data-ref access in loop\n");
2894       return false;
2895     }
2896 
2897   /* Allow loads with zero step in inner-loop vectorization.  */
2898   if (loop_vinfo && integer_zerop (step))
2899     {
2900       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2901       if (!nested_in_vect_loop_p (loop, stmt_info))
2902           return DR_IS_READ (dr);
2903       /* Allow references with zero step for outer loops marked
2904            with pragma omp simd only - it guarantees absence of
2905            loop-carried dependencies between inner loop iterations.  */
2906       if (loop->safelen < 2)
2907           {
2908             if (dump_enabled_p ())
2909               dump_printf_loc (MSG_NOTE, vect_location,
2910                                    "zero step in inner loop of nest\n");
2911             return false;
2912           }
2913     }
2914 
2915   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2916     {
2917       /* Interleaved accesses are not yet supported within outer-loop
2918         vectorization for references in the inner-loop.  */
2919       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2920 
2921       /* For the rest of the analysis we use the outer-loop step.  */
2922       step = STMT_VINFO_DR_STEP (stmt_info);
2923       if (integer_zerop (step))
2924           {
2925             if (dump_enabled_p ())
2926               dump_printf_loc (MSG_NOTE, vect_location,
2927                                "zero step in outer loop.\n");
2928             return DR_IS_READ (dr);
2929           }
2930     }
2931 
2932   /* Consecutive?  */
2933   if (TREE_CODE (step) == INTEGER_CST)
2934     {
2935       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2936       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2937             || (dr_step < 0
2938                 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2939           {
2940             /* Mark that it is not interleaving.  */
2941             DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2942             return true;
2943           }
2944     }
2945 
2946   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2947     {
2948       if (dump_enabled_p ())
2949           dump_printf_loc (MSG_NOTE, vect_location,
2950                            "grouped access in outer loop.\n");
2951       return false;
2952     }
2953 
2954 
2955   /* Assume this is a DR handled by non-constant strided load case.  */
2956   if (TREE_CODE (step) != INTEGER_CST)
2957     return (STMT_VINFO_STRIDED_P (stmt_info)
2958               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2959                     || vect_analyze_group_access (vinfo, dr_info)));
2960 
2961   /* Not consecutive access - check if it's a part of interleaving group.  */
2962   return vect_analyze_group_access (vinfo, dr_info);
2963 }
2964 
2965 /* Compare two data-references DRA and DRB to group them into chunks
2966    suitable for grouping.  */
2967 
2968 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2969 dr_group_sort_cmp (const void *dra_, const void *drb_)
2970 {
2971   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
2972   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
2973   data_reference_p dra = dra_info->dr;
2974   data_reference_p drb = drb_info->dr;
2975   int cmp;
2976 
2977   /* Stabilize sort.  */
2978   if (dra == drb)
2979     return 0;
2980 
2981   /* Different group IDs lead never belong to the same group.  */
2982   if (dra_info->group != drb_info->group)
2983     return dra_info->group < drb_info->group ? -1 : 1;
2984 
2985   /* Ordering of DRs according to base.  */
2986   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2987                                      DR_BASE_ADDRESS (drb));
2988   if (cmp != 0)
2989     return cmp;
2990 
2991   /* And according to DR_OFFSET.  */
2992   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2993   if (cmp != 0)
2994     return cmp;
2995 
2996   /* Put reads before writes.  */
2997   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2998     return DR_IS_READ (dra) ? -1 : 1;
2999 
3000   /* Then sort after access size.  */
3001   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3002                                      TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3003   if (cmp != 0)
3004     return cmp;
3005 
3006   /* And after step.  */
3007   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3008   if (cmp != 0)
3009     return cmp;
3010 
3011   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3012   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3013   if (cmp == 0)
3014     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3015   return cmp;
3016 }
3017 
3018 /* If OP is the result of a conversion, return the unconverted value,
3019    otherwise return null.  */
3020 
3021 static tree
strip_conversion(tree op)3022 strip_conversion (tree op)
3023 {
3024   if (TREE_CODE (op) != SSA_NAME)
3025     return NULL_TREE;
3026   gimple *stmt = SSA_NAME_DEF_STMT (op);
3027   if (!is_gimple_assign (stmt)
3028       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3029     return NULL_TREE;
3030   return gimple_assign_rhs1 (stmt);
3031 }
3032 
3033 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3034    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3035    be grouped in SLP mode.  */
3036 
3037 static bool
can_group_stmts_p(stmt_vec_info stmt1_info,stmt_vec_info stmt2_info,bool allow_slp_p)3038 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3039                        bool allow_slp_p)
3040 {
3041   if (gimple_assign_single_p (stmt1_info->stmt))
3042     return gimple_assign_single_p (stmt2_info->stmt);
3043 
3044   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3045   if (call1 && gimple_call_internal_p (call1))
3046     {
3047       /* Check for two masked loads or two masked stores.  */
3048       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3049       if (!call2 || !gimple_call_internal_p (call2))
3050           return false;
3051       internal_fn ifn = gimple_call_internal_fn (call1);
3052       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3053           return false;
3054       if (ifn != gimple_call_internal_fn (call2))
3055           return false;
3056 
3057       /* Check that the masks are the same.  Cope with casts of masks,
3058            like those created by build_mask_conversion.  */
3059       tree mask1 = gimple_call_arg (call1, 2);
3060       tree mask2 = gimple_call_arg (call2, 2);
3061       if (!operand_equal_p (mask1, mask2, 0)
3062           && (ifn == IFN_MASK_STORE || !allow_slp_p))
3063           {
3064             mask1 = strip_conversion (mask1);
3065             if (!mask1)
3066               return false;
3067             mask2 = strip_conversion (mask2);
3068             if (!mask2)
3069               return false;
3070             if (!operand_equal_p (mask1, mask2, 0))
3071               return false;
3072           }
3073       return true;
3074     }
3075 
3076   return false;
3077 }
3078 
3079 /* Function vect_analyze_data_ref_accesses.
3080 
3081    Analyze the access pattern of all the data references in the loop.
3082 
3083    FORNOW: the only access pattern that is considered vectorizable is a
3084              simple step 1 (consecutive) access.
3085 
3086    FORNOW: handle only arrays and pointer accesses.  */
3087 
3088 opt_result
vect_analyze_data_ref_accesses(vec_info * vinfo,vec<int> * dataref_groups)3089 vect_analyze_data_ref_accesses (vec_info *vinfo,
3090                                         vec<int> *dataref_groups)
3091 {
3092   unsigned int i;
3093   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3094 
3095   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3096 
3097   if (datarefs.is_empty ())
3098     return opt_result::success ();
3099 
3100   /* Sort the array of datarefs to make building the interleaving chains
3101      linear.  Don't modify the original vector's order, it is needed for
3102      determining what dependencies are reversed.  */
3103   vec<dr_vec_info *> datarefs_copy;
3104   datarefs_copy.create (datarefs.length ());
3105   for (unsigned i = 0; i < datarefs.length (); i++)
3106     {
3107       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3108       /* If the caller computed DR grouping use that, otherwise group by
3109            basic blocks.  */
3110       if (dataref_groups)
3111           dr_info->group = (*dataref_groups)[i];
3112       else
3113           dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3114       datarefs_copy.quick_push (dr_info);
3115     }
3116   datarefs_copy.qsort (dr_group_sort_cmp);
3117   hash_set<stmt_vec_info> to_fixup;
3118 
3119   /* Build the interleaving chains.  */
3120   for (i = 0; i < datarefs_copy.length () - 1;)
3121     {
3122       dr_vec_info *dr_info_a = datarefs_copy[i];
3123       data_reference_p dra = dr_info_a->dr;
3124       int dra_group_id = dr_info_a->group;
3125       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3126       stmt_vec_info lastinfo = NULL;
3127       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3128             || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3129           {
3130             ++i;
3131             continue;
3132           }
3133       for (i = i + 1; i < datarefs_copy.length (); ++i)
3134           {
3135             dr_vec_info *dr_info_b = datarefs_copy[i];
3136             data_reference_p drb = dr_info_b->dr;
3137             int drb_group_id = dr_info_b->group;
3138             stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3139             if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3140                 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3141               break;
3142 
3143             /* ???  Imperfect sorting (non-compatible types, non-modulo
3144                accesses, same accesses) can lead to a group to be artificially
3145                split here as we don't just skip over those.  If it really
3146                matters we can push those to a worklist and re-iterate
3147                over them.  The we can just skip ahead to the next DR here.  */
3148 
3149             /* DRs in a different DR group should not be put into the same
3150                interleaving group.  */
3151             if (dra_group_id != drb_group_id)
3152               break;
3153 
3154             /* Check that the data-refs have same first location (except init)
3155                and they are both either store or load (not load and store,
3156                not masked loads or stores).  */
3157             if (DR_IS_READ (dra) != DR_IS_READ (drb)
3158                 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3159                                                   DR_BASE_ADDRESS (drb)) != 0
3160                 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3161                 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3162               break;
3163 
3164             /* Check that the data-refs have the same constant size.  */
3165             tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3166             tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3167             if (!tree_fits_uhwi_p (sza)
3168                 || !tree_fits_uhwi_p (szb)
3169                 || !tree_int_cst_equal (sza, szb))
3170               break;
3171 
3172             /* Check that the data-refs have the same step.  */
3173             if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3174               break;
3175 
3176             /* Check the types are compatible.
3177                ???  We don't distinguish this during sorting.  */
3178             if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3179                                            TREE_TYPE (DR_REF (drb))))
3180               break;
3181 
3182             /* Check that the DR_INITs are compile-time constants.  */
3183             if (!tree_fits_shwi_p (DR_INIT (dra))
3184                 || !tree_fits_shwi_p (DR_INIT (drb)))
3185               break;
3186 
3187             /* Different .GOMP_SIMD_LANE calls still give the same lane,
3188                just hold extra information.  */
3189             if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3190                 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3191                 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3192               break;
3193 
3194             /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3195             HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3196             HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3197             HOST_WIDE_INT init_prev
3198               = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3199             gcc_assert (init_a <= init_b
3200                           && init_a <= init_prev
3201                           && init_prev <= init_b);
3202 
3203             /* Do not place the same access in the interleaving chain twice.  */
3204             if (init_b == init_prev)
3205               {
3206                 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3207                                 < gimple_uid (DR_STMT (drb)));
3208                 /* Simply link in duplicates and fix up the chain below.  */
3209               }
3210             else
3211               {
3212                 /* If init_b == init_a + the size of the type * k, we have an
3213                      interleaving, and DRA is accessed before DRB.  */
3214                 unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3215                 if (type_size_a == 0
3216                       || (((unsigned HOST_WIDE_INT)init_b - init_a)
3217                           % type_size_a != 0))
3218                     break;
3219 
3220                 /* If we have a store, the accesses are adjacent.  This splits
3221                      groups into chunks we support (we don't support vectorization
3222                      of stores with gaps).  */
3223                 if (!DR_IS_READ (dra)
3224                       && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3225                           != type_size_a))
3226                     break;
3227 
3228                 /* If the step (if not zero or non-constant) is smaller than the
3229                      difference between data-refs' inits this splits groups into
3230                      suitable sizes.  */
3231                 if (tree_fits_shwi_p (DR_STEP (dra)))
3232                     {
3233                       unsigned HOST_WIDE_INT step
3234                         = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3235                       if (step != 0
3236                           && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3237                         break;
3238                     }
3239               }
3240 
3241             if (dump_enabled_p ())
3242               dump_printf_loc (MSG_NOTE, vect_location,
3243                                    DR_IS_READ (dra)
3244                                    ? "Detected interleaving load %T and %T\n"
3245                                    : "Detected interleaving store %T and %T\n",
3246                                    DR_REF (dra), DR_REF (drb));
3247 
3248             /* Link the found element into the group list.  */
3249             if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3250               {
3251                 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3252                 lastinfo = stmtinfo_a;
3253               }
3254             DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3255             DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3256             lastinfo = stmtinfo_b;
3257 
3258             STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3259               = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3260 
3261             if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3262               dump_printf_loc (MSG_NOTE, vect_location,
3263                                    "Load suitable for SLP vectorization only.\n");
3264 
3265             if (init_b == init_prev
3266                 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3267                 && dump_enabled_p ())
3268               dump_printf_loc (MSG_NOTE, vect_location,
3269                                    "Queuing group with duplicate access for fixup\n");
3270           }
3271     }
3272 
3273   /* Fixup groups with duplicate entries by splitting it.  */
3274   while (1)
3275     {
3276       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3277       if (!(it != to_fixup.end ()))
3278           break;
3279       stmt_vec_info grp = *it;
3280       to_fixup.remove (grp);
3281 
3282       /* Find the earliest duplicate group member.  */
3283       unsigned first_duplicate = -1u;
3284       stmt_vec_info next, g = grp;
3285       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3286           {
3287             if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3288                                           DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3289                 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3290               first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3291             g = next;
3292           }
3293       if (first_duplicate == -1U)
3294           continue;
3295 
3296       /* Then move all stmts after the first duplicate to a new group.
3297          Note this is a heuristic but one with the property that *it
3298            is fixed up completely.  */
3299       g = grp;
3300       stmt_vec_info newgroup = NULL, ng = grp;
3301       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3302           {
3303             if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3304               {
3305                 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3306                 if (!newgroup)
3307                     newgroup = next;
3308                 else
3309                     DR_GROUP_NEXT_ELEMENT (ng) = next;
3310                 ng = next;
3311                 DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3312               }
3313             else
3314               g = DR_GROUP_NEXT_ELEMENT (g);
3315           }
3316       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3317 
3318       /* Fixup the new group which still may contain duplicates.  */
3319       to_fixup.add (newgroup);
3320     }
3321 
3322   dr_vec_info *dr_info;
3323   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3324     {
3325       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3326             && !vect_analyze_data_ref_access (vinfo, dr_info))
3327           {
3328             if (dump_enabled_p ())
3329               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3330                                    "not vectorized: complicated access pattern.\n");
3331 
3332             if (is_a <bb_vec_info> (vinfo))
3333               {
3334                 /* Mark the statement as not vectorizable.  */
3335                 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3336                 continue;
3337               }
3338             else
3339               {
3340                 datarefs_copy.release ();
3341                 return opt_result::failure_at (dr_info->stmt->stmt,
3342                                                        "not vectorized:"
3343                                                        " complicated access pattern.\n");
3344               }
3345           }
3346     }
3347 
3348   datarefs_copy.release ();
3349   return opt_result::success ();
3350 }
3351 
3352 /* Function vect_vfa_segment_size.
3353 
3354    Input:
3355      DR_INFO: The data reference.
3356      LENGTH_FACTOR: segment length to consider.
3357 
3358    Return a value suitable for the dr_with_seg_len::seg_len field.
3359    This is the "distance travelled" by the pointer from the first
3360    iteration in the segment to the last.  Note that it does not include
3361    the size of the access; in effect it only describes the first byte.  */
3362 
3363 static tree
vect_vfa_segment_size(dr_vec_info * dr_info,tree length_factor)3364 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3365 {
3366   length_factor = size_binop (MINUS_EXPR,
3367                                     fold_convert (sizetype, length_factor),
3368                                     size_one_node);
3369   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3370                          length_factor);
3371 }
3372 
3373 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3374    gives the worst-case number of bytes covered by the segment.  */
3375 
3376 static unsigned HOST_WIDE_INT
vect_vfa_access_size(vec_info * vinfo,dr_vec_info * dr_info)3377 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3378 {
3379   stmt_vec_info stmt_vinfo = dr_info->stmt;
3380   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3381   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3382   unsigned HOST_WIDE_INT access_size = ref_size;
3383   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3384     {
3385       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3386       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3387     }
3388   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3389   int misalignment;
3390   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3391       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3392       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3393             == dr_explicit_realign_optimized))
3394     {
3395       /* We might access a full vector's worth.  */
3396       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3397     }
3398   return access_size;
3399 }
3400 
3401 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3402    describes.  */
3403 
3404 static unsigned int
vect_vfa_align(dr_vec_info * dr_info)3405 vect_vfa_align (dr_vec_info *dr_info)
3406 {
3407   return dr_alignment (dr_info->dr);
3408 }
3409 
3410 /* Function vect_no_alias_p.
3411 
3412    Given data references A and B with equal base and offset, see whether
3413    the alias relation can be decided at compilation time.  Return 1 if
3414    it can and the references alias, 0 if it can and the references do
3415    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3416    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3417    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3418 
3419 static int
vect_compile_time_alias(dr_vec_info * a,dr_vec_info * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3420 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3421                                tree segment_length_a, tree segment_length_b,
3422                                unsigned HOST_WIDE_INT access_size_a,
3423                                unsigned HOST_WIDE_INT access_size_b)
3424 {
3425   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3426   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3427   poly_uint64 const_length_a;
3428   poly_uint64 const_length_b;
3429 
3430   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3431      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3432      [a, a+12) */
3433   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3434     {
3435       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3436       offset_a -= const_length_a;
3437     }
3438   else
3439     const_length_a = tree_to_poly_uint64 (segment_length_a);
3440   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3441     {
3442       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3443       offset_b -= const_length_b;
3444     }
3445   else
3446     const_length_b = tree_to_poly_uint64 (segment_length_b);
3447 
3448   const_length_a += access_size_a;
3449   const_length_b += access_size_b;
3450 
3451   if (ranges_known_overlap_p (offset_a, const_length_a,
3452                                     offset_b, const_length_b))
3453     return 1;
3454 
3455   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3456                                      offset_b, const_length_b))
3457     return 0;
3458 
3459   return -1;
3460 }
3461 
3462 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3463    in DDR is >= VF.  */
3464 
3465 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3466 dependence_distance_ge_vf (data_dependence_relation *ddr,
3467                                  unsigned int loop_depth, poly_uint64 vf)
3468 {
3469   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3470       || DDR_NUM_DIST_VECTS (ddr) == 0)
3471     return false;
3472 
3473   /* If the dependence is exact, we should have limited the VF instead.  */
3474   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3475 
3476   unsigned int i;
3477   lambda_vector dist_v;
3478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3479     {
3480       HOST_WIDE_INT dist = dist_v[loop_depth];
3481       if (dist != 0
3482             && !(dist > 0 && DDR_REVERSED_P (ddr))
3483             && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3484           return false;
3485     }
3486 
3487   if (dump_enabled_p ())
3488     dump_printf_loc (MSG_NOTE, vect_location,
3489                          "dependence distance between %T and %T is >= VF\n",
3490                          DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3491 
3492   return true;
3493 }
3494 
3495 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3496 
3497 static void
dump_lower_bound(dump_flags_t dump_kind,const vec_lower_bound & lower_bound)3498 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3499 {
3500   dump_printf (dump_kind, "%s (%T) >= ",
3501                  lower_bound.unsigned_p ? "unsigned" : "abs",
3502                  lower_bound.expr);
3503   dump_dec (dump_kind, lower_bound.min_value);
3504 }
3505 
3506 /* Record that the vectorized loop requires the vec_lower_bound described
3507    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3508 
3509 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3510 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3511                               poly_uint64 min_value)
3512 {
3513   vec<vec_lower_bound> &lower_bounds
3514     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3515   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3516     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3517       {
3518           unsigned_p &= lower_bounds[i].unsigned_p;
3519           min_value = upper_bound (lower_bounds[i].min_value, min_value);
3520           if (lower_bounds[i].unsigned_p != unsigned_p
3521               || maybe_lt (lower_bounds[i].min_value, min_value))
3522             {
3523               lower_bounds[i].unsigned_p = unsigned_p;
3524               lower_bounds[i].min_value = min_value;
3525               if (dump_enabled_p ())
3526                 {
3527                     dump_printf_loc (MSG_NOTE, vect_location,
3528                                          "updating run-time check to ");
3529                     dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3530                     dump_printf (MSG_NOTE, "\n");
3531                 }
3532             }
3533           return;
3534       }
3535 
3536   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3537   if (dump_enabled_p ())
3538     {
3539       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3540       dump_lower_bound (MSG_NOTE, lower_bound);
3541       dump_printf (MSG_NOTE, "\n");
3542     }
3543   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3544 }
3545 
3546 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3547    will span fewer than GAP bytes.  */
3548 
3549 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,dr_vec_info * dr_info,poly_int64 gap)3550 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3551                       poly_int64 gap)
3552 {
3553   stmt_vec_info stmt_info = dr_info->stmt;
3554   HOST_WIDE_INT count
3555     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3556   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3557     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3558   return (estimated_poly_value (gap)
3559             <= count * vect_get_scalar_dr_size (dr_info));
3560 }
3561 
3562 /* Return true if we know that there is no alias between DR_INFO_A and
3563    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3564    When returning true, set *LOWER_BOUND_OUT to this N.  */
3565 
3566 static bool
vectorizable_with_step_bound_p(dr_vec_info * dr_info_a,dr_vec_info * dr_info_b,poly_uint64 * lower_bound_out)3567 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3568                                         poly_uint64 *lower_bound_out)
3569 {
3570   /* Check that there is a constant gap of known sign between DR_A
3571      and DR_B.  */
3572   data_reference *dr_a = dr_info_a->dr;
3573   data_reference *dr_b = dr_info_b->dr;
3574   poly_int64 init_a, init_b;
3575   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3576       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3577       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3578       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3579       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3580       || !ordered_p (init_a, init_b))
3581     return false;
3582 
3583   /* Sort DR_A and DR_B by the address they access.  */
3584   if (maybe_lt (init_b, init_a))
3585     {
3586       std::swap (init_a, init_b);
3587       std::swap (dr_info_a, dr_info_b);
3588       std::swap (dr_a, dr_b);
3589     }
3590 
3591   /* If the two accesses could be dependent within a scalar iteration,
3592      make sure that we'd retain their order.  */
3593   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3594       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3595     return false;
3596 
3597   /* There is no alias if abs (DR_STEP) is greater than or equal to
3598      the bytes spanned by the combination of the two accesses.  */
3599   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3600   return true;
3601 }
3602 
3603 /* Function vect_prune_runtime_alias_test_list.
3604 
3605    Prune a list of ddrs to be tested at run-time by versioning for alias.
3606    Merge several alias checks into one if possible.
3607    Return FALSE if resulting list of ddrs is longer then allowed by
3608    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3609 
3610 opt_result
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3611 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3612 {
3613   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3614   hash_set <tree_pair_hash> compared_objects;
3615 
3616   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3617   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3618     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3619   const vec<vec_object_pair> &check_unequal_addrs
3620     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3621   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3622   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3623 
3624   ddr_p ddr;
3625   unsigned int i;
3626   tree length_factor;
3627 
3628   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3629 
3630   /* Step values are irrelevant for aliasing if the number of vector
3631      iterations is equal to the number of scalar iterations (which can
3632      happen for fully-SLP loops).  */
3633   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3634 
3635   if (!vf_one_p)
3636     {
3637       /* Convert the checks for nonzero steps into bound tests.  */
3638       tree value;
3639       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3640           vect_check_lower_bound (loop_vinfo, value, true, 1);
3641     }
3642 
3643   if (may_alias_ddrs.is_empty ())
3644     return opt_result::success ();
3645 
3646   comp_alias_ddrs.create (may_alias_ddrs.length ());
3647 
3648   unsigned int loop_depth
3649     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3650                                 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3651 
3652   /* First, we collect all data ref pairs for aliasing checks.  */
3653   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3654     {
3655       poly_uint64 lower_bound;
3656       tree segment_length_a, segment_length_b;
3657       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3658       unsigned int align_a, align_b;
3659 
3660       /* Ignore the alias if the VF we chose ended up being no greater
3661            than the dependence distance.  */
3662       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3663           continue;
3664 
3665       if (DDR_OBJECT_A (ddr))
3666           {
3667             vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3668             if (!compared_objects.add (new_pair))
3669               {
3670                 if (dump_enabled_p ())
3671                     dump_printf_loc (MSG_NOTE, vect_location,
3672                                          "checking that %T and %T"
3673                                          " have different addresses\n",
3674                                          new_pair.first, new_pair.second);
3675                 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3676               }
3677             continue;
3678           }
3679 
3680       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3681       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3682 
3683       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3684       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3685 
3686       bool preserves_scalar_order_p
3687           = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3688       bool ignore_step_p
3689             = (vf_one_p
3690                && (preserves_scalar_order_p
3691                      || operand_equal_p (DR_STEP (dr_info_a->dr),
3692                                              DR_STEP (dr_info_b->dr))));
3693 
3694       /* Skip the pair if inter-iteration dependencies are irrelevant
3695            and intra-iteration dependencies are guaranteed to be honored.  */
3696       if (ignore_step_p
3697             && (preserves_scalar_order_p
3698                 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3699                                                              &lower_bound)))
3700           {
3701             if (dump_enabled_p ())
3702               dump_printf_loc (MSG_NOTE, vect_location,
3703                                    "no need for alias check between "
3704                                    "%T and %T when VF is 1\n",
3705                                    DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3706             continue;
3707           }
3708 
3709       /* See whether we can handle the alias using a bounds check on
3710            the step, and whether that's likely to be the best approach.
3711            (It might not be, for example, if the minimum step is much larger
3712            than the number of bytes handled by one vector iteration.)  */
3713       if (!ignore_step_p
3714             && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3715             && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3716                                                        &lower_bound)
3717             && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3718                 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3719           {
3720             bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3721             if (dump_enabled_p ())
3722               {
3723                 dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3724                                      "%T and %T when the step %T is outside ",
3725                                      DR_REF (dr_info_a->dr),
3726                                      DR_REF (dr_info_b->dr),
3727                                      DR_STEP (dr_info_a->dr));
3728                 if (unsigned_p)
3729                     dump_printf (MSG_NOTE, "[0");
3730                 else
3731                     {
3732                       dump_printf (MSG_NOTE, "(");
3733                       dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3734                     }
3735                 dump_printf (MSG_NOTE, ", ");
3736                 dump_dec (MSG_NOTE, lower_bound);
3737                 dump_printf (MSG_NOTE, ")\n");
3738               }
3739             vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3740                                           unsigned_p, lower_bound);
3741             continue;
3742           }
3743 
3744       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3745       if (dr_group_first_a)
3746           {
3747             stmt_info_a = dr_group_first_a;
3748             dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3749           }
3750 
3751       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3752       if (dr_group_first_b)
3753           {
3754             stmt_info_b = dr_group_first_b;
3755             dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3756           }
3757 
3758       if (ignore_step_p)
3759           {
3760             segment_length_a = size_zero_node;
3761             segment_length_b = size_zero_node;
3762           }
3763       else
3764           {
3765             if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3766                                         DR_STEP (dr_info_b->dr), 0))
3767               length_factor = scalar_loop_iters;
3768             else
3769               length_factor = size_int (vect_factor);
3770             segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3771             segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3772           }
3773       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3774       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3775       align_a = vect_vfa_align (dr_info_a);
3776       align_b = vect_vfa_align (dr_info_b);
3777 
3778       /* See whether the alias is known at compilation time.  */
3779       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3780                                  DR_BASE_ADDRESS (dr_info_b->dr), 0)
3781             && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3782                                     DR_OFFSET (dr_info_b->dr), 0)
3783             && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3784             && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3785             && poly_int_tree_p (segment_length_a)
3786             && poly_int_tree_p (segment_length_b))
3787           {
3788             int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3789                                                        segment_length_a,
3790                                                        segment_length_b,
3791                                                        access_size_a,
3792                                                        access_size_b);
3793             if (res >= 0 && dump_enabled_p ())
3794               {
3795                 dump_printf_loc (MSG_NOTE, vect_location,
3796                                      "can tell at compile time that %T and %T",
3797                                      DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3798                 if (res == 0)
3799                     dump_printf (MSG_NOTE, " do not alias\n");
3800                 else
3801                     dump_printf (MSG_NOTE, " alias\n");
3802               }
3803 
3804             if (res == 0)
3805               continue;
3806 
3807             if (res == 1)
3808               return opt_result::failure_at (stmt_info_b->stmt,
3809                                                      "not vectorized:"
3810                                                      " compilation time alias: %G%G",
3811                                                      stmt_info_a->stmt,
3812                                                      stmt_info_b->stmt);
3813           }
3814 
3815       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3816                                   access_size_a, align_a);
3817       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3818                                   access_size_b, align_b);
3819       /* Canonicalize the order to be the one that's needed for accurate
3820            RAW, WAR and WAW flags, in cases where the data references are
3821            well-ordered.  The order doesn't really matter otherwise,
3822            but we might as well be consistent.  */
3823       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3824           std::swap (dr_a, dr_b);
3825 
3826       dr_with_seg_len_pair_t dr_with_seg_len_pair
3827           (dr_a, dr_b, (preserves_scalar_order_p
3828                           ? dr_with_seg_len_pair_t::WELL_ORDERED
3829                           : dr_with_seg_len_pair_t::REORDERED));
3830 
3831       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3832     }
3833 
3834   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3835 
3836   unsigned int count = (comp_alias_ddrs.length ()
3837                               + check_unequal_addrs.length ());
3838 
3839   if (count
3840       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3841             == VECT_COST_MODEL_VERY_CHEAP))
3842     return opt_result::failure_at
3843       (vect_location, "would need a runtime alias check\n");
3844 
3845   if (dump_enabled_p ())
3846     dump_printf_loc (MSG_NOTE, vect_location,
3847                          "improved number of alias checks from %d to %d\n",
3848                          may_alias_ddrs.length (), count);
3849   unsigned limit = param_vect_max_version_for_alias_checks;
3850   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3851     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3852   if (count > limit)
3853     return opt_result::failure_at
3854       (vect_location,
3855        "number of versioning for alias run-time tests exceeds %d "
3856        "(--param vect-max-version-for-alias-checks)\n", limit);
3857 
3858   return opt_result::success ();
3859 }
3860 
3861 /* Check whether we can use an internal function for a gather load
3862    or scatter store.  READ_P is true for loads and false for stores.
3863    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3864    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3865    is the type of the offset that is being applied to the invariant
3866    base address.  SCALE is the amount by which the offset should
3867    be multiplied *after* it has been converted to address width.
3868 
3869    Return true if the function is supported, storing the function id in
3870    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3871 
3872 bool
vect_gather_scatter_fn_p(vec_info * vinfo,bool read_p,bool masked_p,tree vectype,tree memory_type,tree offset_type,int scale,internal_fn * ifn_out,tree * offset_vectype_out)3873 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3874                                 tree vectype, tree memory_type, tree offset_type,
3875                                 int scale, internal_fn *ifn_out,
3876                                 tree *offset_vectype_out)
3877 {
3878   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3879   unsigned int element_bits = vector_element_bits (vectype);
3880   if (element_bits != memory_bits)
3881     /* For now the vector elements must be the same width as the
3882        memory elements.  */
3883     return false;
3884 
3885   /* Work out which function we need.  */
3886   internal_fn ifn, alt_ifn;
3887   if (read_p)
3888     {
3889       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3890       alt_ifn = IFN_MASK_GATHER_LOAD;
3891     }
3892   else
3893     {
3894       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3895       alt_ifn = IFN_MASK_SCATTER_STORE;
3896     }
3897 
3898   for (;;)
3899     {
3900       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3901       if (!offset_vectype)
3902           return false;
3903 
3904       /* Test whether the target supports this combination.  */
3905       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3906                                                               offset_vectype, scale))
3907           {
3908             *ifn_out = ifn;
3909             *offset_vectype_out = offset_vectype;
3910             return true;
3911           }
3912       else if (!masked_p
3913                  && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3914                                                                         memory_type,
3915                                                                         offset_vectype,
3916                                                                         scale))
3917           {
3918             *ifn_out = alt_ifn;
3919             *offset_vectype_out = offset_vectype;
3920             return true;
3921           }
3922 
3923       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3924             && TYPE_PRECISION (offset_type) >= element_bits)
3925           return false;
3926 
3927       offset_type = build_nonstandard_integer_type
3928           (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3929     }
3930 }
3931 
3932 /* STMT_INFO is a call to an internal gather load or scatter store function.
3933    Describe the operation in INFO.  */
3934 
3935 static void
vect_describe_gather_scatter_call(stmt_vec_info stmt_info,gather_scatter_info * info)3936 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3937                                            gather_scatter_info *info)
3938 {
3939   gcall *call = as_a <gcall *> (stmt_info->stmt);
3940   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3941   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3942 
3943   info->ifn = gimple_call_internal_fn (call);
3944   info->decl = NULL_TREE;
3945   info->base = gimple_call_arg (call, 0);
3946   info->offset = gimple_call_arg (call, 1);
3947   info->offset_dt = vect_unknown_def_type;
3948   info->offset_vectype = NULL_TREE;
3949   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3950   info->element_type = TREE_TYPE (vectype);
3951   info->memory_type = TREE_TYPE (DR_REF (dr));
3952 }
3953 
3954 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3955    gather load or scatter store.  Describe the operation in *INFO if so.  */
3956 
3957 bool
vect_check_gather_scatter(stmt_vec_info stmt_info,loop_vec_info loop_vinfo,gather_scatter_info * info)3958 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3959                                  gather_scatter_info *info)
3960 {
3961   HOST_WIDE_INT scale = 1;
3962   poly_int64 pbitpos, pbitsize;
3963   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3964   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3965   tree offtype = NULL_TREE;
3966   tree decl = NULL_TREE, base, off;
3967   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3968   tree memory_type = TREE_TYPE (DR_REF (dr));
3969   machine_mode pmode;
3970   int punsignedp, reversep, pvolatilep = 0;
3971   internal_fn ifn;
3972   tree offset_vectype;
3973   bool masked_p = false;
3974 
3975   /* See whether this is already a call to a gather/scatter internal function.
3976      If not, see whether it's a masked load or store.  */
3977   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3978   if (call && gimple_call_internal_p (call))
3979     {
3980       ifn = gimple_call_internal_fn (call);
3981       if (internal_gather_scatter_fn_p (ifn))
3982           {
3983             vect_describe_gather_scatter_call (stmt_info, info);
3984             return true;
3985           }
3986       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3987     }
3988 
3989   /* True if we should aim to use internal functions rather than
3990      built-in functions.  */
3991   bool use_ifn_p = (DR_IS_READ (dr)
3992                         ? supports_vec_gather_load_p (TYPE_MODE (vectype))
3993                         : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
3994 
3995   base = DR_REF (dr);
3996   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3997      see if we can use the def stmt of the address.  */
3998   if (masked_p
3999       && TREE_CODE (base) == MEM_REF
4000       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4001       && integer_zerop (TREE_OPERAND (base, 1))
4002       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4003     {
4004       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4005       if (is_gimple_assign (def_stmt)
4006             && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4007           base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4008     }
4009 
4010   /* The gather and scatter builtins need address of the form
4011      loop_invariant + vector * {1, 2, 4, 8}
4012      or
4013      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4014      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4015      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4016      multiplications and additions in it.  To get a vector, we need
4017      a single SSA_NAME that will be defined in the loop and will
4018      contain everything that is not loop invariant and that can be
4019      vectorized.  The following code attempts to find such a preexistng
4020      SSA_NAME OFF and put the loop invariants into a tree BASE
4021      that can be gimplified before the loop.  */
4022   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4023                                     &punsignedp, &reversep, &pvolatilep);
4024   if (reversep)
4025     return false;
4026 
4027   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4028 
4029   if (TREE_CODE (base) == MEM_REF)
4030     {
4031       if (!integer_zerop (TREE_OPERAND (base, 1)))
4032           {
4033             if (off == NULL_TREE)
4034               off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4035             else
4036               off = size_binop (PLUS_EXPR, off,
4037                                     fold_convert (sizetype, TREE_OPERAND (base, 1)));
4038           }
4039       base = TREE_OPERAND (base, 0);
4040     }
4041   else
4042     base = build_fold_addr_expr (base);
4043 
4044   if (off == NULL_TREE)
4045     off = size_zero_node;
4046 
4047   /* If base is not loop invariant, either off is 0, then we start with just
4048      the constant offset in the loop invariant BASE and continue with base
4049      as OFF, otherwise give up.
4050      We could handle that case by gimplifying the addition of base + off
4051      into some SSA_NAME and use that as off, but for now punt.  */
4052   if (!expr_invariant_in_loop_p (loop, base))
4053     {
4054       if (!integer_zerop (off))
4055           return false;
4056       off = base;
4057       base = size_int (pbytepos);
4058     }
4059   /* Otherwise put base + constant offset into the loop invariant BASE
4060      and continue with OFF.  */
4061   else
4062     {
4063       base = fold_convert (sizetype, base);
4064       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4065     }
4066 
4067   /* OFF at this point may be either a SSA_NAME or some tree expression
4068      from get_inner_reference.  Try to peel off loop invariants from it
4069      into BASE as long as possible.  */
4070   STRIP_NOPS (off);
4071   while (offtype == NULL_TREE)
4072     {
4073       enum tree_code code;
4074       tree op0, op1, add = NULL_TREE;
4075 
4076       if (TREE_CODE (off) == SSA_NAME)
4077           {
4078             gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4079 
4080             if (expr_invariant_in_loop_p (loop, off))
4081               return false;
4082 
4083             if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4084               break;
4085 
4086             op0 = gimple_assign_rhs1 (def_stmt);
4087             code = gimple_assign_rhs_code (def_stmt);
4088             op1 = gimple_assign_rhs2 (def_stmt);
4089           }
4090       else
4091           {
4092             if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4093               return false;
4094             code = TREE_CODE (off);
4095             extract_ops_from_tree (off, &code, &op0, &op1);
4096           }
4097       switch (code)
4098           {
4099           case POINTER_PLUS_EXPR:
4100           case PLUS_EXPR:
4101             if (expr_invariant_in_loop_p (loop, op0))
4102               {
4103                 add = op0;
4104                 off = op1;
4105               do_add:
4106                 add = fold_convert (sizetype, add);
4107                 if (scale != 1)
4108                     add = size_binop (MULT_EXPR, add, size_int (scale));
4109                 base = size_binop (PLUS_EXPR, base, add);
4110                 continue;
4111               }
4112             if (expr_invariant_in_loop_p (loop, op1))
4113               {
4114                 add = op1;
4115                 off = op0;
4116                 goto do_add;
4117               }
4118             break;
4119           case MINUS_EXPR:
4120             if (expr_invariant_in_loop_p (loop, op1))
4121               {
4122                 add = fold_convert (sizetype, op1);
4123                 add = size_binop (MINUS_EXPR, size_zero_node, add);
4124                 off = op0;
4125                 goto do_add;
4126               }
4127             break;
4128           case MULT_EXPR:
4129             if (scale == 1 && tree_fits_shwi_p (op1))
4130               {
4131                 int new_scale = tree_to_shwi (op1);
4132                 /* Only treat this as a scaling operation if the target
4133                      supports it for at least some offset type.  */
4134                 if (use_ifn_p
4135                       && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4136                                                             masked_p, vectype, memory_type,
4137                                                             signed_char_type_node,
4138                                                             new_scale, &ifn,
4139                                                             &offset_vectype)
4140                       && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4141                                                             masked_p, vectype, memory_type,
4142                                                             unsigned_char_type_node,
4143                                                             new_scale, &ifn,
4144                                                             &offset_vectype))
4145                     break;
4146                 scale = new_scale;
4147                 off = op0;
4148                 continue;
4149               }
4150             break;
4151           case SSA_NAME:
4152             off = op0;
4153             continue;
4154           CASE_CONVERT:
4155             if (!POINTER_TYPE_P (TREE_TYPE (op0))
4156                 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4157               break;
4158 
4159             /* Don't include the conversion if the target is happy with
4160                the current offset type.  */
4161             if (use_ifn_p
4162                 && !POINTER_TYPE_P (TREE_TYPE (off))
4163                 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4164                                                      masked_p, vectype, memory_type,
4165                                                      TREE_TYPE (off), scale, &ifn,
4166                                                      &offset_vectype))
4167               break;
4168 
4169             if (TYPE_PRECISION (TREE_TYPE (op0))
4170                 == TYPE_PRECISION (TREE_TYPE (off)))
4171               {
4172                 off = op0;
4173                 continue;
4174               }
4175 
4176             /* Include the conversion if it is widening and we're using
4177                the IFN path or the target can handle the converted from
4178                offset or the current size is not already the same as the
4179                data vector element size.  */
4180             if ((TYPE_PRECISION (TREE_TYPE (op0))
4181                  < TYPE_PRECISION (TREE_TYPE (off)))
4182                 && (use_ifn_p
4183                       || (DR_IS_READ (dr)
4184                           ? (targetm.vectorize.builtin_gather
4185                                && targetm.vectorize.builtin_gather (vectype,
4186                                                                             TREE_TYPE (op0),
4187                                                                             scale))
4188                           : (targetm.vectorize.builtin_scatter
4189                                && targetm.vectorize.builtin_scatter (vectype,
4190                                                                              TREE_TYPE (op0),
4191                                                                              scale)))
4192                       || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4193                                                TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4194               {
4195                 off = op0;
4196                 offtype = TREE_TYPE (off);
4197                 STRIP_NOPS (off);
4198                 continue;
4199               }
4200             break;
4201           default:
4202             break;
4203           }
4204       break;
4205     }
4206 
4207   /* If at the end OFF still isn't a SSA_NAME or isn't
4208      defined in the loop, punt.  */
4209   if (TREE_CODE (off) != SSA_NAME
4210       || expr_invariant_in_loop_p (loop, off))
4211     return false;
4212 
4213   if (offtype == NULL_TREE)
4214     offtype = TREE_TYPE (off);
4215 
4216   if (use_ifn_p)
4217     {
4218       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4219                                              vectype, memory_type, offtype, scale,
4220                                              &ifn, &offset_vectype))
4221           ifn = IFN_LAST;
4222       decl = NULL_TREE;
4223     }
4224   else
4225     {
4226       if (DR_IS_READ (dr))
4227           {
4228             if (targetm.vectorize.builtin_gather)
4229               decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4230           }
4231       else
4232           {
4233             if (targetm.vectorize.builtin_scatter)
4234               decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4235           }
4236       ifn = IFN_LAST;
4237       /* The offset vector type will be read from DECL when needed.  */
4238       offset_vectype = NULL_TREE;
4239     }
4240 
4241   info->ifn = ifn;
4242   info->decl = decl;
4243   info->base = base;
4244   info->offset = off;
4245   info->offset_dt = vect_unknown_def_type;
4246   info->offset_vectype = offset_vectype;
4247   info->scale = scale;
4248   info->element_type = TREE_TYPE (vectype);
4249   info->memory_type = memory_type;
4250   return true;
4251 }
4252 
4253 /* Find the data references in STMT, analyze them with respect to LOOP and
4254    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4255    be handled.  */
4256 
4257 opt_result
vect_find_stmt_data_reference(loop_p loop,gimple * stmt,vec<data_reference_p> * datarefs,vec<int> * dataref_groups,int group_id)4258 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4259                                      vec<data_reference_p> *datarefs,
4260                                      vec<int> *dataref_groups, int group_id)
4261 {
4262   /* We can ignore clobbers for dataref analysis - they are removed during
4263      loop vectorization and BB vectorization checks dependences with a
4264      stmt walk.  */
4265   if (gimple_clobber_p (stmt))
4266     return opt_result::success ();
4267 
4268   if (gimple_has_volatile_ops (stmt))
4269     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4270                                            stmt);
4271 
4272   if (stmt_can_throw_internal (cfun, stmt))
4273     return opt_result::failure_at (stmt,
4274                                            "not vectorized:"
4275                                            " statement can throw an exception: %G",
4276                                            stmt);
4277 
4278   auto_vec<data_reference_p, 2> refs;
4279   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4280   if (!res)
4281     return res;
4282 
4283   if (refs.is_empty ())
4284     return opt_result::success ();
4285 
4286   if (refs.length () > 1)
4287     {
4288       while (!refs.is_empty ())
4289           free_data_ref (refs.pop ());
4290       return opt_result::failure_at (stmt,
4291                                              "not vectorized: more than one "
4292                                              "data ref in stmt: %G", stmt);
4293     }
4294 
4295   data_reference_p dr = refs.pop ();
4296   if (gcall *call = dyn_cast <gcall *> (stmt))
4297     if (!gimple_call_internal_p (call)
4298           || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4299               && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4300       {
4301           free_data_ref (dr);
4302           return opt_result::failure_at (stmt,
4303                                                "not vectorized: dr in a call %G", stmt);
4304       }
4305 
4306   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4307       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4308     {
4309       free_data_ref (dr);
4310       return opt_result::failure_at (stmt,
4311                                              "not vectorized:"
4312                                              " statement is bitfield access %G", stmt);
4313     }
4314 
4315   if (DR_BASE_ADDRESS (dr)
4316       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4317     {
4318       free_data_ref (dr);
4319       return opt_result::failure_at (stmt,
4320                                              "not vectorized:"
4321                                              " base addr of dr is a constant\n");
4322     }
4323 
4324   /* Check whether this may be a SIMD lane access and adjust the
4325      DR to make it easier for us to handle it.  */
4326   if (loop
4327       && loop->simduid
4328       && (!DR_BASE_ADDRESS (dr)
4329             || !DR_OFFSET (dr)
4330             || !DR_INIT (dr)
4331             || !DR_STEP (dr)))
4332     {
4333       struct data_reference *newdr
4334           = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4335                                  DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4336       if (DR_BASE_ADDRESS (newdr)
4337             && DR_OFFSET (newdr)
4338             && DR_INIT (newdr)
4339             && DR_STEP (newdr)
4340             && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4341             && integer_zerop (DR_STEP (newdr)))
4342           {
4343             tree base_address = DR_BASE_ADDRESS (newdr);
4344             tree off = DR_OFFSET (newdr);
4345             tree step = ssize_int (1);
4346             if (integer_zerop (off)
4347                 && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4348               {
4349                 off = TREE_OPERAND (base_address, 1);
4350                 base_address = TREE_OPERAND (base_address, 0);
4351               }
4352             STRIP_NOPS (off);
4353             if (TREE_CODE (off) == MULT_EXPR
4354                 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4355               {
4356                 step = TREE_OPERAND (off, 1);
4357                 off = TREE_OPERAND (off, 0);
4358                 STRIP_NOPS (off);
4359               }
4360             if (CONVERT_EXPR_P (off)
4361                 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4362                       < TYPE_PRECISION (TREE_TYPE (off))))
4363               off = TREE_OPERAND (off, 0);
4364             if (TREE_CODE (off) == SSA_NAME)
4365               {
4366                 gimple *def = SSA_NAME_DEF_STMT (off);
4367                 /* Look through widening conversion.  */
4368                 if (is_gimple_assign (def)
4369                       && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4370                     {
4371                       tree rhs1 = gimple_assign_rhs1 (def);
4372                       if (TREE_CODE (rhs1) == SSA_NAME
4373                           && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4374                           && (TYPE_PRECISION (TREE_TYPE (off))
4375                                 > TYPE_PRECISION (TREE_TYPE (rhs1))))
4376                         def = SSA_NAME_DEF_STMT (rhs1);
4377                     }
4378                 if (is_gimple_call (def)
4379                       && gimple_call_internal_p (def)
4380                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4381                     {
4382                       tree arg = gimple_call_arg (def, 0);
4383                       tree reft = TREE_TYPE (DR_REF (newdr));
4384                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4385                       arg = SSA_NAME_VAR (arg);
4386                       if (arg == loop->simduid
4387                           /* For now.  */
4388                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4389                         {
4390                           DR_BASE_ADDRESS (newdr) = base_address;
4391                           DR_OFFSET (newdr) = ssize_int (0);
4392                           DR_STEP (newdr) = step;
4393                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4394                           DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4395                           /* Mark as simd-lane access.  */
4396                           tree arg2 = gimple_call_arg (def, 1);
4397                           newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4398                           free_data_ref (dr);
4399                           datarefs->safe_push (newdr);
4400                           if (dataref_groups)
4401                               dataref_groups->safe_push (group_id);
4402                           return opt_result::success ();
4403                         }
4404                     }
4405               }
4406           }
4407       free_data_ref (newdr);
4408     }
4409 
4410   datarefs->safe_push (dr);
4411   if (dataref_groups)
4412     dataref_groups->safe_push (group_id);
4413   return opt_result::success ();
4414 }
4415 
4416 /* Function vect_analyze_data_refs.
4417 
4418   Find all the data references in the loop or basic block.
4419 
4420    The general structure of the analysis of data refs in the vectorizer is as
4421    follows:
4422    1- vect_analyze_data_refs(loop/bb): call
4423       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4424       in the loop/bb and their dependences.
4425    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4426    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4427    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4428 
4429 */
4430 
4431 opt_result
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf,bool * fatal)4432 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4433 {
4434   class loop *loop = NULL;
4435   unsigned int i;
4436   struct data_reference *dr;
4437   tree scalar_type;
4438 
4439   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4440 
4441   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4442     loop = LOOP_VINFO_LOOP (loop_vinfo);
4443 
4444   /* Go through the data-refs, check that the analysis succeeded.  Update
4445      pointer from stmt_vec_info struct to DR and vectype.  */
4446 
4447   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4448   FOR_EACH_VEC_ELT (datarefs, i, dr)
4449     {
4450       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4451       poly_uint64 vf;
4452 
4453       gcc_assert (DR_REF (dr));
4454       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4455       gcc_assert (!stmt_info->dr_aux.dr);
4456       stmt_info->dr_aux.dr = dr;
4457       stmt_info->dr_aux.stmt = stmt_info;
4458 
4459       /* Check that analysis of the data-ref succeeded.  */
4460       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4461             || !DR_STEP (dr))
4462         {
4463             bool maybe_gather
4464               = DR_IS_READ (dr)
4465                 && !TREE_THIS_VOLATILE (DR_REF (dr));
4466             bool maybe_scatter
4467               = DR_IS_WRITE (dr)
4468                 && !TREE_THIS_VOLATILE (DR_REF (dr))
4469                 && (targetm.vectorize.builtin_scatter != NULL
4470                       || supports_vec_scatter_store_p ());
4471 
4472             /* If target supports vector gather loads or scatter stores,
4473                see if they can't be used.  */
4474             if (is_a <loop_vec_info> (vinfo)
4475                 && !nested_in_vect_loop_p (loop, stmt_info))
4476               {
4477                 if (maybe_gather || maybe_scatter)
4478                     {
4479                       if (maybe_gather)
4480                         gatherscatter = GATHER;
4481                       else
4482                         gatherscatter = SCATTER;
4483                     }
4484               }
4485 
4486             if (gatherscatter == SG_NONE)
4487               {
4488                 if (dump_enabled_p ())
4489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4490                                          "not vectorized: data ref analysis "
4491                                          "failed %G", stmt_info->stmt);
4492                 if (is_a <bb_vec_info> (vinfo))
4493                     {
4494                       /* In BB vectorization the ref can still participate
4495                          in dependence analysis, we just can't vectorize it.  */
4496                       STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4497                       continue;
4498                     }
4499                 return opt_result::failure_at (stmt_info->stmt,
4500                                                        "not vectorized:"
4501                                                        " data ref analysis failed: %G",
4502                                                        stmt_info->stmt);
4503               }
4504         }
4505 
4506       /* See if this was detected as SIMD lane access.  */
4507       if (dr->aux == (void *)-1
4508             || dr->aux == (void *)-2
4509             || dr->aux == (void *)-3
4510             || dr->aux == (void *)-4)
4511           {
4512             if (nested_in_vect_loop_p (loop, stmt_info))
4513               return opt_result::failure_at (stmt_info->stmt,
4514                                                      "not vectorized:"
4515                                                      " data ref analysis failed: %G",
4516                                                      stmt_info->stmt);
4517             STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4518               = -(uintptr_t) dr->aux;
4519           }
4520 
4521       tree base = get_base_address (DR_REF (dr));
4522       if (base && VAR_P (base) && DECL_NONALIASED (base))
4523           {
4524           if (dump_enabled_p ())
4525               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4526                                    "not vectorized: base object not addressable "
4527                                    "for stmt: %G", stmt_info->stmt);
4528           if (is_a <bb_vec_info> (vinfo))
4529               {
4530                 /* In BB vectorization the ref can still participate
4531                    in dependence analysis, we just can't vectorize it.  */
4532                 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4533                 continue;
4534               }
4535             return opt_result::failure_at (stmt_info->stmt,
4536                                                    "not vectorized: base object not"
4537                                                    " addressable for stmt: %G",
4538                                                    stmt_info->stmt);
4539           }
4540 
4541       if (is_a <loop_vec_info> (vinfo)
4542             && DR_STEP (dr)
4543             && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4544           {
4545             if (nested_in_vect_loop_p (loop, stmt_info))
4546               return opt_result::failure_at (stmt_info->stmt,
4547                                                      "not vectorized: "
4548                                                      "not suitable for strided load %G",
4549                                                      stmt_info->stmt);
4550             STMT_VINFO_STRIDED_P (stmt_info) = true;
4551           }
4552 
4553       /* Update DR field in stmt_vec_info struct.  */
4554 
4555       /* If the dataref is in an inner-loop of the loop that is considered for
4556            for vectorization, we also want to analyze the access relative to
4557            the outer-loop (DR contains information only relative to the
4558            inner-most enclosing loop).  We do that by building a reference to the
4559            first location accessed by the inner-loop, and analyze it relative to
4560            the outer-loop.  */
4561       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4562           {
4563             /* Build a reference to the first location accessed by the
4564                inner loop: *(BASE + INIT + OFFSET).  By construction,
4565                this address must be invariant in the inner loop, so we
4566                can consider it as being used in the outer loop.  */
4567             tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4568             tree offset = unshare_expr (DR_OFFSET (dr));
4569             tree init = unshare_expr (DR_INIT (dr));
4570             tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4571                                                     init, offset);
4572             tree init_addr = fold_build_pointer_plus (base, init_offset);
4573             tree init_ref = build_fold_indirect_ref (init_addr);
4574 
4575             if (dump_enabled_p ())
4576               dump_printf_loc (MSG_NOTE, vect_location,
4577                                    "analyze in outer loop: %T\n", init_ref);
4578 
4579             opt_result res
4580               = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4581                                             init_ref, loop, stmt_info->stmt);
4582             if (!res)
4583               /* dr_analyze_innermost already explained the failure.  */
4584               return res;
4585 
4586           if (dump_enabled_p ())
4587               dump_printf_loc (MSG_NOTE, vect_location,
4588                                    "\touter base_address: %T\n"
4589                                    "\touter offset from base address: %T\n"
4590                                    "\touter constant offset from base address: %T\n"
4591                                    "\touter step: %T\n"
4592                                    "\touter base alignment: %d\n\n"
4593                                    "\touter base misalignment: %d\n"
4594                                    "\touter offset alignment: %d\n"
4595                                    "\touter step alignment: %d\n",
4596                                    STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4597                                    STMT_VINFO_DR_OFFSET (stmt_info),
4598                                    STMT_VINFO_DR_INIT (stmt_info),
4599                                    STMT_VINFO_DR_STEP (stmt_info),
4600                                    STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4601                                    STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4602                                    STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4603                                    STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4604           }
4605 
4606       /* Set vectype for STMT.  */
4607       scalar_type = TREE_TYPE (DR_REF (dr));
4608       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4609       if (!vectype)
4610         {
4611           if (dump_enabled_p ())
4612             {
4613               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4614                                "not vectorized: no vectype for stmt: %G",
4615                                      stmt_info->stmt);
4616               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4617               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4618                                  scalar_type);
4619               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4620             }
4621 
4622           if (is_a <bb_vec_info> (vinfo))
4623               {
4624                 /* No vector type is fine, the ref can still participate
4625                    in dependence analysis, we just can't vectorize it.  */
4626                 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4627                 continue;
4628               }
4629             if (fatal)
4630               *fatal = false;
4631             return opt_result::failure_at (stmt_info->stmt,
4632                                                    "not vectorized:"
4633                                                    " no vectype for stmt: %G"
4634                                                    " scalar_type: %T\n",
4635                                                    stmt_info->stmt, scalar_type);
4636         }
4637       else
4638           {
4639             if (dump_enabled_p ())
4640               dump_printf_loc (MSG_NOTE, vect_location,
4641                                    "got vectype for stmt: %G%T\n",
4642                                    stmt_info->stmt, vectype);
4643           }
4644 
4645       /* Adjust the minimal vectorization factor according to the
4646            vector type.  */
4647       vf = TYPE_VECTOR_SUBPARTS (vectype);
4648       *min_vf = upper_bound (*min_vf, vf);
4649 
4650       /* Leave the BB vectorizer to pick the vector type later, based on
4651            the final dataref group size and SLP node size.  */
4652       if (is_a <loop_vec_info> (vinfo))
4653           STMT_VINFO_VECTYPE (stmt_info) = vectype;
4654 
4655       if (gatherscatter != SG_NONE)
4656           {
4657             gather_scatter_info gs_info;
4658             if (!vect_check_gather_scatter (stmt_info,
4659                                                     as_a <loop_vec_info> (vinfo),
4660                                                     &gs_info)
4661                 || !get_vectype_for_scalar_type (vinfo,
4662                                                          TREE_TYPE (gs_info.offset)))
4663               {
4664                 if (fatal)
4665                     *fatal = false;
4666                 return opt_result::failure_at
4667                               (stmt_info->stmt,
4668                                (gatherscatter == GATHER)
4669                                ? "not vectorized: not suitable for gather load %G"
4670                                : "not vectorized: not suitable for scatter store %G",
4671                                stmt_info->stmt);
4672               }
4673             STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4674           }
4675     }
4676 
4677   /* We used to stop processing and prune the list here.  Verify we no
4678      longer need to.  */
4679   gcc_assert (i == datarefs.length ());
4680 
4681   return opt_result::success ();
4682 }
4683 
4684 
4685 /* Function vect_get_new_vect_var.
4686 
4687    Returns a name for a new variable.  The current naming scheme appends the
4688    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4689    the name of vectorizer generated variables, and appends that to NAME if
4690    provided.  */
4691 
4692 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4693 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4694 {
4695   const char *prefix;
4696   tree new_vect_var;
4697 
4698   switch (var_kind)
4699   {
4700   case vect_simple_var:
4701     prefix = "vect";
4702     break;
4703   case vect_scalar_var:
4704     prefix = "stmp";
4705     break;
4706   case vect_mask_var:
4707     prefix = "mask";
4708     break;
4709   case vect_pointer_var:
4710     prefix = "vectp";
4711     break;
4712   default:
4713     gcc_unreachable ();
4714   }
4715 
4716   if (name)
4717     {
4718       char* tmp = concat (prefix, "_", name, NULL);
4719       new_vect_var = create_tmp_reg (type, tmp);
4720       free (tmp);
4721     }
4722   else
4723     new_vect_var = create_tmp_reg (type, prefix);
4724 
4725   return new_vect_var;
4726 }
4727 
4728 /* Like vect_get_new_vect_var but return an SSA name.  */
4729 
4730 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4731 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4732 {
4733   const char *prefix;
4734   tree new_vect_var;
4735 
4736   switch (var_kind)
4737   {
4738   case vect_simple_var:
4739     prefix = "vect";
4740     break;
4741   case vect_scalar_var:
4742     prefix = "stmp";
4743     break;
4744   case vect_pointer_var:
4745     prefix = "vectp";
4746     break;
4747   default:
4748     gcc_unreachable ();
4749   }
4750 
4751   if (name)
4752     {
4753       char* tmp = concat (prefix, "_", name, NULL);
4754       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4755       free (tmp);
4756     }
4757   else
4758     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4759 
4760   return new_vect_var;
4761 }
4762 
4763 /* Duplicate points-to info on NAME from DR_INFO.  */
4764 
4765 static void
vect_duplicate_ssa_name_ptr_info(tree name,dr_vec_info * dr_info)4766 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4767 {
4768   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4769   /* DR_PTR_INFO is for a base SSA name, not including constant or
4770      variable offsets in the ref so its alignment info does not apply.  */
4771   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4772 }
4773 
4774 /* Function vect_create_addr_base_for_vector_ref.
4775 
4776    Create an expression that computes the address of the first memory location
4777    that will be accessed for a data reference.
4778 
4779    Input:
4780    STMT_INFO: The statement containing the data reference.
4781    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4782    OFFSET: Optional. If supplied, it is be added to the initial address.
4783    LOOP:    Specify relative to which loop-nest should the address be computed.
4784             For example, when the dataref is in an inner-loop nested in an
4785               outer-loop that is now being vectorized, LOOP can be either the
4786               outer-loop, or the inner-loop.  The first memory location accessed
4787               by the following dataref ('in' points to short):
4788 
4789                     for (i=0; i<N; i++)
4790                        for (j=0; j<M; j++)
4791                          s += in[i+j]
4792 
4793               is as follows:
4794               if LOOP=i_loop: &in                 (relative to i_loop)
4795               if LOOP=j_loop:           &in+i*2B  (relative to j_loop)
4796 
4797    Output:
4798    1. Return an SSA_NAME whose value is the address of the memory location of
4799       the first vector of the data reference.
4800    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4801       these statement(s) which define the returned SSA_NAME.
4802 
4803    FORNOW: We are only handling array accesses with step 1.  */
4804 
4805 tree
vect_create_addr_base_for_vector_ref(vec_info * vinfo,stmt_vec_info stmt_info,gimple_seq * new_stmt_list,tree offset)4806 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4807                                               gimple_seq *new_stmt_list,
4808                                               tree offset)
4809 {
4810   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4811   struct data_reference *dr = dr_info->dr;
4812   const char *base_name;
4813   tree addr_base;
4814   tree dest;
4815   gimple_seq seq = NULL;
4816   tree vect_ptr_type;
4817   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4818   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4819 
4820   tree data_ref_base = unshare_expr (drb->base_address);
4821   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4822   tree init = unshare_expr (drb->init);
4823 
4824   if (loop_vinfo)
4825     base_name = get_name (data_ref_base);
4826   else
4827     {
4828       base_offset = ssize_int (0);
4829       init = ssize_int (0);
4830       base_name = get_name (DR_REF (dr));
4831     }
4832 
4833   /* Create base_offset */
4834   base_offset = size_binop (PLUS_EXPR,
4835                                   fold_convert (sizetype, base_offset),
4836                                   fold_convert (sizetype, init));
4837 
4838   if (offset)
4839     {
4840       offset = fold_convert (sizetype, offset);
4841       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4842                                          base_offset, offset);
4843     }
4844 
4845   /* base + base_offset */
4846   if (loop_vinfo)
4847     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4848   else
4849     addr_base = build1 (ADDR_EXPR,
4850                               build_pointer_type (TREE_TYPE (DR_REF (dr))),
4851                               /* Strip zero offset components since we don't need
4852                                  them and they can confuse late diagnostics if
4853                                  we CSE them wrongly.  See PR106904 for example.  */
4854                               unshare_expr (strip_zero_offset_components
4855                                                                                 (DR_REF (dr))));
4856 
4857   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4858   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4859   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4860   gimple_seq_add_seq (new_stmt_list, seq);
4861 
4862   if (DR_PTR_INFO (dr)
4863       && TREE_CODE (addr_base) == SSA_NAME
4864       /* We should only duplicate pointer info to newly created SSA names.  */
4865       && SSA_NAME_VAR (addr_base) == dest)
4866     {
4867       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4868       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4869     }
4870 
4871   if (dump_enabled_p ())
4872     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4873 
4874   return addr_base;
4875 }
4876 
4877 
4878 /* Function vect_create_data_ref_ptr.
4879 
4880    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4881    location accessed in the loop by STMT_INFO, along with the def-use update
4882    chain to appropriately advance the pointer through the loop iterations.
4883    Also set aliasing information for the pointer.  This pointer is used by
4884    the callers to this function to create a memory reference expression for
4885    vector load/store access.
4886 
4887    Input:
4888    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4889          GIMPLE_ASSIGN <name, data-ref> or
4890            GIMPLE_ASSIGN <data-ref, name>.
4891    2. AGGR_TYPE: the type of the reference, which should be either a vector
4892         or an array.
4893    3. AT_LOOP: the loop where the vector memref is to be created.
4894    4. OFFSET (optional): a byte offset to be added to the initial address
4895           accessed by the data-ref in STMT_INFO.
4896    5. BSI: location where the new stmts are to be placed if there is no loop
4897    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4898         pointing to the initial address.
4899    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4900           to the IV during each iteration of the loop.  NULL says to move
4901           by one copy of AGGR_TYPE up or down, depending on the step of the
4902           data reference.
4903 
4904    Output:
4905    1. Declare a new ptr to vector_type, and have it point to the base of the
4906       data reference (initial addressed accessed by the data reference).
4907       For example, for vector of type V8HI, the following code is generated:
4908 
4909       v8hi *ap;
4910       ap = (v8hi *)initial_address;
4911 
4912       if OFFSET is not supplied:
4913          initial_address = &a[init];
4914       if OFFSET is supplied:
4915            initial_address = &a[init] + OFFSET;
4916       if BYTE_OFFSET is supplied:
4917            initial_address = &a[init] + BYTE_OFFSET;
4918 
4919       Return the initial_address in INITIAL_ADDRESS.
4920 
4921    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4922       update the pointer in each iteration of the loop.
4923 
4924       Return the increment stmt that updates the pointer in PTR_INCR.
4925 
4926    3. Return the pointer.  */
4927 
4928 tree
vect_create_data_ref_ptr(vec_info * vinfo,stmt_vec_info stmt_info,tree aggr_type,class loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,tree iv_step)4929 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4930                                 tree aggr_type, class loop *at_loop, tree offset,
4931                                 tree *initial_address, gimple_stmt_iterator *gsi,
4932                                 gimple **ptr_incr, bool only_init,
4933                                 tree iv_step)
4934 {
4935   const char *base_name;
4936   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4937   class loop *loop = NULL;
4938   bool nested_in_vect_loop = false;
4939   class loop *containing_loop = NULL;
4940   tree aggr_ptr_type;
4941   tree aggr_ptr;
4942   tree new_temp;
4943   gimple_seq new_stmt_list = NULL;
4944   edge pe = NULL;
4945   basic_block new_bb;
4946   tree aggr_ptr_init;
4947   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4948   struct data_reference *dr = dr_info->dr;
4949   tree aptr;
4950   gimple_stmt_iterator incr_gsi;
4951   bool insert_after;
4952   tree indx_before_incr, indx_after_incr;
4953   gimple *incr;
4954   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4955 
4956   gcc_assert (iv_step != NULL_TREE
4957                 || TREE_CODE (aggr_type) == ARRAY_TYPE
4958                 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4959 
4960   if (loop_vinfo)
4961     {
4962       loop = LOOP_VINFO_LOOP (loop_vinfo);
4963       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4964       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4965       pe = loop_preheader_edge (loop);
4966     }
4967   else
4968     {
4969       gcc_assert (bb_vinfo);
4970       only_init = true;
4971       *ptr_incr = NULL;
4972     }
4973 
4974   /* Create an expression for the first address accessed by this load
4975      in LOOP.  */
4976   base_name = get_name (DR_BASE_ADDRESS (dr));
4977 
4978   if (dump_enabled_p ())
4979     {
4980       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4981       dump_printf_loc (MSG_NOTE, vect_location,
4982                        "create %s-pointer variable to type: %T",
4983                            get_tree_code_name (TREE_CODE (aggr_type)),
4984                            aggr_type);
4985       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4986         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4987       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4988         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4989       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4990         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4991       else
4992         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4993       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4994     }
4995 
4996   /* (1) Create the new aggregate-pointer variable.
4997      Vector and array types inherit the alias set of their component
4998      type by default so we need to use a ref-all pointer if the data
4999      reference does not conflict with the created aggregated data
5000      reference because it is not addressable.  */
5001   bool need_ref_all = false;
5002   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5003                                     get_alias_set (DR_REF (dr))))
5004     need_ref_all = true;
5005   /* Likewise for any of the data references in the stmt group.  */
5006   else if (DR_GROUP_SIZE (stmt_info) > 1)
5007     {
5008       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5009       do
5010           {
5011             struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5012             if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5013                                               get_alias_set (DR_REF (sdr))))
5014               {
5015                 need_ref_all = true;
5016                 break;
5017               }
5018             sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5019           }
5020       while (sinfo);
5021     }
5022   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5023                                                          need_ref_all);
5024   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5025 
5026 
5027   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5028      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5029      def-use update cycles for the pointer: one relative to the outer-loop
5030      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5031      to the inner-loop (which is the inner-most loop containing the dataref),
5032      and this is done be step (5) below.
5033 
5034      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5035      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5036      redundant.  Steps (3),(4) create the following:
5037 
5038           vp0 = &base_addr;
5039           LOOP:     vp1 = phi(vp0,vp2)
5040                     ...
5041                     ...
5042                     vp2 = vp1 + step
5043                     goto LOOP
5044 
5045      If there is an inner-loop nested in loop, then step (5) will also be
5046      applied, and an additional update in the inner-loop will be created:
5047 
5048           vp0 = &base_addr;
5049           LOOP:   vp1 = phi(vp0,vp2)
5050                     ...
5051         inner:     vp3 = phi(vp1,vp4)
5052                      vp4 = vp3 + inner_step
5053                      if () goto inner
5054                     ...
5055                     vp2 = vp1 + step
5056                     if () goto LOOP   */
5057 
5058   /* (2) Calculate the initial address of the aggregate-pointer, and set
5059      the aggregate-pointer to point to it before the loop.  */
5060 
5061   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5062 
5063   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5064                                                                stmt_info, &new_stmt_list,
5065                                                                offset);
5066   if (new_stmt_list)
5067     {
5068       if (pe)
5069         {
5070           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5071           gcc_assert (!new_bb);
5072         }
5073       else
5074         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5075     }
5076 
5077   *initial_address = new_temp;
5078   aggr_ptr_init = new_temp;
5079 
5080   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5081      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5082      inner-loop nested in LOOP (during outer-loop vectorization).  */
5083 
5084   /* No update in loop is required.  */
5085   if (only_init && (!loop_vinfo || at_loop == loop))
5086     aptr = aggr_ptr_init;
5087   else
5088     {
5089       /* Accesses to invariant addresses should be handled specially
5090            by the caller.  */
5091       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5092       gcc_assert (!integer_zerop (step));
5093 
5094       if (iv_step == NULL_TREE)
5095           {
5096             /* The step of the aggregate pointer is the type size,
5097                negated for downward accesses.  */
5098             iv_step = TYPE_SIZE_UNIT (aggr_type);
5099             if (tree_int_cst_sgn (step) == -1)
5100               iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5101           }
5102 
5103       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5104 
5105       create_iv (aggr_ptr_init,
5106                      fold_convert (aggr_ptr_type, iv_step),
5107                      aggr_ptr, loop, &incr_gsi, insert_after,
5108                      &indx_before_incr, &indx_after_incr);
5109       incr = gsi_stmt (incr_gsi);
5110 
5111       /* Copy the points-to information if it exists. */
5112       if (DR_PTR_INFO (dr))
5113           {
5114             vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5115             vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5116           }
5117       if (ptr_incr)
5118           *ptr_incr = incr;
5119 
5120       aptr = indx_before_incr;
5121     }
5122 
5123   if (!nested_in_vect_loop || only_init)
5124     return aptr;
5125 
5126 
5127   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5128      nested in LOOP, if exists.  */
5129 
5130   gcc_assert (nested_in_vect_loop);
5131   if (!only_init)
5132     {
5133       standard_iv_increment_position (containing_loop, &incr_gsi,
5134                                               &insert_after);
5135       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
5136                      containing_loop, &incr_gsi, insert_after, &indx_before_incr,
5137                      &indx_after_incr);
5138       incr = gsi_stmt (incr_gsi);
5139 
5140       /* Copy the points-to information if it exists. */
5141       if (DR_PTR_INFO (dr))
5142           {
5143             vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5144             vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5145           }
5146       if (ptr_incr)
5147           *ptr_incr = incr;
5148 
5149       return indx_before_incr;
5150     }
5151   else
5152     gcc_unreachable ();
5153 }
5154 
5155 
5156 /* Function bump_vector_ptr
5157 
5158    Increment a pointer (to a vector type) by vector-size. If requested,
5159    i.e. if PTR-INCR is given, then also connect the new increment stmt
5160    to the existing def-use update-chain of the pointer, by modifying
5161    the PTR_INCR as illustrated below:
5162 
5163    The pointer def-use update-chain before this function:
5164                         DATAREF_PTR = phi (p_0, p_2)
5165                         ....
5166         PTR_INCR:       p_2 = DATAREF_PTR + step
5167 
5168    The pointer def-use update-chain after this function:
5169                         DATAREF_PTR = phi (p_0, p_2)
5170                         ....
5171                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5172                         ....
5173         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5174 
5175    Input:
5176    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5177                  in the loop.
5178    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5179                 the loop.  The increment amount across iterations is expected
5180                 to be vector_size.
5181    BSI - location where the new update stmt is to be placed.
5182    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5183    BUMP - optional. The offset by which to bump the pointer. If not given,
5184             the offset is assumed to be vector_size.
5185 
5186    Output: Return NEW_DATAREF_PTR as illustrated above.
5187 
5188 */
5189 
5190 tree
bump_vector_ptr(vec_info * vinfo,tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,stmt_vec_info stmt_info,tree bump)5191 bump_vector_ptr (vec_info *vinfo,
5192                      tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5193                      stmt_vec_info stmt_info, tree bump)
5194 {
5195   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5196   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5197   tree update = TYPE_SIZE_UNIT (vectype);
5198   gimple *incr_stmt;
5199   ssa_op_iter iter;
5200   use_operand_p use_p;
5201   tree new_dataref_ptr;
5202 
5203   if (bump)
5204     update = bump;
5205 
5206   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5207     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5208   else
5209     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5210   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5211                                            dataref_ptr, update);
5212   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5213   /* Fold the increment, avoiding excessive chains use-def chains of
5214      those, leading to compile-time issues for passes until the next
5215      forwprop pass which would do this as well.  */
5216   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5217   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5218     {
5219       incr_stmt = gsi_stmt (fold_gsi);
5220       update_stmt (incr_stmt);
5221     }
5222 
5223   /* Copy the points-to information if it exists. */
5224   if (DR_PTR_INFO (dr))
5225     {
5226       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5227       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5228     }
5229 
5230   if (!ptr_incr)
5231     return new_dataref_ptr;
5232 
5233   /* Update the vector-pointer's cross-iteration increment.  */
5234   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5235     {
5236       tree use = USE_FROM_PTR (use_p);
5237 
5238       if (use == dataref_ptr)
5239         SET_USE (use_p, new_dataref_ptr);
5240       else
5241         gcc_assert (operand_equal_p (use, update, 0));
5242     }
5243 
5244   return new_dataref_ptr;
5245 }
5246 
5247 
5248 /* Copy memory reference info such as base/clique from the SRC reference
5249    to the DEST MEM_REF.  */
5250 
5251 void
vect_copy_ref_info(tree dest,tree src)5252 vect_copy_ref_info (tree dest, tree src)
5253 {
5254   if (TREE_CODE (dest) != MEM_REF)
5255     return;
5256 
5257   tree src_base = src;
5258   while (handled_component_p (src_base))
5259     src_base = TREE_OPERAND (src_base, 0);
5260   if (TREE_CODE (src_base) != MEM_REF
5261       && TREE_CODE (src_base) != TARGET_MEM_REF)
5262     return;
5263 
5264   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5265   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5266 }
5267 
5268 
5269 /* Function vect_create_destination_var.
5270 
5271    Create a new temporary of type VECTYPE.  */
5272 
5273 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5274 vect_create_destination_var (tree scalar_dest, tree vectype)
5275 {
5276   tree vec_dest;
5277   const char *name;
5278   char *new_name;
5279   tree type;
5280   enum vect_var_kind kind;
5281 
5282   kind = vectype
5283     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5284     ? vect_mask_var
5285     : vect_simple_var
5286     : vect_scalar_var;
5287   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5288 
5289   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5290 
5291   name = get_name (scalar_dest);
5292   if (name)
5293     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5294   else
5295     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5296   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5297   free (new_name);
5298 
5299   return vec_dest;
5300 }
5301 
5302 /* Function vect_grouped_store_supported.
5303 
5304    Returns TRUE if interleave high and interleave low permutations
5305    are supported, and FALSE otherwise.  */
5306 
5307 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5308 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5309 {
5310   machine_mode mode = TYPE_MODE (vectype);
5311 
5312   /* vect_permute_store_chain requires the group size to be equal to 3 or
5313      be a power of two.  */
5314   if (count != 3 && exact_log2 (count) == -1)
5315     {
5316       if (dump_enabled_p ())
5317           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5318                                "the size of the group of accesses"
5319                                " is not a power of 2 or not eqaul to 3\n");
5320       return false;
5321     }
5322 
5323   /* Check that the permutation is supported.  */
5324   if (VECTOR_MODE_P (mode))
5325     {
5326       unsigned int i;
5327       if (count == 3)
5328           {
5329             unsigned int j0 = 0, j1 = 0, j2 = 0;
5330             unsigned int i, j;
5331 
5332             unsigned int nelt;
5333             if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5334               {
5335                 if (dump_enabled_p ())
5336                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5337                                          "cannot handle groups of 3 stores for"
5338                                          " variable-length vectors\n");
5339                 return false;
5340               }
5341 
5342             vec_perm_builder sel (nelt, nelt, 1);
5343             sel.quick_grow (nelt);
5344             vec_perm_indices indices;
5345             for (j = 0; j < 3; j++)
5346               {
5347                 int nelt0 = ((3 - j) * nelt) % 3;
5348                 int nelt1 = ((3 - j) * nelt + 1) % 3;
5349                 int nelt2 = ((3 - j) * nelt + 2) % 3;
5350                 for (i = 0; i < nelt; i++)
5351                     {
5352                       if (3 * i + nelt0 < nelt)
5353                         sel[3 * i + nelt0] = j0++;
5354                       if (3 * i + nelt1 < nelt)
5355                         sel[3 * i + nelt1] = nelt + j1++;
5356                       if (3 * i + nelt2 < nelt)
5357                         sel[3 * i + nelt2] = 0;
5358                     }
5359                 indices.new_vector (sel, 2, nelt);
5360                 if (!can_vec_perm_const_p (mode, indices))
5361                     {
5362                       if (dump_enabled_p ())
5363                         dump_printf (MSG_MISSED_OPTIMIZATION,
5364                                          "permutation op not supported by target.\n");
5365                       return false;
5366                     }
5367 
5368                 for (i = 0; i < nelt; i++)
5369                     {
5370                       if (3 * i + nelt0 < nelt)
5371                         sel[3 * i + nelt0] = 3 * i + nelt0;
5372                       if (3 * i + nelt1 < nelt)
5373                         sel[3 * i + nelt1] = 3 * i + nelt1;
5374                       if (3 * i + nelt2 < nelt)
5375                         sel[3 * i + nelt2] = nelt + j2++;
5376                     }
5377                 indices.new_vector (sel, 2, nelt);
5378                 if (!can_vec_perm_const_p (mode, indices))
5379                     {
5380                       if (dump_enabled_p ())
5381                         dump_printf (MSG_MISSED_OPTIMIZATION,
5382                                          "permutation op not supported by target.\n");
5383                       return false;
5384                     }
5385               }
5386             return true;
5387           }
5388       else
5389           {
5390             /* If length is not equal to 3 then only power of 2 is supported.  */
5391             gcc_assert (pow2p_hwi (count));
5392             poly_uint64 nelt = GET_MODE_NUNITS (mode);
5393 
5394             /* The encoding has 2 interleaved stepped patterns.  */
5395             vec_perm_builder sel (nelt, 2, 3);
5396             sel.quick_grow (6);
5397             for (i = 0; i < 3; i++)
5398               {
5399                 sel[i * 2] = i;
5400                 sel[i * 2 + 1] = i + nelt;
5401               }
5402             vec_perm_indices indices (sel, 2, nelt);
5403             if (can_vec_perm_const_p (mode, indices))
5404               {
5405                 for (i = 0; i < 6; i++)
5406                     sel[i] += exact_div (nelt, 2);
5407                 indices.new_vector (sel, 2, nelt);
5408                 if (can_vec_perm_const_p (mode, indices))
5409                     return true;
5410               }
5411           }
5412     }
5413 
5414   if (dump_enabled_p ())
5415     dump_printf (MSG_MISSED_OPTIMIZATION,
5416                      "permutation op not supported by target.\n");
5417   return false;
5418 }
5419 
5420 
5421 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5422    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5423 
5424 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5425 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5426                                   bool masked_p)
5427 {
5428   if (masked_p)
5429     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5430                                                    vec_mask_store_lanes_optab,
5431                                                    vectype, count);
5432   else
5433     return vect_lanes_optab_supported_p ("vec_store_lanes",
5434                                                    vec_store_lanes_optab,
5435                                                    vectype, count);
5436 }
5437 
5438 
5439 /* Function vect_permute_store_chain.
5440 
5441    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5442    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5443    the data correctly for the stores.  Return the final references for stores
5444    in RESULT_CHAIN.
5445 
5446    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5447    The input is 4 vectors each containing 8 elements.  We assign a number to
5448    each element, the input sequence is:
5449 
5450    1st vec:   0  1  2  3  4  5  6  7
5451    2nd vec:   8  9 10 11 12 13 14 15
5452    3rd vec:  16 17 18 19 20 21 22 23
5453    4th vec:  24 25 26 27 28 29 30 31
5454 
5455    The output sequence should be:
5456 
5457    1st vec:  0  8 16 24  1  9 17 25
5458    2nd vec:  2 10 18 26  3 11 19 27
5459    3rd vec:  4 12 20 28  5 13 21 30
5460    4th vec:  6 14 22 30  7 15 23 31
5461 
5462    i.e., we interleave the contents of the four vectors in their order.
5463 
5464    We use interleave_high/low instructions to create such output.  The input of
5465    each interleave_high/low operation is two vectors:
5466    1st vec    2nd vec
5467    0 1 2 3    4 5 6 7
5468    the even elements of the result vector are obtained left-to-right from the
5469    high/low elements of the first vector.  The odd elements of the result are
5470    obtained left-to-right from the high/low elements of the second vector.
5471    The output of interleave_high will be:   0 4 1 5
5472    and of interleave_low:                   2 6 3 7
5473 
5474 
5475    The permutation is done in log LENGTH stages.  In each stage interleave_high
5476    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5477    where the first argument is taken from the first half of DR_CHAIN and the
5478    second argument from it's second half.
5479    In our example,
5480 
5481    I1: interleave_high (1st vec, 3rd vec)
5482    I2: interleave_low (1st vec, 3rd vec)
5483    I3: interleave_high (2nd vec, 4th vec)
5484    I4: interleave_low (2nd vec, 4th vec)
5485 
5486    The output for the first stage is:
5487 
5488    I1:  0 16  1 17  2 18  3 19
5489    I2:  4 20  5 21  6 22  7 23
5490    I3:  8 24  9 25 10 26 11 27
5491    I4: 12 28 13 29 14 30 15 31
5492 
5493    The output of the second stage, i.e. the final result is:
5494 
5495    I1:  0  8 16 24  1  9 17 25
5496    I2:  2 10 18 26  3 11 19 27
5497    I3:  4 12 20 28  5 13 21 30
5498    I4:  6 14 22 30  7 15 23 31.  */
5499 
5500 void
vect_permute_store_chain(vec_info * vinfo,vec<tree> & dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5501 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5502                                 unsigned int length,
5503                                 stmt_vec_info stmt_info,
5504                                 gimple_stmt_iterator *gsi,
5505                                 vec<tree> *result_chain)
5506 {
5507   tree vect1, vect2, high, low;
5508   gimple *perm_stmt;
5509   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5510   tree perm_mask_low, perm_mask_high;
5511   tree data_ref;
5512   tree perm3_mask_low, perm3_mask_high;
5513   unsigned int i, j, n, log_length = exact_log2 (length);
5514 
5515   result_chain->quick_grow (length);
5516   memcpy (result_chain->address (), dr_chain.address (),
5517             length * sizeof (tree));
5518 
5519   if (length == 3)
5520     {
5521       /* vect_grouped_store_supported ensures that this is constant.  */
5522       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5523       unsigned int j0 = 0, j1 = 0, j2 = 0;
5524 
5525       vec_perm_builder sel (nelt, nelt, 1);
5526       sel.quick_grow (nelt);
5527       vec_perm_indices indices;
5528       for (j = 0; j < 3; j++)
5529         {
5530             int nelt0 = ((3 - j) * nelt) % 3;
5531             int nelt1 = ((3 - j) * nelt + 1) % 3;
5532             int nelt2 = ((3 - j) * nelt + 2) % 3;
5533 
5534             for (i = 0; i < nelt; i++)
5535               {
5536                 if (3 * i + nelt0 < nelt)
5537                     sel[3 * i + nelt0] = j0++;
5538                 if (3 * i + nelt1 < nelt)
5539                     sel[3 * i + nelt1] = nelt + j1++;
5540                 if (3 * i + nelt2 < nelt)
5541                     sel[3 * i + nelt2] = 0;
5542               }
5543             indices.new_vector (sel, 2, nelt);
5544             perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5545 
5546             for (i = 0; i < nelt; i++)
5547               {
5548                 if (3 * i + nelt0 < nelt)
5549                     sel[3 * i + nelt0] = 3 * i + nelt0;
5550                 if (3 * i + nelt1 < nelt)
5551                     sel[3 * i + nelt1] = 3 * i + nelt1;
5552                 if (3 * i + nelt2 < nelt)
5553                     sel[3 * i + nelt2] = nelt + j2++;
5554               }
5555             indices.new_vector (sel, 2, nelt);
5556             perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5557 
5558             vect1 = dr_chain[0];
5559             vect2 = dr_chain[1];
5560 
5561             /* Create interleaving stmt:
5562                low = VEC_PERM_EXPR <vect1, vect2,
5563                                           {j, nelt, *, j + 1, nelt + j + 1, *,
5564                                            j + 2, nelt + j + 2, *, ...}>  */
5565             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5566             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5567                                                      vect2, perm3_mask_low);
5568             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5569 
5570             vect1 = data_ref;
5571             vect2 = dr_chain[2];
5572             /* Create interleaving stmt:
5573                low = VEC_PERM_EXPR <vect1, vect2,
5574                                           {0, 1, nelt + j, 3, 4, nelt + j + 1,
5575                                            6, 7, nelt + j + 2, ...}>  */
5576             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5577             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5578                                                      vect2, perm3_mask_high);
5579             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5580             (*result_chain)[j] = data_ref;
5581           }
5582     }
5583   else
5584     {
5585       /* If length is not equal to 3 then only power of 2 is supported.  */
5586       gcc_assert (pow2p_hwi (length));
5587 
5588       /* The encoding has 2 interleaved stepped patterns.  */
5589       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5590       vec_perm_builder sel (nelt, 2, 3);
5591       sel.quick_grow (6);
5592       for (i = 0; i < 3; i++)
5593           {
5594             sel[i * 2] = i;
5595             sel[i * 2 + 1] = i + nelt;
5596           }
5597           vec_perm_indices indices (sel, 2, nelt);
5598           perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5599 
5600           for (i = 0; i < 6; i++)
5601             sel[i] += exact_div (nelt, 2);
5602           indices.new_vector (sel, 2, nelt);
5603           perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5604 
5605           for (i = 0, n = log_length; i < n; i++)
5606             {
5607               for (j = 0; j < length/2; j++)
5608                 {
5609                     vect1 = dr_chain[j];
5610                     vect2 = dr_chain[j+length/2];
5611 
5612                     /* Create interleaving stmt:
5613                        high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5614                                                                       ...}>  */
5615                     high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5616                     perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5617                                                              vect2, perm_mask_high);
5618                     vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5619                     (*result_chain)[2*j] = high;
5620 
5621                     /* Create interleaving stmt:
5622                        low = VEC_PERM_EXPR <vect1, vect2,
5623                                                   {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5624                                                    ...}>  */
5625                     low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5626                     perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5627                                                              vect2, perm_mask_low);
5628                     vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5629                     (*result_chain)[2*j+1] = low;
5630                 }
5631               memcpy (dr_chain.address (), result_chain->address (),
5632                         length * sizeof (tree));
5633             }
5634     }
5635 }
5636 
5637 /* Function vect_setup_realignment
5638 
5639    This function is called when vectorizing an unaligned load using
5640    the dr_explicit_realign[_optimized] scheme.
5641    This function generates the following code at the loop prolog:
5642 
5643       p = initial_addr;
5644    x  msq_init = *(floor(p));   # prolog load
5645       realignment_token = call target_builtin;
5646     loop:
5647    x  msq = phi (msq_init, ---)
5648 
5649    The stmts marked with x are generated only for the case of
5650    dr_explicit_realign_optimized.
5651 
5652    The code above sets up a new (vector) pointer, pointing to the first
5653    location accessed by STMT_INFO, and a "floor-aligned" load using that
5654    pointer.  It also generates code to compute the "realignment-token"
5655    (if the relevant target hook was defined), and creates a phi-node at the
5656    loop-header bb whose arguments are the result of the prolog-load (created
5657    by this function) and the result of a load that takes place in the loop
5658    (to be created by the caller to this function).
5659 
5660    For the case of dr_explicit_realign_optimized:
5661    The caller to this function uses the phi-result (msq) to create the
5662    realignment code inside the loop, and sets up the missing phi argument,
5663    as follows:
5664     loop:
5665       msq = phi (msq_init, lsq)
5666       lsq = *(floor(p'));        # load in loop
5667       result = realign_load (msq, lsq, realignment_token);
5668 
5669    For the case of dr_explicit_realign:
5670     loop:
5671       msq = *(floor(p));      # load in loop
5672       p' = p + (VS-1);
5673       lsq = *(floor(p'));     # load in loop
5674       result = realign_load (msq, lsq, realignment_token);
5675 
5676    Input:
5677    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5678                  a memory location that may be unaligned.
5679    BSI - place where new code is to be inserted.
5680    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5681                                     is used.
5682 
5683    Output:
5684    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5685                        target hook, if defined.
5686    Return value - the result of the loop-header phi node.  */
5687 
5688 tree
vect_setup_realignment(vec_info * vinfo,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,class loop ** at_loop)5689 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5690                               gimple_stmt_iterator *gsi, tree *realignment_token,
5691                               enum dr_alignment_support alignment_support_scheme,
5692                               tree init_addr,
5693                               class loop **at_loop)
5694 {
5695   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5696   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5697   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5698   struct data_reference *dr = dr_info->dr;
5699   class loop *loop = NULL;
5700   edge pe = NULL;
5701   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5702   tree vec_dest;
5703   gimple *inc;
5704   tree ptr;
5705   tree data_ref;
5706   basic_block new_bb;
5707   tree msq_init = NULL_TREE;
5708   tree new_temp;
5709   gphi *phi_stmt;
5710   tree msq = NULL_TREE;
5711   gimple_seq stmts = NULL;
5712   bool compute_in_loop = false;
5713   bool nested_in_vect_loop = false;
5714   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5715   class loop *loop_for_initial_load = NULL;
5716 
5717   if (loop_vinfo)
5718     {
5719       loop = LOOP_VINFO_LOOP (loop_vinfo);
5720       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5721     }
5722 
5723   gcc_assert (alignment_support_scheme == dr_explicit_realign
5724                 || alignment_support_scheme == dr_explicit_realign_optimized);
5725 
5726   /* We need to generate three things:
5727      1. the misalignment computation
5728      2. the extra vector load (for the optimized realignment scheme).
5729      3. the phi node for the two vectors from which the realignment is
5730       done (for the optimized realignment scheme).  */
5731 
5732   /* 1. Determine where to generate the misalignment computation.
5733 
5734      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5735      calculation will be generated by this function, outside the loop (in the
5736      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5737      caller, inside the loop.
5738 
5739      Background: If the misalignment remains fixed throughout the iterations of
5740      the loop, then both realignment schemes are applicable, and also the
5741      misalignment computation can be done outside LOOP.  This is because we are
5742      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5743      are a multiple of VS (the Vector Size), and therefore the misalignment in
5744      different vectorized LOOP iterations is always the same.
5745      The problem arises only if the memory access is in an inner-loop nested
5746      inside LOOP, which is now being vectorized using outer-loop vectorization.
5747      This is the only case when the misalignment of the memory access may not
5748      remain fixed throughout the iterations of the inner-loop (as explained in
5749      detail in vect_supportable_dr_alignment).  In this case, not only is the
5750      optimized realignment scheme not applicable, but also the misalignment
5751      computation (and generation of the realignment token that is passed to
5752      REALIGN_LOAD) have to be done inside the loop.
5753 
5754      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5755      or not, which in turn determines if the misalignment is computed inside
5756      the inner-loop, or outside LOOP.  */
5757 
5758   if (init_addr != NULL_TREE || !loop_vinfo)
5759     {
5760       compute_in_loop = true;
5761       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5762     }
5763 
5764 
5765   /* 2. Determine where to generate the extra vector load.
5766 
5767      For the optimized realignment scheme, instead of generating two vector
5768      loads in each iteration, we generate a single extra vector load in the
5769      preheader of the loop, and in each iteration reuse the result of the
5770      vector load from the previous iteration.  In case the memory access is in
5771      an inner-loop nested inside LOOP, which is now being vectorized using
5772      outer-loop vectorization, we need to determine whether this initial vector
5773      load should be generated at the preheader of the inner-loop, or can be
5774      generated at the preheader of LOOP.  If the memory access has no evolution
5775      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5776      to be generated inside LOOP (in the preheader of the inner-loop).  */
5777 
5778   if (nested_in_vect_loop)
5779     {
5780       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5781       bool invariant_in_outerloop =
5782             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5783       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5784     }
5785   else
5786     loop_for_initial_load = loop;
5787   if (at_loop)
5788     *at_loop = loop_for_initial_load;
5789 
5790   if (loop_for_initial_load)
5791     pe = loop_preheader_edge (loop_for_initial_load);
5792 
5793   /* 3. For the case of the optimized realignment, create the first vector
5794       load at the loop preheader.  */
5795 
5796   if (alignment_support_scheme == dr_explicit_realign_optimized)
5797     {
5798       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5799       gassign *new_stmt;
5800 
5801       gcc_assert (!compute_in_loop);
5802       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5803       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5804                                               loop_for_initial_load, NULL_TREE,
5805                                               &init_addr, NULL, &inc, true);
5806       if (TREE_CODE (ptr) == SSA_NAME)
5807           new_temp = copy_ssa_name (ptr);
5808       else
5809           new_temp = make_ssa_name (TREE_TYPE (ptr));
5810       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5811       tree type = TREE_TYPE (ptr);
5812       new_stmt = gimple_build_assign
5813                        (new_temp, BIT_AND_EXPR, ptr,
5814                         fold_build2 (MINUS_EXPR, type,
5815                                          build_int_cst (type, 0),
5816                                          build_int_cst (type, align)));
5817       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5818       gcc_assert (!new_bb);
5819       data_ref
5820           = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5821                       build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5822       vect_copy_ref_info (data_ref, DR_REF (dr));
5823       new_stmt = gimple_build_assign (vec_dest, data_ref);
5824       new_temp = make_ssa_name (vec_dest, new_stmt);
5825       gimple_assign_set_lhs (new_stmt, new_temp);
5826       if (pe)
5827         {
5828           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5829           gcc_assert (!new_bb);
5830         }
5831       else
5832          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5833 
5834       msq_init = gimple_assign_lhs (new_stmt);
5835     }
5836 
5837   /* 4. Create realignment token using a target builtin, if available.
5838       It is done either inside the containing loop, or before LOOP (as
5839       determined above).  */
5840 
5841   if (targetm.vectorize.builtin_mask_for_load)
5842     {
5843       gcall *new_stmt;
5844       tree builtin_decl;
5845 
5846       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5847       if (!init_addr)
5848           {
5849             /* Generate the INIT_ADDR computation outside LOOP.  */
5850             init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5851                                                                           stmt_info, &stmts,
5852                                                                           NULL_TREE);
5853           if (loop)
5854             {
5855                 pe = loop_preheader_edge (loop);
5856                 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5857                 gcc_assert (!new_bb);
5858             }
5859           else
5860              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5861           }
5862 
5863       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5864       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5865       vec_dest =
5866           vect_create_destination_var (scalar_dest,
5867                                              gimple_call_return_type (new_stmt));
5868       new_temp = make_ssa_name (vec_dest, new_stmt);
5869       gimple_call_set_lhs (new_stmt, new_temp);
5870 
5871       if (compute_in_loop)
5872           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5873       else
5874           {
5875             /* Generate the misalignment computation outside LOOP.  */
5876             pe = loop_preheader_edge (loop);
5877             new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5878             gcc_assert (!new_bb);
5879           }
5880 
5881       *realignment_token = gimple_call_lhs (new_stmt);
5882 
5883       /* The result of the CALL_EXPR to this builtin is determined from
5884          the value of the parameter and no global variables are touched
5885          which makes the builtin a "const" function.  Requiring the
5886          builtin to have the "const" attribute makes it unnecessary
5887          to call mark_call_clobbered.  */
5888       gcc_assert (TREE_READONLY (builtin_decl));
5889     }
5890 
5891   if (alignment_support_scheme == dr_explicit_realign)
5892     return msq;
5893 
5894   gcc_assert (!compute_in_loop);
5895   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5896 
5897 
5898   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5899 
5900   pe = loop_preheader_edge (containing_loop);
5901   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5902   msq = make_ssa_name (vec_dest);
5903   phi_stmt = create_phi_node (msq, containing_loop->header);
5904   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5905 
5906   return msq;
5907 }
5908 
5909 
5910 /* Function vect_grouped_load_supported.
5911 
5912    COUNT is the size of the load group (the number of statements plus the
5913    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5914    only one statement, with a gap of COUNT - 1.
5915 
5916    Returns true if a suitable permute exists.  */
5917 
5918 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5919 vect_grouped_load_supported (tree vectype, bool single_element_p,
5920                                    unsigned HOST_WIDE_INT count)
5921 {
5922   machine_mode mode = TYPE_MODE (vectype);
5923 
5924   /* If this is single-element interleaving with an element distance
5925      that leaves unused vector loads around punt - we at least create
5926      very sub-optimal code in that case (and blow up memory,
5927      see PR65518).  */
5928   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5929     {
5930       if (dump_enabled_p ())
5931           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5932                                "single-element interleaving not supported "
5933                                "for not adjacent vector loads\n");
5934       return false;
5935     }
5936 
5937   /* vect_permute_load_chain requires the group size to be equal to 3 or
5938      be a power of two.  */
5939   if (count != 3 && exact_log2 (count) == -1)
5940     {
5941       if (dump_enabled_p ())
5942           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5943                                "the size of the group of accesses"
5944                                " is not a power of 2 or not equal to 3\n");
5945       return false;
5946     }
5947 
5948   /* Check that the permutation is supported.  */
5949   if (VECTOR_MODE_P (mode))
5950     {
5951       unsigned int i, j;
5952       if (count == 3)
5953           {
5954             unsigned int nelt;
5955             if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5956               {
5957                 if (dump_enabled_p ())
5958                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5959                                          "cannot handle groups of 3 loads for"
5960                                          " variable-length vectors\n");
5961                 return false;
5962               }
5963 
5964             vec_perm_builder sel (nelt, nelt, 1);
5965             sel.quick_grow (nelt);
5966             vec_perm_indices indices;
5967             unsigned int k;
5968             for (k = 0; k < 3; k++)
5969               {
5970                 for (i = 0; i < nelt; i++)
5971                     if (3 * i + k < 2 * nelt)
5972                       sel[i] = 3 * i + k;
5973                     else
5974                       sel[i] = 0;
5975                 indices.new_vector (sel, 2, nelt);
5976                 if (!can_vec_perm_const_p (mode, indices))
5977                     {
5978                       if (dump_enabled_p ())
5979                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5980                                              "shuffle of 3 loads is not supported by"
5981                                              " target\n");
5982                       return false;
5983                     }
5984                 for (i = 0, j = 0; i < nelt; i++)
5985                     if (3 * i + k < 2 * nelt)
5986                       sel[i] = i;
5987                     else
5988                       sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5989                 indices.new_vector (sel, 2, nelt);
5990                 if (!can_vec_perm_const_p (mode, indices))
5991                     {
5992                       if (dump_enabled_p ())
5993                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5994                                              "shuffle of 3 loads is not supported by"
5995                                              " target\n");
5996                       return false;
5997                     }
5998               }
5999             return true;
6000           }
6001       else
6002           {
6003             /* If length is not equal to 3 then only power of 2 is supported.  */
6004             gcc_assert (pow2p_hwi (count));
6005             poly_uint64 nelt = GET_MODE_NUNITS (mode);
6006 
6007             /* The encoding has a single stepped pattern.  */
6008             vec_perm_builder sel (nelt, 1, 3);
6009             sel.quick_grow (3);
6010             for (i = 0; i < 3; i++)
6011               sel[i] = i * 2;
6012             vec_perm_indices indices (sel, 2, nelt);
6013             if (can_vec_perm_const_p (mode, indices))
6014               {
6015                 for (i = 0; i < 3; i++)
6016                     sel[i] = i * 2 + 1;
6017                 indices.new_vector (sel, 2, nelt);
6018                 if (can_vec_perm_const_p (mode, indices))
6019                     return true;
6020               }
6021         }
6022     }
6023 
6024   if (dump_enabled_p ())
6025     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026                          "extract even/odd not supported by target\n");
6027   return false;
6028 }
6029 
6030 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
6031    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6032 
6033 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)6034 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6035                                  bool masked_p)
6036 {
6037   if (masked_p)
6038     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6039                                                    vec_mask_load_lanes_optab,
6040                                                    vectype, count);
6041   else
6042     return vect_lanes_optab_supported_p ("vec_load_lanes",
6043                                                    vec_load_lanes_optab,
6044                                                    vectype, count);
6045 }
6046 
6047 /* Function vect_permute_load_chain.
6048 
6049    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6050    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6051    the input data correctly.  Return the final references for loads in
6052    RESULT_CHAIN.
6053 
6054    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6055    The input is 4 vectors each containing 8 elements. We assign a number to each
6056    element, the input sequence is:
6057 
6058    1st vec:   0  1  2  3  4  5  6  7
6059    2nd vec:   8  9 10 11 12 13 14 15
6060    3rd vec:  16 17 18 19 20 21 22 23
6061    4th vec:  24 25 26 27 28 29 30 31
6062 
6063    The output sequence should be:
6064 
6065    1st vec:  0 4  8 12 16 20 24 28
6066    2nd vec:  1 5  9 13 17 21 25 29
6067    3rd vec:  2 6 10 14 18 22 26 30
6068    4th vec:  3 7 11 15 19 23 27 31
6069 
6070    i.e., the first output vector should contain the first elements of each
6071    interleaving group, etc.
6072 
6073    We use extract_even/odd instructions to create such output.  The input of
6074    each extract_even/odd operation is two vectors
6075    1st vec    2nd vec
6076    0 1 2 3    4 5 6 7
6077 
6078    and the output is the vector of extracted even/odd elements.  The output of
6079    extract_even will be:   0 2 4 6
6080    and of extract_odd:     1 3 5 7
6081 
6082 
6083    The permutation is done in log LENGTH stages.  In each stage extract_even
6084    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6085    their order.  In our example,
6086 
6087    E1: extract_even (1st vec, 2nd vec)
6088    E2: extract_odd (1st vec, 2nd vec)
6089    E3: extract_even (3rd vec, 4th vec)
6090    E4: extract_odd (3rd vec, 4th vec)
6091 
6092    The output for the first stage will be:
6093 
6094    E1:  0  2  4  6  8 10 12 14
6095    E2:  1  3  5  7  9 11 13 15
6096    E3: 16 18 20 22 24 26 28 30
6097    E4: 17 19 21 23 25 27 29 31
6098 
6099    In order to proceed and create the correct sequence for the next stage (or
6100    for the correct output, if the second stage is the last one, as in our
6101    example), we first put the output of extract_even operation and then the
6102    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6103    The input for the second stage is:
6104 
6105    1st vec (E1):  0  2  4  6  8 10 12 14
6106    2nd vec (E3): 16 18 20 22 24 26 28 30
6107    3rd vec (E2):  1  3  5  7  9 11 13 15
6108    4th vec (E4): 17 19 21 23 25 27 29 31
6109 
6110    The output of the second stage:
6111 
6112    E1: 0 4  8 12 16 20 24 28
6113    E2: 2 6 10 14 18 22 26 30
6114    E3: 1 5  9 13 17 21 25 29
6115    E4: 3 7 11 15 19 23 27 31
6116 
6117    And RESULT_CHAIN after reordering:
6118 
6119    1st vec (E1):  0 4  8 12 16 20 24 28
6120    2nd vec (E3):  1 5  9 13 17 21 25 29
6121    3rd vec (E2):  2 6 10 14 18 22 26 30
6122    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6123 
6124 static void
vect_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6125 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6126                                unsigned int length,
6127                                stmt_vec_info stmt_info,
6128                                gimple_stmt_iterator *gsi,
6129                                vec<tree> *result_chain)
6130 {
6131   tree data_ref, first_vect, second_vect;
6132   tree perm_mask_even, perm_mask_odd;
6133   tree perm3_mask_low, perm3_mask_high;
6134   gimple *perm_stmt;
6135   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6136   unsigned int i, j, log_length = exact_log2 (length);
6137 
6138   result_chain->quick_grow (length);
6139   memcpy (result_chain->address (), dr_chain.address (),
6140             length * sizeof (tree));
6141 
6142   if (length == 3)
6143     {
6144       /* vect_grouped_load_supported ensures that this is constant.  */
6145       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6146       unsigned int k;
6147 
6148       vec_perm_builder sel (nelt, nelt, 1);
6149       sel.quick_grow (nelt);
6150       vec_perm_indices indices;
6151       for (k = 0; k < 3; k++)
6152           {
6153             for (i = 0; i < nelt; i++)
6154               if (3 * i + k < 2 * nelt)
6155                 sel[i] = 3 * i + k;
6156               else
6157                 sel[i] = 0;
6158             indices.new_vector (sel, 2, nelt);
6159             perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6160 
6161             for (i = 0, j = 0; i < nelt; i++)
6162               if (3 * i + k < 2 * nelt)
6163                 sel[i] = i;
6164               else
6165                 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6166             indices.new_vector (sel, 2, nelt);
6167             perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6168 
6169             first_vect = dr_chain[0];
6170             second_vect = dr_chain[1];
6171 
6172             /* Create interleaving stmt (low part of):
6173                low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6174                                                                            ...}>  */
6175             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6176             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6177                                                      second_vect, perm3_mask_low);
6178             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6179 
6180             /* Create interleaving stmt (high part of):
6181                high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6182                                                                             ...}>  */
6183             first_vect = data_ref;
6184             second_vect = dr_chain[2];
6185             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6186             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6187                                                      second_vect, perm3_mask_high);
6188             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6189             (*result_chain)[k] = data_ref;
6190           }
6191     }
6192   else
6193     {
6194       /* If length is not equal to 3 then only power of 2 is supported.  */
6195       gcc_assert (pow2p_hwi (length));
6196 
6197       /* The encoding has a single stepped pattern.  */
6198       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6199       vec_perm_builder sel (nelt, 1, 3);
6200       sel.quick_grow (3);
6201       for (i = 0; i < 3; ++i)
6202           sel[i] = i * 2;
6203       vec_perm_indices indices (sel, 2, nelt);
6204       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6205 
6206       for (i = 0; i < 3; ++i)
6207           sel[i] = i * 2 + 1;
6208       indices.new_vector (sel, 2, nelt);
6209       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6210 
6211       for (i = 0; i < log_length; i++)
6212           {
6213             for (j = 0; j < length; j += 2)
6214               {
6215                 first_vect = dr_chain[j];
6216                 second_vect = dr_chain[j+1];
6217 
6218                 /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6219                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6220                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6221                                                          first_vect, second_vect,
6222                                                          perm_mask_even);
6223                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6224                 (*result_chain)[j/2] = data_ref;
6225 
6226                 /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6227                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6228                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6229                                                          first_vect, second_vect,
6230                                                          perm_mask_odd);
6231                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6232                 (*result_chain)[j/2+length/2] = data_ref;
6233               }
6234             memcpy (dr_chain.address (), result_chain->address (),
6235                       length * sizeof (tree));
6236           }
6237     }
6238 }
6239 
6240 /* Function vect_shift_permute_load_chain.
6241 
6242    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6243    sequence of stmts to reorder the input data accordingly.
6244    Return the final references for loads in RESULT_CHAIN.
6245    Return true if successed, false otherwise.
6246 
6247    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6248    The input is 3 vectors each containing 8 elements.  We assign a
6249    number to each element, the input sequence is:
6250 
6251    1st vec:   0  1  2  3  4  5  6  7
6252    2nd vec:   8  9 10 11 12 13 14 15
6253    3rd vec:  16 17 18 19 20 21 22 23
6254 
6255    The output sequence should be:
6256 
6257    1st vec:  0 3 6  9 12 15 18 21
6258    2nd vec:  1 4 7 10 13 16 19 22
6259    3rd vec:  2 5 8 11 14 17 20 23
6260 
6261    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6262 
6263    First we shuffle all 3 vectors to get correct elements order:
6264 
6265    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6266    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6267    3rd vec:  (16 19 22) (17 20 23) (18 21)
6268 
6269    Next we unite and shift vector 3 times:
6270 
6271    1st step:
6272      shift right by 6 the concatenation of:
6273      "1st vec" and  "2nd vec"
6274        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6275      "2nd vec" and  "3rd vec"
6276        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6277      "3rd vec" and  "1st vec"
6278        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6279                                    | New vectors                   |
6280 
6281      So that now new vectors are:
6282 
6283      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6284      2nd vec:  (10 13) (16 19 22) (17 20 23)
6285      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6286 
6287    2nd step:
6288      shift right by 5 the concatenation of:
6289      "1st vec" and  "3rd vec"
6290        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6291      "2nd vec" and  "1st vec"
6292        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6293      "3rd vec" and  "2nd vec"
6294        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6295                                 | New vectors                   |
6296 
6297      So that now new vectors are:
6298 
6299      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6300      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6301      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6302 
6303    3rd step:
6304      shift right by 5 the concatenation of:
6305      "1st vec" and  "1st vec"
6306        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6307      shift right by 3 the concatenation of:
6308      "2nd vec" and  "2nd vec"
6309                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6310                                 | New vectors                   |
6311 
6312      So that now all vectors are READY:
6313      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6314      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6315      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6316 
6317    This algorithm is faster than one in vect_permute_load_chain if:
6318      1.  "shift of a concatination" is faster than general permutation.
6319            This is usually so.
6320      2.  The TARGET machine can't execute vector instructions in parallel.
6321            This is because each step of the algorithm depends on previous.
6322            The algorithm in vect_permute_load_chain is much more parallel.
6323 
6324    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6325 */
6326 
6327 static bool
vect_shift_permute_load_chain(vec_info * vinfo,vec<tree> dr_chain,unsigned int length,stmt_vec_info stmt_info,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6328 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6329                                      unsigned int length,
6330                                      stmt_vec_info stmt_info,
6331                                      gimple_stmt_iterator *gsi,
6332                                      vec<tree> *result_chain)
6333 {
6334   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6335   tree perm2_mask1, perm2_mask2, perm3_mask;
6336   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6337   gimple *perm_stmt;
6338 
6339   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6340   unsigned int i;
6341   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6342 
6343   unsigned HOST_WIDE_INT nelt, vf;
6344   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6345       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6346     /* Not supported for variable-length vectors.  */
6347     return false;
6348 
6349   vec_perm_builder sel (nelt, nelt, 1);
6350   sel.quick_grow (nelt);
6351 
6352   result_chain->quick_grow (length);
6353   memcpy (result_chain->address (), dr_chain.address (),
6354             length * sizeof (tree));
6355 
6356   if (pow2p_hwi (length) && vf > 4)
6357     {
6358       unsigned int j, log_length = exact_log2 (length);
6359       for (i = 0; i < nelt / 2; ++i)
6360           sel[i] = i * 2;
6361       for (i = 0; i < nelt / 2; ++i)
6362           sel[nelt / 2 + i] = i * 2 + 1;
6363       vec_perm_indices indices (sel, 2, nelt);
6364       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6365           {
6366             if (dump_enabled_p ())
6367               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368                                    "shuffle of 2 fields structure is not \
6369                                     supported by target\n");
6370             return false;
6371           }
6372       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6373 
6374       for (i = 0; i < nelt / 2; ++i)
6375           sel[i] = i * 2 + 1;
6376       for (i = 0; i < nelt / 2; ++i)
6377           sel[nelt / 2 + i] = i * 2;
6378       indices.new_vector (sel, 2, nelt);
6379       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6380           {
6381             if (dump_enabled_p ())
6382               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6383                                    "shuffle of 2 fields structure is not \
6384                                     supported by target\n");
6385             return false;
6386           }
6387       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6388 
6389       /* Generating permutation constant to shift all elements.
6390            For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6391       for (i = 0; i < nelt; i++)
6392           sel[i] = nelt / 2 + i;
6393       indices.new_vector (sel, 2, nelt);
6394       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6395           {
6396             if (dump_enabled_p ())
6397               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398                                    "shift permutation is not supported by target\n");
6399             return false;
6400           }
6401       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6402 
6403       /* Generating permutation constant to select vector from 2.
6404            For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6405       for (i = 0; i < nelt / 2; i++)
6406           sel[i] = i;
6407       for (i = nelt / 2; i < nelt; i++)
6408           sel[i] = nelt + i;
6409       indices.new_vector (sel, 2, nelt);
6410       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6411           {
6412             if (dump_enabled_p ())
6413               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6414                                    "select is not supported by target\n");
6415             return false;
6416           }
6417       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6418 
6419       for (i = 0; i < log_length; i++)
6420           {
6421             for (j = 0; j < length; j += 2)
6422               {
6423                 first_vect = dr_chain[j];
6424                 second_vect = dr_chain[j + 1];
6425 
6426                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6427                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6428                                                          first_vect, first_vect,
6429                                                          perm2_mask1);
6430                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6431                 vect[0] = data_ref;
6432 
6433                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6434                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6435                                                          second_vect, second_vect,
6436                                                          perm2_mask2);
6437                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6438                 vect[1] = data_ref;
6439 
6440                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6441                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6442                                                          vect[0], vect[1], shift1_mask);
6443                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6444                 (*result_chain)[j/2 + length/2] = data_ref;
6445 
6446                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6447                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6448                                                          vect[0], vect[1], select_mask);
6449                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6450                 (*result_chain)[j/2] = data_ref;
6451               }
6452             memcpy (dr_chain.address (), result_chain->address (),
6453                       length * sizeof (tree));
6454           }
6455       return true;
6456     }
6457   if (length == 3 && vf > 2)
6458     {
6459       unsigned int k = 0, l = 0;
6460 
6461       /* Generating permutation constant to get all elements in rigth order.
6462            For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6463       for (i = 0; i < nelt; i++)
6464           {
6465             if (3 * k + (l % 3) >= nelt)
6466               {
6467                 k = 0;
6468                 l += (3 - (nelt % 3));
6469               }
6470             sel[i] = 3 * k + (l % 3);
6471             k++;
6472           }
6473       vec_perm_indices indices (sel, 2, nelt);
6474       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6475           {
6476             if (dump_enabled_p ())
6477               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6478                                    "shuffle of 3 fields structure is not \
6479                                     supported by target\n");
6480             return false;
6481           }
6482       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6483 
6484       /* Generating permutation constant to shift all elements.
6485            For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6486       for (i = 0; i < nelt; i++)
6487           sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6488       indices.new_vector (sel, 2, nelt);
6489       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6490           {
6491             if (dump_enabled_p ())
6492               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493                                    "shift permutation is not supported by target\n");
6494             return false;
6495           }
6496       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6497 
6498       /* Generating permutation constant to shift all elements.
6499            For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6500       for (i = 0; i < nelt; i++)
6501           sel[i] = 2 * (nelt / 3) + 1 + i;
6502       indices.new_vector (sel, 2, nelt);
6503       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6504           {
6505             if (dump_enabled_p ())
6506               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6507                                    "shift permutation is not supported by target\n");
6508             return false;
6509           }
6510       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6511 
6512       /* Generating permutation constant to shift all elements.
6513            For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6514       for (i = 0; i < nelt; i++)
6515           sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6516       indices.new_vector (sel, 2, nelt);
6517       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6518           {
6519             if (dump_enabled_p ())
6520               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6521                                    "shift permutation is not supported by target\n");
6522             return false;
6523           }
6524       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6525 
6526       /* Generating permutation constant to shift all elements.
6527            For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6528       for (i = 0; i < nelt; i++)
6529           sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6530       indices.new_vector (sel, 2, nelt);
6531       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6532           {
6533             if (dump_enabled_p ())
6534               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6535                                    "shift permutation is not supported by target\n");
6536             return false;
6537           }
6538       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6539 
6540       for (k = 0; k < 3; k++)
6541           {
6542             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6543             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6544                                                      dr_chain[k], dr_chain[k],
6545                                                      perm3_mask);
6546             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6547             vect[k] = data_ref;
6548           }
6549 
6550       for (k = 0; k < 3; k++)
6551           {
6552             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6553             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6554                                                      vect[k % 3], vect[(k + 1) % 3],
6555                                                      shift1_mask);
6556             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6557             vect_shift[k] = data_ref;
6558           }
6559 
6560       for (k = 0; k < 3; k++)
6561           {
6562             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6563             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6564                                                      vect_shift[(4 - k) % 3],
6565                                                      vect_shift[(3 - k) % 3],
6566                                                      shift2_mask);
6567             vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6568             vect[k] = data_ref;
6569           }
6570 
6571       (*result_chain)[3 - (nelt % 3)] = vect[2];
6572 
6573       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6574       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6575                                                vect[0], shift3_mask);
6576       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6577       (*result_chain)[nelt % 3] = data_ref;
6578 
6579       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6580       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6581                                                vect[1], shift4_mask);
6582       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6583       (*result_chain)[0] = data_ref;
6584       return true;
6585     }
6586   return false;
6587 }
6588 
6589 /* Function vect_transform_grouped_load.
6590 
6591    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6592    to perform their permutation and ascribe the result vectorized statements to
6593    the scalar statements.
6594 */
6595 
6596 void
vect_transform_grouped_load(vec_info * vinfo,stmt_vec_info stmt_info,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6597 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6598                                    vec<tree> dr_chain,
6599                                    int size, gimple_stmt_iterator *gsi)
6600 {
6601   machine_mode mode;
6602   vec<tree> result_chain = vNULL;
6603 
6604   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6605      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6606      vectors, that are ready for vector computation.  */
6607   result_chain.create (size);
6608 
6609   /* If reassociation width for vector type is 2 or greater target machine can
6610      execute 2 or more vector instructions in parallel.  Otherwise try to
6611      get chain for loads group using vect_shift_permute_load_chain.  */
6612   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6613   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6614       || pow2p_hwi (size)
6615       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6616                                                    gsi, &result_chain))
6617     vect_permute_load_chain (vinfo, dr_chain,
6618                                    size, stmt_info, gsi, &result_chain);
6619   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6620   result_chain.release ();
6621 }
6622 
6623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6624    generated as part of the vectorization of STMT_INFO.  Assign the statement
6625    for each vector to the associated scalar statement.  */
6626 
6627 void
vect_record_grouped_load_vectors(vec_info *,stmt_vec_info stmt_info,vec<tree> result_chain)6628 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6629                                           vec<tree> result_chain)
6630 {
6631   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6632   unsigned int i, gap_count;
6633   tree tmp_data_ref;
6634 
6635   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6636      Since we scan the chain starting from it's first node, their order
6637      corresponds the order of data-refs in RESULT_CHAIN.  */
6638   stmt_vec_info next_stmt_info = first_stmt_info;
6639   gap_count = 1;
6640   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6641     {
6642       if (!next_stmt_info)
6643           break;
6644 
6645       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6646        code elimination pass later.  No need to check for the first stmt in
6647        the group, since it always exists.
6648        DR_GROUP_GAP is the number of steps in elements from the previous
6649        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6650        correspond to the gaps.  */
6651       if (next_stmt_info != first_stmt_info
6652             && gap_count < DR_GROUP_GAP (next_stmt_info))
6653           {
6654             gap_count++;
6655             continue;
6656           }
6657 
6658       /* ???  The following needs cleanup after the removal of
6659          DR_GROUP_SAME_DR_STMT.  */
6660       if (next_stmt_info)
6661         {
6662             gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6663             /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6664                copies, and we put the new vector statement last.  */
6665             STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6666 
6667             next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6668             gap_count = 1;
6669         }
6670     }
6671 }
6672 
6673 /* Function vect_force_dr_alignment_p.
6674 
6675    Returns whether the alignment of a DECL can be forced to be aligned
6676    on ALIGNMENT bit boundary.  */
6677 
6678 bool
vect_can_force_dr_alignment_p(const_tree decl,poly_uint64 alignment)6679 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6680 {
6681   if (!VAR_P (decl))
6682     return false;
6683 
6684   if (decl_in_symtab_p (decl)
6685       && !symtab_node::get (decl)->can_increase_alignment_p ())
6686     return false;
6687 
6688   if (TREE_STATIC (decl))
6689     return (known_le (alignment,
6690                           (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6691   else
6692     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6693 }
6694 
6695 /* Return whether the data reference DR_INFO is supported with respect to its
6696    alignment.
6697    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6698    it is aligned, i.e., check if it is possible to vectorize it with different
6699    alignment.  */
6700 
6701 enum dr_alignment_support
vect_supportable_dr_alignment(vec_info * vinfo,dr_vec_info * dr_info,tree vectype,int misalignment)6702 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6703                                      tree vectype, int misalignment)
6704 {
6705   data_reference *dr = dr_info->dr;
6706   stmt_vec_info stmt_info = dr_info->stmt;
6707   machine_mode mode = TYPE_MODE (vectype);
6708   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6709   class loop *vect_loop = NULL;
6710   bool nested_in_vect_loop = false;
6711 
6712   if (misalignment == 0)
6713     return dr_aligned;
6714 
6715   /* For now assume all conditional loads/stores support unaligned
6716      access without any special code.  */
6717   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6718     if (gimple_call_internal_p (stmt)
6719           && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6720               || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6721       return dr_unaligned_supported;
6722 
6723   if (loop_vinfo)
6724     {
6725       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6726       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6727     }
6728 
6729   /* Possibly unaligned access.  */
6730 
6731   /* We can choose between using the implicit realignment scheme (generating
6732      a misaligned_move stmt) and the explicit realignment scheme (generating
6733      aligned loads with a REALIGN_LOAD).  There are two variants to the
6734      explicit realignment scheme: optimized, and unoptimized.
6735      We can optimize the realignment only if the step between consecutive
6736      vector loads is equal to the vector size.  Since the vector memory
6737      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6738      is guaranteed that the misalignment amount remains the same throughout the
6739      execution of the vectorized loop.  Therefore, we can create the
6740      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6741      at the loop preheader.
6742 
6743      However, in the case of outer-loop vectorization, when vectorizing a
6744      memory access in the inner-loop nested within the LOOP that is now being
6745      vectorized, while it is guaranteed that the misalignment of the
6746      vectorized memory access will remain the same in different outer-loop
6747      iterations, it is *not* guaranteed that is will remain the same throughout
6748      the execution of the inner-loop.  This is because the inner-loop advances
6749      with the original scalar step (and not in steps of VS).  If the inner-loop
6750      step happens to be a multiple of VS, then the misalignment remains fixed
6751      and we can use the optimized realignment scheme.  For example:
6752 
6753       for (i=0; i<N; i++)
6754         for (j=0; j<M; j++)
6755           s += a[i+j];
6756 
6757      When vectorizing the i-loop in the above example, the step between
6758      consecutive vector loads is 1, and so the misalignment does not remain
6759      fixed across the execution of the inner-loop, and the realignment cannot
6760      be optimized (as illustrated in the following pseudo vectorized loop):
6761 
6762       for (i=0; i<N; i+=4)
6763         for (j=0; j<M; j++){
6764           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6765                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6766                          // (assuming that we start from an aligned address).
6767           }
6768 
6769      We therefore have to use the unoptimized realignment scheme:
6770 
6771       for (i=0; i<N; i+=4)
6772           for (j=k; j<M; j+=4)
6773           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6774                            // that the misalignment of the initial address is
6775                            // 0).
6776 
6777      The loop can then be vectorized as follows:
6778 
6779       for (k=0; k<4; k++){
6780         rt = get_realignment_token (&vp[k]);
6781         for (i=0; i<N; i+=4){
6782           v1 = vp[i+k];
6783           for (j=k; j<M; j+=4){
6784             v2 = vp[i+j+VS-1];
6785             va = REALIGN_LOAD <v1,v2,rt>;
6786             vs += va;
6787             v1 = v2;
6788           }
6789         }
6790     } */
6791 
6792   if (DR_IS_READ (dr))
6793     {
6794       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6795             && (!targetm.vectorize.builtin_mask_for_load
6796                 || targetm.vectorize.builtin_mask_for_load ()))
6797           {
6798             /* If we are doing SLP then the accesses need not have the
6799                same alignment, instead it depends on the SLP group size.  */
6800             if (loop_vinfo
6801                 && STMT_SLP_TYPE (stmt_info)
6802                 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6803                                     * (DR_GROUP_SIZE
6804                                          (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6805                                     TYPE_VECTOR_SUBPARTS (vectype)))
6806               ;
6807             else if (!loop_vinfo
6808                        || (nested_in_vect_loop
6809                            && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6810                                             GET_MODE_SIZE (TYPE_MODE (vectype)))))
6811               return dr_explicit_realign;
6812             else
6813               return dr_explicit_realign_optimized;
6814           }
6815     }
6816 
6817   bool is_packed = false;
6818   tree type = TREE_TYPE (DR_REF (dr));
6819   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6820     is_packed = not_size_aligned (DR_REF (dr));
6821   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6822                                                                  is_packed))
6823     return dr_unaligned_supported;
6824 
6825   /* Unsupported.  */
6826   return dr_unaligned_unsupported;
6827 }
6828