xref: /dragonfly/contrib/gcc-8.0/gcc/tree-vect-data-refs.c (revision 95059079af47f9a66a175f374f2da1a5020e3255)
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
57 
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
60 
61 static bool
vect_lanes_optab_supported_p(const char * name,convert_optab optab,tree vectype,unsigned HOST_WIDE_INT count)62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63                                     tree vectype, unsigned HOST_WIDE_INT count)
64 {
65   machine_mode mode, array_mode;
66   bool limit_p;
67 
68   mode = TYPE_MODE (vectype);
69   if (!targetm.array_mode (mode, count).exists (&array_mode))
70     {
71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72       limit_p = !targetm.array_mode_supported_p (mode, count);
73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
74           {
75             if (dump_enabled_p ())
76               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77                                    "no array mode for %s["
78                                    HOST_WIDE_INT_PRINT_DEC "]\n",
79                                    GET_MODE_NAME (mode), count);
80             return false;
81           }
82     }
83 
84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
85     {
86       if (dump_enabled_p ())
87           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88                          "cannot use %s<%s><%s>\n", name,
89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90       return false;
91     }
92 
93   if (dump_enabled_p ())
94     dump_printf_loc (MSG_NOTE, vect_location,
95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96                      GET_MODE_NAME (mode));
97 
98   return true;
99 }
100 
101 
102 /* Return the smallest scalar part of STMT.
103    This is used to determine the vectype of the stmt.  We generally set the
104    vectype according to the type of the result (lhs).  For stmts whose
105    result-type is different than the type of the arguments (e.g., demotion,
106    promotion), vectype will be reset appropriately (later).  Note that we have
107    to visit the smallest datatype in this function, because that determines the
108    VF.  If the smallest datatype in the loop is present only as the rhs of a
109    promotion operation - we'd miss it.
110    Such a case, where a variable of this datatype does not appear in the lhs
111    anywhere in the loop, can only occur if it's an invariant: e.g.:
112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
113    invariant motion.  However, we cannot rely on invariant motion to always
114    take invariants out of the loop, and so in the case of promotion we also
115    have to check the rhs.
116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117    types.  */
118 
119 tree
vect_get_smallest_scalar_type(gimple * stmt,HOST_WIDE_INT * lhs_size_unit,HOST_WIDE_INT * rhs_size_unit)120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121                                HOST_WIDE_INT *rhs_size_unit)
122 {
123   tree scalar_type = gimple_expr_type (stmt);
124   HOST_WIDE_INT lhs, rhs;
125 
126   /* During the analysis phase, this function is called on arbitrary
127      statements that might not have scalar results.  */
128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129     return scalar_type;
130 
131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
132 
133   if (is_gimple_assign (stmt)
134       && (gimple_assign_cast_p (stmt)
135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
140     {
141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
142 
143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144       if (rhs < lhs)
145         scalar_type = rhs_type;
146     }
147 
148   *lhs_size_unit = lhs;
149   *rhs_size_unit = rhs;
150   return scalar_type;
151 }
152 
153 
154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
155    tested at run-time.  Return TRUE if DDR was successfully inserted.
156    Return false if versioning is not supported.  */
157 
158 static bool
vect_mark_for_runtime_alias_test(ddr_p ddr,loop_vec_info loop_vinfo)159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
160 {
161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
162 
163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
164     return false;
165 
166   if (!runtime_alias_check_p (ddr, loop,
167                                     optimize_loop_nest_for_speed_p (loop)))
168     return false;
169 
170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
171   return true;
172 }
173 
174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
175 
176 static void
vect_check_nonzero_value(loop_vec_info loop_vinfo,tree value)177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
178 {
179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
180   for (unsigned int i = 0; i < checks.length(); ++i)
181     if (checks[i] == value)
182       return;
183 
184   if (dump_enabled_p ())
185     {
186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
188       dump_printf (MSG_NOTE, " is nonzero\n");
189     }
190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
191 }
192 
193 /* Return true if we know that the order of vectorized STMT_A and
194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
195    At least one of the statements is a write.  */
196 
197 static bool
vect_preserves_scalar_order_p(gimple * stmt_a,gimple * stmt_b)198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
199 {
200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
202 
203   /* Single statements are always kept in their original order.  */
204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
206     return true;
207 
208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
209      SLP group are emitted at the position of the last scalar load and
210      all loads in an interleaving group are emitted at the position
211      of the first scalar load.
212      Stores in a group are emitted at the position of the last scalar store.
213      Compute that position and check whether the resulting order matches
214      the current one.
215      We have not yet decided between SLP and interleaving so we have
216      to conservatively assume both.  */
217   gimple *il_a;
218   gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
219   if (last_a)
220     {
221       for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s;
222              s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
223           last_a = get_later_stmt (last_a, s);
224       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
225           {
226             for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s;
227                  s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
228               if (get_later_stmt (il_a, s) == il_a)
229                 il_a = s;
230           }
231       else
232           il_a = last_a;
233     }
234   else
235     last_a = il_a = stmt_a;
236   gimple *il_b;
237   gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
238   if (last_b)
239     {
240       for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s;
241              s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
242           last_b = get_later_stmt (last_b, s);
243       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
244           {
245             for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s;
246                  s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s)))
247               if (get_later_stmt (il_b, s) == il_b)
248                 il_b = s;
249           }
250       else
251           il_b = last_b;
252     }
253   else
254     last_b = il_b = stmt_b;
255   bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a);
256   return (/* SLP */
257             (get_later_stmt (last_a, last_b) == last_a) == a_after_b
258             /* Interleaving */
259             && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
260             /* Mixed */
261             && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
262             && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
263 }
264 
265 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
266    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
267    distances.  These distances are conservatively correct but they don't
268    reflect a guaranteed dependence.
269 
270    Return true if this function does all the work necessary to avoid
271    an alias or false if the caller should use the dependence distances
272    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
273    the depth of the loop described by LOOP_VINFO and the other arguments
274    are as for vect_analyze_data_ref_dependence.  */
275 
276 static bool
vect_analyze_possibly_independent_ddr(data_dependence_relation * ddr,loop_vec_info loop_vinfo,int loop_depth,unsigned int * max_vf)277 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
278                                                loop_vec_info loop_vinfo,
279                                                int loop_depth, unsigned int *max_vf)
280 {
281   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282   lambda_vector dist_v;
283   unsigned int i;
284   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
285     {
286       int dist = dist_v[loop_depth];
287       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
288           {
289             /* If the user asserted safelen >= DIST consecutive iterations
290                can be executed concurrently, assume independence.
291 
292                ??? An alternative would be to add the alias check even
293                in this case, and vectorize the fallback loop with the
294                maximum VF set to safelen.  However, if the user has
295                explicitly given a length, it's less likely that that
296                would be a win.  */
297             if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
298               {
299                 if ((unsigned int) loop->safelen < *max_vf)
300                     *max_vf = loop->safelen;
301                 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
302                 continue;
303               }
304 
305             /* For dependence distances of 2 or more, we have the option
306                of limiting VF or checking for an alias at runtime.
307                Prefer to check at runtime if we can, to avoid limiting
308                the VF unnecessarily when the bases are in fact independent.
309 
310                Note that the alias checks will be removed if the VF ends up
311                being small enough.  */
312             return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
313           }
314     }
315   return true;
316 }
317 
318 
319 /* Function vect_analyze_data_ref_dependence.
320 
321    Return TRUE if there (might) exist a dependence between a memory-reference
322    DRA and a memory-reference DRB.  When versioning for alias may check a
323    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
324    the data dependence.  */
325 
326 static bool
vect_analyze_data_ref_dependence(struct data_dependence_relation * ddr,loop_vec_info loop_vinfo,unsigned int * max_vf)327 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
328                                           loop_vec_info loop_vinfo,
329                                           unsigned int *max_vf)
330 {
331   unsigned int i;
332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
333   struct data_reference *dra = DDR_A (ddr);
334   struct data_reference *drb = DDR_B (ddr);
335   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
336   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
337   lambda_vector dist_v;
338   unsigned int loop_depth;
339 
340   /* In loop analysis all data references should be vectorizable.  */
341   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
342       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
343     gcc_unreachable ();
344 
345   /* Independent data accesses.  */
346   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
347     return false;
348 
349   if (dra == drb
350       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
351     return false;
352 
353   /* We do not have to consider dependences between accesses that belong
354      to the same group, unless the stride could be smaller than the
355      group size.  */
356   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
357       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b)
358       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
359     return false;
360 
361   /* Even if we have an anti-dependence then, as the vectorized loop covers at
362      least two scalar iterations, there is always also a true dependence.
363      As the vectorizer does not re-order loads and stores we can ignore
364      the anti-dependence if TBAA can disambiguate both DRs similar to the
365      case with known negative distance anti-dependences (positive
366      distance anti-dependences would violate TBAA constraints).  */
367   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
368        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
369       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
370                                          get_alias_set (DR_REF (drb))))
371     return false;
372 
373   /* Unknown data dependence.  */
374   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
375     {
376       /* If user asserted safelen consecutive iterations can be
377            executed concurrently, assume independence.  */
378       if (loop->safelen >= 2)
379           {
380             if ((unsigned int) loop->safelen < *max_vf)
381               *max_vf = loop->safelen;
382             LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
383             return false;
384           }
385 
386       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
387             || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
388           {
389             if (dump_enabled_p ())
390               {
391                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392                                      "versioning for alias not supported for: "
393                                      "can't determine dependence between ");
394                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
395                                          DR_REF (dra));
396                 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
397                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
398                                          DR_REF (drb));
399                 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
400               }
401             return true;
402           }
403 
404       if (dump_enabled_p ())
405           {
406             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407                                  "versioning for alias required: "
408                                  "can't determine dependence between ");
409             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
410                                    DR_REF (dra));
411             dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
412             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
413                                    DR_REF (drb));
414             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
415           }
416 
417       /* Add to list of ddrs that need to be tested at run-time.  */
418       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
419     }
420 
421   /* Known data dependence.  */
422   if (DDR_NUM_DIST_VECTS (ddr) == 0)
423     {
424       /* If user asserted safelen consecutive iterations can be
425            executed concurrently, assume independence.  */
426       if (loop->safelen >= 2)
427           {
428             if ((unsigned int) loop->safelen < *max_vf)
429               *max_vf = loop->safelen;
430             LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
431             return false;
432           }
433 
434       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
435             || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
436           {
437             if (dump_enabled_p ())
438               {
439                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
440                                      "versioning for alias not supported for: "
441                                      "bad dist vector for ");
442                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
443                                          DR_REF (dra));
444                 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
445                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
446                                          DR_REF (drb));
447                 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
448               }
449             return true;
450           }
451 
452       if (dump_enabled_p ())
453         {
454           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
455                            "versioning for alias required: "
456                            "bad dist vector for ");
457           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
458           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
459           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
460           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
461         }
462       /* Add to list of ddrs that need to be tested at run-time.  */
463       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
464     }
465 
466   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
467 
468   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
469       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
470                                                             loop_depth, max_vf))
471     return false;
472 
473   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
474     {
475       int dist = dist_v[loop_depth];
476 
477       if (dump_enabled_p ())
478           dump_printf_loc (MSG_NOTE, vect_location,
479                          "dependence distance  = %d.\n", dist);
480 
481       if (dist == 0)
482           {
483             if (dump_enabled_p ())
484               {
485                 dump_printf_loc (MSG_NOTE, vect_location,
486                                  "dependence distance == 0 between ");
487                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
488                 dump_printf (MSG_NOTE, " and ");
489                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
490                 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
491               }
492 
493             /* When we perform grouped accesses and perform implicit CSE
494                by detecting equal accesses and doing disambiguation with
495                runtime alias tests like for
496                   .. = a[i];
497                     .. = a[i+1];
498                     a[i] = ..;
499                     a[i+1] = ..;
500                     *p = ..;
501                     .. = a[i];
502                     .. = a[i+1];
503                where we will end up loading { a[i], a[i+1] } once, make
504                sure that inserting group loads before the first load and
505                stores after the last store will do the right thing.
506                Similar for groups like
507                   a[i] = ...;
508                     ... = a[i];
509                     a[i+1] = ...;
510                where loads from the group interleave with the store.  */
511             if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
512               {
513                 if (dump_enabled_p ())
514                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
515                                          "READ_WRITE dependence in interleaving.\n");
516                 return true;
517               }
518 
519             if (loop->safelen < 2)
520               {
521                 tree indicator = dr_zero_step_indicator (dra);
522                 if (TREE_CODE (indicator) != INTEGER_CST)
523                     vect_check_nonzero_value (loop_vinfo, indicator);
524                 else if (integer_zerop (indicator))
525                     {
526                       if (dump_enabled_p ())
527                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528                                          "access also has a zero step\n");
529                       return true;
530                     }
531               }
532             continue;
533           }
534 
535       if (dist > 0 && DDR_REVERSED_P (ddr))
536           {
537             /* If DDR_REVERSED_P the order of the data-refs in DDR was
538                reversed (to make distance vector positive), and the actual
539                distance is negative.  */
540             if (dump_enabled_p ())
541               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
542                                "dependence distance negative.\n");
543             /* Record a negative dependence distance to later limit the
544                amount of stmt copying / unrolling we can perform.
545                Only need to handle read-after-write dependence.  */
546             if (DR_IS_READ (drb)
547                 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
548                       || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
549               STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
550             continue;
551           }
552 
553       unsigned int abs_dist = abs (dist);
554       if (abs_dist >= 2 && abs_dist < *max_vf)
555           {
556             /* The dependence distance requires reduction of the maximal
557                vectorization factor.  */
558             *max_vf = abs (dist);
559             if (dump_enabled_p ())
560               dump_printf_loc (MSG_NOTE, vect_location,
561                                "adjusting maximal vectorization factor to %i\n",
562                                *max_vf);
563           }
564 
565       if (abs_dist >= *max_vf)
566           {
567             /* Dependence distance does not create dependence, as far as
568                vectorization is concerned, in this case.  */
569             if (dump_enabled_p ())
570               dump_printf_loc (MSG_NOTE, vect_location,
571                                "dependence distance >= VF.\n");
572             continue;
573           }
574 
575       if (dump_enabled_p ())
576           {
577             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
578                          "not vectorized, possible dependence "
579                          "between data-refs ");
580             dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
581             dump_printf (MSG_NOTE,  " and ");
582             dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
583             dump_printf (MSG_NOTE,  "\n");
584           }
585 
586       return true;
587     }
588 
589   return false;
590 }
591 
592 /* Function vect_analyze_data_ref_dependences.
593 
594    Examine all the data references in the loop, and make sure there do not
595    exist any data dependences between them.  Set *MAX_VF according to
596    the maximum vectorization factor the data dependences allow.  */
597 
598 bool
vect_analyze_data_ref_dependences(loop_vec_info loop_vinfo,unsigned int * max_vf)599 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
600                                            unsigned int *max_vf)
601 {
602   unsigned int i;
603   struct data_dependence_relation *ddr;
604 
605   if (dump_enabled_p ())
606     dump_printf_loc (MSG_NOTE, vect_location,
607                      "=== vect_analyze_data_ref_dependences ===\n");
608 
609   LOOP_VINFO_DDRS (loop_vinfo)
610     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
611                * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
612   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
613   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
614   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
615                                         &LOOP_VINFO_DDRS (loop_vinfo),
616                                         LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
617     return false;
618 
619   /* For epilogues we either have no aliases or alias versioning
620      was applied to original loop.  Therefore we may just get max_vf
621      using VF of original loop.  */
622   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
623     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
624   else
625     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
626       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
627           return false;
628 
629   return true;
630 }
631 
632 
633 /* Function vect_slp_analyze_data_ref_dependence.
634 
635    Return TRUE if there (might) exist a dependence between a memory-reference
636    DRA and a memory-reference DRB.  When versioning for alias may check a
637    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
638    the data dependence.  */
639 
640 static bool
vect_slp_analyze_data_ref_dependence(struct data_dependence_relation * ddr)641 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
642 {
643   struct data_reference *dra = DDR_A (ddr);
644   struct data_reference *drb = DDR_B (ddr);
645 
646   /* We need to check dependences of statements marked as unvectorizable
647      as well, they still can prohibit vectorization.  */
648 
649   /* Independent data accesses.  */
650   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
651     return false;
652 
653   if (dra == drb)
654     return false;
655 
656   /* Read-read is OK.  */
657   if (DR_IS_READ (dra) && DR_IS_READ (drb))
658     return false;
659 
660   /* If dra and drb are part of the same interleaving chain consider
661      them independent.  */
662   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
663       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
664             == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
665     return false;
666 
667   /* Unknown data dependence.  */
668   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
669     {
670       if  (dump_enabled_p ())
671           {
672             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673                                  "can't determine dependence between ");
674             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
675             dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
676             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
677             dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
678           }
679     }
680   else if (dump_enabled_p ())
681     {
682       dump_printf_loc (MSG_NOTE, vect_location,
683                            "determined dependence between ");
684       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
685       dump_printf (MSG_NOTE, " and ");
686       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
687       dump_printf (MSG_NOTE,  "\n");
688     }
689 
690   return true;
691 }
692 
693 
694 /* Analyze dependences involved in the transform of SLP NODE.  STORES
695    contain the vector of scalar stores of this instance if we are
696    disambiguating the loads.  */
697 
698 static bool
vect_slp_analyze_node_dependences(slp_instance instance,slp_tree node,vec<gimple * > stores,gimple * last_store)699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
700                                            vec<gimple *> stores, gimple *last_store)
701 {
702   /* This walks over all stmts involved in the SLP load/store done
703      in NODE verifying we can sink them up to the last stmt in the
704      group.  */
705   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
706   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
707     {
708       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
709       if (access == last_access)
710           continue;
711       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
712       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
713              gsi_stmt (gsi) != last_access; gsi_next (&gsi))
714           {
715             gimple *stmt = gsi_stmt (gsi);
716             if (! gimple_vuse (stmt)
717                 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
718               continue;
719 
720             /* If we couldn't record a (single) data reference for this
721                stmt we have to give up.  */
722             /* ???  Here and below if dependence analysis fails we can resort
723                to the alias oracle which can handle more kinds of stmts.  */
724             data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
725             if (!dr_b)
726               return false;
727 
728             bool dependent = false;
729             /* If we run into a store of this same instance (we've just
730                marked those) then delay dependence checking until we run
731                into the last store because this is where it will have
732                been sunk to (and we verify if we can do that as well).  */
733             if (gimple_visited_p (stmt))
734               {
735                 if (stmt != last_store)
736                     continue;
737                 unsigned i;
738                 gimple *store;
739                 FOR_EACH_VEC_ELT (stores, i, store)
740                     {
741                       data_reference *store_dr
742                         = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
743                       ddr_p ddr = initialize_data_dependence_relation
744                                         (dr_a, store_dr, vNULL);
745                       dependent = vect_slp_analyze_data_ref_dependence (ddr);
746                       free_dependence_relation (ddr);
747                       if (dependent)
748                         break;
749                     }
750               }
751             else
752               {
753                 ddr_p ddr = initialize_data_dependence_relation (dr_a,
754                                                                              dr_b, vNULL);
755                 dependent = vect_slp_analyze_data_ref_dependence (ddr);
756                 free_dependence_relation (ddr);
757               }
758             if (dependent)
759               return false;
760           }
761     }
762   return true;
763 }
764 
765 
766 /* Function vect_analyze_data_ref_dependences.
767 
768    Examine all the data references in the basic-block, and make sure there
769    do not exist any data dependences between them.  Set *MAX_VF according to
770    the maximum vectorization factor the data dependences allow.  */
771 
772 bool
vect_slp_analyze_instance_dependence(slp_instance instance)773 vect_slp_analyze_instance_dependence (slp_instance instance)
774 {
775   if (dump_enabled_p ())
776     dump_printf_loc (MSG_NOTE, vect_location,
777                      "=== vect_slp_analyze_instance_dependence ===\n");
778 
779   /* The stores of this instance are at the root of the SLP tree.  */
780   slp_tree store = SLP_INSTANCE_TREE (instance);
781   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
782     store = NULL;
783 
784   /* Verify we can sink stores to the vectorized stmt insert location.  */
785   gimple *last_store = NULL;
786   if (store)
787     {
788       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
789           return false;
790 
791       /* Mark stores in this instance and remember the last one.  */
792       last_store = vect_find_last_scalar_stmt_in_slp (store);
793       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
794           gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
795     }
796 
797   bool res = true;
798 
799   /* Verify we can sink loads to the vectorized stmt insert location,
800      special-casing stores of this instance.  */
801   slp_tree load;
802   unsigned int i;
803   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
804     if (! vect_slp_analyze_node_dependences (instance, load,
805                                                        store
806                                                        ? SLP_TREE_SCALAR_STMTS (store)
807                                                        : vNULL, last_store))
808       {
809           res = false;
810           break;
811       }
812 
813   /* Unset the visited flag.  */
814   if (store)
815     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
816       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
817 
818   return res;
819 }
820 
821 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
822    the statement that contains DRB, which is useful for recording in the
823    dump file.  */
824 
825 static void
vect_record_base_alignment(vec_info * vinfo,gimple * stmt,innermost_loop_behavior * drb)826 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
827                                   innermost_loop_behavior *drb)
828 {
829   bool existed;
830   innermost_loop_behavior *&entry
831     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
832   if (!existed || entry->base_alignment < drb->base_alignment)
833     {
834       entry = drb;
835       if (dump_enabled_p ())
836           {
837             dump_printf_loc (MSG_NOTE, vect_location,
838                                  "recording new base alignment for ");
839             dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
840             dump_printf (MSG_NOTE, "\n");
841             dump_printf_loc (MSG_NOTE, vect_location,
842                                  "  alignment:    %d\n", drb->base_alignment);
843             dump_printf_loc (MSG_NOTE, vect_location,
844                                  "  misalignment: %d\n", drb->base_misalignment);
845             dump_printf_loc (MSG_NOTE, vect_location,
846                                  "  based on:     ");
847             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
848           }
849     }
850 }
851 
852 /* If the region we're going to vectorize is reached, all unconditional
853    data references occur at least once.  We can therefore pool the base
854    alignment guarantees from each unconditional reference.  Do this by
855    going through all the data references in VINFO and checking whether
856    the containing statement makes the reference unconditionally.  If so,
857    record the alignment of the base address in VINFO so that it can be
858    used for all other references with the same base.  */
859 
860 void
vect_record_base_alignments(vec_info * vinfo)861 vect_record_base_alignments (vec_info *vinfo)
862 {
863   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
864   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
865   data_reference *dr;
866   unsigned int i;
867   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
868     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
869       {
870           gimple *stmt = DR_STMT (dr);
871           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
872 
873           /* If DR is nested in the loop that is being vectorized, we can also
874              record the alignment of the base wrt the outer loop.  */
875           if (loop && nested_in_vect_loop_p (loop, stmt))
876             {
877               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
878               vect_record_base_alignment
879                 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
880             }
881       }
882 }
883 
884 /* Return the target alignment for the vectorized form of DR.  */
885 
886 static unsigned int
vect_calculate_target_alignment(struct data_reference * dr)887 vect_calculate_target_alignment (struct data_reference *dr)
888 {
889   gimple *stmt = DR_STMT (dr);
890   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
891   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
892   return targetm.vectorize.preferred_vector_alignment (vectype);
893 }
894 
895 /* Function vect_compute_data_ref_alignment
896 
897    Compute the misalignment of the data reference DR.
898 
899    Output:
900    1. If during the misalignment computation it is found that the data reference
901       cannot be vectorized then false is returned.
902    2. DR_MISALIGNMENT (DR) is defined.
903 
904    FOR NOW: No analysis is actually performed. Misalignment is calculated
905    only for trivial cases. TODO.  */
906 
907 bool
vect_compute_data_ref_alignment(struct data_reference * dr)908 vect_compute_data_ref_alignment (struct data_reference *dr)
909 {
910   gimple *stmt = DR_STMT (dr);
911   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
912   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
913   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
914   struct loop *loop = NULL;
915   tree ref = DR_REF (dr);
916   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
917 
918   if (dump_enabled_p ())
919     dump_printf_loc (MSG_NOTE, vect_location,
920                      "vect_compute_data_ref_alignment:\n");
921 
922   if (loop_vinfo)
923     loop = LOOP_VINFO_LOOP (loop_vinfo);
924 
925   /* Initialize misalignment to unknown.  */
926   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
927 
928   innermost_loop_behavior *drb = vect_dr_behavior (dr);
929   bool step_preserves_misalignment_p;
930 
931   unsigned HOST_WIDE_INT vector_alignment
932     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
933   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
934 
935   /* No step for BB vectorization.  */
936   if (!loop)
937     {
938       gcc_assert (integer_zerop (drb->step));
939       step_preserves_misalignment_p = true;
940     }
941 
942   /* In case the dataref is in an inner-loop of the loop that is being
943      vectorized (LOOP), we use the base and misalignment information
944      relative to the outer-loop (LOOP).  This is ok only if the misalignment
945      stays the same throughout the execution of the inner-loop, which is why
946      we have to check that the stride of the dataref in the inner-loop evenly
947      divides by the vector alignment.  */
948   else if (nested_in_vect_loop_p (loop, stmt))
949     {
950       step_preserves_misalignment_p
951           = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
952 
953       if (dump_enabled_p ())
954           {
955             if (step_preserves_misalignment_p)
956               dump_printf_loc (MSG_NOTE, vect_location,
957                                    "inner step divides the vector alignment.\n");
958             else
959               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
960                                    "inner step doesn't divide the vector"
961                                    " alignment.\n");
962           }
963     }
964 
965   /* Similarly we can only use base and misalignment information relative to
966      an innermost loop if the misalignment stays the same throughout the
967      execution of the loop.  As above, this is the case if the stride of
968      the dataref evenly divides by the alignment.  */
969   else
970     {
971       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
972       step_preserves_misalignment_p
973           = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
974 
975       if (!step_preserves_misalignment_p && dump_enabled_p ())
976           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
977                                "step doesn't divide the vector alignment.\n");
978     }
979 
980   unsigned int base_alignment = drb->base_alignment;
981   unsigned int base_misalignment = drb->base_misalignment;
982 
983   /* Calculate the maximum of the pooled base address alignment and the
984      alignment that we can compute for DR itself.  */
985   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
986   if (entry && base_alignment < (*entry)->base_alignment)
987     {
988       base_alignment = (*entry)->base_alignment;
989       base_misalignment = (*entry)->base_misalignment;
990     }
991 
992   if (drb->offset_alignment < vector_alignment
993       || !step_preserves_misalignment_p
994       /* We need to know whether the step wrt the vectorized loop is
995            negative when computing the starting misalignment below.  */
996       || TREE_CODE (drb->step) != INTEGER_CST)
997     {
998       if (dump_enabled_p ())
999           {
1000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001                              "Unknown alignment for access: ");
1002             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1003             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1004           }
1005       return true;
1006     }
1007 
1008   if (base_alignment < vector_alignment)
1009     {
1010       unsigned int max_alignment;
1011       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1012       if (max_alignment < vector_alignment
1013             || !vect_can_force_dr_alignment_p (base,
1014                                                        vector_alignment * BITS_PER_UNIT))
1015           {
1016             if (dump_enabled_p ())
1017               {
1018                 dump_printf_loc (MSG_NOTE, vect_location,
1019                                  "can't force alignment of ref: ");
1020                 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1021                 dump_printf (MSG_NOTE, "\n");
1022               }
1023             return true;
1024           }
1025 
1026       /* Force the alignment of the decl.
1027            NOTE: This is the only change to the code we make during
1028            the analysis phase, before deciding to vectorize the loop.  */
1029       if (dump_enabled_p ())
1030         {
1031           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1032           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1033           dump_printf (MSG_NOTE, "\n");
1034         }
1035 
1036       DR_VECT_AUX (dr)->base_decl = base;
1037       DR_VECT_AUX (dr)->base_misaligned = true;
1038       base_misalignment = 0;
1039     }
1040   poly_int64 misalignment
1041     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1042 
1043   /* If this is a backward running DR then first access in the larger
1044      vectype actually is N-1 elements before the address in the DR.
1045      Adjust misalign accordingly.  */
1046   if (tree_int_cst_sgn (drb->step) < 0)
1047     /* PLUS because STEP is negative.  */
1048     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1049                          * TREE_INT_CST_LOW (drb->step));
1050 
1051   unsigned int const_misalignment;
1052   if (!known_misalignment (misalignment, vector_alignment,
1053                                  &const_misalignment))
1054     {
1055       if (dump_enabled_p ())
1056           {
1057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058                                  "Non-constant misalignment for access: ");
1059             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1060             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1061           }
1062       return true;
1063     }
1064 
1065   SET_DR_MISALIGNMENT (dr, const_misalignment);
1066 
1067   if (dump_enabled_p ())
1068     {
1069       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1070                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1071       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1072       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1073     }
1074 
1075   return true;
1076 }
1077 
1078 /* Function vect_update_misalignment_for_peel.
1079    Sets DR's misalignment
1080    - to 0 if it has the same alignment as DR_PEEL,
1081    - to the misalignment computed using NPEEL if DR's salignment is known,
1082    - to -1 (unknown) otherwise.
1083 
1084    DR - the data reference whose misalignment is to be adjusted.
1085    DR_PEEL - the data reference whose misalignment is being made
1086              zero in the vector loop by the peel.
1087    NPEEL - the number of iterations in the peel loop if the misalignment
1088            of DR_PEEL is known at compile time.  */
1089 
1090 static void
vect_update_misalignment_for_peel(struct data_reference * dr,struct data_reference * dr_peel,int npeel)1091 vect_update_misalignment_for_peel (struct data_reference *dr,
1092                                    struct data_reference *dr_peel, int npeel)
1093 {
1094   unsigned int i;
1095   vec<dr_p> same_aligned_drs;
1096   struct data_reference *current_dr;
1097   int dr_size = vect_get_scalar_dr_size (dr);
1098   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1099   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1100   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1101 
1102  /* For interleaved data accesses the step in the loop must be multiplied by
1103      the size of the interleaving group.  */
1104   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1105     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1106   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1107     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1108 
1109   /* It can be assumed that the data refs with the same alignment as dr_peel
1110      are aligned in the vector loop.  */
1111   same_aligned_drs
1112     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1113   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1114     {
1115       if (current_dr != dr)
1116         continue;
1117       gcc_assert (!known_alignment_for_access_p (dr)
1118                       || !known_alignment_for_access_p (dr_peel)
1119                       || (DR_MISALIGNMENT (dr) / dr_size
1120                           == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1121       SET_DR_MISALIGNMENT (dr, 0);
1122       return;
1123     }
1124 
1125   if (known_alignment_for_access_p (dr)
1126       && known_alignment_for_access_p (dr_peel))
1127     {
1128       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1129       int misal = DR_MISALIGNMENT (dr);
1130       misal += negative ? -npeel * dr_size : npeel * dr_size;
1131       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1132       SET_DR_MISALIGNMENT (dr, misal);
1133       return;
1134     }
1135 
1136   if (dump_enabled_p ())
1137     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1138                          "to unknown (-1).\n");
1139   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1140 }
1141 
1142 
1143 /* Function verify_data_ref_alignment
1144 
1145    Return TRUE if DR can be handled with respect to alignment.  */
1146 
1147 static bool
verify_data_ref_alignment(data_reference_p dr)1148 verify_data_ref_alignment (data_reference_p dr)
1149 {
1150   enum dr_alignment_support supportable_dr_alignment
1151     = vect_supportable_dr_alignment (dr, false);
1152   if (!supportable_dr_alignment)
1153     {
1154       if (dump_enabled_p ())
1155           {
1156             if (DR_IS_READ (dr))
1157               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1158                                    "not vectorized: unsupported unaligned load.");
1159             else
1160               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161                                    "not vectorized: unsupported unaligned "
1162                                    "store.");
1163 
1164             dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1165                                    DR_REF (dr));
1166             dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1167           }
1168       return false;
1169     }
1170 
1171   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1172     dump_printf_loc (MSG_NOTE, vect_location,
1173                          "Vectorizing an unaligned access.\n");
1174 
1175   return true;
1176 }
1177 
1178 /* Function vect_verify_datarefs_alignment
1179 
1180    Return TRUE if all data references in the loop can be
1181    handled with respect to alignment.  */
1182 
1183 bool
vect_verify_datarefs_alignment(loop_vec_info vinfo)1184 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1185 {
1186   vec<data_reference_p> datarefs = vinfo->datarefs;
1187   struct data_reference *dr;
1188   unsigned int i;
1189 
1190   FOR_EACH_VEC_ELT (datarefs, i, dr)
1191     {
1192       gimple *stmt = DR_STMT (dr);
1193       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1194 
1195       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1196           continue;
1197 
1198       /* For interleaving, only the alignment of the first access matters.   */
1199       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1200             && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1201           continue;
1202 
1203       /* Strided accesses perform only component accesses, alignment is
1204            irrelevant for them.  */
1205       if (STMT_VINFO_STRIDED_P (stmt_info)
1206             && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1207           continue;
1208 
1209       if (! verify_data_ref_alignment (dr))
1210           return false;
1211     }
1212 
1213   return true;
1214 }
1215 
1216 /* Given an memory reference EXP return whether its alignment is less
1217    than its size.  */
1218 
1219 static bool
not_size_aligned(tree exp)1220 not_size_aligned (tree exp)
1221 {
1222   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1223     return true;
1224 
1225   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1226             > get_object_alignment (exp));
1227 }
1228 
1229 /* Function vector_alignment_reachable_p
1230 
1231    Return true if vector alignment for DR is reachable by peeling
1232    a few loop iterations.  Return false otherwise.  */
1233 
1234 static bool
vector_alignment_reachable_p(struct data_reference * dr)1235 vector_alignment_reachable_p (struct data_reference *dr)
1236 {
1237   gimple *stmt = DR_STMT (dr);
1238   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1239   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1240 
1241   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1242     {
1243       /* For interleaved access we peel only if number of iterations in
1244            the prolog loop ({VF - misalignment}), is a multiple of the
1245            number of the interleaved accesses.  */
1246       int elem_size, mis_in_elements;
1247 
1248       /* FORNOW: handle only known alignment.  */
1249       if (!known_alignment_for_access_p (dr))
1250           return false;
1251 
1252       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1253       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1254       elem_size = vector_element_size (vector_size, nelements);
1255       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1256 
1257       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1258           return false;
1259     }
1260 
1261   /* If misalignment is known at the compile time then allow peeling
1262      only if natural alignment is reachable through peeling.  */
1263   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1264     {
1265       HOST_WIDE_INT elmsize =
1266                     int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1267       if (dump_enabled_p ())
1268           {
1269             dump_printf_loc (MSG_NOTE, vect_location,
1270                              "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1271             dump_printf (MSG_NOTE,
1272                          ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1273           }
1274       if (DR_MISALIGNMENT (dr) % elmsize)
1275           {
1276             if (dump_enabled_p ())
1277               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1278                                "data size does not divide the misalignment.\n");
1279             return false;
1280           }
1281     }
1282 
1283   if (!known_alignment_for_access_p (dr))
1284     {
1285       tree type = TREE_TYPE (DR_REF (dr));
1286       bool is_packed = not_size_aligned (DR_REF (dr));
1287       if (dump_enabled_p ())
1288           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                            "Unknown misalignment, %snaturally aligned\n",
1290                                is_packed ? "not " : "");
1291       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1292     }
1293 
1294   return true;
1295 }
1296 
1297 
1298 /* Calculate the cost of the memory access represented by DR.  */
1299 
1300 static void
vect_get_data_access_cost(struct data_reference * dr,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec)1301 vect_get_data_access_cost (struct data_reference *dr,
1302                            unsigned int *inside_cost,
1303                            unsigned int *outside_cost,
1304                                  stmt_vector_for_cost *body_cost_vec)
1305 {
1306   gimple *stmt = DR_STMT (dr);
1307   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1308   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1309   int ncopies;
1310 
1311   if (PURE_SLP_STMT (stmt_info))
1312     ncopies = 1;
1313   else
1314     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1315 
1316   if (DR_IS_READ (dr))
1317     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1318                               NULL, body_cost_vec, false);
1319   else
1320     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1321 
1322   if (dump_enabled_p ())
1323     dump_printf_loc (MSG_NOTE, vect_location,
1324                      "vect_get_data_access_cost: inside_cost = %d, "
1325                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1326 }
1327 
1328 
1329 typedef struct _vect_peel_info
1330 {
1331   struct data_reference *dr;
1332   int npeel;
1333   unsigned int count;
1334 } *vect_peel_info;
1335 
1336 typedef struct _vect_peel_extended_info
1337 {
1338   struct _vect_peel_info peel_info;
1339   unsigned int inside_cost;
1340   unsigned int outside_cost;
1341 } *vect_peel_extended_info;
1342 
1343 
1344 /* Peeling hashtable helpers.  */
1345 
1346 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1347 {
1348   static inline hashval_t hash (const _vect_peel_info *);
1349   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1350 };
1351 
1352 inline hashval_t
hash(const _vect_peel_info * peel_info)1353 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1354 {
1355   return (hashval_t) peel_info->npeel;
1356 }
1357 
1358 inline bool
equal(const _vect_peel_info * a,const _vect_peel_info * b)1359 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1360 {
1361   return (a->npeel == b->npeel);
1362 }
1363 
1364 
1365 /* Insert DR into peeling hash table with NPEEL as key.  */
1366 
1367 static void
vect_peeling_hash_insert(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo,struct data_reference * dr,int npeel)1368 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1369                                 loop_vec_info loop_vinfo, struct data_reference *dr,
1370                           int npeel)
1371 {
1372   struct _vect_peel_info elem, *slot;
1373   _vect_peel_info **new_slot;
1374   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1375 
1376   elem.npeel = npeel;
1377   slot = peeling_htab->find (&elem);
1378   if (slot)
1379     slot->count++;
1380   else
1381     {
1382       slot = XNEW (struct _vect_peel_info);
1383       slot->npeel = npeel;
1384       slot->dr = dr;
1385       slot->count = 1;
1386       new_slot = peeling_htab->find_slot (slot, INSERT);
1387       *new_slot = slot;
1388     }
1389 
1390   if (!supportable_dr_alignment
1391       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1392     slot->count += VECT_MAX_COST;
1393 }
1394 
1395 
1396 /* Traverse peeling hash table to find peeling option that aligns maximum
1397    number of data accesses.  */
1398 
1399 int
vect_peeling_hash_get_most_frequent(_vect_peel_info ** slot,_vect_peel_extended_info * max)1400 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1401                                              _vect_peel_extended_info *max)
1402 {
1403   vect_peel_info elem = *slot;
1404 
1405   if (elem->count > max->peel_info.count
1406       || (elem->count == max->peel_info.count
1407           && max->peel_info.npeel > elem->npeel))
1408     {
1409       max->peel_info.npeel = elem->npeel;
1410       max->peel_info.count = elem->count;
1411       max->peel_info.dr = elem->dr;
1412     }
1413 
1414   return 1;
1415 }
1416 
1417 /* Get the costs of peeling NPEEL iterations checking data access costs
1418    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1419    misalignment will be zero after peeling.  */
1420 
1421 static void
vect_get_peeling_costs_all_drs(vec<data_reference_p> datarefs,struct data_reference * dr0,unsigned int * inside_cost,unsigned int * outside_cost,stmt_vector_for_cost * body_cost_vec,unsigned int npeel,bool unknown_misalignment)1422 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1423                                         struct data_reference *dr0,
1424                                         unsigned int *inside_cost,
1425                                         unsigned int *outside_cost,
1426                                         stmt_vector_for_cost *body_cost_vec,
1427                                         unsigned int npeel,
1428                                         bool unknown_misalignment)
1429 {
1430   unsigned i;
1431   data_reference *dr;
1432 
1433   FOR_EACH_VEC_ELT (datarefs, i, dr)
1434     {
1435       gimple *stmt = DR_STMT (dr);
1436       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1437       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1438           continue;
1439 
1440       /* For interleaving, only the alignment of the first access
1441          matters.  */
1442       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1443           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1444         continue;
1445 
1446       /* Strided accesses perform only component accesses, alignment is
1447          irrelevant for them.  */
1448       if (STMT_VINFO_STRIDED_P (stmt_info)
1449             && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1450           continue;
1451 
1452       int save_misalignment;
1453       save_misalignment = DR_MISALIGNMENT (dr);
1454       if (npeel == 0)
1455           ;
1456       else if (unknown_misalignment && dr == dr0)
1457           SET_DR_MISALIGNMENT (dr, 0);
1458       else
1459           vect_update_misalignment_for_peel (dr, dr0, npeel);
1460       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1461                                          body_cost_vec);
1462       SET_DR_MISALIGNMENT (dr, save_misalignment);
1463     }
1464 }
1465 
1466 /* Traverse peeling hash table and calculate cost for each peeling option.
1467    Find the one with the lowest cost.  */
1468 
1469 int
vect_peeling_hash_get_lowest_cost(_vect_peel_info ** slot,_vect_peel_extended_info * min)1470 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1471                                            _vect_peel_extended_info *min)
1472 {
1473   vect_peel_info elem = *slot;
1474   int dummy;
1475   unsigned int inside_cost = 0, outside_cost = 0;
1476   gimple *stmt = DR_STMT (elem->dr);
1477   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1478   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1479   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1480                            epilogue_cost_vec;
1481 
1482   prologue_cost_vec.create (2);
1483   body_cost_vec.create (2);
1484   epilogue_cost_vec.create (2);
1485 
1486   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1487                                           elem->dr, &inside_cost, &outside_cost,
1488                                           &body_cost_vec, elem->npeel, false);
1489 
1490   body_cost_vec.release ();
1491 
1492   outside_cost += vect_get_known_peeling_cost
1493     (loop_vinfo, elem->npeel, &dummy,
1494      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1495      &prologue_cost_vec, &epilogue_cost_vec);
1496 
1497   /* Prologue and epilogue costs are added to the target model later.
1498      These costs depend only on the scalar iteration cost, the
1499      number of peeling iterations finally chosen, and the number of
1500      misaligned statements.  So discard the information found here.  */
1501   prologue_cost_vec.release ();
1502   epilogue_cost_vec.release ();
1503 
1504   if (inside_cost < min->inside_cost
1505       || (inside_cost == min->inside_cost
1506             && outside_cost < min->outside_cost))
1507     {
1508       min->inside_cost = inside_cost;
1509       min->outside_cost = outside_cost;
1510       min->peel_info.dr = elem->dr;
1511       min->peel_info.npeel = elem->npeel;
1512       min->peel_info.count = elem->count;
1513     }
1514 
1515   return 1;
1516 }
1517 
1518 
1519 /* Choose best peeling option by traversing peeling hash table and either
1520    choosing an option with the lowest cost (if cost model is enabled) or the
1521    option that aligns as many accesses as possible.  */
1522 
1523 static struct _vect_peel_extended_info
vect_peeling_hash_choose_best_peeling(hash_table<peel_info_hasher> * peeling_htab,loop_vec_info loop_vinfo)1524 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1525                                                loop_vec_info loop_vinfo)
1526 {
1527    struct _vect_peel_extended_info res;
1528 
1529    res.peel_info.dr = NULL;
1530 
1531    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1532      {
1533        res.inside_cost = INT_MAX;
1534        res.outside_cost = INT_MAX;
1535        peeling_htab->traverse <_vect_peel_extended_info *,
1536                                      vect_peeling_hash_get_lowest_cost> (&res);
1537      }
1538    else
1539      {
1540        res.peel_info.count = 0;
1541        peeling_htab->traverse <_vect_peel_extended_info *,
1542                                      vect_peeling_hash_get_most_frequent> (&res);
1543        res.inside_cost = 0;
1544        res.outside_cost = 0;
1545      }
1546 
1547    return res;
1548 }
1549 
1550 /* Return true if the new peeling NPEEL is supported.  */
1551 
1552 static bool
vect_peeling_supportable(loop_vec_info loop_vinfo,struct data_reference * dr0,unsigned npeel)1553 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1554                                 unsigned npeel)
1555 {
1556   unsigned i;
1557   struct data_reference *dr = NULL;
1558   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1559   gimple *stmt;
1560   stmt_vec_info stmt_info;
1561   enum dr_alignment_support supportable_dr_alignment;
1562 
1563   /* Ensure that all data refs can be vectorized after the peel.  */
1564   FOR_EACH_VEC_ELT (datarefs, i, dr)
1565     {
1566       int save_misalignment;
1567 
1568       if (dr == dr0)
1569           continue;
1570 
1571       stmt = DR_STMT (dr);
1572       stmt_info = vinfo_for_stmt (stmt);
1573       /* For interleaving, only the alignment of the first access
1574            matters.  */
1575       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1576             && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1577           continue;
1578 
1579       /* Strided accesses perform only component accesses, alignment is
1580            irrelevant for them.  */
1581       if (STMT_VINFO_STRIDED_P (stmt_info)
1582             && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1583           continue;
1584 
1585       save_misalignment = DR_MISALIGNMENT (dr);
1586       vect_update_misalignment_for_peel (dr, dr0, npeel);
1587       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1588       SET_DR_MISALIGNMENT (dr, save_misalignment);
1589 
1590       if (!supportable_dr_alignment)
1591           return false;
1592     }
1593 
1594   return true;
1595 }
1596 
1597 /* Function vect_enhance_data_refs_alignment
1598 
1599    This pass will use loop versioning and loop peeling in order to enhance
1600    the alignment of data references in the loop.
1601 
1602    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1603    original loop is to be vectorized.  Any other loops that are created by
1604    the transformations performed in this pass - are not supposed to be
1605    vectorized.  This restriction will be relaxed.
1606 
1607    This pass will require a cost model to guide it whether to apply peeling
1608    or versioning or a combination of the two.  For example, the scheme that
1609    intel uses when given a loop with several memory accesses, is as follows:
1610    choose one memory access ('p') which alignment you want to force by doing
1611    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1612    other accesses are not necessarily aligned, or (2) use loop versioning to
1613    generate one loop in which all accesses are aligned, and another loop in
1614    which only 'p' is necessarily aligned.
1615 
1616    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1617    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1618    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1619 
1620    Devising a cost model is the most critical aspect of this work.  It will
1621    guide us on which access to peel for, whether to use loop versioning, how
1622    many versions to create, etc.  The cost model will probably consist of
1623    generic considerations as well as target specific considerations (on
1624    powerpc for example, misaligned stores are more painful than misaligned
1625    loads).
1626 
1627    Here are the general steps involved in alignment enhancements:
1628 
1629      -- original loop, before alignment analysis:
1630           for (i=0; i<N; i++){
1631             x = q[i];                             # DR_MISALIGNMENT(q) = unknown
1632             p[i] = y;                             # DR_MISALIGNMENT(p) = unknown
1633           }
1634 
1635      -- After vect_compute_data_refs_alignment:
1636           for (i=0; i<N; i++){
1637             x = q[i];                             # DR_MISALIGNMENT(q) = 3
1638             p[i] = y;                             # DR_MISALIGNMENT(p) = unknown
1639           }
1640 
1641      -- Possibility 1: we do loop versioning:
1642      if (p is aligned) {
1643           for (i=0; i<N; i++){          # loop 1A
1644             x = q[i];                             # DR_MISALIGNMENT(q) = 3
1645             p[i] = y;                             # DR_MISALIGNMENT(p) = 0
1646           }
1647      }
1648      else {
1649           for (i=0; i<N; i++){          # loop 1B
1650             x = q[i];                             # DR_MISALIGNMENT(q) = 3
1651             p[i] = y;                             # DR_MISALIGNMENT(p) = unaligned
1652           }
1653      }
1654 
1655      -- Possibility 2: we do loop peeling:
1656      for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1657           x = q[i];
1658           p[i] = y;
1659      }
1660      for (i = 3; i < N; i++){ # loop 2A
1661           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1662           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1663      }
1664 
1665      -- Possibility 3: combination of loop peeling and versioning:
1666      for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1667           x = q[i];
1668           p[i] = y;
1669      }
1670      if (p is aligned) {
1671           for (i = 3; i<N; i++){        # loop 3A
1672             x = q[i];                             # DR_MISALIGNMENT(q) = 0
1673             p[i] = y;                             # DR_MISALIGNMENT(p) = 0
1674           }
1675      }
1676      else {
1677           for (i = 3; i<N; i++){        # loop 3B
1678             x = q[i];                             # DR_MISALIGNMENT(q) = 0
1679             p[i] = y;                             # DR_MISALIGNMENT(p) = unaligned
1680           }
1681      }
1682 
1683      These loops are later passed to loop_transform to be vectorized.  The
1684      vectorizer will use the alignment information to guide the transformation
1685      (whether to generate regular loads/stores, or with special handling for
1686      misalignment).  */
1687 
1688 bool
vect_enhance_data_refs_alignment(loop_vec_info loop_vinfo)1689 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1690 {
1691   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1692   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1693   enum dr_alignment_support supportable_dr_alignment;
1694   struct data_reference *dr0 = NULL, *first_store = NULL;
1695   struct data_reference *dr;
1696   unsigned int i, j;
1697   bool do_peeling = false;
1698   bool do_versioning = false;
1699   bool stat;
1700   gimple *stmt;
1701   stmt_vec_info stmt_info;
1702   unsigned int npeel = 0;
1703   bool one_misalignment_known = false;
1704   bool one_misalignment_unknown = false;
1705   bool one_dr_unsupportable = false;
1706   struct data_reference *unsupportable_dr = NULL;
1707   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1708   unsigned possible_npeel_number = 1;
1709   tree vectype;
1710   unsigned int mis, same_align_drs_max = 0;
1711   hash_table<peel_info_hasher> peeling_htab (1);
1712 
1713   if (dump_enabled_p ())
1714     dump_printf_loc (MSG_NOTE, vect_location,
1715                      "=== vect_enhance_data_refs_alignment ===\n");
1716 
1717   /* Reset data so we can safely be called multiple times.  */
1718   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1719   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1720 
1721   /* While cost model enhancements are expected in the future, the high level
1722      view of the code at this time is as follows:
1723 
1724      A) If there is a misaligned access then see if peeling to align
1725         this access can make all data references satisfy
1726         vect_supportable_dr_alignment.  If so, update data structures
1727         as needed and return true.
1728 
1729      B) If peeling wasn't possible and there is a data reference with an
1730         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1731         then see if loop versioning checks can be used to make all data
1732         references satisfy vect_supportable_dr_alignment.  If so, update
1733         data structures as needed and return true.
1734 
1735      C) If neither peeling nor versioning were successful then return false if
1736         any data reference does not satisfy vect_supportable_dr_alignment.
1737 
1738      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1739 
1740      Note, Possibility 3 above (which is peeling and versioning together) is not
1741      being done at this time.  */
1742 
1743   /* (1) Peeling to force alignment.  */
1744 
1745   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1746      Considerations:
1747      + How many accesses will become aligned due to the peeling
1748      - How many accesses will become unaligned due to the peeling,
1749        and the cost of misaligned accesses.
1750      - The cost of peeling (the extra runtime checks, the increase
1751        in code size).  */
1752 
1753   FOR_EACH_VEC_ELT (datarefs, i, dr)
1754     {
1755       stmt = DR_STMT (dr);
1756       stmt_info = vinfo_for_stmt (stmt);
1757 
1758       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1759           continue;
1760 
1761       /* For interleaving, only the alignment of the first access
1762          matters.  */
1763       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1764           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1765         continue;
1766 
1767       /* For invariant accesses there is nothing to enhance.  */
1768       if (integer_zerop (DR_STEP (dr)))
1769           continue;
1770 
1771       /* Strided accesses perform only component accesses, alignment is
1772            irrelevant for them.  */
1773       if (STMT_VINFO_STRIDED_P (stmt_info)
1774             && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1775           continue;
1776 
1777       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1778       do_peeling = vector_alignment_reachable_p (dr);
1779       if (do_peeling)
1780         {
1781           if (known_alignment_for_access_p (dr))
1782             {
1783                 unsigned int npeel_tmp = 0;
1784                 bool negative = tree_int_cst_compare (DR_STEP (dr),
1785                                                                 size_zero_node) < 0;
1786 
1787                 vectype = STMT_VINFO_VECTYPE (stmt_info);
1788                 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1789                 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1790                 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1791                 if (DR_MISALIGNMENT (dr) != 0)
1792                     npeel_tmp = (mis & (target_align - 1)) / dr_size;
1793 
1794               /* For multiple types, it is possible that the bigger type access
1795                  will have more than one peeling option.  E.g., a loop with two
1796                  types: one of size (vector size / 4), and the other one of
1797                  size (vector size / 8).  Vectorization factor will 8.  If both
1798                  accesses are misaligned by 3, the first one needs one scalar
1799                  iteration to be aligned, and the second one needs 5.  But the
1800                      first one will be aligned also by peeling 5 scalar
1801                  iterations, and in that case both accesses will be aligned.
1802                  Hence, except for the immediate peeling amount, we also want
1803                  to try to add full vector size, while we don't exceed
1804                  vectorization factor.
1805                  We do this automatically for cost model, since we calculate
1806                      cost for every peeling option.  */
1807               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1808                     {
1809                       poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1810                                                     ? vf * GROUP_SIZE (stmt_info) : vf);
1811                       possible_npeel_number
1812                         = vect_get_num_vectors (nscalars, vectype);
1813 
1814                       /* NPEEL_TMP is 0 when there is no misalignment, but also
1815                          allow peeling NELEMENTS.  */
1816                       if (DR_MISALIGNMENT (dr) == 0)
1817                         possible_npeel_number++;
1818                     }
1819 
1820                 /* Save info about DR in the hash table.  Also include peeling
1821                    amounts according to the explanation above.  */
1822               for (j = 0; j < possible_npeel_number; j++)
1823                 {
1824                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1825                                                       dr, npeel_tmp);
1826                       npeel_tmp += target_align / dr_size;
1827                 }
1828 
1829                 one_misalignment_known = true;
1830             }
1831           else
1832             {
1833               /* If we don't know any misalignment values, we prefer
1834                  peeling for data-ref that has the maximum number of data-refs
1835                  with the same alignment, unless the target prefers to align
1836                  stores over load.  */
1837                 unsigned same_align_drs
1838                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1839                 if (!dr0
1840                       || same_align_drs_max < same_align_drs)
1841                     {
1842                       same_align_drs_max = same_align_drs;
1843                       dr0 = dr;
1844                     }
1845                 /* For data-refs with the same number of related
1846                      accesses prefer the one where the misalign
1847                      computation will be invariant in the outermost loop.  */
1848                 else if (same_align_drs_max == same_align_drs)
1849                     {
1850                       struct loop *ivloop0, *ivloop;
1851                       ivloop0 = outermost_invariant_loop_for_expr
1852                         (loop, DR_BASE_ADDRESS (dr0));
1853                       ivloop = outermost_invariant_loop_for_expr
1854                         (loop, DR_BASE_ADDRESS (dr));
1855                       if ((ivloop && !ivloop0)
1856                           || (ivloop && ivloop0
1857                                 && flow_loop_nested_p (ivloop, ivloop0)))
1858                         dr0 = dr;
1859                     }
1860 
1861                 one_misalignment_unknown = true;
1862 
1863                 /* Check for data refs with unsupportable alignment that
1864                    can be peeled.  */
1865                 if (!supportable_dr_alignment)
1866                 {
1867                     one_dr_unsupportable = true;
1868                     unsupportable_dr = dr;
1869                 }
1870 
1871                 if (!first_store && DR_IS_WRITE (dr))
1872                     first_store = dr;
1873             }
1874         }
1875       else
1876         {
1877           if (!aligned_access_p (dr))
1878             {
1879               if (dump_enabled_p ())
1880                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1881                                  "vector alignment may not be reachable\n");
1882               break;
1883             }
1884         }
1885     }
1886 
1887   /* Check if we can possibly peel the loop.  */
1888   if (!vect_can_advance_ivs_p (loop_vinfo)
1889       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1890       || loop->inner)
1891     do_peeling = false;
1892 
1893   struct _vect_peel_extended_info peel_for_known_alignment;
1894   struct _vect_peel_extended_info peel_for_unknown_alignment;
1895   struct _vect_peel_extended_info best_peel;
1896 
1897   peel_for_unknown_alignment.inside_cost = INT_MAX;
1898   peel_for_unknown_alignment.outside_cost = INT_MAX;
1899   peel_for_unknown_alignment.peel_info.count = 0;
1900 
1901   if (do_peeling
1902       && one_misalignment_unknown)
1903     {
1904       /* Check if the target requires to prefer stores over loads, i.e., if
1905          misaligned stores are more expensive than misaligned loads (taking
1906          drs with same alignment into account).  */
1907       unsigned int load_inside_cost = 0;
1908       unsigned int load_outside_cost = 0;
1909       unsigned int store_inside_cost = 0;
1910       unsigned int store_outside_cost = 0;
1911       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1912 
1913       stmt_vector_for_cost dummy;
1914       dummy.create (2);
1915       vect_get_peeling_costs_all_drs (datarefs, dr0,
1916                                               &load_inside_cost,
1917                                               &load_outside_cost,
1918                                               &dummy, estimated_npeels, true);
1919       dummy.release ();
1920 
1921       if (first_store)
1922           {
1923             dummy.create (2);
1924             vect_get_peeling_costs_all_drs (datarefs, first_store,
1925                                                     &store_inside_cost,
1926                                                     &store_outside_cost,
1927                                                     &dummy, estimated_npeels, true);
1928             dummy.release ();
1929           }
1930       else
1931           {
1932             store_inside_cost = INT_MAX;
1933             store_outside_cost = INT_MAX;
1934           }
1935 
1936       if (load_inside_cost > store_inside_cost
1937             || (load_inside_cost == store_inside_cost
1938                 && load_outside_cost > store_outside_cost))
1939           {
1940             dr0 = first_store;
1941             peel_for_unknown_alignment.inside_cost = store_inside_cost;
1942             peel_for_unknown_alignment.outside_cost = store_outside_cost;
1943           }
1944       else
1945           {
1946             peel_for_unknown_alignment.inside_cost = load_inside_cost;
1947             peel_for_unknown_alignment.outside_cost = load_outside_cost;
1948           }
1949 
1950       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1951       prologue_cost_vec.create (2);
1952       epilogue_cost_vec.create (2);
1953 
1954       int dummy2;
1955       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1956           (loop_vinfo, estimated_npeels, &dummy2,
1957            &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1958            &prologue_cost_vec, &epilogue_cost_vec);
1959 
1960       prologue_cost_vec.release ();
1961       epilogue_cost_vec.release ();
1962 
1963       peel_for_unknown_alignment.peel_info.count = 1
1964           + STMT_VINFO_SAME_ALIGN_REFS
1965           (vinfo_for_stmt (DR_STMT (dr0))).length ();
1966     }
1967 
1968   peel_for_unknown_alignment.peel_info.npeel = 0;
1969   peel_for_unknown_alignment.peel_info.dr = dr0;
1970 
1971   best_peel = peel_for_unknown_alignment;
1972 
1973   peel_for_known_alignment.inside_cost = INT_MAX;
1974   peel_for_known_alignment.outside_cost = INT_MAX;
1975   peel_for_known_alignment.peel_info.count = 0;
1976   peel_for_known_alignment.peel_info.dr = NULL;
1977 
1978   if (do_peeling && one_misalignment_known)
1979     {
1980       /* Peeling is possible, but there is no data access that is not supported
1981          unless aligned.  So we try to choose the best possible peeling from
1982            the hash table.  */
1983       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1984           (&peeling_htab, loop_vinfo);
1985     }
1986 
1987   /* Compare costs of peeling for known and unknown alignment. */
1988   if (peel_for_known_alignment.peel_info.dr != NULL
1989       && peel_for_unknown_alignment.inside_cost
1990       >= peel_for_known_alignment.inside_cost)
1991     {
1992       best_peel = peel_for_known_alignment;
1993 
1994       /* If the best peeling for known alignment has NPEEL == 0, perform no
1995          peeling at all except if there is an unsupportable dr that we can
1996          align.  */
1997       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1998           do_peeling = false;
1999     }
2000 
2001   /* If there is an unsupportable data ref, prefer this over all choices so far
2002      since we'd have to discard a chosen peeling except when it accidentally
2003      aligned the unsupportable data ref.  */
2004   if (one_dr_unsupportable)
2005     dr0 = unsupportable_dr;
2006   else if (do_peeling)
2007     {
2008       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2009            TODO: Use nopeel_outside_cost or get rid of it?  */
2010       unsigned nopeel_inside_cost = 0;
2011       unsigned nopeel_outside_cost = 0;
2012 
2013       stmt_vector_for_cost dummy;
2014       dummy.create (2);
2015       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
2016                                               &nopeel_outside_cost, &dummy, 0, false);
2017       dummy.release ();
2018 
2019       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2020            costs will be recorded.  */
2021       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2022       prologue_cost_vec.create (2);
2023       epilogue_cost_vec.create (2);
2024 
2025       int dummy2;
2026       nopeel_outside_cost += vect_get_known_peeling_cost
2027           (loop_vinfo, 0, &dummy2,
2028            &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2029            &prologue_cost_vec, &epilogue_cost_vec);
2030 
2031       prologue_cost_vec.release ();
2032       epilogue_cost_vec.release ();
2033 
2034       npeel = best_peel.peel_info.npeel;
2035       dr0 = best_peel.peel_info.dr;
2036 
2037       /* If no peeling is not more expensive than the best peeling we
2038            have so far, don't perform any peeling.  */
2039       if (nopeel_inside_cost <= best_peel.inside_cost)
2040           do_peeling = false;
2041     }
2042 
2043   if (do_peeling)
2044     {
2045       stmt = DR_STMT (dr0);
2046       stmt_info = vinfo_for_stmt (stmt);
2047       vectype = STMT_VINFO_VECTYPE (stmt_info);
2048 
2049       if (known_alignment_for_access_p (dr0))
2050         {
2051             bool negative = tree_int_cst_compare (DR_STEP (dr0),
2052                                                             size_zero_node) < 0;
2053           if (!npeel)
2054             {
2055               /* Since it's known at compile time, compute the number of
2056                  iterations in the peeled loop (the peeling factor) for use in
2057                  updating DR_MISALIGNMENT values.  The peeling factor is the
2058                  vectorization factor minus the misalignment as an element
2059                  count.  */
2060                 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2061                 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2062                 npeel = ((mis & (target_align - 1))
2063                            / vect_get_scalar_dr_size (dr0));
2064             }
2065 
2066             /* For interleaved data access every iteration accesses all the
2067                members of the group, therefore we divide the number of iterations
2068                by the group size.  */
2069             stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2070             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2071               npeel /= GROUP_SIZE (stmt_info);
2072 
2073           if (dump_enabled_p ())
2074             dump_printf_loc (MSG_NOTE, vect_location,
2075                              "Try peeling by %d\n", npeel);
2076         }
2077 
2078       /* Ensure that all datarefs can be vectorized after the peel.  */
2079       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2080           do_peeling = false;
2081 
2082       /* Check if all datarefs are supportable and log.  */
2083       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2084         {
2085           stat = vect_verify_datarefs_alignment (loop_vinfo);
2086           if (!stat)
2087             do_peeling = false;
2088           else
2089               return stat;
2090         }
2091 
2092       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2093       if (do_peeling)
2094         {
2095           unsigned max_allowed_peel
2096             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2097           if (max_allowed_peel != (unsigned)-1)
2098             {
2099               unsigned max_peel = npeel;
2100               if (max_peel == 0)
2101                 {
2102                       unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2103                       max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2104                 }
2105               if (max_peel > max_allowed_peel)
2106                 {
2107                   do_peeling = false;
2108                   if (dump_enabled_p ())
2109                     dump_printf_loc (MSG_NOTE, vect_location,
2110                         "Disable peeling, max peels reached: %d\n", max_peel);
2111                 }
2112             }
2113         }
2114 
2115       /* Cost model #2 - if peeling may result in a remaining loop not
2116            iterating enough to be vectorized then do not peel.  Since this
2117            is a cost heuristic rather than a correctness decision, use the
2118            most likely runtime value for variable vectorization factors.  */
2119       if (do_peeling
2120             && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2121           {
2122             unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2123             unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2124             if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2125                 < assumed_vf + max_peel)
2126               do_peeling = false;
2127           }
2128 
2129       if (do_peeling)
2130         {
2131           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2132              If the misalignment of DR_i is identical to that of dr0 then set
2133              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2134              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2135              by the peeling factor times the element size of DR_i (MOD the
2136              vectorization factor times the size).  Otherwise, the
2137              misalignment of DR_i must be set to unknown.  */
2138             FOR_EACH_VEC_ELT (datarefs, i, dr)
2139               if (dr != dr0)
2140                 {
2141                     /* Strided accesses perform only component accesses, alignment
2142                        is irrelevant for them.  */
2143                     stmt_info = vinfo_for_stmt (DR_STMT (dr));
2144                     if (STMT_VINFO_STRIDED_P (stmt_info)
2145                         && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146                       continue;
2147 
2148                     vect_update_misalignment_for_peel (dr, dr0, npeel);
2149                 }
2150 
2151           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2152           if (npeel)
2153             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2154           else
2155             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2156                 = DR_MISALIGNMENT (dr0);
2157             SET_DR_MISALIGNMENT (dr0, 0);
2158             if (dump_enabled_p ())
2159             {
2160               dump_printf_loc (MSG_NOTE, vect_location,
2161                                "Alignment of access forced using peeling.\n");
2162               dump_printf_loc (MSG_NOTE, vect_location,
2163                                "Peeling for alignment will be applied.\n");
2164             }
2165 
2166             /* The inside-loop cost will be accounted for in vectorizable_load
2167                and vectorizable_store correctly with adjusted alignments.
2168                Drop the body_cst_vec on the floor here.  */
2169             stat = vect_verify_datarefs_alignment (loop_vinfo);
2170             gcc_assert (stat);
2171           return stat;
2172         }
2173     }
2174 
2175   /* (2) Versioning to force alignment.  */
2176 
2177   /* Try versioning if:
2178      1) optimize loop for speed
2179      2) there is at least one unsupported misaligned data ref with an unknown
2180         misalignment, and
2181      3) all misaligned data refs with a known misalignment are supported, and
2182      4) the number of runtime alignment checks is within reason.  */
2183 
2184   do_versioning =
2185           optimize_loop_nest_for_speed_p (loop)
2186           && (!loop->inner); /* FORNOW */
2187 
2188   if (do_versioning)
2189     {
2190       FOR_EACH_VEC_ELT (datarefs, i, dr)
2191         {
2192             stmt = DR_STMT (dr);
2193             stmt_info = vinfo_for_stmt (stmt);
2194 
2195             /* For interleaving, only the alignment of the first access
2196                matters.  */
2197             if (aligned_access_p (dr)
2198                 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2199                       && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2200               continue;
2201 
2202             if (STMT_VINFO_STRIDED_P (stmt_info))
2203               {
2204                 /* Strided loads perform only component accesses, alignment is
2205                      irrelevant for them.  */
2206                 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2207                     continue;
2208                 do_versioning = false;
2209                 break;
2210               }
2211 
2212             supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2213 
2214           if (!supportable_dr_alignment)
2215             {
2216                 gimple *stmt;
2217               int mask;
2218               tree vectype;
2219 
2220               if (known_alignment_for_access_p (dr)
2221                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2222                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2223                 {
2224                   do_versioning = false;
2225                   break;
2226                 }
2227 
2228               stmt = DR_STMT (dr);
2229               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2230               gcc_assert (vectype);
2231 
2232                 /* At present we don't support versioning for alignment
2233                      with variable VF, since there's no guarantee that the
2234                      VF is a power of two.  We could relax this if we added
2235                      a way of enforcing a power-of-two size.  */
2236                 unsigned HOST_WIDE_INT size;
2237                 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2238                     {
2239                       do_versioning = false;
2240                       break;
2241                     }
2242 
2243               /* The rightmost bits of an aligned address must be zeros.
2244                  Construct the mask needed for this test.  For example,
2245                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2246                  mask must be 15 = 0xf. */
2247                 mask = size - 1;
2248 
2249               /* FORNOW: use the same mask to test all potentially unaligned
2250                  references in the loop.  The vectorizer currently supports
2251                  a single vector size, see the reference to
2252                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2253                  vectorization factor is computed.  */
2254               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2255                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2256               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2257               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2258                           DR_STMT (dr));
2259             }
2260         }
2261 
2262       /* Versioning requires at least one misaligned data reference.  */
2263       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2264         do_versioning = false;
2265       else if (!do_versioning)
2266         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2267     }
2268 
2269   if (do_versioning)
2270     {
2271       vec<gimple *> may_misalign_stmts
2272         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2273       gimple *stmt;
2274 
2275       /* It can now be assumed that the data references in the statements
2276          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2277          of the loop being vectorized.  */
2278       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2279         {
2280           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2281           dr = STMT_VINFO_DATA_REF (stmt_info);
2282             SET_DR_MISALIGNMENT (dr, 0);
2283             if (dump_enabled_p ())
2284             dump_printf_loc (MSG_NOTE, vect_location,
2285                              "Alignment of access forced using versioning.\n");
2286         }
2287 
2288       if (dump_enabled_p ())
2289         dump_printf_loc (MSG_NOTE, vect_location,
2290                          "Versioning for alignment will be applied.\n");
2291 
2292       /* Peeling and versioning can't be done together at this time.  */
2293       gcc_assert (! (do_peeling && do_versioning));
2294 
2295       stat = vect_verify_datarefs_alignment (loop_vinfo);
2296       gcc_assert (stat);
2297       return stat;
2298     }
2299 
2300   /* This point is reached if neither peeling nor versioning is being done.  */
2301   gcc_assert (! (do_peeling || do_versioning));
2302 
2303   stat = vect_verify_datarefs_alignment (loop_vinfo);
2304   return stat;
2305 }
2306 
2307 
2308 /* Function vect_find_same_alignment_drs.
2309 
2310    Update group and alignment relations according to the chosen
2311    vectorization factor.  */
2312 
2313 static void
vect_find_same_alignment_drs(struct data_dependence_relation * ddr)2314 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2315 {
2316   struct data_reference *dra = DDR_A (ddr);
2317   struct data_reference *drb = DDR_B (ddr);
2318   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2319   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2320 
2321   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2322     return;
2323 
2324   if (dra == drb)
2325     return;
2326 
2327   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2328       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2329       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2330     return;
2331 
2332   /* Two references with distance zero have the same alignment.  */
2333   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2334                                 - wi::to_poly_offset (DR_INIT (drb)));
2335   if (maybe_ne (diff, 0))
2336     {
2337       /* Get the wider of the two alignments.  */
2338       unsigned int align_a = (vect_calculate_target_alignment (dra)
2339                                     / BITS_PER_UNIT);
2340       unsigned int align_b = (vect_calculate_target_alignment (drb)
2341                                     / BITS_PER_UNIT);
2342       unsigned int max_align = MAX (align_a, align_b);
2343 
2344       /* Require the gap to be a multiple of the larger vector alignment.  */
2345       if (!multiple_p (diff, max_align))
2346           return;
2347     }
2348 
2349   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2350   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2351   if (dump_enabled_p ())
2352     {
2353       dump_printf_loc (MSG_NOTE, vect_location,
2354                            "accesses have the same alignment: ");
2355       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2356       dump_printf (MSG_NOTE,  " and ");
2357       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2358       dump_printf (MSG_NOTE, "\n");
2359     }
2360 }
2361 
2362 
2363 /* Function vect_analyze_data_refs_alignment
2364 
2365    Analyze the alignment of the data-references in the loop.
2366    Return FALSE if a data reference is found that cannot be vectorized.  */
2367 
2368 bool
vect_analyze_data_refs_alignment(loop_vec_info vinfo)2369 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2370 {
2371   if (dump_enabled_p ())
2372     dump_printf_loc (MSG_NOTE, vect_location,
2373                      "=== vect_analyze_data_refs_alignment ===\n");
2374 
2375   /* Mark groups of data references with same alignment using
2376      data dependence information.  */
2377   vec<ddr_p> ddrs = vinfo->ddrs;
2378   struct data_dependence_relation *ddr;
2379   unsigned int i;
2380 
2381   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2382     vect_find_same_alignment_drs (ddr);
2383 
2384   vec<data_reference_p> datarefs = vinfo->datarefs;
2385   struct data_reference *dr;
2386 
2387   vect_record_base_alignments (vinfo);
2388   FOR_EACH_VEC_ELT (datarefs, i, dr)
2389     {
2390       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2391       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2392             && !vect_compute_data_ref_alignment (dr))
2393           {
2394             /* Strided accesses perform only component accesses, misalignment
2395                information is irrelevant for them.  */
2396             if (STMT_VINFO_STRIDED_P (stmt_info)
2397                 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2398               continue;
2399 
2400             if (dump_enabled_p ())
2401               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2402                                    "not vectorized: can't calculate alignment "
2403                                    "for data ref.\n");
2404 
2405             return false;
2406           }
2407     }
2408 
2409   return true;
2410 }
2411 
2412 
2413 /* Analyze alignment of DRs of stmts in NODE.  */
2414 
2415 static bool
vect_slp_analyze_and_verify_node_alignment(slp_tree node)2416 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2417 {
2418   /* We vectorize from the first scalar stmt in the node unless
2419      the node is permuted in which case we start from the first
2420      element in the group.  */
2421   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2422   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2423   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2424     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2425 
2426   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2427   if (! vect_compute_data_ref_alignment (dr)
2428       /* For creating the data-ref pointer we need alignment of the
2429            first element anyway.  */
2430       || (dr != first_dr
2431             && ! vect_compute_data_ref_alignment (first_dr))
2432       || ! verify_data_ref_alignment (dr))
2433     {
2434       if (dump_enabled_p ())
2435           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2436                                "not vectorized: bad data alignment in basic "
2437                                "block.\n");
2438       return false;
2439     }
2440 
2441   return true;
2442 }
2443 
2444 /* Function vect_slp_analyze_instance_alignment
2445 
2446    Analyze the alignment of the data-references in the SLP instance.
2447    Return FALSE if a data reference is found that cannot be vectorized.  */
2448 
2449 bool
vect_slp_analyze_and_verify_instance_alignment(slp_instance instance)2450 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2451 {
2452   if (dump_enabled_p ())
2453     dump_printf_loc (MSG_NOTE, vect_location,
2454                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2455 
2456   slp_tree node;
2457   unsigned i;
2458   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2459     if (! vect_slp_analyze_and_verify_node_alignment (node))
2460       return false;
2461 
2462   node = SLP_INSTANCE_TREE (instance);
2463   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2464       && ! vect_slp_analyze_and_verify_node_alignment
2465                (SLP_INSTANCE_TREE (instance)))
2466     return false;
2467 
2468   return true;
2469 }
2470 
2471 
2472 /* Analyze groups of accesses: check that DR belongs to a group of
2473    accesses of legal size, step, etc.  Detect gaps, single element
2474    interleaving, and other special cases. Set grouped access info.
2475    Collect groups of strided stores for further use in SLP analysis.
2476    Worker for vect_analyze_group_access.  */
2477 
2478 static bool
vect_analyze_group_access_1(struct data_reference * dr)2479 vect_analyze_group_access_1 (struct data_reference *dr)
2480 {
2481   tree step = DR_STEP (dr);
2482   tree scalar_type = TREE_TYPE (DR_REF (dr));
2483   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2484   gimple *stmt = DR_STMT (dr);
2485   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2486   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2487   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2488   HOST_WIDE_INT dr_step = -1;
2489   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2490   bool slp_impossible = false;
2491 
2492   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2493      size of the interleaving group (including gaps).  */
2494   if (tree_fits_shwi_p (step))
2495     {
2496       dr_step = tree_to_shwi (step);
2497       /* Check that STEP is a multiple of type size.  Otherwise there is
2498          a non-element-sized gap at the end of the group which we
2499            cannot represent in GROUP_GAP or GROUP_SIZE.
2500            ???  As we can handle non-constant step fine here we should
2501            simply remove uses of GROUP_GAP between the last and first
2502            element and instead rely on DR_STEP.  GROUP_SIZE then would
2503            simply not include that gap.  */
2504       if ((dr_step % type_size) != 0)
2505           {
2506             if (dump_enabled_p ())
2507               {
2508                 dump_printf_loc (MSG_NOTE, vect_location,
2509                                  "Step ");
2510                 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2511                 dump_printf (MSG_NOTE,
2512                                  " is not a multiple of the element size for ");
2513                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2514                 dump_printf (MSG_NOTE, "\n");
2515               }
2516             return false;
2517           }
2518       groupsize = absu_hwi (dr_step) / type_size;
2519     }
2520   else
2521     groupsize = 0;
2522 
2523   /* Not consecutive access is possible only if it is a part of interleaving.  */
2524   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2525     {
2526       /* Check if it this DR is a part of interleaving, and is a single
2527            element of the group that is accessed in the loop.  */
2528 
2529       /* Gaps are supported only for loads. STEP must be a multiple of the type
2530            size.  */
2531       if (DR_IS_READ (dr)
2532             && (dr_step % type_size) == 0
2533             && groupsize > 0)
2534           {
2535             GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2536             GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2537             GROUP_GAP (stmt_info) = groupsize - 1;
2538             if (dump_enabled_p ())
2539               {
2540                 dump_printf_loc (MSG_NOTE, vect_location,
2541                                  "Detected single element interleaving ");
2542                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2543                 dump_printf (MSG_NOTE, " step ");
2544                 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2545                 dump_printf (MSG_NOTE, "\n");
2546               }
2547 
2548             return true;
2549           }
2550 
2551       if (dump_enabled_p ())
2552         {
2553             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554                              "not consecutive access ");
2555             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2556         }
2557 
2558       if (bb_vinfo)
2559         {
2560           /* Mark the statement as unvectorizable.  */
2561           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2562           return true;
2563         }
2564 
2565       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2566       STMT_VINFO_STRIDED_P (stmt_info) = true;
2567       return true;
2568     }
2569 
2570   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2571     {
2572       /* First stmt in the interleaving chain. Check the chain.  */
2573       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2574       struct data_reference *data_ref = dr;
2575       unsigned int count = 1;
2576       tree prev_init = DR_INIT (data_ref);
2577       gimple *prev = stmt;
2578       HOST_WIDE_INT diff, gaps = 0;
2579 
2580       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2581       while (next)
2582         {
2583           /* Skip same data-refs.  In case that two or more stmts share
2584              data-ref (supported only for loads), we vectorize only the first
2585              stmt, and the rest get their vectorized loads from the first
2586              one.  */
2587           if (!tree_int_cst_compare (DR_INIT (data_ref),
2588                                      DR_INIT (STMT_VINFO_DATA_REF (
2589                                                                vinfo_for_stmt (next)))))
2590             {
2591               if (DR_IS_WRITE (data_ref))
2592                 {
2593                   if (dump_enabled_p ())
2594                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2595                                      "Two store stmts share the same dr.\n");
2596                   return false;
2597                 }
2598 
2599                 if (dump_enabled_p ())
2600                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601                                          "Two or more load stmts share the same dr.\n");
2602 
2603               /* For load use the same data-ref load.  */
2604               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2605 
2606               prev = next;
2607               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2608               continue;
2609             }
2610 
2611           prev = next;
2612           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2613 
2614             /* All group members have the same STEP by construction.  */
2615             gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2616 
2617           /* Check that the distance between two accesses is equal to the type
2618              size. Otherwise, we have gaps.  */
2619           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2620                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2621             if (diff != 1)
2622               {
2623                 /* FORNOW: SLP of accesses with gaps is not supported.  */
2624                 slp_impossible = true;
2625                 if (DR_IS_WRITE (data_ref))
2626                     {
2627                   if (dump_enabled_p ())
2628                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2629                                      "interleaved store with gaps\n");
2630                       return false;
2631                     }
2632 
2633               gaps += diff - 1;
2634               }
2635 
2636             last_accessed_element += diff;
2637 
2638           /* Store the gap from the previous member of the group. If there is no
2639              gap in the access, GROUP_GAP is always 1.  */
2640           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2641 
2642           prev_init = DR_INIT (data_ref);
2643           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2644           /* Count the number of data-refs in the chain.  */
2645           count++;
2646         }
2647 
2648       if (groupsize == 0)
2649         groupsize = count + gaps;
2650 
2651       /* This could be UINT_MAX but as we are generating code in a very
2652          inefficient way we have to cap earlier.  See PR78699 for example.  */
2653       if (groupsize > 4096)
2654           {
2655             if (dump_enabled_p ())
2656               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2657                                    "group is too large\n");
2658             return false;
2659           }
2660 
2661       /* Check that the size of the interleaving is equal to count for stores,
2662          i.e., that there are no gaps.  */
2663       if (groupsize != count
2664             && !DR_IS_READ (dr))
2665         {
2666             if (dump_enabled_p ())
2667               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2668                                    "interleaved store with gaps\n");
2669             return false;
2670           }
2671 
2672       /* If there is a gap after the last load in the group it is the
2673            difference between the groupsize and the last accessed
2674            element.
2675            When there is no gap, this difference should be 0.  */
2676       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2677 
2678       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2679       if (dump_enabled_p ())
2680           {
2681             dump_printf_loc (MSG_NOTE, vect_location,
2682                                  "Detected interleaving ");
2683             if (DR_IS_READ (dr))
2684               dump_printf (MSG_NOTE, "load ");
2685             else
2686               dump_printf (MSG_NOTE, "store ");
2687             dump_printf (MSG_NOTE, "of size %u starting with ",
2688                            (unsigned)groupsize);
2689             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2690             if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2691               dump_printf_loc (MSG_NOTE, vect_location,
2692                                    "There is a gap of %u elements after the group\n",
2693                                    GROUP_GAP (vinfo_for_stmt (stmt)));
2694           }
2695 
2696       /* SLP: create an SLP data structure for every interleaving group of
2697            stores for further analysis in vect_analyse_slp.  */
2698       if (DR_IS_WRITE (dr) && !slp_impossible)
2699         {
2700           if (loop_vinfo)
2701             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2702           if (bb_vinfo)
2703             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2704         }
2705     }
2706 
2707   return true;
2708 }
2709 
2710 /* Analyze groups of accesses: check that DR belongs to a group of
2711    accesses of legal size, step, etc.  Detect gaps, single element
2712    interleaving, and other special cases. Set grouped access info.
2713    Collect groups of strided stores for further use in SLP analysis.  */
2714 
2715 static bool
vect_analyze_group_access(struct data_reference * dr)2716 vect_analyze_group_access (struct data_reference *dr)
2717 {
2718   if (!vect_analyze_group_access_1 (dr))
2719     {
2720       /* Dissolve the group if present.  */
2721       gimple *next;
2722       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2723       while (stmt)
2724           {
2725             stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2726             next = GROUP_NEXT_ELEMENT (vinfo);
2727             GROUP_FIRST_ELEMENT (vinfo) = NULL;
2728             GROUP_NEXT_ELEMENT (vinfo) = NULL;
2729             stmt = next;
2730           }
2731       return false;
2732     }
2733   return true;
2734 }
2735 
2736 /* Analyze the access pattern of the data-reference DR.
2737    In case of non-consecutive accesses call vect_analyze_group_access() to
2738    analyze groups of accesses.  */
2739 
2740 static bool
vect_analyze_data_ref_access(struct data_reference * dr)2741 vect_analyze_data_ref_access (struct data_reference *dr)
2742 {
2743   tree step = DR_STEP (dr);
2744   tree scalar_type = TREE_TYPE (DR_REF (dr));
2745   gimple *stmt = DR_STMT (dr);
2746   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2747   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2748   struct loop *loop = NULL;
2749 
2750   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2751     return true;
2752 
2753   if (loop_vinfo)
2754     loop = LOOP_VINFO_LOOP (loop_vinfo);
2755 
2756   if (loop_vinfo && !step)
2757     {
2758       if (dump_enabled_p ())
2759           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                            "bad data-ref access in loop\n");
2761       return false;
2762     }
2763 
2764   /* Allow loads with zero step in inner-loop vectorization.  */
2765   if (loop_vinfo && integer_zerop (step))
2766     {
2767       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2768       if (!nested_in_vect_loop_p (loop, stmt))
2769           return DR_IS_READ (dr);
2770       /* Allow references with zero step for outer loops marked
2771            with pragma omp simd only - it guarantees absence of
2772            loop-carried dependencies between inner loop iterations.  */
2773       if (loop->safelen < 2)
2774           {
2775             if (dump_enabled_p ())
2776               dump_printf_loc (MSG_NOTE, vect_location,
2777                                    "zero step in inner loop of nest\n");
2778             return false;
2779           }
2780     }
2781 
2782   if (loop && nested_in_vect_loop_p (loop, stmt))
2783     {
2784       /* Interleaved accesses are not yet supported within outer-loop
2785         vectorization for references in the inner-loop.  */
2786       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2787 
2788       /* For the rest of the analysis we use the outer-loop step.  */
2789       step = STMT_VINFO_DR_STEP (stmt_info);
2790       if (integer_zerop (step))
2791           {
2792             if (dump_enabled_p ())
2793               dump_printf_loc (MSG_NOTE, vect_location,
2794                                "zero step in outer loop.\n");
2795             return DR_IS_READ (dr);
2796           }
2797     }
2798 
2799   /* Consecutive?  */
2800   if (TREE_CODE (step) == INTEGER_CST)
2801     {
2802       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2803       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2804             || (dr_step < 0
2805                 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2806           {
2807             /* Mark that it is not interleaving.  */
2808             GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2809             return true;
2810           }
2811     }
2812 
2813   if (loop && nested_in_vect_loop_p (loop, stmt))
2814     {
2815       if (dump_enabled_p ())
2816           dump_printf_loc (MSG_NOTE, vect_location,
2817                            "grouped access in outer loop.\n");
2818       return false;
2819     }
2820 
2821 
2822   /* Assume this is a DR handled by non-constant strided load case.  */
2823   if (TREE_CODE (step) != INTEGER_CST)
2824     return (STMT_VINFO_STRIDED_P (stmt_info)
2825               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2826                     || vect_analyze_group_access (dr)));
2827 
2828   /* Not consecutive access - check if it's a part of interleaving group.  */
2829   return vect_analyze_group_access (dr);
2830 }
2831 
2832 /* Compare two data-references DRA and DRB to group them into chunks
2833    suitable for grouping.  */
2834 
2835 static int
dr_group_sort_cmp(const void * dra_,const void * drb_)2836 dr_group_sort_cmp (const void *dra_, const void *drb_)
2837 {
2838   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2839   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2840   int cmp;
2841 
2842   /* Stabilize sort.  */
2843   if (dra == drb)
2844     return 0;
2845 
2846   /* DRs in different loops never belong to the same group.  */
2847   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2848   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2849   if (loopa != loopb)
2850     return loopa->num < loopb->num ? -1 : 1;
2851 
2852   /* Ordering of DRs according to base.  */
2853   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2854                                      DR_BASE_ADDRESS (drb));
2855   if (cmp != 0)
2856     return cmp;
2857 
2858   /* And according to DR_OFFSET.  */
2859   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2860   if (cmp != 0)
2861     return cmp;
2862 
2863   /* Put reads before writes.  */
2864   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2865     return DR_IS_READ (dra) ? -1 : 1;
2866 
2867   /* Then sort after access size.  */
2868   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2869                                      TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2870   if (cmp != 0)
2871     return cmp;
2872 
2873   /* And after step.  */
2874   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2875   if (cmp != 0)
2876     return cmp;
2877 
2878   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2879   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2880   if (cmp == 0)
2881     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2882   return cmp;
2883 }
2884 
2885 /* If OP is the result of a conversion, return the unconverted value,
2886    otherwise return null.  */
2887 
2888 static tree
strip_conversion(tree op)2889 strip_conversion (tree op)
2890 {
2891   if (TREE_CODE (op) != SSA_NAME)
2892     return NULL_TREE;
2893   gimple *stmt = SSA_NAME_DEF_STMT (op);
2894   if (!is_gimple_assign (stmt)
2895       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2896     return NULL_TREE;
2897   return gimple_assign_rhs1 (stmt);
2898 }
2899 
2900 /* Return true if vectorizable_* routines can handle statements STMT1
2901    and STMT2 being in a single group.  */
2902 
2903 static bool
can_group_stmts_p(gimple * stmt1,gimple * stmt2)2904 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2905 {
2906   if (gimple_assign_single_p (stmt1))
2907     return gimple_assign_single_p (stmt2);
2908 
2909   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2910     {
2911       /* Check for two masked loads or two masked stores.  */
2912       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2913           return false;
2914       internal_fn ifn = gimple_call_internal_fn (stmt1);
2915       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2916           return false;
2917       if (ifn != gimple_call_internal_fn (stmt2))
2918           return false;
2919 
2920       /* Check that the masks are the same.  Cope with casts of masks,
2921            like those created by build_mask_conversion.  */
2922       tree mask1 = gimple_call_arg (stmt1, 2);
2923       tree mask2 = gimple_call_arg (stmt2, 2);
2924       if (!operand_equal_p (mask1, mask2, 0))
2925           {
2926             mask1 = strip_conversion (mask1);
2927             if (!mask1)
2928               return false;
2929             mask2 = strip_conversion (mask2);
2930             if (!mask2)
2931               return false;
2932             if (!operand_equal_p (mask1, mask2, 0))
2933               return false;
2934           }
2935       return true;
2936     }
2937 
2938   return false;
2939 }
2940 
2941 /* Function vect_analyze_data_ref_accesses.
2942 
2943    Analyze the access pattern of all the data references in the loop.
2944 
2945    FORNOW: the only access pattern that is considered vectorizable is a
2946              simple step 1 (consecutive) access.
2947 
2948    FORNOW: handle only arrays and pointer accesses.  */
2949 
2950 bool
vect_analyze_data_ref_accesses(vec_info * vinfo)2951 vect_analyze_data_ref_accesses (vec_info *vinfo)
2952 {
2953   unsigned int i;
2954   vec<data_reference_p> datarefs = vinfo->datarefs;
2955   struct data_reference *dr;
2956 
2957   if (dump_enabled_p ())
2958     dump_printf_loc (MSG_NOTE, vect_location,
2959                      "=== vect_analyze_data_ref_accesses ===\n");
2960 
2961   if (datarefs.is_empty ())
2962     return true;
2963 
2964   /* Sort the array of datarefs to make building the interleaving chains
2965      linear.  Don't modify the original vector's order, it is needed for
2966      determining what dependencies are reversed.  */
2967   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2968   datarefs_copy.qsort (dr_group_sort_cmp);
2969 
2970   /* Build the interleaving chains.  */
2971   for (i = 0; i < datarefs_copy.length () - 1;)
2972     {
2973       data_reference_p dra = datarefs_copy[i];
2974       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2975       stmt_vec_info lastinfo = NULL;
2976       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2977             || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2978           {
2979             ++i;
2980             continue;
2981           }
2982       for (i = i + 1; i < datarefs_copy.length (); ++i)
2983           {
2984             data_reference_p drb = datarefs_copy[i];
2985             stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2986             if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2987                 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2988               break;
2989 
2990             /* ???  Imperfect sorting (non-compatible types, non-modulo
2991                accesses, same accesses) can lead to a group to be artificially
2992                split here as we don't just skip over those.  If it really
2993                matters we can push those to a worklist and re-iterate
2994                over them.  The we can just skip ahead to the next DR here.  */
2995 
2996             /* DRs in a different loop should not be put into the same
2997                interleaving group.  */
2998             if (gimple_bb (DR_STMT (dra))->loop_father
2999                 != gimple_bb (DR_STMT (drb))->loop_father)
3000               break;
3001 
3002             /* Check that the data-refs have same first location (except init)
3003                and they are both either store or load (not load and store,
3004                not masked loads or stores).  */
3005             if (DR_IS_READ (dra) != DR_IS_READ (drb)
3006                 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3007                                                   DR_BASE_ADDRESS (drb)) != 0
3008                 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3009                 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
3010               break;
3011 
3012             /* Check that the data-refs have the same constant size.  */
3013             tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3014             tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3015             if (!tree_fits_uhwi_p (sza)
3016                 || !tree_fits_uhwi_p (szb)
3017                 || !tree_int_cst_equal (sza, szb))
3018               break;
3019 
3020             /* Check that the data-refs have the same step.  */
3021             if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3022               break;
3023 
3024             /* Check the types are compatible.
3025                ???  We don't distinguish this during sorting.  */
3026             if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3027                                            TREE_TYPE (DR_REF (drb))))
3028               break;
3029 
3030             /* Check that the DR_INITs are compile-time constants.  */
3031             if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3032                 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3033               break;
3034 
3035             /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3036             HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3037             HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3038             HOST_WIDE_INT init_prev
3039               = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3040             gcc_assert (init_a <= init_b
3041                           && init_a <= init_prev
3042                           && init_prev <= init_b);
3043 
3044             /* Do not place the same access in the interleaving chain twice.  */
3045             if (init_b == init_prev)
3046               {
3047                 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3048                                 < gimple_uid (DR_STMT (drb)));
3049                 /* ???  For now we simply "drop" the later reference which is
3050                    otherwise the same rather than finishing off this group.
3051                      In the end we'd want to re-process duplicates forming
3052                      multiple groups from the refs, likely by just collecting
3053                      all candidates (including duplicates and split points
3054                      below) in a vector and then process them together.  */
3055                 continue;
3056               }
3057 
3058             /* If init_b == init_a + the size of the type * k, we have an
3059                interleaving, and DRA is accessed before DRB.  */
3060             HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3061             if (type_size_a == 0
3062                 || (init_b - init_a) % type_size_a != 0)
3063               break;
3064 
3065             /* If we have a store, the accesses are adjacent.  This splits
3066                groups into chunks we support (we don't support vectorization
3067                of stores with gaps).  */
3068             if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3069               break;
3070 
3071             /* If the step (if not zero or non-constant) is greater than the
3072                difference between data-refs' inits this splits groups into
3073                suitable sizes.  */
3074             if (tree_fits_shwi_p (DR_STEP (dra)))
3075               {
3076                 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3077                 if (step != 0 && step <= (init_b - init_a))
3078                     break;
3079               }
3080 
3081             if (dump_enabled_p ())
3082               {
3083                 dump_printf_loc (MSG_NOTE, vect_location,
3084                                      "Detected interleaving ");
3085                 if (DR_IS_READ (dra))
3086                     dump_printf (MSG_NOTE, "load ");
3087                 else
3088                     dump_printf (MSG_NOTE, "store ");
3089                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3090                 dump_printf (MSG_NOTE,  " and ");
3091                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3092                 dump_printf (MSG_NOTE, "\n");
3093               }
3094 
3095             /* Link the found element into the group list.  */
3096             if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3097               {
3098                 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3099                 lastinfo = stmtinfo_a;
3100               }
3101             GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3102             GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3103             lastinfo = stmtinfo_b;
3104           }
3105     }
3106 
3107   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3108     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3109         && !vect_analyze_data_ref_access (dr))
3110       {
3111           if (dump_enabled_p ())
3112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3113                              "not vectorized: complicated access pattern.\n");
3114 
3115         if (is_a <bb_vec_info> (vinfo))
3116           {
3117             /* Mark the statement as not vectorizable.  */
3118             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3119             continue;
3120           }
3121         else
3122             {
3123               datarefs_copy.release ();
3124               return false;
3125             }
3126       }
3127 
3128   datarefs_copy.release ();
3129   return true;
3130 }
3131 
3132 /* Function vect_vfa_segment_size.
3133 
3134    Input:
3135      DR: The data reference.
3136      LENGTH_FACTOR: segment length to consider.
3137 
3138    Return a value suitable for the dr_with_seg_len::seg_len field.
3139    This is the "distance travelled" by the pointer from the first
3140    iteration in the segment to the last.  Note that it does not include
3141    the size of the access; in effect it only describes the first byte.  */
3142 
3143 static tree
vect_vfa_segment_size(struct data_reference * dr,tree length_factor)3144 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3145 {
3146   length_factor = size_binop (MINUS_EXPR,
3147                                     fold_convert (sizetype, length_factor),
3148                                     size_one_node);
3149   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3150                          length_factor);
3151 }
3152 
3153 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3154    gives the worst-case number of bytes covered by the segment.  */
3155 
3156 static unsigned HOST_WIDE_INT
vect_vfa_access_size(data_reference * dr)3157 vect_vfa_access_size (data_reference *dr)
3158 {
3159   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3160   tree ref_type = TREE_TYPE (DR_REF (dr));
3161   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3162   unsigned HOST_WIDE_INT access_size = ref_size;
3163   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3164     {
3165       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3166       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3167     }
3168   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3169       && (vect_supportable_dr_alignment (dr, false)
3170             == dr_explicit_realign_optimized))
3171     {
3172       /* We might access a full vector's worth.  */
3173       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3174       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3175     }
3176   return access_size;
3177 }
3178 
3179 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3180 
3181 static unsigned int
vect_vfa_align(const data_reference * dr)3182 vect_vfa_align (const data_reference *dr)
3183 {
3184   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3185 }
3186 
3187 /* Function vect_no_alias_p.
3188 
3189    Given data references A and B with equal base and offset, see whether
3190    the alias relation can be decided at compilation time.  Return 1 if
3191    it can and the references alias, 0 if it can and the references do
3192    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3193    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3194    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3195 
3196 static int
vect_compile_time_alias(struct data_reference * a,struct data_reference * b,tree segment_length_a,tree segment_length_b,unsigned HOST_WIDE_INT access_size_a,unsigned HOST_WIDE_INT access_size_b)3197 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3198                                tree segment_length_a, tree segment_length_b,
3199                                unsigned HOST_WIDE_INT access_size_a,
3200                                unsigned HOST_WIDE_INT access_size_b)
3201 {
3202   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3203   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3204   poly_uint64 const_length_a;
3205   poly_uint64 const_length_b;
3206 
3207   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3208      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3209      [a, a+12) */
3210   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3211     {
3212       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3213       offset_a = (offset_a + access_size_a) - const_length_a;
3214     }
3215   else
3216     const_length_a = tree_to_poly_uint64 (segment_length_a);
3217   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3218     {
3219       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3220       offset_b = (offset_b + access_size_b) - const_length_b;
3221     }
3222   else
3223     const_length_b = tree_to_poly_uint64 (segment_length_b);
3224 
3225   const_length_a += access_size_a;
3226   const_length_b += access_size_b;
3227 
3228   if (ranges_known_overlap_p (offset_a, const_length_a,
3229                                     offset_b, const_length_b))
3230     return 1;
3231 
3232   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3233                                      offset_b, const_length_b))
3234     return 0;
3235 
3236   return -1;
3237 }
3238 
3239 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3240    in DDR is >= VF.  */
3241 
3242 static bool
dependence_distance_ge_vf(data_dependence_relation * ddr,unsigned int loop_depth,poly_uint64 vf)3243 dependence_distance_ge_vf (data_dependence_relation *ddr,
3244                                  unsigned int loop_depth, poly_uint64 vf)
3245 {
3246   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3247       || DDR_NUM_DIST_VECTS (ddr) == 0)
3248     return false;
3249 
3250   /* If the dependence is exact, we should have limited the VF instead.  */
3251   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3252 
3253   unsigned int i;
3254   lambda_vector dist_v;
3255   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3256     {
3257       HOST_WIDE_INT dist = dist_v[loop_depth];
3258       if (dist != 0
3259             && !(dist > 0 && DDR_REVERSED_P (ddr))
3260             && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3261           return false;
3262     }
3263 
3264   if (dump_enabled_p ())
3265     {
3266       dump_printf_loc (MSG_NOTE, vect_location,
3267                            "dependence distance between ");
3268       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3269       dump_printf (MSG_NOTE,  " and ");
3270       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3271       dump_printf (MSG_NOTE,  " is >= VF\n");
3272     }
3273 
3274   return true;
3275 }
3276 
3277 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3278 
3279 static void
dump_lower_bound(int dump_kind,const vec_lower_bound & lower_bound)3280 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3281 {
3282   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3283   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3284   dump_printf (dump_kind, ") >= ");
3285   dump_dec (dump_kind, lower_bound.min_value);
3286 }
3287 
3288 /* Record that the vectorized loop requires the vec_lower_bound described
3289    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3290 
3291 static void
vect_check_lower_bound(loop_vec_info loop_vinfo,tree expr,bool unsigned_p,poly_uint64 min_value)3292 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3293                               poly_uint64 min_value)
3294 {
3295   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3296   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3297     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3298       {
3299           unsigned_p &= lower_bounds[i].unsigned_p;
3300           min_value = upper_bound (lower_bounds[i].min_value, min_value);
3301           if (lower_bounds[i].unsigned_p != unsigned_p
3302               || maybe_lt (lower_bounds[i].min_value, min_value))
3303             {
3304               lower_bounds[i].unsigned_p = unsigned_p;
3305               lower_bounds[i].min_value = min_value;
3306               if (dump_enabled_p ())
3307                 {
3308                     dump_printf_loc (MSG_NOTE, vect_location,
3309                                          "updating run-time check to ");
3310                     dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3311                     dump_printf (MSG_NOTE, "\n");
3312                 }
3313             }
3314           return;
3315       }
3316 
3317   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3318   if (dump_enabled_p ())
3319     {
3320       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3321       dump_lower_bound (MSG_NOTE, lower_bound);
3322       dump_printf (MSG_NOTE, "\n");
3323     }
3324   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3325 }
3326 
3327 /* Return true if it's unlikely that the step of the vectorized form of DR
3328    will span fewer than GAP bytes.  */
3329 
3330 static bool
vect_small_gap_p(loop_vec_info loop_vinfo,data_reference * dr,poly_int64 gap)3331 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3332 {
3333   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3334   HOST_WIDE_INT count
3335     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3336   if (GROUP_FIRST_ELEMENT (stmt_info))
3337     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3338   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3339 }
3340 
3341 /* Return true if we know that there is no alias between DR_A and DR_B
3342    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3343    *LOWER_BOUND_OUT to this N.  */
3344 
3345 static bool
vectorizable_with_step_bound_p(data_reference * dr_a,data_reference * dr_b,poly_uint64 * lower_bound_out)3346 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3347                                         poly_uint64 *lower_bound_out)
3348 {
3349   /* Check that there is a constant gap of known sign between DR_A
3350      and DR_B.  */
3351   poly_int64 init_a, init_b;
3352   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3353       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3354       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3355       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3356       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3357       || !ordered_p (init_a, init_b))
3358     return false;
3359 
3360   /* Sort DR_A and DR_B by the address they access.  */
3361   if (maybe_lt (init_b, init_a))
3362     {
3363       std::swap (init_a, init_b);
3364       std::swap (dr_a, dr_b);
3365     }
3366 
3367   /* If the two accesses could be dependent within a scalar iteration,
3368      make sure that we'd retain their order.  */
3369   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3370       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3371     return false;
3372 
3373   /* There is no alias if abs (DR_STEP) is greater than or equal to
3374      the bytes spanned by the combination of the two accesses.  */
3375   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3376   return true;
3377 }
3378 
3379 /* Function vect_prune_runtime_alias_test_list.
3380 
3381    Prune a list of ddrs to be tested at run-time by versioning for alias.
3382    Merge several alias checks into one if possible.
3383    Return FALSE if resulting list of ddrs is longer then allowed by
3384    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3385 
3386 bool
vect_prune_runtime_alias_test_list(loop_vec_info loop_vinfo)3387 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3388 {
3389   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3390   hash_set <tree_pair_hash> compared_objects;
3391 
3392   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3393   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3394     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3395   vec<vec_object_pair> &check_unequal_addrs
3396     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3397   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3398   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3399 
3400   ddr_p ddr;
3401   unsigned int i;
3402   tree length_factor;
3403 
3404   if (dump_enabled_p ())
3405     dump_printf_loc (MSG_NOTE, vect_location,
3406                      "=== vect_prune_runtime_alias_test_list ===\n");
3407 
3408   /* Step values are irrelevant for aliasing if the number of vector
3409      iterations is equal to the number of scalar iterations (which can
3410      happen for fully-SLP loops).  */
3411   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3412 
3413   if (!ignore_step_p)
3414     {
3415       /* Convert the checks for nonzero steps into bound tests.  */
3416       tree value;
3417       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3418           vect_check_lower_bound (loop_vinfo, value, true, 1);
3419     }
3420 
3421   if (may_alias_ddrs.is_empty ())
3422     return true;
3423 
3424   comp_alias_ddrs.create (may_alias_ddrs.length ());
3425 
3426   unsigned int loop_depth
3427     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3428                                 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3429 
3430   /* First, we collect all data ref pairs for aliasing checks.  */
3431   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3432     {
3433       int comp_res;
3434       poly_uint64 lower_bound;
3435       struct data_reference *dr_a, *dr_b;
3436       gimple *dr_group_first_a, *dr_group_first_b;
3437       tree segment_length_a, segment_length_b;
3438       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3439       unsigned int align_a, align_b;
3440       gimple *stmt_a, *stmt_b;
3441 
3442       /* Ignore the alias if the VF we chose ended up being no greater
3443            than the dependence distance.  */
3444       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3445           continue;
3446 
3447       if (DDR_OBJECT_A (ddr))
3448           {
3449             vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3450             if (!compared_objects.add (new_pair))
3451               {
3452                 if (dump_enabled_p ())
3453                     {
3454                       dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3455                       dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3456                       dump_printf (MSG_NOTE, " and ");
3457                       dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3458                       dump_printf (MSG_NOTE, " have different addresses\n");
3459                     }
3460                 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3461               }
3462             continue;
3463           }
3464 
3465       dr_a = DDR_A (ddr);
3466       stmt_a = DR_STMT (DDR_A (ddr));
3467 
3468       dr_b = DDR_B (ddr);
3469       stmt_b = DR_STMT (DDR_B (ddr));
3470 
3471       /* Skip the pair if inter-iteration dependencies are irrelevant
3472            and intra-iteration dependencies are guaranteed to be honored.  */
3473       if (ignore_step_p
3474             && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3475                 || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3476           {
3477             if (dump_enabled_p ())
3478               {
3479                 dump_printf_loc (MSG_NOTE, vect_location,
3480                                      "no need for alias check between ");
3481                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3482                 dump_printf (MSG_NOTE, " and ");
3483                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3484                 dump_printf (MSG_NOTE, " when VF is 1\n");
3485               }
3486             continue;
3487           }
3488 
3489       /* See whether we can handle the alias using a bounds check on
3490            the step, and whether that's likely to be the best approach.
3491            (It might not be, for example, if the minimum step is much larger
3492            than the number of bytes handled by one vector iteration.)  */
3493       if (!ignore_step_p
3494             && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3495             && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3496             && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3497                 || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3498           {
3499             bool unsigned_p = dr_known_forward_stride_p (dr_a);
3500             if (dump_enabled_p ())
3501               {
3502                 dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3503                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3504                 dump_printf (MSG_NOTE, " and ");
3505                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3506                 dump_printf (MSG_NOTE, " when the step ");
3507                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3508                 dump_printf (MSG_NOTE, " is outside ");
3509                 if (unsigned_p)
3510                     dump_printf (MSG_NOTE, "[0");
3511                 else
3512                     {
3513                       dump_printf (MSG_NOTE, "(");
3514                       dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3515                     }
3516                 dump_printf (MSG_NOTE, ", ");
3517                 dump_dec (MSG_NOTE, lower_bound);
3518                 dump_printf (MSG_NOTE, ")\n");
3519               }
3520             vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3521                                           lower_bound);
3522             continue;
3523           }
3524 
3525       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3526       if (dr_group_first_a)
3527           {
3528             stmt_a = dr_group_first_a;
3529             dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3530           }
3531 
3532       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3533       if (dr_group_first_b)
3534           {
3535             stmt_b = dr_group_first_b;
3536             dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3537           }
3538 
3539       if (ignore_step_p)
3540           {
3541             segment_length_a = size_zero_node;
3542             segment_length_b = size_zero_node;
3543           }
3544       else
3545           {
3546             if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3547               length_factor = scalar_loop_iters;
3548             else
3549               length_factor = size_int (vect_factor);
3550             segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3551             segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3552           }
3553       access_size_a = vect_vfa_access_size (dr_a);
3554       access_size_b = vect_vfa_access_size (dr_b);
3555       align_a = vect_vfa_align (dr_a);
3556       align_b = vect_vfa_align (dr_b);
3557 
3558       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3559                                                   DR_BASE_ADDRESS (dr_b));
3560       if (comp_res == 0)
3561           comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3562                                                     DR_OFFSET (dr_b));
3563 
3564       /* See whether the alias is known at compilation time.  */
3565       if (comp_res == 0
3566             && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3567             && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3568             && poly_int_tree_p (segment_length_a)
3569             && poly_int_tree_p (segment_length_b))
3570           {
3571             int res = vect_compile_time_alias (dr_a, dr_b,
3572                                                        segment_length_a,
3573                                                        segment_length_b,
3574                                                        access_size_a,
3575                                                        access_size_b);
3576             if (res >= 0 && dump_enabled_p ())
3577               {
3578                 dump_printf_loc (MSG_NOTE, vect_location,
3579                                      "can tell at compile time that ");
3580                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3581                 dump_printf (MSG_NOTE, " and ");
3582                 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3583                 if (res == 0)
3584                     dump_printf (MSG_NOTE, " do not alias\n");
3585                 else
3586                     dump_printf (MSG_NOTE, " alias\n");
3587               }
3588 
3589             if (res == 0)
3590               continue;
3591 
3592             if (res == 1)
3593               {
3594                 if (dump_enabled_p ())
3595                     dump_printf_loc (MSG_NOTE, vect_location,
3596                                          "not vectorized: compilation time alias.\n");
3597                 return false;
3598               }
3599           }
3600 
3601       dr_with_seg_len_pair_t dr_with_seg_len_pair
3602           (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3603            dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3604 
3605       /* Canonicalize pairs by sorting the two DR members.  */
3606       if (comp_res > 0)
3607           std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3608 
3609       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3610     }
3611 
3612   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3613 
3614   unsigned int count = (comp_alias_ddrs.length ()
3615                               + check_unequal_addrs.length ());
3616 
3617   dump_printf_loc (MSG_NOTE, vect_location,
3618                        "improved number of alias checks from %d to %d\n",
3619                        may_alias_ddrs.length (), count);
3620   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3621     {
3622       if (dump_enabled_p ())
3623           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624                                "number of versioning for alias "
3625                                "run-time tests exceeds %d "
3626                                "(--param vect-max-version-for-alias-checks)\n",
3627                                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3628       return false;
3629     }
3630 
3631   return true;
3632 }
3633 
3634 /* Check whether we can use an internal function for a gather load
3635    or scatter store.  READ_P is true for loads and false for stores.
3636    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3637    the type of the memory elements being loaded or stored.  OFFSET_BITS
3638    is the number of bits in each scalar offset and OFFSET_SIGN is the
3639    sign of the offset.  SCALE is the amount by which the offset should
3640    be multiplied *after* it has been converted to address width.
3641 
3642    Return true if the function is supported, storing the function
3643    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3644 
3645 bool
vect_gather_scatter_fn_p(bool read_p,bool masked_p,tree vectype,tree memory_type,unsigned int offset_bits,signop offset_sign,int scale,internal_fn * ifn_out,tree * element_type_out)3646 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3647                                 tree memory_type, unsigned int offset_bits,
3648                                 signop offset_sign, int scale,
3649                                 internal_fn *ifn_out, tree *element_type_out)
3650 {
3651   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3652   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3653   if (offset_bits > element_bits)
3654     /* Internal functions require the offset to be the same width as
3655        the vector elements.  We can extend narrower offsets, but it isn't
3656        safe to truncate wider offsets.  */
3657     return false;
3658 
3659   if (element_bits != memory_bits)
3660     /* For now the vector elements must be the same width as the
3661        memory elements.  */
3662     return false;
3663 
3664   /* Work out which function we need.  */
3665   internal_fn ifn;
3666   if (read_p)
3667     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3668   else
3669     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3670 
3671   /* Test whether the target supports this combination.  */
3672   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3673                                                          offset_sign, scale))
3674     return false;
3675 
3676   *ifn_out = ifn;
3677   *element_type_out = TREE_TYPE (vectype);
3678   return true;
3679 }
3680 
3681 /* CALL is a call to an internal gather load or scatter store function.
3682    Describe the operation in INFO.  */
3683 
3684 static void
vect_describe_gather_scatter_call(gcall * call,gather_scatter_info * info)3685 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3686 {
3687   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3688   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3689   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3690 
3691   info->ifn = gimple_call_internal_fn (call);
3692   info->decl = NULL_TREE;
3693   info->base = gimple_call_arg (call, 0);
3694   info->offset = gimple_call_arg (call, 1);
3695   info->offset_dt = vect_unknown_def_type;
3696   info->offset_vectype = NULL_TREE;
3697   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3698   info->element_type = TREE_TYPE (vectype);
3699   info->memory_type = TREE_TYPE (DR_REF (dr));
3700 }
3701 
3702 /* Return true if a non-affine read or write in STMT is suitable for a
3703    gather load or scatter store.  Describe the operation in *INFO if so.  */
3704 
3705 bool
vect_check_gather_scatter(gimple * stmt,loop_vec_info loop_vinfo,gather_scatter_info * info)3706 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3707                                  gather_scatter_info *info)
3708 {
3709   HOST_WIDE_INT scale = 1;
3710   poly_int64 pbitpos, pbitsize;
3711   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3712   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3713   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3714   tree offtype = NULL_TREE;
3715   tree decl = NULL_TREE, base, off;
3716   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3717   tree memory_type = TREE_TYPE (DR_REF (dr));
3718   machine_mode pmode;
3719   int punsignedp, reversep, pvolatilep = 0;
3720   internal_fn ifn;
3721   tree element_type;
3722   bool masked_p = false;
3723 
3724   /* See whether this is already a call to a gather/scatter internal function.
3725      If not, see whether it's a masked load or store.  */
3726   gcall *call = dyn_cast <gcall *> (stmt);
3727   if (call && gimple_call_internal_p (call))
3728     {
3729       ifn = gimple_call_internal_fn (stmt);
3730       if (internal_gather_scatter_fn_p (ifn))
3731           {
3732             vect_describe_gather_scatter_call (call, info);
3733             return true;
3734           }
3735       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3736     }
3737 
3738   /* True if we should aim to use internal functions rather than
3739      built-in functions.  */
3740   bool use_ifn_p = (DR_IS_READ (dr)
3741                         ? supports_vec_gather_load_p ()
3742                         : supports_vec_scatter_store_p ());
3743 
3744   base = DR_REF (dr);
3745   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3746      see if we can use the def stmt of the address.  */
3747   if (masked_p
3748       && TREE_CODE (base) == MEM_REF
3749       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3750       && integer_zerop (TREE_OPERAND (base, 1))
3751       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3752     {
3753       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3754       if (is_gimple_assign (def_stmt)
3755             && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3756           base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3757     }
3758 
3759   /* The gather and scatter builtins need address of the form
3760      loop_invariant + vector * {1, 2, 4, 8}
3761      or
3762      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3763      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3764      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3765      multiplications and additions in it.  To get a vector, we need
3766      a single SSA_NAME that will be defined in the loop and will
3767      contain everything that is not loop invariant and that can be
3768      vectorized.  The following code attempts to find such a preexistng
3769      SSA_NAME OFF and put the loop invariants into a tree BASE
3770      that can be gimplified before the loop.  */
3771   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3772                                     &punsignedp, &reversep, &pvolatilep);
3773   gcc_assert (base && !reversep);
3774   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3775 
3776   if (TREE_CODE (base) == MEM_REF)
3777     {
3778       if (!integer_zerop (TREE_OPERAND (base, 1)))
3779           {
3780             if (off == NULL_TREE)
3781               off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3782             else
3783               off = size_binop (PLUS_EXPR, off,
3784                                     fold_convert (sizetype, TREE_OPERAND (base, 1)));
3785           }
3786       base = TREE_OPERAND (base, 0);
3787     }
3788   else
3789     base = build_fold_addr_expr (base);
3790 
3791   if (off == NULL_TREE)
3792     off = size_zero_node;
3793 
3794   /* If base is not loop invariant, either off is 0, then we start with just
3795      the constant offset in the loop invariant BASE and continue with base
3796      as OFF, otherwise give up.
3797      We could handle that case by gimplifying the addition of base + off
3798      into some SSA_NAME and use that as off, but for now punt.  */
3799   if (!expr_invariant_in_loop_p (loop, base))
3800     {
3801       if (!integer_zerop (off))
3802           return false;
3803       off = base;
3804       base = size_int (pbytepos);
3805     }
3806   /* Otherwise put base + constant offset into the loop invariant BASE
3807      and continue with OFF.  */
3808   else
3809     {
3810       base = fold_convert (sizetype, base);
3811       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3812     }
3813 
3814   /* OFF at this point may be either a SSA_NAME or some tree expression
3815      from get_inner_reference.  Try to peel off loop invariants from it
3816      into BASE as long as possible.  */
3817   STRIP_NOPS (off);
3818   while (offtype == NULL_TREE)
3819     {
3820       enum tree_code code;
3821       tree op0, op1, add = NULL_TREE;
3822 
3823       if (TREE_CODE (off) == SSA_NAME)
3824           {
3825             gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3826 
3827             if (expr_invariant_in_loop_p (loop, off))
3828               return false;
3829 
3830             if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3831               break;
3832 
3833             op0 = gimple_assign_rhs1 (def_stmt);
3834             code = gimple_assign_rhs_code (def_stmt);
3835             op1 = gimple_assign_rhs2 (def_stmt);
3836           }
3837       else
3838           {
3839             if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3840               return false;
3841             code = TREE_CODE (off);
3842             extract_ops_from_tree (off, &code, &op0, &op1);
3843           }
3844       switch (code)
3845           {
3846           case POINTER_PLUS_EXPR:
3847           case PLUS_EXPR:
3848             if (expr_invariant_in_loop_p (loop, op0))
3849               {
3850                 add = op0;
3851                 off = op1;
3852               do_add:
3853                 add = fold_convert (sizetype, add);
3854                 if (scale != 1)
3855                     add = size_binop (MULT_EXPR, add, size_int (scale));
3856                 base = size_binop (PLUS_EXPR, base, add);
3857                 continue;
3858               }
3859             if (expr_invariant_in_loop_p (loop, op1))
3860               {
3861                 add = op1;
3862                 off = op0;
3863                 goto do_add;
3864               }
3865             break;
3866           case MINUS_EXPR:
3867             if (expr_invariant_in_loop_p (loop, op1))
3868               {
3869                 add = fold_convert (sizetype, op1);
3870                 add = size_binop (MINUS_EXPR, size_zero_node, add);
3871                 off = op0;
3872                 goto do_add;
3873               }
3874             break;
3875           case MULT_EXPR:
3876             if (scale == 1 && tree_fits_shwi_p (op1))
3877               {
3878                 int new_scale = tree_to_shwi (op1);
3879                 /* Only treat this as a scaling operation if the target
3880                      supports it.  */
3881                 if (use_ifn_p
3882                       && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3883                                                             vectype, memory_type, 1,
3884                                                             TYPE_SIGN (TREE_TYPE (op0)),
3885                                                             new_scale, &ifn,
3886                                                             &element_type))
3887                     break;
3888                 scale = new_scale;
3889                 off = op0;
3890                 continue;
3891               }
3892             break;
3893           case SSA_NAME:
3894             off = op0;
3895             continue;
3896           CASE_CONVERT:
3897             if (!POINTER_TYPE_P (TREE_TYPE (op0))
3898                 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3899               break;
3900             if (TYPE_PRECISION (TREE_TYPE (op0))
3901                 == TYPE_PRECISION (TREE_TYPE (off)))
3902               {
3903                 off = op0;
3904                 continue;
3905               }
3906 
3907             /* The internal functions need the offset to be the same width
3908                as the elements of VECTYPE.  Don't include operations that
3909                cast the offset from that width to a different width.  */
3910             if (use_ifn_p
3911                 && (int_size_in_bytes (TREE_TYPE (vectype))
3912                       == int_size_in_bytes (TREE_TYPE (off))))
3913               break;
3914 
3915             if (TYPE_PRECISION (TREE_TYPE (op0))
3916                 < TYPE_PRECISION (TREE_TYPE (off)))
3917               {
3918                 off = op0;
3919                 offtype = TREE_TYPE (off);
3920                 STRIP_NOPS (off);
3921                 continue;
3922               }
3923             break;
3924           default:
3925             break;
3926           }
3927       break;
3928     }
3929 
3930   /* If at the end OFF still isn't a SSA_NAME or isn't
3931      defined in the loop, punt.  */
3932   if (TREE_CODE (off) != SSA_NAME
3933       || expr_invariant_in_loop_p (loop, off))
3934     return false;
3935 
3936   if (offtype == NULL_TREE)
3937     offtype = TREE_TYPE (off);
3938 
3939   if (use_ifn_p)
3940     {
3941       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3942                                              memory_type, TYPE_PRECISION (offtype),
3943                                              TYPE_SIGN (offtype), scale, &ifn,
3944                                              &element_type))
3945           return false;
3946     }
3947   else
3948     {
3949       if (DR_IS_READ (dr))
3950           {
3951             if (targetm.vectorize.builtin_gather)
3952               decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3953           }
3954       else
3955           {
3956             if (targetm.vectorize.builtin_scatter)
3957               decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3958           }
3959 
3960       if (!decl)
3961           return false;
3962 
3963       ifn = IFN_LAST;
3964       element_type = TREE_TYPE (vectype);
3965     }
3966 
3967   info->ifn = ifn;
3968   info->decl = decl;
3969   info->base = base;
3970   info->offset = off;
3971   info->offset_dt = vect_unknown_def_type;
3972   info->offset_vectype = NULL_TREE;
3973   info->scale = scale;
3974   info->element_type = element_type;
3975   info->memory_type = memory_type;
3976   return true;
3977 }
3978 
3979 /* Function vect_analyze_data_refs.
3980 
3981   Find all the data references in the loop or basic block.
3982 
3983    The general structure of the analysis of data refs in the vectorizer is as
3984    follows:
3985    1- vect_analyze_data_refs(loop/bb): call
3986       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3987       in the loop/bb and their dependences.
3988    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3989    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3990    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3991 
3992 */
3993 
3994 bool
vect_analyze_data_refs(vec_info * vinfo,poly_uint64 * min_vf)3995 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3996 {
3997   struct loop *loop = NULL;
3998   unsigned int i;
3999   struct data_reference *dr;
4000   tree scalar_type;
4001 
4002   if (dump_enabled_p ())
4003     dump_printf_loc (MSG_NOTE, vect_location,
4004                          "=== vect_analyze_data_refs ===\n");
4005 
4006   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4007     loop = LOOP_VINFO_LOOP (loop_vinfo);
4008 
4009   /* Go through the data-refs, check that the analysis succeeded.  Update
4010      pointer from stmt_vec_info struct to DR and vectype.  */
4011 
4012   vec<data_reference_p> datarefs = vinfo->datarefs;
4013   FOR_EACH_VEC_ELT (datarefs, i, dr)
4014     {
4015       gimple *stmt;
4016       stmt_vec_info stmt_info;
4017       tree base, offset, init;
4018       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4019       bool simd_lane_access = false;
4020       poly_uint64 vf;
4021 
4022 again:
4023       if (!dr || !DR_REF (dr))
4024         {
4025           if (dump_enabled_p ())
4026               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4027                                "not vectorized: unhandled data-ref\n");
4028           return false;
4029         }
4030 
4031       stmt = DR_STMT (dr);
4032       stmt_info = vinfo_for_stmt (stmt);
4033 
4034       /* Discard clobbers from the dataref vector.  We will remove
4035          clobber stmts during vectorization.  */
4036       if (gimple_clobber_p (stmt))
4037           {
4038             free_data_ref (dr);
4039             if (i == datarefs.length () - 1)
4040               {
4041                 datarefs.pop ();
4042                 break;
4043               }
4044             datarefs.ordered_remove (i);
4045             dr = datarefs[i];
4046             goto again;
4047           }
4048 
4049       /* Check that analysis of the data-ref succeeded.  */
4050       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4051             || !DR_STEP (dr))
4052         {
4053             bool maybe_gather
4054               = DR_IS_READ (dr)
4055                 && !TREE_THIS_VOLATILE (DR_REF (dr))
4056                 && (targetm.vectorize.builtin_gather != NULL
4057                       || supports_vec_gather_load_p ());
4058             bool maybe_scatter
4059               = DR_IS_WRITE (dr)
4060                 && !TREE_THIS_VOLATILE (DR_REF (dr))
4061                 && (targetm.vectorize.builtin_scatter != NULL
4062                       || supports_vec_scatter_store_p ());
4063             bool maybe_simd_lane_access
4064               = is_a <loop_vec_info> (vinfo) && loop->simduid;
4065 
4066             /* If target supports vector gather loads or scatter stores, or if
4067                this might be a SIMD lane access, see if they can't be used.  */
4068             if (is_a <loop_vec_info> (vinfo)
4069                 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4070                 && !nested_in_vect_loop_p (loop, stmt))
4071               {
4072                 struct data_reference *newdr
4073                     = create_data_ref (NULL, loop_containing_stmt (stmt),
4074                                            DR_REF (dr), stmt, !maybe_scatter,
4075                                            DR_IS_CONDITIONAL_IN_STMT (dr));
4076                 gcc_assert (newdr != NULL && DR_REF (newdr));
4077                 if (DR_BASE_ADDRESS (newdr)
4078                       && DR_OFFSET (newdr)
4079                       && DR_INIT (newdr)
4080                       && DR_STEP (newdr)
4081                       && integer_zerop (DR_STEP (newdr)))
4082                     {
4083                       if (maybe_simd_lane_access)
4084                         {
4085                           tree off = DR_OFFSET (newdr);
4086                           STRIP_NOPS (off);
4087                           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4088                                 && TREE_CODE (off) == MULT_EXPR
4089                                 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4090                               {
4091                                 tree step = TREE_OPERAND (off, 1);
4092                                 off = TREE_OPERAND (off, 0);
4093                                 STRIP_NOPS (off);
4094                                 if (CONVERT_EXPR_P (off)
4095                                     && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4096                                                                                             0)))
4097                                          < TYPE_PRECISION (TREE_TYPE (off)))
4098                                   off = TREE_OPERAND (off, 0);
4099                                 if (TREE_CODE (off) == SSA_NAME)
4100                                   {
4101                                     gimple *def = SSA_NAME_DEF_STMT (off);
4102                                     tree reft = TREE_TYPE (DR_REF (newdr));
4103                                     if (is_gimple_call (def)
4104                                           && gimple_call_internal_p (def)
4105                                           && (gimple_call_internal_fn (def)
4106                                               == IFN_GOMP_SIMD_LANE))
4107                                         {
4108                                           tree arg = gimple_call_arg (def, 0);
4109                                           gcc_assert (TREE_CODE (arg) == SSA_NAME);
4110                                           arg = SSA_NAME_VAR (arg);
4111                                           if (arg == loop->simduid
4112                                               /* For now.  */
4113                                               && tree_int_cst_equal
4114                                                      (TYPE_SIZE_UNIT (reft),
4115                                                       step))
4116                                             {
4117                                               DR_OFFSET (newdr) = ssize_int (0);
4118                                               DR_STEP (newdr) = step;
4119                                               DR_OFFSET_ALIGNMENT (newdr)
4120                                                   = BIGGEST_ALIGNMENT;
4121                                               DR_STEP_ALIGNMENT (newdr)
4122                                                   = highest_pow2_factor (step);
4123                                               dr = newdr;
4124                                               simd_lane_access = true;
4125                                             }
4126                                         }
4127                                   }
4128                               }
4129                         }
4130                       if (!simd_lane_access && (maybe_gather || maybe_scatter))
4131                         {
4132                           dr = newdr;
4133                           if (maybe_gather)
4134                               gatherscatter = GATHER;
4135                           else
4136                               gatherscatter = SCATTER;
4137                         }
4138                     }
4139                 if (gatherscatter == SG_NONE && !simd_lane_access)
4140                     free_data_ref (newdr);
4141               }
4142 
4143             if (gatherscatter == SG_NONE && !simd_lane_access)
4144               {
4145                 if (dump_enabled_p ())
4146                     {
4147                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4148                                    "not vectorized: data ref analysis "
4149                                    "failed ");
4150                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4151                     }
4152 
4153                 if (is_a <bb_vec_info> (vinfo))
4154                     break;
4155 
4156                 return false;
4157               }
4158         }
4159 
4160       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4161         {
4162           if (dump_enabled_p ())
4163             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4164                              "not vectorized: base addr of dr is a "
4165                              "constant\n");
4166 
4167           if (is_a <bb_vec_info> (vinfo))
4168               break;
4169 
4170             if (gatherscatter != SG_NONE || simd_lane_access)
4171               free_data_ref (dr);
4172             return false;
4173         }
4174 
4175       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4176         {
4177           if (dump_enabled_p ())
4178             {
4179               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4180                                "not vectorized: volatile type ");
4181               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4182             }
4183 
4184           if (is_a <bb_vec_info> (vinfo))
4185               break;
4186 
4187           return false;
4188         }
4189 
4190       if (stmt_can_throw_internal (stmt))
4191         {
4192           if (dump_enabled_p ())
4193             {
4194               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4195                                "not vectorized: statement can throw an "
4196                                "exception ");
4197               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4198             }
4199 
4200           if (is_a <bb_vec_info> (vinfo))
4201               break;
4202 
4203             if (gatherscatter != SG_NONE || simd_lane_access)
4204               free_data_ref (dr);
4205           return false;
4206         }
4207 
4208       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4209             && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4210           {
4211           if (dump_enabled_p ())
4212             {
4213               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4214                                "not vectorized: statement is bitfield "
4215                                "access ");
4216               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4217             }
4218 
4219           if (is_a <bb_vec_info> (vinfo))
4220               break;
4221 
4222             if (gatherscatter != SG_NONE || simd_lane_access)
4223               free_data_ref (dr);
4224           return false;
4225           }
4226 
4227       base = unshare_expr (DR_BASE_ADDRESS (dr));
4228       offset = unshare_expr (DR_OFFSET (dr));
4229       init = unshare_expr (DR_INIT (dr));
4230 
4231       if (is_gimple_call (stmt)
4232             && (!gimple_call_internal_p (stmt)
4233                 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4234                       && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4235           {
4236             if (dump_enabled_p ())
4237               {
4238                 dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4239                                  "not vectorized: dr in a call ");
4240                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4241               }
4242 
4243             if (is_a <bb_vec_info> (vinfo))
4244               break;
4245 
4246             if (gatherscatter != SG_NONE || simd_lane_access)
4247               free_data_ref (dr);
4248             return false;
4249           }
4250 
4251       /* Update DR field in stmt_vec_info struct.  */
4252 
4253       /* If the dataref is in an inner-loop of the loop that is considered for
4254            for vectorization, we also want to analyze the access relative to
4255            the outer-loop (DR contains information only relative to the
4256            inner-most enclosing loop).  We do that by building a reference to the
4257            first location accessed by the inner-loop, and analyze it relative to
4258            the outer-loop.  */
4259       if (loop && nested_in_vect_loop_p (loop, stmt))
4260           {
4261             /* Build a reference to the first location accessed by the
4262                inner loop: *(BASE + INIT + OFFSET).  By construction,
4263                this address must be invariant in the inner loop, so we
4264                can consider it as being used in the outer loop.  */
4265             tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4266                                                     init, offset);
4267             tree init_addr = fold_build_pointer_plus (base, init_offset);
4268             tree init_ref = build_fold_indirect_ref (init_addr);
4269 
4270             if (dump_enabled_p ())
4271               {
4272                 dump_printf_loc (MSG_NOTE, vect_location,
4273                                "analyze in outer loop: ");
4274                 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4275                 dump_printf (MSG_NOTE, "\n");
4276               }
4277 
4278             if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4279                                              init_ref, loop))
4280               /* dr_analyze_innermost already explained the failure.  */
4281               return false;
4282 
4283           if (dump_enabled_p ())
4284               {
4285                 dump_printf_loc (MSG_NOTE, vect_location,
4286                                "\touter base_address: ");
4287                 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4288                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4289                 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4290                 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4291                                  STMT_VINFO_DR_OFFSET (stmt_info));
4292                 dump_printf (MSG_NOTE,
4293                            "\n\touter constant offset from base address: ");
4294                 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4295                                  STMT_VINFO_DR_INIT (stmt_info));
4296                 dump_printf (MSG_NOTE, "\n\touter step: ");
4297                 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4298                                  STMT_VINFO_DR_STEP (stmt_info));
4299                 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4300                                  STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4301                 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4302                                  STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4303                 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4304                                  STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4305                 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4306                                  STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4307               }
4308           }
4309 
4310       if (STMT_VINFO_DATA_REF (stmt_info))
4311         {
4312           if (dump_enabled_p ())
4313             {
4314               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4315                                "not vectorized: more than one data ref "
4316                                "in stmt: ");
4317               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4318             }
4319 
4320           if (is_a <bb_vec_info> (vinfo))
4321               break;
4322 
4323             if (gatherscatter != SG_NONE || simd_lane_access)
4324               free_data_ref (dr);
4325           return false;
4326         }
4327 
4328       STMT_VINFO_DATA_REF (stmt_info) = dr;
4329       if (simd_lane_access)
4330           {
4331             STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4332             free_data_ref (datarefs[i]);
4333             datarefs[i] = dr;
4334           }
4335 
4336       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4337             && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4338             && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4339           {
4340           if (dump_enabled_p ())
4341             {
4342               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4343                                "not vectorized: base object not addressable "
4344                                      "for stmt: ");
4345               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4346             }
4347           if (is_a <bb_vec_info> (vinfo))
4348               {
4349                 /* In BB vectorization the ref can still participate
4350                    in dependence analysis, we just can't vectorize it.  */
4351                 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4352                 continue;
4353               }
4354             return false;
4355           }
4356 
4357       /* Set vectype for STMT.  */
4358       scalar_type = TREE_TYPE (DR_REF (dr));
4359       STMT_VINFO_VECTYPE (stmt_info)
4360           = get_vectype_for_scalar_type (scalar_type);
4361       if (!STMT_VINFO_VECTYPE (stmt_info))
4362         {
4363           if (dump_enabled_p ())
4364             {
4365               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4366                                "not vectorized: no vectype for stmt: ");
4367               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4368               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4369               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4370                                  scalar_type);
4371               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4372             }
4373 
4374           if (is_a <bb_vec_info> (vinfo))
4375               {
4376                 /* No vector type is fine, the ref can still participate
4377                    in dependence analysis, we just can't vectorize it.  */
4378                 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4379                 continue;
4380               }
4381 
4382             if (gatherscatter != SG_NONE || simd_lane_access)
4383               {
4384                 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4385                 if (gatherscatter != SG_NONE)
4386                     free_data_ref (dr);
4387               }
4388             return false;
4389         }
4390       else
4391           {
4392             if (dump_enabled_p ())
4393               {
4394                 dump_printf_loc (MSG_NOTE, vect_location,
4395                                      "got vectype for stmt: ");
4396                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4397                 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4398                                          STMT_VINFO_VECTYPE (stmt_info));
4399                 dump_printf (MSG_NOTE, "\n");
4400               }
4401           }
4402 
4403       /* Adjust the minimal vectorization factor according to the
4404            vector type.  */
4405       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4406       *min_vf = upper_bound (*min_vf, vf);
4407 
4408       if (gatherscatter != SG_NONE)
4409           {
4410             gather_scatter_info gs_info;
4411             if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4412                                                     &gs_info)
4413                 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4414               {
4415                 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4416                 free_data_ref (dr);
4417                 if (dump_enabled_p ())
4418                     {
4419                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4420                                            (gatherscatter == GATHER) ?
4421                                            "not vectorized: not suitable for gather "
4422                                            "load " :
4423                                            "not vectorized: not suitable for scatter "
4424                                            "store ");
4425                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4426                     }
4427                 return false;
4428               }
4429 
4430             free_data_ref (datarefs[i]);
4431             datarefs[i] = dr;
4432             STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4433           }
4434 
4435       else if (is_a <loop_vec_info> (vinfo)
4436                  && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4437           {
4438             if (nested_in_vect_loop_p (loop, stmt))
4439               {
4440                 if (dump_enabled_p ())
4441                     {
4442                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4443                                    "not vectorized: not suitable for strided "
4444                                    "load ");
4445                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4446                     }
4447                 return false;
4448               }
4449             STMT_VINFO_STRIDED_P (stmt_info) = true;
4450           }
4451     }
4452 
4453   /* If we stopped analysis at the first dataref we could not analyze
4454      when trying to vectorize a basic-block mark the rest of the datarefs
4455      as not vectorizable and truncate the vector of datarefs.  That
4456      avoids spending useless time in analyzing their dependence.  */
4457   if (i != datarefs.length ())
4458     {
4459       gcc_assert (is_a <bb_vec_info> (vinfo));
4460       for (unsigned j = i; j < datarefs.length (); ++j)
4461           {
4462             data_reference_p dr = datarefs[j];
4463           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4464             free_data_ref (dr);
4465           }
4466       datarefs.truncate (i);
4467     }
4468 
4469   return true;
4470 }
4471 
4472 
4473 /* Function vect_get_new_vect_var.
4474 
4475    Returns a name for a new variable.  The current naming scheme appends the
4476    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4477    the name of vectorizer generated variables, and appends that to NAME if
4478    provided.  */
4479 
4480 tree
vect_get_new_vect_var(tree type,enum vect_var_kind var_kind,const char * name)4481 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4482 {
4483   const char *prefix;
4484   tree new_vect_var;
4485 
4486   switch (var_kind)
4487   {
4488   case vect_simple_var:
4489     prefix = "vect";
4490     break;
4491   case vect_scalar_var:
4492     prefix = "stmp";
4493     break;
4494   case vect_mask_var:
4495     prefix = "mask";
4496     break;
4497   case vect_pointer_var:
4498     prefix = "vectp";
4499     break;
4500   default:
4501     gcc_unreachable ();
4502   }
4503 
4504   if (name)
4505     {
4506       char* tmp = concat (prefix, "_", name, NULL);
4507       new_vect_var = create_tmp_reg (type, tmp);
4508       free (tmp);
4509     }
4510   else
4511     new_vect_var = create_tmp_reg (type, prefix);
4512 
4513   return new_vect_var;
4514 }
4515 
4516 /* Like vect_get_new_vect_var but return an SSA name.  */
4517 
4518 tree
vect_get_new_ssa_name(tree type,enum vect_var_kind var_kind,const char * name)4519 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4520 {
4521   const char *prefix;
4522   tree new_vect_var;
4523 
4524   switch (var_kind)
4525   {
4526   case vect_simple_var:
4527     prefix = "vect";
4528     break;
4529   case vect_scalar_var:
4530     prefix = "stmp";
4531     break;
4532   case vect_pointer_var:
4533     prefix = "vectp";
4534     break;
4535   default:
4536     gcc_unreachable ();
4537   }
4538 
4539   if (name)
4540     {
4541       char* tmp = concat (prefix, "_", name, NULL);
4542       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4543       free (tmp);
4544     }
4545   else
4546     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4547 
4548   return new_vect_var;
4549 }
4550 
4551 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4552 
4553 static void
vect_duplicate_ssa_name_ptr_info(tree name,data_reference * dr)4554 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4555 {
4556   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4557   int misalign = DR_MISALIGNMENT (dr);
4558   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4559     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4560   else
4561     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4562                                   DR_TARGET_ALIGNMENT (dr), misalign);
4563 }
4564 
4565 /* Function vect_create_addr_base_for_vector_ref.
4566 
4567    Create an expression that computes the address of the first memory location
4568    that will be accessed for a data reference.
4569 
4570    Input:
4571    STMT: The statement containing the data reference.
4572    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4573    OFFSET: Optional. If supplied, it is be added to the initial address.
4574    LOOP:    Specify relative to which loop-nest should the address be computed.
4575             For example, when the dataref is in an inner-loop nested in an
4576               outer-loop that is now being vectorized, LOOP can be either the
4577               outer-loop, or the inner-loop.  The first memory location accessed
4578               by the following dataref ('in' points to short):
4579 
4580                     for (i=0; i<N; i++)
4581                        for (j=0; j<M; j++)
4582                          s += in[i+j]
4583 
4584               is as follows:
4585               if LOOP=i_loop: &in                 (relative to i_loop)
4586               if LOOP=j_loop:           &in+i*2B  (relative to j_loop)
4587    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4588               initial address.  Unlike OFFSET, which is number of elements to
4589               be added, BYTE_OFFSET is measured in bytes.
4590 
4591    Output:
4592    1. Return an SSA_NAME whose value is the address of the memory location of
4593       the first vector of the data reference.
4594    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4595       these statement(s) which define the returned SSA_NAME.
4596 
4597    FORNOW: We are only handling array accesses with step 1.  */
4598 
4599 tree
vect_create_addr_base_for_vector_ref(gimple * stmt,gimple_seq * new_stmt_list,tree offset,tree byte_offset)4600 vect_create_addr_base_for_vector_ref (gimple *stmt,
4601                                               gimple_seq *new_stmt_list,
4602                                               tree offset,
4603                                               tree byte_offset)
4604 {
4605   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4606   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4607   const char *base_name;
4608   tree addr_base;
4609   tree dest;
4610   gimple_seq seq = NULL;
4611   tree vect_ptr_type;
4612   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4613   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4614   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4615 
4616   tree data_ref_base = unshare_expr (drb->base_address);
4617   tree base_offset = unshare_expr (drb->offset);
4618   tree init = unshare_expr (drb->init);
4619 
4620   if (loop_vinfo)
4621     base_name = get_name (data_ref_base);
4622   else
4623     {
4624       base_offset = ssize_int (0);
4625       init = ssize_int (0);
4626       base_name = get_name (DR_REF (dr));
4627     }
4628 
4629   /* Create base_offset */
4630   base_offset = size_binop (PLUS_EXPR,
4631                                   fold_convert (sizetype, base_offset),
4632                                   fold_convert (sizetype, init));
4633 
4634   if (offset)
4635     {
4636       offset = fold_build2 (MULT_EXPR, sizetype,
4637                                   fold_convert (sizetype, offset), step);
4638       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4639                                          base_offset, offset);
4640     }
4641   if (byte_offset)
4642     {
4643       byte_offset = fold_convert (sizetype, byte_offset);
4644       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4645                                          base_offset, byte_offset);
4646     }
4647 
4648   /* base + base_offset */
4649   if (loop_vinfo)
4650     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4651   else
4652     {
4653       addr_base = build1 (ADDR_EXPR,
4654                                 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4655                                 unshare_expr (DR_REF (dr)));
4656     }
4657 
4658   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4659   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4660   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4661   gimple_seq_add_seq (new_stmt_list, seq);
4662 
4663   if (DR_PTR_INFO (dr)
4664       && TREE_CODE (addr_base) == SSA_NAME
4665       && !SSA_NAME_PTR_INFO (addr_base))
4666     {
4667       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4668       if (offset || byte_offset)
4669           mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4670     }
4671 
4672   if (dump_enabled_p ())
4673     {
4674       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4675       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4676       dump_printf (MSG_NOTE, "\n");
4677     }
4678 
4679   return addr_base;
4680 }
4681 
4682 
4683 /* Function vect_create_data_ref_ptr.
4684 
4685    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4686    location accessed in the loop by STMT, along with the def-use update
4687    chain to appropriately advance the pointer through the loop iterations.
4688    Also set aliasing information for the pointer.  This pointer is used by
4689    the callers to this function to create a memory reference expression for
4690    vector load/store access.
4691 
4692    Input:
4693    1. STMT: a stmt that references memory. Expected to be of the form
4694          GIMPLE_ASSIGN <name, data-ref> or
4695            GIMPLE_ASSIGN <data-ref, name>.
4696    2. AGGR_TYPE: the type of the reference, which should be either a vector
4697         or an array.
4698    3. AT_LOOP: the loop where the vector memref is to be created.
4699    4. OFFSET (optional): an offset to be added to the initial address accessed
4700         by the data-ref in STMT.
4701    5. BSI: location where the new stmts are to be placed if there is no loop
4702    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4703         pointing to the initial address.
4704    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4705           to the initial address accessed by the data-ref in STMT.  This is
4706           similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4707           in bytes.
4708    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4709           to the IV during each iteration of the loop.  NULL says to move
4710           by one copy of AGGR_TYPE up or down, depending on the step of the
4711           data reference.
4712 
4713    Output:
4714    1. Declare a new ptr to vector_type, and have it point to the base of the
4715       data reference (initial addressed accessed by the data reference).
4716       For example, for vector of type V8HI, the following code is generated:
4717 
4718       v8hi *ap;
4719       ap = (v8hi *)initial_address;
4720 
4721       if OFFSET is not supplied:
4722          initial_address = &a[init];
4723       if OFFSET is supplied:
4724          initial_address = &a[init + OFFSET];
4725       if BYTE_OFFSET is supplied:
4726            initial_address = &a[init] + BYTE_OFFSET;
4727 
4728       Return the initial_address in INITIAL_ADDRESS.
4729 
4730    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4731       update the pointer in each iteration of the loop.
4732 
4733       Return the increment stmt that updates the pointer in PTR_INCR.
4734 
4735    3. Set INV_P to true if the access pattern of the data reference in the
4736       vectorized loop is invariant.  Set it to false otherwise.
4737 
4738    4. Return the pointer.  */
4739 
4740 tree
vect_create_data_ref_ptr(gimple * stmt,tree aggr_type,struct loop * at_loop,tree offset,tree * initial_address,gimple_stmt_iterator * gsi,gimple ** ptr_incr,bool only_init,bool * inv_p,tree byte_offset,tree iv_step)4741 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4742                                 tree offset, tree *initial_address,
4743                                 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4744                                 bool only_init, bool *inv_p, tree byte_offset,
4745                                 tree iv_step)
4746 {
4747   const char *base_name;
4748   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4749   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4750   struct loop *loop = NULL;
4751   bool nested_in_vect_loop = false;
4752   struct loop *containing_loop = NULL;
4753   tree aggr_ptr_type;
4754   tree aggr_ptr;
4755   tree new_temp;
4756   gimple_seq new_stmt_list = NULL;
4757   edge pe = NULL;
4758   basic_block new_bb;
4759   tree aggr_ptr_init;
4760   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4761   tree aptr;
4762   gimple_stmt_iterator incr_gsi;
4763   bool insert_after;
4764   tree indx_before_incr, indx_after_incr;
4765   gimple *incr;
4766   tree step;
4767   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4768 
4769   gcc_assert (iv_step != NULL_TREE
4770                 || TREE_CODE (aggr_type) == ARRAY_TYPE
4771                 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4772 
4773   if (loop_vinfo)
4774     {
4775       loop = LOOP_VINFO_LOOP (loop_vinfo);
4776       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4777       containing_loop = (gimple_bb (stmt))->loop_father;
4778       pe = loop_preheader_edge (loop);
4779     }
4780   else
4781     {
4782       gcc_assert (bb_vinfo);
4783       only_init = true;
4784       *ptr_incr = NULL;
4785     }
4786 
4787   /* Check the step (evolution) of the load in LOOP, and record
4788      whether it's invariant.  */
4789   step = vect_dr_behavior (dr)->step;
4790   if (integer_zerop (step))
4791     *inv_p = true;
4792   else
4793     *inv_p = false;
4794 
4795   /* Create an expression for the first address accessed by this load
4796      in LOOP.  */
4797   base_name = get_name (DR_BASE_ADDRESS (dr));
4798 
4799   if (dump_enabled_p ())
4800     {
4801       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4802       dump_printf_loc (MSG_NOTE, vect_location,
4803                        "create %s-pointer variable to type: ",
4804                            get_tree_code_name (TREE_CODE (aggr_type)));
4805       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4806       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4807         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4808       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4809         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4810       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4811         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4812       else
4813         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4814       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4815       dump_printf (MSG_NOTE, "\n");
4816     }
4817 
4818   /* (1) Create the new aggregate-pointer variable.
4819      Vector and array types inherit the alias set of their component
4820      type by default so we need to use a ref-all pointer if the data
4821      reference does not conflict with the created aggregated data
4822      reference because it is not addressable.  */
4823   bool need_ref_all = false;
4824   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4825                                     get_alias_set (DR_REF (dr))))
4826     need_ref_all = true;
4827   /* Likewise for any of the data references in the stmt group.  */
4828   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4829     {
4830       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4831       do
4832           {
4833             stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4834             struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4835             if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4836                                               get_alias_set (DR_REF (sdr))))
4837               {
4838                 need_ref_all = true;
4839                 break;
4840               }
4841             orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4842           }
4843       while (orig_stmt);
4844     }
4845   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4846                                                          need_ref_all);
4847   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4848 
4849 
4850   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4851      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4852      def-use update cycles for the pointer: one relative to the outer-loop
4853      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4854      to the inner-loop (which is the inner-most loop containing the dataref),
4855      and this is done be step (5) below.
4856 
4857      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4858      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4859      redundant.  Steps (3),(4) create the following:
4860 
4861           vp0 = &base_addr;
4862           LOOP:     vp1 = phi(vp0,vp2)
4863                     ...
4864                     ...
4865                     vp2 = vp1 + step
4866                     goto LOOP
4867 
4868      If there is an inner-loop nested in loop, then step (5) will also be
4869      applied, and an additional update in the inner-loop will be created:
4870 
4871           vp0 = &base_addr;
4872           LOOP:   vp1 = phi(vp0,vp2)
4873                     ...
4874         inner:     vp3 = phi(vp1,vp4)
4875                      vp4 = vp3 + inner_step
4876                      if () goto inner
4877                     ...
4878                     vp2 = vp1 + step
4879                     if () goto LOOP   */
4880 
4881   /* (2) Calculate the initial address of the aggregate-pointer, and set
4882      the aggregate-pointer to point to it before the loop.  */
4883 
4884   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4885 
4886   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4887                                                                offset, byte_offset);
4888   if (new_stmt_list)
4889     {
4890       if (pe)
4891         {
4892           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4893           gcc_assert (!new_bb);
4894         }
4895       else
4896         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4897     }
4898 
4899   *initial_address = new_temp;
4900   aggr_ptr_init = new_temp;
4901 
4902   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4903      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4904      inner-loop nested in LOOP (during outer-loop vectorization).  */
4905 
4906   /* No update in loop is required.  */
4907   if (only_init && (!loop_vinfo || at_loop == loop))
4908     aptr = aggr_ptr_init;
4909   else
4910     {
4911       if (iv_step == NULL_TREE)
4912           {
4913             /* The step of the aggregate pointer is the type size.  */
4914             iv_step = TYPE_SIZE_UNIT (aggr_type);
4915             /* One exception to the above is when the scalar step of the load in
4916                LOOP is zero. In this case the step here is also zero.  */
4917             if (*inv_p)
4918               iv_step = size_zero_node;
4919             else if (tree_int_cst_sgn (step) == -1)
4920               iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4921           }
4922 
4923       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4924 
4925       create_iv (aggr_ptr_init,
4926                      fold_convert (aggr_ptr_type, iv_step),
4927                      aggr_ptr, loop, &incr_gsi, insert_after,
4928                      &indx_before_incr, &indx_after_incr);
4929       incr = gsi_stmt (incr_gsi);
4930       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4931 
4932       /* Copy the points-to information if it exists. */
4933       if (DR_PTR_INFO (dr))
4934           {
4935             vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4936             vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4937           }
4938       if (ptr_incr)
4939           *ptr_incr = incr;
4940 
4941       aptr = indx_before_incr;
4942     }
4943 
4944   if (!nested_in_vect_loop || only_init)
4945     return aptr;
4946 
4947 
4948   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4949      nested in LOOP, if exists.  */
4950 
4951   gcc_assert (nested_in_vect_loop);
4952   if (!only_init)
4953     {
4954       standard_iv_increment_position (containing_loop, &incr_gsi,
4955                                               &insert_after);
4956       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4957                      containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4958                      &indx_after_incr);
4959       incr = gsi_stmt (incr_gsi);
4960       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4961 
4962       /* Copy the points-to information if it exists. */
4963       if (DR_PTR_INFO (dr))
4964           {
4965             vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4966             vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4967           }
4968       if (ptr_incr)
4969           *ptr_incr = incr;
4970 
4971       return indx_before_incr;
4972     }
4973   else
4974     gcc_unreachable ();
4975 }
4976 
4977 
4978 /* Function bump_vector_ptr
4979 
4980    Increment a pointer (to a vector type) by vector-size. If requested,
4981    i.e. if PTR-INCR is given, then also connect the new increment stmt
4982    to the existing def-use update-chain of the pointer, by modifying
4983    the PTR_INCR as illustrated below:
4984 
4985    The pointer def-use update-chain before this function:
4986                         DATAREF_PTR = phi (p_0, p_2)
4987                         ....
4988         PTR_INCR:       p_2 = DATAREF_PTR + step
4989 
4990    The pointer def-use update-chain after this function:
4991                         DATAREF_PTR = phi (p_0, p_2)
4992                         ....
4993                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4994                         ....
4995         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4996 
4997    Input:
4998    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4999                  in the loop.
5000    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5001                 the loop.  The increment amount across iterations is expected
5002                 to be vector_size.
5003    BSI - location where the new update stmt is to be placed.
5004    STMT - the original scalar memory-access stmt that is being vectorized.
5005    BUMP - optional. The offset by which to bump the pointer. If not given,
5006             the offset is assumed to be vector_size.
5007 
5008    Output: Return NEW_DATAREF_PTR as illustrated above.
5009 
5010 */
5011 
5012 tree
bump_vector_ptr(tree dataref_ptr,gimple * ptr_incr,gimple_stmt_iterator * gsi,gimple * stmt,tree bump)5013 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5014                      gimple *stmt, tree bump)
5015 {
5016   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5017   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5018   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5019   tree update = TYPE_SIZE_UNIT (vectype);
5020   gassign *incr_stmt;
5021   ssa_op_iter iter;
5022   use_operand_p use_p;
5023   tree new_dataref_ptr;
5024 
5025   if (bump)
5026     update = bump;
5027 
5028   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5029     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5030   else
5031     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5032   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5033                                            dataref_ptr, update);
5034   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
5035 
5036   /* Copy the points-to information if it exists. */
5037   if (DR_PTR_INFO (dr))
5038     {
5039       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5040       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5041     }
5042 
5043   if (!ptr_incr)
5044     return new_dataref_ptr;
5045 
5046   /* Update the vector-pointer's cross-iteration increment.  */
5047   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5048     {
5049       tree use = USE_FROM_PTR (use_p);
5050 
5051       if (use == dataref_ptr)
5052         SET_USE (use_p, new_dataref_ptr);
5053       else
5054         gcc_assert (operand_equal_p (use, update, 0));
5055     }
5056 
5057   return new_dataref_ptr;
5058 }
5059 
5060 
5061 /* Copy memory reference info such as base/clique from the SRC reference
5062    to the DEST MEM_REF.  */
5063 
5064 void
vect_copy_ref_info(tree dest,tree src)5065 vect_copy_ref_info (tree dest, tree src)
5066 {
5067   if (TREE_CODE (dest) != MEM_REF)
5068     return;
5069 
5070   tree src_base = src;
5071   while (handled_component_p (src_base))
5072     src_base = TREE_OPERAND (src_base, 0);
5073   if (TREE_CODE (src_base) != MEM_REF
5074       && TREE_CODE (src_base) != TARGET_MEM_REF)
5075     return;
5076 
5077   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5078   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5079 }
5080 
5081 
5082 /* Function vect_create_destination_var.
5083 
5084    Create a new temporary of type VECTYPE.  */
5085 
5086 tree
vect_create_destination_var(tree scalar_dest,tree vectype)5087 vect_create_destination_var (tree scalar_dest, tree vectype)
5088 {
5089   tree vec_dest;
5090   const char *name;
5091   char *new_name;
5092   tree type;
5093   enum vect_var_kind kind;
5094 
5095   kind = vectype
5096     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5097     ? vect_mask_var
5098     : vect_simple_var
5099     : vect_scalar_var;
5100   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5101 
5102   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5103 
5104   name = get_name (scalar_dest);
5105   if (name)
5106     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5107   else
5108     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5109   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5110   free (new_name);
5111 
5112   return vec_dest;
5113 }
5114 
5115 /* Function vect_grouped_store_supported.
5116 
5117    Returns TRUE if interleave high and interleave low permutations
5118    are supported, and FALSE otherwise.  */
5119 
5120 bool
vect_grouped_store_supported(tree vectype,unsigned HOST_WIDE_INT count)5121 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5122 {
5123   machine_mode mode = TYPE_MODE (vectype);
5124 
5125   /* vect_permute_store_chain requires the group size to be equal to 3 or
5126      be a power of two.  */
5127   if (count != 3 && exact_log2 (count) == -1)
5128     {
5129       if (dump_enabled_p ())
5130           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5131                                "the size of the group of accesses"
5132                                " is not a power of 2 or not eqaul to 3\n");
5133       return false;
5134     }
5135 
5136   /* Check that the permutation is supported.  */
5137   if (VECTOR_MODE_P (mode))
5138     {
5139       unsigned int i;
5140       if (count == 3)
5141           {
5142             unsigned int j0 = 0, j1 = 0, j2 = 0;
5143             unsigned int i, j;
5144 
5145             unsigned int nelt;
5146             if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5147               {
5148                 if (dump_enabled_p ())
5149                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5150                                          "cannot handle groups of 3 stores for"
5151                                          " variable-length vectors\n");
5152                 return false;
5153               }
5154 
5155             vec_perm_builder sel (nelt, nelt, 1);
5156             sel.quick_grow (nelt);
5157             vec_perm_indices indices;
5158             for (j = 0; j < 3; j++)
5159               {
5160                 int nelt0 = ((3 - j) * nelt) % 3;
5161                 int nelt1 = ((3 - j) * nelt + 1) % 3;
5162                 int nelt2 = ((3 - j) * nelt + 2) % 3;
5163                 for (i = 0; i < nelt; i++)
5164                     {
5165                       if (3 * i + nelt0 < nelt)
5166                         sel[3 * i + nelt0] = j0++;
5167                       if (3 * i + nelt1 < nelt)
5168                         sel[3 * i + nelt1] = nelt + j1++;
5169                       if (3 * i + nelt2 < nelt)
5170                         sel[3 * i + nelt2] = 0;
5171                     }
5172                 indices.new_vector (sel, 2, nelt);
5173                 if (!can_vec_perm_const_p (mode, indices))
5174                     {
5175                       if (dump_enabled_p ())
5176                         dump_printf (MSG_MISSED_OPTIMIZATION,
5177                                          "permutation op not supported by target.\n");
5178                       return false;
5179                     }
5180 
5181                 for (i = 0; i < nelt; i++)
5182                     {
5183                       if (3 * i + nelt0 < nelt)
5184                         sel[3 * i + nelt0] = 3 * i + nelt0;
5185                       if (3 * i + nelt1 < nelt)
5186                         sel[3 * i + nelt1] = 3 * i + nelt1;
5187                       if (3 * i + nelt2 < nelt)
5188                         sel[3 * i + nelt2] = nelt + j2++;
5189                     }
5190                 indices.new_vector (sel, 2, nelt);
5191                 if (!can_vec_perm_const_p (mode, indices))
5192                     {
5193                       if (dump_enabled_p ())
5194                         dump_printf (MSG_MISSED_OPTIMIZATION,
5195                                          "permutation op not supported by target.\n");
5196                       return false;
5197                     }
5198               }
5199             return true;
5200           }
5201       else
5202           {
5203             /* If length is not equal to 3 then only power of 2 is supported.  */
5204             gcc_assert (pow2p_hwi (count));
5205             poly_uint64 nelt = GET_MODE_NUNITS (mode);
5206 
5207             /* The encoding has 2 interleaved stepped patterns.  */
5208             vec_perm_builder sel (nelt, 2, 3);
5209             sel.quick_grow (6);
5210             for (i = 0; i < 3; i++)
5211               {
5212                 sel[i * 2] = i;
5213                 sel[i * 2 + 1] = i + nelt;
5214               }
5215             vec_perm_indices indices (sel, 2, nelt);
5216             if (can_vec_perm_const_p (mode, indices))
5217               {
5218                 for (i = 0; i < 6; i++)
5219                     sel[i] += exact_div (nelt, 2);
5220                 indices.new_vector (sel, 2, nelt);
5221                 if (can_vec_perm_const_p (mode, indices))
5222                     return true;
5223               }
5224           }
5225     }
5226 
5227   if (dump_enabled_p ())
5228     dump_printf (MSG_MISSED_OPTIMIZATION,
5229                      "permutaion op not supported by target.\n");
5230   return false;
5231 }
5232 
5233 
5234 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5235    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5236 
5237 bool
vect_store_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5238 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5239                                   bool masked_p)
5240 {
5241   if (masked_p)
5242     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5243                                                    vec_mask_store_lanes_optab,
5244                                                    vectype, count);
5245   else
5246     return vect_lanes_optab_supported_p ("vec_store_lanes",
5247                                                    vec_store_lanes_optab,
5248                                                    vectype, count);
5249 }
5250 
5251 
5252 /* Function vect_permute_store_chain.
5253 
5254    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5255    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5256    the data correctly for the stores.  Return the final references for stores
5257    in RESULT_CHAIN.
5258 
5259    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5260    The input is 4 vectors each containing 8 elements.  We assign a number to
5261    each element, the input sequence is:
5262 
5263    1st vec:   0  1  2  3  4  5  6  7
5264    2nd vec:   8  9 10 11 12 13 14 15
5265    3rd vec:  16 17 18 19 20 21 22 23
5266    4th vec:  24 25 26 27 28 29 30 31
5267 
5268    The output sequence should be:
5269 
5270    1st vec:  0  8 16 24  1  9 17 25
5271    2nd vec:  2 10 18 26  3 11 19 27
5272    3rd vec:  4 12 20 28  5 13 21 30
5273    4th vec:  6 14 22 30  7 15 23 31
5274 
5275    i.e., we interleave the contents of the four vectors in their order.
5276 
5277    We use interleave_high/low instructions to create such output.  The input of
5278    each interleave_high/low operation is two vectors:
5279    1st vec    2nd vec
5280    0 1 2 3    4 5 6 7
5281    the even elements of the result vector are obtained left-to-right from the
5282    high/low elements of the first vector.  The odd elements of the result are
5283    obtained left-to-right from the high/low elements of the second vector.
5284    The output of interleave_high will be:   0 4 1 5
5285    and of interleave_low:                   2 6 3 7
5286 
5287 
5288    The permutation is done in log LENGTH stages.  In each stage interleave_high
5289    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5290    where the first argument is taken from the first half of DR_CHAIN and the
5291    second argument from it's second half.
5292    In our example,
5293 
5294    I1: interleave_high (1st vec, 3rd vec)
5295    I2: interleave_low (1st vec, 3rd vec)
5296    I3: interleave_high (2nd vec, 4th vec)
5297    I4: interleave_low (2nd vec, 4th vec)
5298 
5299    The output for the first stage is:
5300 
5301    I1:  0 16  1 17  2 18  3 19
5302    I2:  4 20  5 21  6 22  7 23
5303    I3:  8 24  9 25 10 26 11 27
5304    I4: 12 28 13 29 14 30 15 31
5305 
5306    The output of the second stage, i.e. the final result is:
5307 
5308    I1:  0  8 16 24  1  9 17 25
5309    I2:  2 10 18 26  3 11 19 27
5310    I3:  4 12 20 28  5 13 21 30
5311    I4:  6 14 22 30  7 15 23 31.  */
5312 
5313 void
vect_permute_store_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5314 vect_permute_store_chain (vec<tree> dr_chain,
5315                                 unsigned int length,
5316                                 gimple *stmt,
5317                                 gimple_stmt_iterator *gsi,
5318                                 vec<tree> *result_chain)
5319 {
5320   tree vect1, vect2, high, low;
5321   gimple *perm_stmt;
5322   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5323   tree perm_mask_low, perm_mask_high;
5324   tree data_ref;
5325   tree perm3_mask_low, perm3_mask_high;
5326   unsigned int i, j, n, log_length = exact_log2 (length);
5327 
5328   result_chain->quick_grow (length);
5329   memcpy (result_chain->address (), dr_chain.address (),
5330             length * sizeof (tree));
5331 
5332   if (length == 3)
5333     {
5334       /* vect_grouped_store_supported ensures that this is constant.  */
5335       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5336       unsigned int j0 = 0, j1 = 0, j2 = 0;
5337 
5338       vec_perm_builder sel (nelt, nelt, 1);
5339       sel.quick_grow (nelt);
5340       vec_perm_indices indices;
5341       for (j = 0; j < 3; j++)
5342         {
5343             int nelt0 = ((3 - j) * nelt) % 3;
5344             int nelt1 = ((3 - j) * nelt + 1) % 3;
5345             int nelt2 = ((3 - j) * nelt + 2) % 3;
5346 
5347             for (i = 0; i < nelt; i++)
5348               {
5349                 if (3 * i + nelt0 < nelt)
5350                     sel[3 * i + nelt0] = j0++;
5351                 if (3 * i + nelt1 < nelt)
5352                     sel[3 * i + nelt1] = nelt + j1++;
5353                 if (3 * i + nelt2 < nelt)
5354                     sel[3 * i + nelt2] = 0;
5355               }
5356             indices.new_vector (sel, 2, nelt);
5357             perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5358 
5359             for (i = 0; i < nelt; i++)
5360               {
5361                 if (3 * i + nelt0 < nelt)
5362                     sel[3 * i + nelt0] = 3 * i + nelt0;
5363                 if (3 * i + nelt1 < nelt)
5364                     sel[3 * i + nelt1] = 3 * i + nelt1;
5365                 if (3 * i + nelt2 < nelt)
5366                     sel[3 * i + nelt2] = nelt + j2++;
5367               }
5368             indices.new_vector (sel, 2, nelt);
5369             perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5370 
5371             vect1 = dr_chain[0];
5372             vect2 = dr_chain[1];
5373 
5374             /* Create interleaving stmt:
5375                low = VEC_PERM_EXPR <vect1, vect2,
5376                                           {j, nelt, *, j + 1, nelt + j + 1, *,
5377                                            j + 2, nelt + j + 2, *, ...}>  */
5378             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5379             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5380                                                      vect2, perm3_mask_low);
5381             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5382 
5383             vect1 = data_ref;
5384             vect2 = dr_chain[2];
5385             /* Create interleaving stmt:
5386                low = VEC_PERM_EXPR <vect1, vect2,
5387                                           {0, 1, nelt + j, 3, 4, nelt + j + 1,
5388                                            6, 7, nelt + j + 2, ...}>  */
5389             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5390             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5391                                                      vect2, perm3_mask_high);
5392             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5393             (*result_chain)[j] = data_ref;
5394           }
5395     }
5396   else
5397     {
5398       /* If length is not equal to 3 then only power of 2 is supported.  */
5399       gcc_assert (pow2p_hwi (length));
5400 
5401       /* The encoding has 2 interleaved stepped patterns.  */
5402       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5403       vec_perm_builder sel (nelt, 2, 3);
5404       sel.quick_grow (6);
5405       for (i = 0; i < 3; i++)
5406           {
5407             sel[i * 2] = i;
5408             sel[i * 2 + 1] = i + nelt;
5409           }
5410           vec_perm_indices indices (sel, 2, nelt);
5411           perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5412 
5413           for (i = 0; i < 6; i++)
5414             sel[i] += exact_div (nelt, 2);
5415           indices.new_vector (sel, 2, nelt);
5416           perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5417 
5418           for (i = 0, n = log_length; i < n; i++)
5419             {
5420               for (j = 0; j < length/2; j++)
5421                 {
5422                     vect1 = dr_chain[j];
5423                     vect2 = dr_chain[j+length/2];
5424 
5425                     /* Create interleaving stmt:
5426                        high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5427                                                                       ...}>  */
5428                     high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5429                     perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5430                                                              vect2, perm_mask_high);
5431                     vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5432                     (*result_chain)[2*j] = high;
5433 
5434                     /* Create interleaving stmt:
5435                        low = VEC_PERM_EXPR <vect1, vect2,
5436                                                   {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5437                                                    ...}>  */
5438                     low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5439                     perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5440                                                              vect2, perm_mask_low);
5441                     vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5442                     (*result_chain)[2*j+1] = low;
5443                 }
5444               memcpy (dr_chain.address (), result_chain->address (),
5445                         length * sizeof (tree));
5446             }
5447     }
5448 }
5449 
5450 /* Function vect_setup_realignment
5451 
5452    This function is called when vectorizing an unaligned load using
5453    the dr_explicit_realign[_optimized] scheme.
5454    This function generates the following code at the loop prolog:
5455 
5456       p = initial_addr;
5457    x  msq_init = *(floor(p));   # prolog load
5458       realignment_token = call target_builtin;
5459     loop:
5460    x  msq = phi (msq_init, ---)
5461 
5462    The stmts marked with x are generated only for the case of
5463    dr_explicit_realign_optimized.
5464 
5465    The code above sets up a new (vector) pointer, pointing to the first
5466    location accessed by STMT, and a "floor-aligned" load using that pointer.
5467    It also generates code to compute the "realignment-token" (if the relevant
5468    target hook was defined), and creates a phi-node at the loop-header bb
5469    whose arguments are the result of the prolog-load (created by this
5470    function) and the result of a load that takes place in the loop (to be
5471    created by the caller to this function).
5472 
5473    For the case of dr_explicit_realign_optimized:
5474    The caller to this function uses the phi-result (msq) to create the
5475    realignment code inside the loop, and sets up the missing phi argument,
5476    as follows:
5477     loop:
5478       msq = phi (msq_init, lsq)
5479       lsq = *(floor(p'));        # load in loop
5480       result = realign_load (msq, lsq, realignment_token);
5481 
5482    For the case of dr_explicit_realign:
5483     loop:
5484       msq = *(floor(p));      # load in loop
5485       p' = p + (VS-1);
5486       lsq = *(floor(p'));     # load in loop
5487       result = realign_load (msq, lsq, realignment_token);
5488 
5489    Input:
5490    STMT - (scalar) load stmt to be vectorized. This load accesses
5491           a memory location that may be unaligned.
5492    BSI - place where new code is to be inserted.
5493    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5494                                     is used.
5495 
5496    Output:
5497    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5498                        target hook, if defined.
5499    Return value - the result of the loop-header phi node.  */
5500 
5501 tree
vect_setup_realignment(gimple * stmt,gimple_stmt_iterator * gsi,tree * realignment_token,enum dr_alignment_support alignment_support_scheme,tree init_addr,struct loop ** at_loop)5502 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5503                         tree *realignment_token,
5504                               enum dr_alignment_support alignment_support_scheme,
5505                               tree init_addr,
5506                               struct loop **at_loop)
5507 {
5508   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5509   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5510   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5511   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5512   struct loop *loop = NULL;
5513   edge pe = NULL;
5514   tree scalar_dest = gimple_assign_lhs (stmt);
5515   tree vec_dest;
5516   gimple *inc;
5517   tree ptr;
5518   tree data_ref;
5519   basic_block new_bb;
5520   tree msq_init = NULL_TREE;
5521   tree new_temp;
5522   gphi *phi_stmt;
5523   tree msq = NULL_TREE;
5524   gimple_seq stmts = NULL;
5525   bool inv_p;
5526   bool compute_in_loop = false;
5527   bool nested_in_vect_loop = false;
5528   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5529   struct loop *loop_for_initial_load = NULL;
5530 
5531   if (loop_vinfo)
5532     {
5533       loop = LOOP_VINFO_LOOP (loop_vinfo);
5534       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5535     }
5536 
5537   gcc_assert (alignment_support_scheme == dr_explicit_realign
5538                 || alignment_support_scheme == dr_explicit_realign_optimized);
5539 
5540   /* We need to generate three things:
5541      1. the misalignment computation
5542      2. the extra vector load (for the optimized realignment scheme).
5543      3. the phi node for the two vectors from which the realignment is
5544       done (for the optimized realignment scheme).  */
5545 
5546   /* 1. Determine where to generate the misalignment computation.
5547 
5548      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5549      calculation will be generated by this function, outside the loop (in the
5550      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5551      caller, inside the loop.
5552 
5553      Background: If the misalignment remains fixed throughout the iterations of
5554      the loop, then both realignment schemes are applicable, and also the
5555      misalignment computation can be done outside LOOP.  This is because we are
5556      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5557      are a multiple of VS (the Vector Size), and therefore the misalignment in
5558      different vectorized LOOP iterations is always the same.
5559      The problem arises only if the memory access is in an inner-loop nested
5560      inside LOOP, which is now being vectorized using outer-loop vectorization.
5561      This is the only case when the misalignment of the memory access may not
5562      remain fixed throughout the iterations of the inner-loop (as explained in
5563      detail in vect_supportable_dr_alignment).  In this case, not only is the
5564      optimized realignment scheme not applicable, but also the misalignment
5565      computation (and generation of the realignment token that is passed to
5566      REALIGN_LOAD) have to be done inside the loop.
5567 
5568      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5569      or not, which in turn determines if the misalignment is computed inside
5570      the inner-loop, or outside LOOP.  */
5571 
5572   if (init_addr != NULL_TREE || !loop_vinfo)
5573     {
5574       compute_in_loop = true;
5575       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5576     }
5577 
5578 
5579   /* 2. Determine where to generate the extra vector load.
5580 
5581      For the optimized realignment scheme, instead of generating two vector
5582      loads in each iteration, we generate a single extra vector load in the
5583      preheader of the loop, and in each iteration reuse the result of the
5584      vector load from the previous iteration.  In case the memory access is in
5585      an inner-loop nested inside LOOP, which is now being vectorized using
5586      outer-loop vectorization, we need to determine whether this initial vector
5587      load should be generated at the preheader of the inner-loop, or can be
5588      generated at the preheader of LOOP.  If the memory access has no evolution
5589      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5590      to be generated inside LOOP (in the preheader of the inner-loop).  */
5591 
5592   if (nested_in_vect_loop)
5593     {
5594       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5595       bool invariant_in_outerloop =
5596             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5597       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5598     }
5599   else
5600     loop_for_initial_load = loop;
5601   if (at_loop)
5602     *at_loop = loop_for_initial_load;
5603 
5604   if (loop_for_initial_load)
5605     pe = loop_preheader_edge (loop_for_initial_load);
5606 
5607   /* 3. For the case of the optimized realignment, create the first vector
5608       load at the loop preheader.  */
5609 
5610   if (alignment_support_scheme == dr_explicit_realign_optimized)
5611     {
5612       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5613       gassign *new_stmt;
5614 
5615       gcc_assert (!compute_in_loop);
5616       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5617       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5618                                               NULL_TREE, &init_addr, NULL, &inc,
5619                                               true, &inv_p);
5620       if (TREE_CODE (ptr) == SSA_NAME)
5621           new_temp = copy_ssa_name (ptr);
5622       else
5623           new_temp = make_ssa_name (TREE_TYPE (ptr));
5624       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5625       new_stmt = gimple_build_assign
5626                        (new_temp, BIT_AND_EXPR, ptr,
5627                         build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5628       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5629       gcc_assert (!new_bb);
5630       data_ref
5631           = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5632                       build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5633       vect_copy_ref_info (data_ref, DR_REF (dr));
5634       new_stmt = gimple_build_assign (vec_dest, data_ref);
5635       new_temp = make_ssa_name (vec_dest, new_stmt);
5636       gimple_assign_set_lhs (new_stmt, new_temp);
5637       if (pe)
5638         {
5639           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5640           gcc_assert (!new_bb);
5641         }
5642       else
5643          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5644 
5645       msq_init = gimple_assign_lhs (new_stmt);
5646     }
5647 
5648   /* 4. Create realignment token using a target builtin, if available.
5649       It is done either inside the containing loop, or before LOOP (as
5650       determined above).  */
5651 
5652   if (targetm.vectorize.builtin_mask_for_load)
5653     {
5654       gcall *new_stmt;
5655       tree builtin_decl;
5656 
5657       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5658       if (!init_addr)
5659           {
5660             /* Generate the INIT_ADDR computation outside LOOP.  */
5661             init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5662                                                                           NULL_TREE);
5663           if (loop)
5664             {
5665                 pe = loop_preheader_edge (loop);
5666                 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5667                 gcc_assert (!new_bb);
5668             }
5669           else
5670              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5671           }
5672 
5673       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5674       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5675       vec_dest =
5676           vect_create_destination_var (scalar_dest,
5677                                              gimple_call_return_type (new_stmt));
5678       new_temp = make_ssa_name (vec_dest, new_stmt);
5679       gimple_call_set_lhs (new_stmt, new_temp);
5680 
5681       if (compute_in_loop)
5682           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5683       else
5684           {
5685             /* Generate the misalignment computation outside LOOP.  */
5686             pe = loop_preheader_edge (loop);
5687             new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5688             gcc_assert (!new_bb);
5689           }
5690 
5691       *realignment_token = gimple_call_lhs (new_stmt);
5692 
5693       /* The result of the CALL_EXPR to this builtin is determined from
5694          the value of the parameter and no global variables are touched
5695          which makes the builtin a "const" function.  Requiring the
5696          builtin to have the "const" attribute makes it unnecessary
5697          to call mark_call_clobbered.  */
5698       gcc_assert (TREE_READONLY (builtin_decl));
5699     }
5700 
5701   if (alignment_support_scheme == dr_explicit_realign)
5702     return msq;
5703 
5704   gcc_assert (!compute_in_loop);
5705   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5706 
5707 
5708   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5709 
5710   pe = loop_preheader_edge (containing_loop);
5711   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5712   msq = make_ssa_name (vec_dest);
5713   phi_stmt = create_phi_node (msq, containing_loop->header);
5714   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5715 
5716   return msq;
5717 }
5718 
5719 
5720 /* Function vect_grouped_load_supported.
5721 
5722    COUNT is the size of the load group (the number of statements plus the
5723    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5724    only one statement, with a gap of COUNT - 1.
5725 
5726    Returns true if a suitable permute exists.  */
5727 
5728 bool
vect_grouped_load_supported(tree vectype,bool single_element_p,unsigned HOST_WIDE_INT count)5729 vect_grouped_load_supported (tree vectype, bool single_element_p,
5730                                    unsigned HOST_WIDE_INT count)
5731 {
5732   machine_mode mode = TYPE_MODE (vectype);
5733 
5734   /* If this is single-element interleaving with an element distance
5735      that leaves unused vector loads around punt - we at least create
5736      very sub-optimal code in that case (and blow up memory,
5737      see PR65518).  */
5738   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5739     {
5740       if (dump_enabled_p ())
5741           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5742                                "single-element interleaving not supported "
5743                                "for not adjacent vector loads\n");
5744       return false;
5745     }
5746 
5747   /* vect_permute_load_chain requires the group size to be equal to 3 or
5748      be a power of two.  */
5749   if (count != 3 && exact_log2 (count) == -1)
5750     {
5751       if (dump_enabled_p ())
5752           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5753                                "the size of the group of accesses"
5754                                " is not a power of 2 or not equal to 3\n");
5755       return false;
5756     }
5757 
5758   /* Check that the permutation is supported.  */
5759   if (VECTOR_MODE_P (mode))
5760     {
5761       unsigned int i, j;
5762       if (count == 3)
5763           {
5764             unsigned int nelt;
5765             if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5766               {
5767                 if (dump_enabled_p ())
5768                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5769                                          "cannot handle groups of 3 loads for"
5770                                          " variable-length vectors\n");
5771                 return false;
5772               }
5773 
5774             vec_perm_builder sel (nelt, nelt, 1);
5775             sel.quick_grow (nelt);
5776             vec_perm_indices indices;
5777             unsigned int k;
5778             for (k = 0; k < 3; k++)
5779               {
5780                 for (i = 0; i < nelt; i++)
5781                     if (3 * i + k < 2 * nelt)
5782                       sel[i] = 3 * i + k;
5783                     else
5784                       sel[i] = 0;
5785                 indices.new_vector (sel, 2, nelt);
5786                 if (!can_vec_perm_const_p (mode, indices))
5787                     {
5788                       if (dump_enabled_p ())
5789                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790                                              "shuffle of 3 loads is not supported by"
5791                                              " target\n");
5792                       return false;
5793                     }
5794                 for (i = 0, j = 0; i < nelt; i++)
5795                     if (3 * i + k < 2 * nelt)
5796                       sel[i] = i;
5797                     else
5798                       sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5799                 indices.new_vector (sel, 2, nelt);
5800                 if (!can_vec_perm_const_p (mode, indices))
5801                     {
5802                       if (dump_enabled_p ())
5803                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804                                              "shuffle of 3 loads is not supported by"
5805                                              " target\n");
5806                       return false;
5807                     }
5808               }
5809             return true;
5810           }
5811       else
5812           {
5813             /* If length is not equal to 3 then only power of 2 is supported.  */
5814             gcc_assert (pow2p_hwi (count));
5815             poly_uint64 nelt = GET_MODE_NUNITS (mode);
5816 
5817             /* The encoding has a single stepped pattern.  */
5818             vec_perm_builder sel (nelt, 1, 3);
5819             sel.quick_grow (3);
5820             for (i = 0; i < 3; i++)
5821               sel[i] = i * 2;
5822             vec_perm_indices indices (sel, 2, nelt);
5823             if (can_vec_perm_const_p (mode, indices))
5824               {
5825                 for (i = 0; i < 3; i++)
5826                     sel[i] = i * 2 + 1;
5827                 indices.new_vector (sel, 2, nelt);
5828                 if (can_vec_perm_const_p (mode, indices))
5829                     return true;
5830               }
5831         }
5832     }
5833 
5834   if (dump_enabled_p ())
5835     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5836                          "extract even/odd not supported by target\n");
5837   return false;
5838 }
5839 
5840 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5841    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5842 
5843 bool
vect_load_lanes_supported(tree vectype,unsigned HOST_WIDE_INT count,bool masked_p)5844 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5845                                  bool masked_p)
5846 {
5847   if (masked_p)
5848     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5849                                                    vec_mask_load_lanes_optab,
5850                                                    vectype, count);
5851   else
5852     return vect_lanes_optab_supported_p ("vec_load_lanes",
5853                                                    vec_load_lanes_optab,
5854                                                    vectype, count);
5855 }
5856 
5857 /* Function vect_permute_load_chain.
5858 
5859    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5860    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5861    the input data correctly.  Return the final references for loads in
5862    RESULT_CHAIN.
5863 
5864    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5865    The input is 4 vectors each containing 8 elements. We assign a number to each
5866    element, the input sequence is:
5867 
5868    1st vec:   0  1  2  3  4  5  6  7
5869    2nd vec:   8  9 10 11 12 13 14 15
5870    3rd vec:  16 17 18 19 20 21 22 23
5871    4th vec:  24 25 26 27 28 29 30 31
5872 
5873    The output sequence should be:
5874 
5875    1st vec:  0 4  8 12 16 20 24 28
5876    2nd vec:  1 5  9 13 17 21 25 29
5877    3rd vec:  2 6 10 14 18 22 26 30
5878    4th vec:  3 7 11 15 19 23 27 31
5879 
5880    i.e., the first output vector should contain the first elements of each
5881    interleaving group, etc.
5882 
5883    We use extract_even/odd instructions to create such output.  The input of
5884    each extract_even/odd operation is two vectors
5885    1st vec    2nd vec
5886    0 1 2 3    4 5 6 7
5887 
5888    and the output is the vector of extracted even/odd elements.  The output of
5889    extract_even will be:   0 2 4 6
5890    and of extract_odd:     1 3 5 7
5891 
5892 
5893    The permutation is done in log LENGTH stages.  In each stage extract_even
5894    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5895    their order.  In our example,
5896 
5897    E1: extract_even (1st vec, 2nd vec)
5898    E2: extract_odd (1st vec, 2nd vec)
5899    E3: extract_even (3rd vec, 4th vec)
5900    E4: extract_odd (3rd vec, 4th vec)
5901 
5902    The output for the first stage will be:
5903 
5904    E1:  0  2  4  6  8 10 12 14
5905    E2:  1  3  5  7  9 11 13 15
5906    E3: 16 18 20 22 24 26 28 30
5907    E4: 17 19 21 23 25 27 29 31
5908 
5909    In order to proceed and create the correct sequence for the next stage (or
5910    for the correct output, if the second stage is the last one, as in our
5911    example), we first put the output of extract_even operation and then the
5912    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5913    The input for the second stage is:
5914 
5915    1st vec (E1):  0  2  4  6  8 10 12 14
5916    2nd vec (E3): 16 18 20 22 24 26 28 30
5917    3rd vec (E2):  1  3  5  7  9 11 13 15
5918    4th vec (E4): 17 19 21 23 25 27 29 31
5919 
5920    The output of the second stage:
5921 
5922    E1: 0 4  8 12 16 20 24 28
5923    E2: 2 6 10 14 18 22 26 30
5924    E3: 1 5  9 13 17 21 25 29
5925    E4: 3 7 11 15 19 23 27 31
5926 
5927    And RESULT_CHAIN after reordering:
5928 
5929    1st vec (E1):  0 4  8 12 16 20 24 28
5930    2nd vec (E3):  1 5  9 13 17 21 25 29
5931    3rd vec (E2):  2 6 10 14 18 22 26 30
5932    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5933 
5934 static void
vect_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)5935 vect_permute_load_chain (vec<tree> dr_chain,
5936                                unsigned int length,
5937                                gimple *stmt,
5938                                gimple_stmt_iterator *gsi,
5939                                vec<tree> *result_chain)
5940 {
5941   tree data_ref, first_vect, second_vect;
5942   tree perm_mask_even, perm_mask_odd;
5943   tree perm3_mask_low, perm3_mask_high;
5944   gimple *perm_stmt;
5945   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5946   unsigned int i, j, log_length = exact_log2 (length);
5947 
5948   result_chain->quick_grow (length);
5949   memcpy (result_chain->address (), dr_chain.address (),
5950             length * sizeof (tree));
5951 
5952   if (length == 3)
5953     {
5954       /* vect_grouped_load_supported ensures that this is constant.  */
5955       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5956       unsigned int k;
5957 
5958       vec_perm_builder sel (nelt, nelt, 1);
5959       sel.quick_grow (nelt);
5960       vec_perm_indices indices;
5961       for (k = 0; k < 3; k++)
5962           {
5963             for (i = 0; i < nelt; i++)
5964               if (3 * i + k < 2 * nelt)
5965                 sel[i] = 3 * i + k;
5966               else
5967                 sel[i] = 0;
5968             indices.new_vector (sel, 2, nelt);
5969             perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5970 
5971             for (i = 0, j = 0; i < nelt; i++)
5972               if (3 * i + k < 2 * nelt)
5973                 sel[i] = i;
5974               else
5975                 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5976             indices.new_vector (sel, 2, nelt);
5977             perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5978 
5979             first_vect = dr_chain[0];
5980             second_vect = dr_chain[1];
5981 
5982             /* Create interleaving stmt (low part of):
5983                low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5984                                                                            ...}>  */
5985             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5986             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5987                                                      second_vect, perm3_mask_low);
5988             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5989 
5990             /* Create interleaving stmt (high part of):
5991                high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5992                                                                             ...}>  */
5993             first_vect = data_ref;
5994             second_vect = dr_chain[2];
5995             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5996             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5997                                                      second_vect, perm3_mask_high);
5998             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5999             (*result_chain)[k] = data_ref;
6000           }
6001     }
6002   else
6003     {
6004       /* If length is not equal to 3 then only power of 2 is supported.  */
6005       gcc_assert (pow2p_hwi (length));
6006 
6007       /* The encoding has a single stepped pattern.  */
6008       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6009       vec_perm_builder sel (nelt, 1, 3);
6010       sel.quick_grow (3);
6011       for (i = 0; i < 3; ++i)
6012           sel[i] = i * 2;
6013       vec_perm_indices indices (sel, 2, nelt);
6014       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6015 
6016       for (i = 0; i < 3; ++i)
6017           sel[i] = i * 2 + 1;
6018       indices.new_vector (sel, 2, nelt);
6019       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6020 
6021       for (i = 0; i < log_length; i++)
6022           {
6023             for (j = 0; j < length; j += 2)
6024               {
6025                 first_vect = dr_chain[j];
6026                 second_vect = dr_chain[j+1];
6027 
6028                 /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6029                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6030                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6031                                                          first_vect, second_vect,
6032                                                          perm_mask_even);
6033                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6034                 (*result_chain)[j/2] = data_ref;
6035 
6036                 /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6037                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6038                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6039                                                          first_vect, second_vect,
6040                                                          perm_mask_odd);
6041                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6042                 (*result_chain)[j/2+length/2] = data_ref;
6043               }
6044             memcpy (dr_chain.address (), result_chain->address (),
6045                       length * sizeof (tree));
6046           }
6047     }
6048 }
6049 
6050 /* Function vect_shift_permute_load_chain.
6051 
6052    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6053    sequence of stmts to reorder the input data accordingly.
6054    Return the final references for loads in RESULT_CHAIN.
6055    Return true if successed, false otherwise.
6056 
6057    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6058    The input is 3 vectors each containing 8 elements.  We assign a
6059    number to each element, the input sequence is:
6060 
6061    1st vec:   0  1  2  3  4  5  6  7
6062    2nd vec:   8  9 10 11 12 13 14 15
6063    3rd vec:  16 17 18 19 20 21 22 23
6064 
6065    The output sequence should be:
6066 
6067    1st vec:  0 3 6  9 12 15 18 21
6068    2nd vec:  1 4 7 10 13 16 19 22
6069    3rd vec:  2 5 8 11 14 17 20 23
6070 
6071    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6072 
6073    First we shuffle all 3 vectors to get correct elements order:
6074 
6075    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6076    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6077    3rd vec:  (16 19 22) (17 20 23) (18 21)
6078 
6079    Next we unite and shift vector 3 times:
6080 
6081    1st step:
6082      shift right by 6 the concatenation of:
6083      "1st vec" and  "2nd vec"
6084        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6085      "2nd vec" and  "3rd vec"
6086        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6087      "3rd vec" and  "1st vec"
6088        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6089                                    | New vectors                   |
6090 
6091      So that now new vectors are:
6092 
6093      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6094      2nd vec:  (10 13) (16 19 22) (17 20 23)
6095      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6096 
6097    2nd step:
6098      shift right by 5 the concatenation of:
6099      "1st vec" and  "3rd vec"
6100        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6101      "2nd vec" and  "1st vec"
6102        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6103      "3rd vec" and  "2nd vec"
6104        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6105                                 | New vectors                   |
6106 
6107      So that now new vectors are:
6108 
6109      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6110      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6111      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6112 
6113    3rd step:
6114      shift right by 5 the concatenation of:
6115      "1st vec" and  "1st vec"
6116        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6117      shift right by 3 the concatenation of:
6118      "2nd vec" and  "2nd vec"
6119                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6120                                 | New vectors                   |
6121 
6122      So that now all vectors are READY:
6123      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6124      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6125      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6126 
6127    This algorithm is faster than one in vect_permute_load_chain if:
6128      1.  "shift of a concatination" is faster than general permutation.
6129            This is usually so.
6130      2.  The TARGET machine can't execute vector instructions in parallel.
6131            This is because each step of the algorithm depends on previous.
6132            The algorithm in vect_permute_load_chain is much more parallel.
6133 
6134    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6135 */
6136 
6137 static bool
vect_shift_permute_load_chain(vec<tree> dr_chain,unsigned int length,gimple * stmt,gimple_stmt_iterator * gsi,vec<tree> * result_chain)6138 vect_shift_permute_load_chain (vec<tree> dr_chain,
6139                                      unsigned int length,
6140                                      gimple *stmt,
6141                                      gimple_stmt_iterator *gsi,
6142                                      vec<tree> *result_chain)
6143 {
6144   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6145   tree perm2_mask1, perm2_mask2, perm3_mask;
6146   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6147   gimple *perm_stmt;
6148 
6149   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6150   unsigned int i;
6151   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6152   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6153 
6154   unsigned HOST_WIDE_INT nelt, vf;
6155   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6156       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6157     /* Not supported for variable-length vectors.  */
6158     return false;
6159 
6160   vec_perm_builder sel (nelt, nelt, 1);
6161   sel.quick_grow (nelt);
6162 
6163   result_chain->quick_grow (length);
6164   memcpy (result_chain->address (), dr_chain.address (),
6165             length * sizeof (tree));
6166 
6167   if (pow2p_hwi (length) && vf > 4)
6168     {
6169       unsigned int j, log_length = exact_log2 (length);
6170       for (i = 0; i < nelt / 2; ++i)
6171           sel[i] = i * 2;
6172       for (i = 0; i < nelt / 2; ++i)
6173           sel[nelt / 2 + i] = i * 2 + 1;
6174       vec_perm_indices indices (sel, 2, nelt);
6175       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6176           {
6177             if (dump_enabled_p ())
6178               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6179                                    "shuffle of 2 fields structure is not \
6180                                     supported by target\n");
6181             return false;
6182           }
6183       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6184 
6185       for (i = 0; i < nelt / 2; ++i)
6186           sel[i] = i * 2 + 1;
6187       for (i = 0; i < nelt / 2; ++i)
6188           sel[nelt / 2 + i] = i * 2;
6189       indices.new_vector (sel, 2, nelt);
6190       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6191           {
6192             if (dump_enabled_p ())
6193               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6194                                    "shuffle of 2 fields structure is not \
6195                                     supported by target\n");
6196             return false;
6197           }
6198       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6199 
6200       /* Generating permutation constant to shift all elements.
6201            For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6202       for (i = 0; i < nelt; i++)
6203           sel[i] = nelt / 2 + i;
6204       indices.new_vector (sel, 2, nelt);
6205       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6206           {
6207             if (dump_enabled_p ())
6208               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6209                                    "shift permutation is not supported by target\n");
6210             return false;
6211           }
6212       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6213 
6214       /* Generating permutation constant to select vector from 2.
6215            For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6216       for (i = 0; i < nelt / 2; i++)
6217           sel[i] = i;
6218       for (i = nelt / 2; i < nelt; i++)
6219           sel[i] = nelt + i;
6220       indices.new_vector (sel, 2, nelt);
6221       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6222           {
6223             if (dump_enabled_p ())
6224               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6225                                    "select is not supported by target\n");
6226             return false;
6227           }
6228       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6229 
6230       for (i = 0; i < log_length; i++)
6231           {
6232             for (j = 0; j < length; j += 2)
6233               {
6234                 first_vect = dr_chain[j];
6235                 second_vect = dr_chain[j + 1];
6236 
6237                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6238                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6239                                                          first_vect, first_vect,
6240                                                          perm2_mask1);
6241                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6242                 vect[0] = data_ref;
6243 
6244                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6245                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6246                                                          second_vect, second_vect,
6247                                                          perm2_mask2);
6248                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6249                 vect[1] = data_ref;
6250 
6251                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6252                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6253                                                          vect[0], vect[1], shift1_mask);
6254                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6255                 (*result_chain)[j/2 + length/2] = data_ref;
6256 
6257                 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6258                 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6259                                                          vect[0], vect[1], select_mask);
6260                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6261                 (*result_chain)[j/2] = data_ref;
6262               }
6263             memcpy (dr_chain.address (), result_chain->address (),
6264                       length * sizeof (tree));
6265           }
6266       return true;
6267     }
6268   if (length == 3 && vf > 2)
6269     {
6270       unsigned int k = 0, l = 0;
6271 
6272       /* Generating permutation constant to get all elements in rigth order.
6273            For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6274       for (i = 0; i < nelt; i++)
6275           {
6276             if (3 * k + (l % 3) >= nelt)
6277               {
6278                 k = 0;
6279                 l += (3 - (nelt % 3));
6280               }
6281             sel[i] = 3 * k + (l % 3);
6282             k++;
6283           }
6284       vec_perm_indices indices (sel, 2, nelt);
6285       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6286           {
6287             if (dump_enabled_p ())
6288               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289                                    "shuffle of 3 fields structure is not \
6290                                     supported by target\n");
6291             return false;
6292           }
6293       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6294 
6295       /* Generating permutation constant to shift all elements.
6296            For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6297       for (i = 0; i < nelt; i++)
6298           sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6299       indices.new_vector (sel, 2, nelt);
6300       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6301           {
6302             if (dump_enabled_p ())
6303               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304                                    "shift permutation is not supported by target\n");
6305             return false;
6306           }
6307       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6308 
6309       /* Generating permutation constant to shift all elements.
6310            For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6311       for (i = 0; i < nelt; i++)
6312           sel[i] = 2 * (nelt / 3) + 1 + i;
6313       indices.new_vector (sel, 2, nelt);
6314       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6315           {
6316             if (dump_enabled_p ())
6317               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6318                                    "shift permutation is not supported by target\n");
6319             return false;
6320           }
6321       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6322 
6323       /* Generating permutation constant to shift all elements.
6324            For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6325       for (i = 0; i < nelt; i++)
6326           sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6327       indices.new_vector (sel, 2, nelt);
6328       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6329           {
6330             if (dump_enabled_p ())
6331               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6332                                    "shift permutation is not supported by target\n");
6333             return false;
6334           }
6335       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6336 
6337       /* Generating permutation constant to shift all elements.
6338            For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6339       for (i = 0; i < nelt; i++)
6340           sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6341       indices.new_vector (sel, 2, nelt);
6342       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6343           {
6344             if (dump_enabled_p ())
6345               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6346                                    "shift permutation is not supported by target\n");
6347             return false;
6348           }
6349       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6350 
6351       for (k = 0; k < 3; k++)
6352           {
6353             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6354             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6355                                                      dr_chain[k], dr_chain[k],
6356                                                      perm3_mask);
6357             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6358             vect[k] = data_ref;
6359           }
6360 
6361       for (k = 0; k < 3; k++)
6362           {
6363             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6364             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6365                                                      vect[k % 3], vect[(k + 1) % 3],
6366                                                      shift1_mask);
6367             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6368             vect_shift[k] = data_ref;
6369           }
6370 
6371       for (k = 0; k < 3; k++)
6372           {
6373             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6374             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6375                                                      vect_shift[(4 - k) % 3],
6376                                                      vect_shift[(3 - k) % 3],
6377                                                      shift2_mask);
6378             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6379             vect[k] = data_ref;
6380           }
6381 
6382       (*result_chain)[3 - (nelt % 3)] = vect[2];
6383 
6384       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6385       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6386                                                vect[0], shift3_mask);
6387       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6388       (*result_chain)[nelt % 3] = data_ref;
6389 
6390       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6391       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6392                                                vect[1], shift4_mask);
6393       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6394       (*result_chain)[0] = data_ref;
6395       return true;
6396     }
6397   return false;
6398 }
6399 
6400 /* Function vect_transform_grouped_load.
6401 
6402    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6403    to perform their permutation and ascribe the result vectorized statements to
6404    the scalar statements.
6405 */
6406 
6407 void
vect_transform_grouped_load(gimple * stmt,vec<tree> dr_chain,int size,gimple_stmt_iterator * gsi)6408 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6409                                    gimple_stmt_iterator *gsi)
6410 {
6411   machine_mode mode;
6412   vec<tree> result_chain = vNULL;
6413 
6414   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6415      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6416      vectors, that are ready for vector computation.  */
6417   result_chain.create (size);
6418 
6419   /* If reassociation width for vector type is 2 or greater target machine can
6420      execute 2 or more vector instructions in parallel.  Otherwise try to
6421      get chain for loads group using vect_shift_permute_load_chain.  */
6422   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6423   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6424       || pow2p_hwi (size)
6425       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6426                                                    gsi, &result_chain))
6427     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6428   vect_record_grouped_load_vectors (stmt, result_chain);
6429   result_chain.release ();
6430 }
6431 
6432 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6433    generated as part of the vectorization of STMT.  Assign the statement
6434    for each vector to the associated scalar statement.  */
6435 
6436 void
vect_record_grouped_load_vectors(gimple * stmt,vec<tree> result_chain)6437 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6438 {
6439   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6440   gimple *next_stmt, *new_stmt;
6441   unsigned int i, gap_count;
6442   tree tmp_data_ref;
6443 
6444   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6445      Since we scan the chain starting from it's first node, their order
6446      corresponds the order of data-refs in RESULT_CHAIN.  */
6447   next_stmt = first_stmt;
6448   gap_count = 1;
6449   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6450     {
6451       if (!next_stmt)
6452           break;
6453 
6454       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6455        code elimination pass later.  No need to check for the first stmt in
6456        the group, since it always exists.
6457        GROUP_GAP is the number of steps in elements from the previous
6458        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6459        correspond to the gaps.  */
6460       if (next_stmt != first_stmt
6461           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6462       {
6463         gap_count++;
6464         continue;
6465       }
6466 
6467       while (next_stmt)
6468         {
6469             new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6470             /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6471                copies, and we put the new vector statement in the first available
6472                RELATED_STMT.  */
6473             if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6474               STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6475             else
6476             {
6477               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6478                 {
6479                       gimple *prev_stmt =
6480                         STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6481                       gimple *rel_stmt =
6482                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6483                     while (rel_stmt)
6484                         {
6485                           prev_stmt = rel_stmt;
6486                           rel_stmt =
6487                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6488                         }
6489 
6490                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6491                     new_stmt;
6492                 }
6493             }
6494 
6495             next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6496             gap_count = 1;
6497             /* If NEXT_STMT accesses the same DR as the previous statement,
6498                put the same TMP_DATA_REF as its vectorized statement; otherwise
6499                get the next data-ref from RESULT_CHAIN.  */
6500             if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6501               break;
6502         }
6503     }
6504 }
6505 
6506 /* Function vect_force_dr_alignment_p.
6507 
6508    Returns whether the alignment of a DECL can be forced to be aligned
6509    on ALIGNMENT bit boundary.  */
6510 
6511 bool
vect_can_force_dr_alignment_p(const_tree decl,unsigned int alignment)6512 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6513 {
6514   if (!VAR_P (decl))
6515     return false;
6516 
6517   if (decl_in_symtab_p (decl)
6518       && !symtab_node::get (decl)->can_increase_alignment_p ())
6519     return false;
6520 
6521   if (TREE_STATIC (decl))
6522     return (alignment <= MAX_OFILE_ALIGNMENT);
6523   else
6524     return (alignment <= MAX_STACK_ALIGNMENT);
6525 }
6526 
6527 
6528 /* Return whether the data reference DR is supported with respect to its
6529    alignment.
6530    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6531    it is aligned, i.e., check if it is possible to vectorize it with different
6532    alignment.  */
6533 
6534 enum dr_alignment_support
vect_supportable_dr_alignment(struct data_reference * dr,bool check_aligned_accesses)6535 vect_supportable_dr_alignment (struct data_reference *dr,
6536                                bool check_aligned_accesses)
6537 {
6538   gimple *stmt = DR_STMT (dr);
6539   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6540   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6541   machine_mode mode = TYPE_MODE (vectype);
6542   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6543   struct loop *vect_loop = NULL;
6544   bool nested_in_vect_loop = false;
6545 
6546   if (aligned_access_p (dr) && !check_aligned_accesses)
6547     return dr_aligned;
6548 
6549   /* For now assume all conditional loads/stores support unaligned
6550      access without any special code.  */
6551   if (is_gimple_call (stmt)
6552       && gimple_call_internal_p (stmt)
6553       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6554             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6555     return dr_unaligned_supported;
6556 
6557   if (loop_vinfo)
6558     {
6559       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6560       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6561     }
6562 
6563   /* Possibly unaligned access.  */
6564 
6565   /* We can choose between using the implicit realignment scheme (generating
6566      a misaligned_move stmt) and the explicit realignment scheme (generating
6567      aligned loads with a REALIGN_LOAD).  There are two variants to the
6568      explicit realignment scheme: optimized, and unoptimized.
6569      We can optimize the realignment only if the step between consecutive
6570      vector loads is equal to the vector size.  Since the vector memory
6571      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6572      is guaranteed that the misalignment amount remains the same throughout the
6573      execution of the vectorized loop.  Therefore, we can create the
6574      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6575      at the loop preheader.
6576 
6577      However, in the case of outer-loop vectorization, when vectorizing a
6578      memory access in the inner-loop nested within the LOOP that is now being
6579      vectorized, while it is guaranteed that the misalignment of the
6580      vectorized memory access will remain the same in different outer-loop
6581      iterations, it is *not* guaranteed that is will remain the same throughout
6582      the execution of the inner-loop.  This is because the inner-loop advances
6583      with the original scalar step (and not in steps of VS).  If the inner-loop
6584      step happens to be a multiple of VS, then the misalignment remains fixed
6585      and we can use the optimized realignment scheme.  For example:
6586 
6587       for (i=0; i<N; i++)
6588         for (j=0; j<M; j++)
6589           s += a[i+j];
6590 
6591      When vectorizing the i-loop in the above example, the step between
6592      consecutive vector loads is 1, and so the misalignment does not remain
6593      fixed across the execution of the inner-loop, and the realignment cannot
6594      be optimized (as illustrated in the following pseudo vectorized loop):
6595 
6596       for (i=0; i<N; i+=4)
6597         for (j=0; j<M; j++){
6598           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6599                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6600                          // (assuming that we start from an aligned address).
6601           }
6602 
6603      We therefore have to use the unoptimized realignment scheme:
6604 
6605       for (i=0; i<N; i+=4)
6606           for (j=k; j<M; j+=4)
6607           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6608                            // that the misalignment of the initial address is
6609                            // 0).
6610 
6611      The loop can then be vectorized as follows:
6612 
6613       for (k=0; k<4; k++){
6614         rt = get_realignment_token (&vp[k]);
6615         for (i=0; i<N; i+=4){
6616           v1 = vp[i+k];
6617           for (j=k; j<M; j+=4){
6618             v2 = vp[i+j+VS-1];
6619             va = REALIGN_LOAD <v1,v2,rt>;
6620             vs += va;
6621             v1 = v2;
6622           }
6623         }
6624     } */
6625 
6626   if (DR_IS_READ (dr))
6627     {
6628       bool is_packed = false;
6629       tree type = (TREE_TYPE (DR_REF (dr)));
6630 
6631       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6632             && (!targetm.vectorize.builtin_mask_for_load
6633                 || targetm.vectorize.builtin_mask_for_load ()))
6634           {
6635             tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6636 
6637             /* If we are doing SLP then the accesses need not have the
6638                same alignment, instead it depends on the SLP group size.  */
6639             if (loop_vinfo
6640                 && STMT_SLP_TYPE (stmt_info)
6641                 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6642                                     * GROUP_SIZE (vinfo_for_stmt
6643                                                       (GROUP_FIRST_ELEMENT (stmt_info))),
6644                                     TYPE_VECTOR_SUBPARTS (vectype)))
6645               ;
6646             else if (!loop_vinfo
6647                        || (nested_in_vect_loop
6648                            && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6649                                             GET_MODE_SIZE (TYPE_MODE (vectype)))))
6650               return dr_explicit_realign;
6651             else
6652               return dr_explicit_realign_optimized;
6653           }
6654       if (!known_alignment_for_access_p (dr))
6655           is_packed = not_size_aligned (DR_REF (dr));
6656 
6657       if (targetm.vectorize.support_vector_misalignment
6658               (mode, type, DR_MISALIGNMENT (dr), is_packed))
6659           /* Can't software pipeline the loads, but can at least do them.  */
6660           return dr_unaligned_supported;
6661     }
6662   else
6663     {
6664       bool is_packed = false;
6665       tree type = (TREE_TYPE (DR_REF (dr)));
6666 
6667       if (!known_alignment_for_access_p (dr))
6668           is_packed = not_size_aligned (DR_REF (dr));
6669 
6670      if (targetm.vectorize.support_vector_misalignment
6671              (mode, type, DR_MISALIGNMENT (dr), is_packed))
6672        return dr_unaligned_supported;
6673     }
6674 
6675   /* Unsupported.  */
6676   return dr_unaligned_unsupported;
6677 }
6678