1 /* Loop distribution.
2    Copyright (C) 2006-2022 Free Software Foundation, Inc.
3    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4    and Sebastian Pop <sebastian.pop@amd.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 /* This pass performs loop distribution: for example, the loop
23 
24    |DO I = 2, N
25    |    A(I) = B(I) + C
26    |    D(I) = A(I-1)*E
27    |ENDDO
28 
29    is transformed to
30 
31    |DOALL I = 2, N
32    |   A(I) = B(I) + C
33    |ENDDO
34    |
35    |DOALL I = 2, N
36    |   D(I) = A(I-1)*E
37    |ENDDO
38 
39    Loop distribution is the dual of loop fusion.  It separates statements
40    of a loop (or loop nest) into multiple loops (or loop nests) with the
41    same loop header.  The major goal is to separate statements which may
42    be vectorized from those that can't.  This pass implements distribution
43    in the following steps:
44 
45      1) Seed partitions with specific type statements.  For now we support
46           two types seed statements: statement defining variable used outside
47           of loop; statement storing to memory.
48      2) Build reduced dependence graph (RDG) for loop to be distributed.
49           The vertices (RDG:V) model all statements in the loop and the edges
50           (RDG:E) model flow and control dependencies between statements.
51      3) Apart from RDG, compute data dependencies between memory references.
52      4) Starting from seed statement, build up partition by adding depended
53           statements according to RDG's dependence information.  Partition is
54           classified as parallel type if it can be executed paralleled; or as
55           sequential type if it can't.  Parallel type partition is further
56           classified as different builtin kinds if it can be implemented as
57           builtin function calls.
58      5) Build partition dependence graph (PG) based on data dependencies.
59           The vertices (PG:V) model all partitions and the edges (PG:E) model
60           all data dependencies between every partitions pair.  In general,
61           data dependence is either compilation time known or unknown.  In C
62           family languages, there exists quite amount compilation time unknown
63           dependencies because of possible alias relation of data references.
64           We categorize PG's edge to two types: "true" edge that represents
65           compilation time known data dependencies; "alias" edge for all other
66           data dependencies.
67      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68           partitions in each strong connected component (SCC) correspondingly.
69           Build new PG for merged partitions.
70      7) Traverse PG again and this time with both "true" and "alias" edges
71           included.  We try to break SCCs by removing some edges.  Because
72           SCCs by "true" edges are all fused in step 6), we can break SCCs
73           by removing some "alias" edges.  It's NP-hard to choose optimal
74           edge set, fortunately simple approximation is good enough for us
75           given the small problem scale.
76      8) Collect all data dependencies of the removed "alias" edges.  Create
77           runtime alias checks for collected data dependencies.
78      9) Version loop under the condition of runtime alias checks.  Given
79           loop distribution generally introduces additional overhead, it is
80           only useful if vectorization is achieved in distributed loop.  We
81           version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82           no distributed loop can be vectorized, we simply remove distributed
83           loops and recover to the original one.
84 
85    TODO:
86      1) We only distribute innermost two-level loop nest now.  We should
87           extend it for arbitrary loop nests in the future.
88      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89           desired to minimize loop overhead, maximize parallelism and maximize
90           data reuse.  */
91 
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
113 #include "cfgloop.h"
114 #include "tree-scalar-evolution.h"
115 #include "tree-vectorizer.h"
116 #include "tree-eh.h"
117 #include "gimple-fold.h"
118 #include "tree-affine.h"
119 #include "intl.h"
120 #include "rtl.h"
121 #include "memmodel.h"
122 #include "optabs.h"
123 
124 
125 #define MAX_DATAREFS_NUM \
126           ((unsigned) param_loop_max_datarefs_for_datadeps)
127 
128 /* Threshold controlling number of distributed partitions.  Given it may
129    be unnecessary if a memory stream cost model is invented in the future,
130    we define it as a temporary macro, rather than a parameter.  */
131 #define NUM_PARTITION_THRESHOLD (4)
132 
133 /* Hashtable helpers.  */
134 
135 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
136 {
137   static inline hashval_t hash (const data_dependence_relation *);
138   static inline bool equal (const data_dependence_relation *,
139                                   const data_dependence_relation *);
140 };
141 
142 /* Hash function for data dependence.  */
143 
144 inline hashval_t
hash(const data_dependence_relation * ddr)145 ddr_hasher::hash (const data_dependence_relation *ddr)
146 {
147   inchash::hash h;
148   h.add_ptr (DDR_A (ddr));
149   h.add_ptr (DDR_B (ddr));
150   return h.end ();
151 }
152 
153 /* Hash table equality function for data dependence.  */
154 
155 inline bool
equal(const data_dependence_relation * ddr1,const data_dependence_relation * ddr2)156 ddr_hasher::equal (const data_dependence_relation *ddr1,
157                        const data_dependence_relation *ddr2)
158 {
159   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
160 }
161 
162 
163 
164 #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
165 
166 /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
167 struct rdg_vertex
168 {
169   /* The statement represented by this vertex.  */
170   gimple *stmt;
171 
172   /* Vector of data-references in this statement.  */
173   vec<data_reference_p> datarefs;
174 
175   /* True when the statement contains a write to memory.  */
176   bool has_mem_write;
177 
178   /* True when the statement contains a read from memory.  */
179   bool has_mem_reads;
180 };
181 
182 #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
183 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
184 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
185 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
186 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
187 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
188 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
189 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
190 
191 /* Data dependence type.  */
192 
193 enum rdg_dep_type
194 {
195   /* Read After Write (RAW).  */
196   flow_dd = 'f',
197 
198   /* Control dependence (execute conditional on).  */
199   control_dd = 'c'
200 };
201 
202 /* Dependence information attached to an edge of the RDG.  */
203 
204 struct rdg_edge
205 {
206   /* Type of the dependence.  */
207   enum rdg_dep_type type;
208 };
209 
210 #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
211 
212 /* Kind of distributed loop.  */
213 enum partition_kind {
214     PKIND_NORMAL,
215     /* Partial memset stands for a paritition can be distributed into a loop
216        of memset calls, rather than a single memset call.  It's handled just
217        like a normal parition, i.e, distributed as separate loop, no memset
218        call is generated.
219 
220        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
221        loop nest as deep as possible.  As a result, parloop achieves better
222        parallelization by parallelizing deeper loop nest.  This hack should
223        be unnecessary and removed once distributed memset can be understood
224        and analyzed in data reference analysis.  See PR82604 for more.  */
225     PKIND_PARTIAL_MEMSET,
226     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
227 };
228 
229 /* Type of distributed loop.  */
230 enum partition_type {
231     /* The distributed loop can be executed parallelly.  */
232     PTYPE_PARALLEL = 0,
233     /* The distributed loop has to be executed sequentially.  */
234     PTYPE_SEQUENTIAL
235 };
236 
237 /* Builtin info for loop distribution.  */
238 struct builtin_info
239 {
240   /* data-references a kind != PKIND_NORMAL partition is about.  */
241   data_reference_p dst_dr;
242   data_reference_p src_dr;
243   /* Base address and size of memory objects operated by the builtin.  Note
244      both dest and source memory objects must have the same size.  */
245   tree dst_base;
246   tree src_base;
247   tree size;
248   /* Base and offset part of dst_base after stripping constant offset.  This
249      is only used in memset builtin distribution for now.  */
250   tree dst_base_base;
251   unsigned HOST_WIDE_INT dst_base_offset;
252 };
253 
254 /* Partition for loop distribution.  */
255 struct partition
256 {
257   /* Statements of the partition.  */
258   bitmap stmts;
259   /* True if the partition defines variable which is used outside of loop.  */
260   bool reduction_p;
261   location_t loc;
262   enum partition_kind kind;
263   enum partition_type type;
264   /* Data references in the partition.  */
265   bitmap datarefs;
266   /* Information of builtin parition.  */
267   struct builtin_info *builtin;
268 };
269 
270 /* Partitions are fused because of different reasons.  */
271 enum fuse_type
272 {
273   FUSE_NON_BUILTIN = 0,
274   FUSE_REDUCTION = 1,
275   FUSE_SHARE_REF = 2,
276   FUSE_SAME_SCC = 3,
277   FUSE_FINALIZE = 4
278 };
279 
280 /* Description on different fusing reason.  */
281 static const char *fuse_message[] = {
282   "they are non-builtins",
283   "they have reductions",
284   "they have shared memory refs",
285   "they are in the same dependence scc",
286   "there is no point to distribute loop"};
287 
288 
289 /* Dump vertex I in RDG to FILE.  */
290 
291 static void
dump_rdg_vertex(FILE * file,struct graph * rdg,int i)292 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
293 {
294   struct vertex *v = &(rdg->vertices[i]);
295   struct graph_edge *e;
296 
297   fprintf (file, "(vertex %d: (%s%s) (in:", i,
298              RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
299              RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
300 
301   if (v->pred)
302     for (e = v->pred; e; e = e->pred_next)
303       fprintf (file, " %d", e->src);
304 
305   fprintf (file, ") (out:");
306 
307   if (v->succ)
308     for (e = v->succ; e; e = e->succ_next)
309       fprintf (file, " %d", e->dest);
310 
311   fprintf (file, ")\n");
312   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
313   fprintf (file, ")\n");
314 }
315 
316 /* Call dump_rdg_vertex on stderr.  */
317 
318 DEBUG_FUNCTION void
debug_rdg_vertex(struct graph * rdg,int i)319 debug_rdg_vertex (struct graph *rdg, int i)
320 {
321   dump_rdg_vertex (stderr, rdg, i);
322 }
323 
324 /* Dump the reduced dependence graph RDG to FILE.  */
325 
326 static void
dump_rdg(FILE * file,struct graph * rdg)327 dump_rdg (FILE *file, struct graph *rdg)
328 {
329   fprintf (file, "(rdg\n");
330   for (int i = 0; i < rdg->n_vertices; i++)
331     dump_rdg_vertex (file, rdg, i);
332   fprintf (file, ")\n");
333 }
334 
335 /* Call dump_rdg on stderr.  */
336 
337 DEBUG_FUNCTION void
debug_rdg(struct graph * rdg)338 debug_rdg (struct graph *rdg)
339 {
340   dump_rdg (stderr, rdg);
341 }
342 
343 static void
dot_rdg_1(FILE * file,struct graph * rdg)344 dot_rdg_1 (FILE *file, struct graph *rdg)
345 {
346   int i;
347   pretty_printer buffer;
348   pp_needs_newline (&buffer) = false;
349   buffer.buffer->stream = file;
350 
351   fprintf (file, "digraph RDG {\n");
352 
353   for (i = 0; i < rdg->n_vertices; i++)
354     {
355       struct vertex *v = &(rdg->vertices[i]);
356       struct graph_edge *e;
357 
358       fprintf (file, "%d [label=\"[%d] ", i, i);
359       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
360       pp_flush (&buffer);
361       fprintf (file, "\"]\n");
362 
363       /* Highlight reads from memory.  */
364       if (RDG_MEM_READS_STMT (rdg, i))
365        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
366 
367       /* Highlight stores to memory.  */
368       if (RDG_MEM_WRITE_STMT (rdg, i))
369        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
370 
371       if (v->succ)
372        for (e = v->succ; e; e = e->succ_next)
373          switch (RDGE_TYPE (e))
374            {
375            case flow_dd:
376              /* These are the most common dependences: don't print these. */
377              fprintf (file, "%d -> %d \n", i, e->dest);
378              break;
379 
380              case control_dd:
381              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
382              break;
383 
384            default:
385              gcc_unreachable ();
386            }
387     }
388 
389   fprintf (file, "}\n\n");
390 }
391 
392 /* Display the Reduced Dependence Graph using dotty.  */
393 
394 DEBUG_FUNCTION void
dot_rdg(struct graph * rdg)395 dot_rdg (struct graph *rdg)
396 {
397   /* When debugging, you may want to enable the following code.  */
398 #ifdef HAVE_POPEN
399   FILE *file = popen ("dot -Tx11", "w");
400   if (!file)
401     return;
402   dot_rdg_1 (file, rdg);
403   fflush (file);
404   close (fileno (file));
405   pclose (file);
406 #else
407   dot_rdg_1 (stderr, rdg);
408 #endif
409 }
410 
411 /* Returns the index of STMT in RDG.  */
412 
413 static int
rdg_vertex_for_stmt(struct graph * rdg ATTRIBUTE_UNUSED,gimple * stmt)414 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
415 {
416   int index = gimple_uid (stmt);
417   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
418   return index;
419 }
420 
421 /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
422    the index of DEF in RDG.  */
423 
424 static void
create_rdg_edges_for_scalar(struct graph * rdg,tree def,int idef)425 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
426 {
427   use_operand_p imm_use_p;
428   imm_use_iterator iterator;
429 
430   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
431     {
432       struct graph_edge *e;
433       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
434 
435       if (use < 0)
436           continue;
437 
438       e = add_edge (rdg, idef, use);
439       e->data = XNEW (struct rdg_edge);
440       RDGE_TYPE (e) = flow_dd;
441     }
442 }
443 
444 /* Creates an edge for the control dependences of BB to the vertex V.  */
445 
446 static void
create_edge_for_control_dependence(struct graph * rdg,basic_block bb,int v,control_dependences * cd)447 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
448                                             int v, control_dependences *cd)
449 {
450   bitmap_iterator bi;
451   unsigned edge_n;
452   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
453                                   0, edge_n, bi)
454     {
455       basic_block cond_bb = cd->get_edge_src (edge_n);
456       gimple *stmt = last_stmt (cond_bb);
457       if (stmt && is_ctrl_stmt (stmt))
458           {
459             struct graph_edge *e;
460             int c = rdg_vertex_for_stmt (rdg, stmt);
461             if (c < 0)
462               continue;
463 
464             e = add_edge (rdg, c, v);
465             e->data = XNEW (struct rdg_edge);
466             RDGE_TYPE (e) = control_dd;
467           }
468     }
469 }
470 
471 /* Creates the edges of the reduced dependence graph RDG.  */
472 
473 static void
create_rdg_flow_edges(struct graph * rdg)474 create_rdg_flow_edges (struct graph *rdg)
475 {
476   int i;
477   def_operand_p def_p;
478   ssa_op_iter iter;
479 
480   for (i = 0; i < rdg->n_vertices; i++)
481     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
482                                     iter, SSA_OP_DEF)
483       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
484 }
485 
486 /* Creates the edges of the reduced dependence graph RDG.  */
487 
488 static void
create_rdg_cd_edges(struct graph * rdg,control_dependences * cd,loop_p loop)489 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
490 {
491   int i;
492 
493   for (i = 0; i < rdg->n_vertices; i++)
494     {
495       gimple *stmt = RDG_STMT (rdg, i);
496       if (gimple_code (stmt) == GIMPLE_PHI)
497           {
498             edge_iterator ei;
499             edge e;
500             FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
501               if (flow_bb_inside_loop_p (loop, e->src))
502                 create_edge_for_control_dependence (rdg, e->src, i, cd);
503           }
504       else
505           create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
506     }
507 }
508 
509 
510 class loop_distribution
511 {
512   private:
513   /* The loop (nest) to be distributed.  */
514   vec<loop_p> loop_nest;
515 
516   /* Vector of data references in the loop to be distributed.  */
517   vec<data_reference_p> datarefs_vec;
518 
519   /* If there is nonaddressable data reference in above vector.  */
520   bool has_nonaddressable_dataref_p;
521 
522   /* Store index of data reference in aux field.  */
523 
524   /* Hash table for data dependence relation in the loop to be distributed.  */
525   hash_table<ddr_hasher> *ddrs_table;
526 
527   /* Array mapping basic block's index to its topological order.  */
528   int *bb_top_order_index;
529   /* And size of the array.  */
530   int bb_top_order_index_size;
531 
532   /* Build the vertices of the reduced dependence graph RDG.  Return false
533      if that failed.  */
534   bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
535                                   loop_p loop);
536 
537   /* Initialize STMTS with all the statements of LOOP.  We use topological
538      order to discover all statements.  The order is important because
539      generate_loops_for_partition is using the same traversal for identifying
540      statements in loop copies.  */
541   void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
542 
543 
544   /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
545      LOOP, and one edge per flow dependence or control dependence from control
546      dependence CD.  During visiting each statement, data references are also
547      collected and recorded in global data DATAREFS_VEC.  */
548   struct graph * build_rdg (class loop *loop, control_dependences *cd);
549 
550 /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
551    graph and we update type for result partition if it is non-NULL.  */
552   void partition_merge_into (struct graph *rdg,
553                                    partition *dest, partition *partition,
554                                    enum fuse_type ft);
555 
556 
557   /* Return data dependence relation for data references A and B.  The two
558      data references must be in lexicographic order wrto reduced dependence
559      graph RDG.  We firstly try to find ddr from global ddr hash table.  If
560      it doesn't exist, compute the ddr and cache it.  */
561   data_dependence_relation * get_data_dependence (struct graph *rdg,
562                                                               data_reference_p a,
563                                                               data_reference_p b);
564 
565 
566   /* In reduced dependence graph RDG for loop distribution, return true if
567      dependence between references DR1 and DR2 leads to a dependence cycle
568      and such dependence cycle can't be resolved by runtime alias check.  */
569   bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
570                                   data_reference_p dr2);
571 
572 
573   /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
574      PARTITION1's type after merging PARTITION2 into PARTITION1.  */
575   void update_type_for_merge (struct graph *rdg,
576                                     partition *partition1, partition *partition2);
577 
578 
579   /* Returns a partition with all the statements needed for computing
580      the vertex V of the RDG, also including the loop exit conditions.  */
581   partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
582 
583   /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
584      if it forms builtin memcpy or memmove call.  */
585   void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
586                                     data_reference_p dst_dr, data_reference_p src_dr);
587 
588   /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
589      For the moment we detect memset, memcpy and memmove patterns.  Bitmap
590      STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
591      Returns true if there is a reduction in all partitions and we
592      possibly did not mark PARTITION as having one for this reason.  */
593 
594   bool
595   classify_partition (loop_p loop,
596                           struct graph *rdg, partition *partition,
597                           bitmap stmt_in_all_partitions);
598 
599 
600   /* Returns true when PARTITION1 and PARTITION2 access the same memory
601      object in RDG.  */
602   bool share_memory_accesses (struct graph *rdg,
603                                     partition *partition1, partition *partition2);
604 
605   /* For each seed statement in STARTING_STMTS, this function builds
606      partition for it by adding depended statements according to RDG.
607      All partitions are recorded in PARTITIONS.  */
608   void rdg_build_partitions (struct graph *rdg,
609                                    vec<gimple *> starting_stmts,
610                                    vec<partition *> *partitions);
611 
612   /* Compute partition dependence created by the data references in DRS1
613      and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
614      not NULL, we record dependence introduced by possible alias between
615      two data references in ALIAS_DDRS; otherwise, we simply ignore such
616      dependence as if it doesn't exist at all.  */
617   int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
618                                      bitmap drs2, vec<ddr_p> *alias_ddrs);
619 
620 
621   /* Build and return partition dependence graph for PARTITIONS.  RDG is
622      reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
623      is true, data dependence caused by possible alias between references
624      is ignored, as if it doesn't exist at all; otherwise all depdendences
625      are considered.  */
626   struct graph *build_partition_graph (struct graph *rdg,
627                                                vec<struct partition *> *partitions,
628                                                bool ignore_alias_p);
629 
630   /* Given reduced dependence graph RDG merge strong connected components
631      of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
632      possible alias between references is ignored, as if it doesn't exist
633      at all; otherwise all depdendences are considered.  */
634   void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
635                                          *partitions, bool ignore_alias_p);
636 
637 /* This is the main function breaking strong conected components in
638    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
639    relations for runtime alias check in ALIAS_DDRS.  */
640   void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
641                                            *partitions, vec<ddr_p> *alias_ddrs);
642 
643 
644   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
645      ALIAS_DDRS contains ddrs which need runtime alias check.  */
646   void finalize_partitions (class loop *loop, vec<struct partition *>
647                                   *partitions, vec<ddr_p> *alias_ddrs);
648 
649   /* Distributes the code from LOOP in such a way that producer statements
650      are placed before consumer statements.  Tries to separate only the
651      statements from STMTS into separate loops.  Returns the number of
652      distributed loops.  Set NB_CALLS to number of generated builtin calls.
653      Set *DESTROY_P to whether LOOP needs to be destroyed.  */
654   int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
655                            control_dependences *cd, int *nb_calls, bool *destroy_p,
656                            bool only_patterns_p);
657 
658   /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
659      replace them accordingly.  */
660   bool transform_reduction_loop (loop_p loop);
661 
662   /* Compute topological order for basic blocks.  Topological order is
663      needed because data dependence is computed for data references in
664      lexicographical order.  */
665   void bb_top_order_init (void);
666 
667   void bb_top_order_destroy (void);
668 
669   public:
670 
671   /* Getter for bb_top_order.  */
672 
get_bb_top_order_index_size(void)673   inline int get_bb_top_order_index_size (void)
674     {
675       return bb_top_order_index_size;
676     }
677 
get_bb_top_order_index(int i)678   inline int get_bb_top_order_index (int i)
679     {
680       return bb_top_order_index[i];
681     }
682 
683   unsigned int execute (function *fun);
684 };
685 
686 
687 /* If X has a smaller topological sort number than Y, returns -1;
688    if greater, returns 1.  */
689 static int
bb_top_order_cmp_r(const void * x,const void * y,void * loop)690 bb_top_order_cmp_r (const void *x, const void *y, void *loop)
691 {
692   loop_distribution *_loop =
693     (loop_distribution *) loop;
694 
695   basic_block bb1 = *(const basic_block *) x;
696   basic_block bb2 = *(const basic_block *) y;
697 
698   int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
699 
700   gcc_assert (bb1->index < bb_top_order_index_size
701                 && bb2->index < bb_top_order_index_size);
702   gcc_assert (bb1 == bb2
703                 || _loop->get_bb_top_order_index(bb1->index)
704                      != _loop->get_bb_top_order_index(bb2->index));
705 
706   return (_loop->get_bb_top_order_index(bb1->index) -
707             _loop->get_bb_top_order_index(bb2->index));
708 }
709 
710 bool
create_rdg_vertices(struct graph * rdg,const vec<gimple * > & stmts,loop_p loop)711 loop_distribution::create_rdg_vertices (struct graph *rdg,
712                                                   const vec<gimple *> &stmts,
713                                                   loop_p loop)
714 {
715   int i;
716   gimple *stmt;
717 
718   FOR_EACH_VEC_ELT (stmts, i, stmt)
719     {
720       struct vertex *v = &(rdg->vertices[i]);
721 
722       /* Record statement to vertex mapping.  */
723       gimple_set_uid (stmt, i);
724 
725       v->data = XNEW (struct rdg_vertex);
726       RDGV_STMT (v) = stmt;
727       RDGV_DATAREFS (v).create (0);
728       RDGV_HAS_MEM_WRITE (v) = false;
729       RDGV_HAS_MEM_READS (v) = false;
730       if (gimple_code (stmt) == GIMPLE_PHI)
731           continue;
732 
733       unsigned drp = datarefs_vec.length ();
734       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
735           return false;
736       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
737           {
738             data_reference_p dr = datarefs_vec[j];
739             if (DR_IS_READ (dr))
740               RDGV_HAS_MEM_READS (v) = true;
741             else
742               RDGV_HAS_MEM_WRITE (v) = true;
743             RDGV_DATAREFS (v).safe_push (dr);
744             has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
745           }
746     }
747   return true;
748 }
749 
750 void
stmts_from_loop(class loop * loop,vec<gimple * > * stmts)751 loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
752 {
753   unsigned int i;
754   basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
755 
756   for (i = 0; i < loop->num_nodes; i++)
757     {
758       basic_block bb = bbs[i];
759 
760       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
761              gsi_next (&bsi))
762           if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
763             stmts->safe_push (bsi.phi ());
764 
765       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
766              gsi_next (&bsi))
767           {
768             gimple *stmt = gsi_stmt (bsi);
769             if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
770               stmts->safe_push (stmt);
771           }
772     }
773 
774   free (bbs);
775 }
776 
777 /* Free the reduced dependence graph RDG.  */
778 
779 static void
free_rdg(struct graph * rdg)780 free_rdg (struct graph *rdg)
781 {
782   int i;
783 
784   for (i = 0; i < rdg->n_vertices; i++)
785     {
786       struct vertex *v = &(rdg->vertices[i]);
787       struct graph_edge *e;
788 
789       for (e = v->succ; e; e = e->succ_next)
790           free (e->data);
791 
792       if (v->data)
793           {
794             gimple_set_uid (RDGV_STMT (v), -1);
795             (RDGV_DATAREFS (v)).release ();
796             free (v->data);
797           }
798     }
799 
800   free_graph (rdg);
801 }
802 
803 struct graph *
build_rdg(class loop * loop,control_dependences * cd)804 loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
805 {
806   struct graph *rdg;
807 
808   /* Create the RDG vertices from the stmts of the loop nest.  */
809   auto_vec<gimple *, 10> stmts;
810   stmts_from_loop (loop, &stmts);
811   rdg = new_graph (stmts.length ());
812   if (!create_rdg_vertices (rdg, stmts, loop))
813     {
814       free_rdg (rdg);
815       return NULL;
816     }
817   stmts.release ();
818 
819   create_rdg_flow_edges (rdg);
820   if (cd)
821     create_rdg_cd_edges (rdg, cd, loop);
822 
823   return rdg;
824 }
825 
826 
827 /* Allocate and initialize a partition from BITMAP.  */
828 
829 static partition *
partition_alloc(void)830 partition_alloc (void)
831 {
832   partition *partition = XCNEW (struct partition);
833   partition->stmts = BITMAP_ALLOC (NULL);
834   partition->reduction_p = false;
835   partition->loc = UNKNOWN_LOCATION;
836   partition->kind = PKIND_NORMAL;
837   partition->type = PTYPE_PARALLEL;
838   partition->datarefs = BITMAP_ALLOC (NULL);
839   return partition;
840 }
841 
842 /* Free PARTITION.  */
843 
844 static void
partition_free(partition * partition)845 partition_free (partition *partition)
846 {
847   BITMAP_FREE (partition->stmts);
848   BITMAP_FREE (partition->datarefs);
849   if (partition->builtin)
850     free (partition->builtin);
851 
852   free (partition);
853 }
854 
855 /* Returns true if the partition can be generated as a builtin.  */
856 
857 static bool
partition_builtin_p(partition * partition)858 partition_builtin_p (partition *partition)
859 {
860   return partition->kind > PKIND_PARTIAL_MEMSET;
861 }
862 
863 /* Returns true if the partition contains a reduction.  */
864 
865 static bool
partition_reduction_p(partition * partition)866 partition_reduction_p (partition *partition)
867 {
868   return partition->reduction_p;
869 }
870 
871 void
partition_merge_into(struct graph * rdg,partition * dest,partition * partition,enum fuse_type ft)872 loop_distribution::partition_merge_into (struct graph *rdg,
873                           partition *dest, partition *partition, enum fuse_type ft)
874 {
875   if (dump_file && (dump_flags & TDF_DETAILS))
876     {
877       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
878       fprintf (dump_file, "  Part 1: ");
879       dump_bitmap (dump_file, dest->stmts);
880       fprintf (dump_file, "  Part 2: ");
881       dump_bitmap (dump_file, partition->stmts);
882     }
883 
884   dest->kind = PKIND_NORMAL;
885   if (dest->type == PTYPE_PARALLEL)
886     dest->type = partition->type;
887 
888   bitmap_ior_into (dest->stmts, partition->stmts);
889   if (partition_reduction_p (partition))
890     dest->reduction_p = true;
891 
892   /* Further check if any data dependence prevents us from executing the
893      new partition parallelly.  */
894   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
895     update_type_for_merge (rdg, dest, partition);
896 
897   bitmap_ior_into (dest->datarefs, partition->datarefs);
898 }
899 
900 
901 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
902    the LOOP.  */
903 
904 static bool
ssa_name_has_uses_outside_loop_p(tree def,loop_p loop)905 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
906 {
907   imm_use_iterator imm_iter;
908   use_operand_p use_p;
909 
910   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
911     {
912       if (is_gimple_debug (USE_STMT (use_p)))
913           continue;
914 
915       basic_block use_bb = gimple_bb (USE_STMT (use_p));
916       if (!flow_bb_inside_loop_p (loop, use_bb))
917           return true;
918     }
919 
920   return false;
921 }
922 
923 /* Returns true when STMT defines a scalar variable used after the
924    loop LOOP.  */
925 
926 static bool
stmt_has_scalar_dependences_outside_loop(loop_p loop,gimple * stmt)927 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
928 {
929   def_operand_p def_p;
930   ssa_op_iter op_iter;
931 
932   if (gimple_code (stmt) == GIMPLE_PHI)
933     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
934 
935   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
936     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
937       return true;
938 
939   return false;
940 }
941 
942 /* Return a copy of LOOP placed before LOOP.  */
943 
944 static class loop *
copy_loop_before(class loop * loop)945 copy_loop_before (class loop *loop)
946 {
947   class loop *res;
948   edge preheader = loop_preheader_edge (loop);
949 
950   initialize_original_copy_tables ();
951   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
952   gcc_assert (res != NULL);
953   free_original_copy_tables ();
954   delete_update_ssa ();
955 
956   return res;
957 }
958 
959 /* Creates an empty basic block after LOOP.  */
960 
961 static void
create_bb_after_loop(class loop * loop)962 create_bb_after_loop (class loop *loop)
963 {
964   edge exit = single_exit (loop);
965 
966   if (!exit)
967     return;
968 
969   split_edge (exit);
970 }
971 
972 /* Generate code for PARTITION from the code in LOOP.  The loop is
973    copied when COPY_P is true.  All the statements not flagged in the
974    PARTITION bitmap are removed from the loop or from its copy.  The
975    statements are indexed in sequence inside a basic block, and the
976    basic blocks of a loop are taken in dom order.  */
977 
978 static void
generate_loops_for_partition(class loop * loop,partition * partition,bool copy_p)979 generate_loops_for_partition (class loop *loop, partition *partition,
980                                     bool copy_p)
981 {
982   unsigned i;
983   basic_block *bbs;
984 
985   if (copy_p)
986     {
987       int orig_loop_num = loop->orig_loop_num;
988       loop = copy_loop_before (loop);
989       gcc_assert (loop != NULL);
990       loop->orig_loop_num = orig_loop_num;
991       create_preheader (loop, CP_SIMPLE_PREHEADERS);
992       create_bb_after_loop (loop);
993     }
994   else
995     {
996       /* Origin number is set to the new versioned loop's num.  */
997       gcc_assert (loop->orig_loop_num != loop->num);
998     }
999 
1000   /* Remove stmts not in the PARTITION bitmap.  */
1001   bbs = get_loop_body_in_dom_order (loop);
1002 
1003   if (MAY_HAVE_DEBUG_BIND_STMTS)
1004     for (i = 0; i < loop->num_nodes; i++)
1005       {
1006           basic_block bb = bbs[i];
1007 
1008           for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
1009                gsi_next (&bsi))
1010             {
1011               gphi *phi = bsi.phi ();
1012               if (!virtual_operand_p (gimple_phi_result (phi))
1013                     && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1014                 reset_debug_uses (phi);
1015             }
1016 
1017           for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1018             {
1019               gimple *stmt = gsi_stmt (bsi);
1020               if (gimple_code (stmt) != GIMPLE_LABEL
1021                     && !is_gimple_debug (stmt)
1022                     && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1023                 reset_debug_uses (stmt);
1024             }
1025       }
1026 
1027   for (i = 0; i < loop->num_nodes; i++)
1028     {
1029       basic_block bb = bbs[i];
1030       edge inner_exit = NULL;
1031 
1032       if (loop != bb->loop_father)
1033           inner_exit = single_exit (bb->loop_father);
1034 
1035       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1036           {
1037             gphi *phi = bsi.phi ();
1038             if (!virtual_operand_p (gimple_phi_result (phi))
1039                 && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1040               remove_phi_node (&bsi, true);
1041             else
1042               gsi_next (&bsi);
1043           }
1044 
1045       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1046           {
1047             gimple *stmt = gsi_stmt (bsi);
1048             if (gimple_code (stmt) != GIMPLE_LABEL
1049                 && !is_gimple_debug (stmt)
1050                 && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1051               {
1052                 /* In distribution of loop nest, if bb is inner loop's exit_bb,
1053                      we choose its exit edge/path in order to avoid generating
1054                      infinite loop.  For all other cases, we choose an arbitrary
1055                      path through the empty CFG part that this unnecessary
1056                      control stmt controls.  */
1057                 if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1058                     {
1059                       if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1060                         gimple_cond_make_true (cond_stmt);
1061                       else
1062                         gimple_cond_make_false (cond_stmt);
1063                       update_stmt (stmt);
1064                     }
1065                 else if (gimple_code (stmt) == GIMPLE_SWITCH)
1066                     {
1067                       gswitch *switch_stmt = as_a <gswitch *> (stmt);
1068                       gimple_switch_set_index
1069                           (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1070                       update_stmt (stmt);
1071                     }
1072                 else
1073                     {
1074                       unlink_stmt_vdef (stmt);
1075                       gsi_remove (&bsi, true);
1076                       release_defs (stmt);
1077                       continue;
1078                     }
1079               }
1080             gsi_next (&bsi);
1081           }
1082     }
1083 
1084   free (bbs);
1085 }
1086 
1087 /* If VAL memory representation contains the same value in all bytes,
1088    return that value, otherwise return -1.
1089    E.g. for 0x24242424 return 0x24, for IEEE double
1090    747708026454360457216.0 return 0x44, etc.  */
1091 
1092 static int
const_with_all_bytes_same(tree val)1093 const_with_all_bytes_same (tree val)
1094 {
1095   unsigned char buf[64];
1096   int i, len;
1097 
1098   if (integer_zerop (val)
1099       || (TREE_CODE (val) == CONSTRUCTOR
1100           && !TREE_CLOBBER_P (val)
1101           && CONSTRUCTOR_NELTS (val) == 0))
1102     return 0;
1103 
1104   if (real_zerop (val))
1105     {
1106       /* Only return 0 for +0.0, not for -0.0, which doesn't have
1107            an all bytes same memory representation.  Don't transform
1108            -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
1109       switch (TREE_CODE (val))
1110           {
1111           case REAL_CST:
1112             if (!real_isneg (TREE_REAL_CST_PTR (val)))
1113               return 0;
1114             break;
1115           case COMPLEX_CST:
1116             if (!const_with_all_bytes_same (TREE_REALPART (val))
1117                 && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1118               return 0;
1119             break;
1120           case VECTOR_CST:
1121             {
1122               unsigned int count = vector_cst_encoded_nelts (val);
1123               unsigned int j;
1124               for (j = 0; j < count; ++j)
1125                 if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1126                     break;
1127               if (j == count)
1128                 return 0;
1129               break;
1130             }
1131           default:
1132             break;
1133           }
1134     }
1135 
1136   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1137     return -1;
1138 
1139   len = native_encode_expr (val, buf, sizeof (buf));
1140   if (len == 0)
1141     return -1;
1142   for (i = 1; i < len; i++)
1143     if (buf[i] != buf[0])
1144       return -1;
1145   return buf[0];
1146 }
1147 
1148 /* Generate a call to memset for PARTITION in LOOP.  */
1149 
1150 static void
generate_memset_builtin(class loop * loop,partition * partition)1151 generate_memset_builtin (class loop *loop, partition *partition)
1152 {
1153   gimple_stmt_iterator gsi;
1154   tree mem, fn, nb_bytes;
1155   tree val;
1156   struct builtin_info *builtin = partition->builtin;
1157   gimple *fn_call;
1158 
1159   /* The new statements will be placed before LOOP.  */
1160   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1161 
1162   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1163   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1164                                                false, GSI_CONTINUE_LINKING);
1165   mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1166   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1167                                           false, GSI_CONTINUE_LINKING);
1168 
1169   /* This exactly matches the pattern recognition in classify_partition.  */
1170   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1171   /* Handle constants like 0x15151515 and similarly
1172      floating point constants etc. where all bytes are the same.  */
1173   int bytev = const_with_all_bytes_same (val);
1174   if (bytev != -1)
1175     val = build_int_cst (integer_type_node, bytev);
1176   else if (TREE_CODE (val) == INTEGER_CST)
1177     val = fold_convert (integer_type_node, val);
1178   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1179     {
1180       tree tem = make_ssa_name (integer_type_node);
1181       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1182       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1183       val = tem;
1184     }
1185 
1186   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1187   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1188   gimple_set_location (fn_call, partition->loc);
1189   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1190   fold_stmt (&gsi);
1191 
1192   if (dump_file && (dump_flags & TDF_DETAILS))
1193     {
1194       fprintf (dump_file, "generated memset");
1195       if (bytev == 0)
1196           fprintf (dump_file, " zero\n");
1197       else
1198           fprintf (dump_file, "\n");
1199     }
1200 }
1201 
1202 /* Generate a call to memcpy for PARTITION in LOOP.  */
1203 
1204 static void
generate_memcpy_builtin(class loop * loop,partition * partition)1205 generate_memcpy_builtin (class loop *loop, partition *partition)
1206 {
1207   gimple_stmt_iterator gsi;
1208   gimple *fn_call;
1209   tree dest, src, fn, nb_bytes;
1210   enum built_in_function kind;
1211   struct builtin_info *builtin = partition->builtin;
1212 
1213   /* The new statements will be placed before LOOP.  */
1214   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1215 
1216   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1217   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1218                                                false, GSI_CONTINUE_LINKING);
1219   dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1220   src = rewrite_to_non_trapping_overflow (builtin->src_base);
1221   if (partition->kind == PKIND_MEMCPY
1222       || ! ptr_derefs_may_alias_p (dest, src))
1223     kind = BUILT_IN_MEMCPY;
1224   else
1225     kind = BUILT_IN_MEMMOVE;
1226   /* Try harder if we're copying a constant size.  */
1227   if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1228     {
1229       aff_tree asrc, adest;
1230       tree_to_aff_combination (src, ptr_type_node, &asrc);
1231       tree_to_aff_combination (dest, ptr_type_node, &adest);
1232       aff_combination_scale (&adest, -1);
1233       aff_combination_add (&asrc, &adest);
1234       if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1235                                              wi::to_poly_widest (nb_bytes)))
1236           kind = BUILT_IN_MEMCPY;
1237     }
1238 
1239   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1240                                            false, GSI_CONTINUE_LINKING);
1241   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1242                                           false, GSI_CONTINUE_LINKING);
1243   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1244   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1245   gimple_set_location (fn_call, partition->loc);
1246   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1247   fold_stmt (&gsi);
1248 
1249   if (dump_file && (dump_flags & TDF_DETAILS))
1250     {
1251       if (kind == BUILT_IN_MEMCPY)
1252           fprintf (dump_file, "generated memcpy\n");
1253       else
1254           fprintf (dump_file, "generated memmove\n");
1255     }
1256 }
1257 
1258 /* Remove and destroy the loop LOOP.  */
1259 
1260 static void
destroy_loop(class loop * loop)1261 destroy_loop (class loop *loop)
1262 {
1263   unsigned nbbs = loop->num_nodes;
1264   edge exit = single_exit (loop);
1265   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1266   basic_block *bbs;
1267   unsigned i;
1268 
1269   bbs = get_loop_body_in_dom_order (loop);
1270 
1271   gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1272   bool safe_p = single_pred_p (exit->dest);
1273   for (unsigned i = 0; i < nbbs; ++i)
1274     {
1275       /* We have made sure to not leave any dangling uses of SSA
1276          names defined in the loop.  With the exception of virtuals.
1277            Make sure we replace all uses of virtual defs that will remain
1278            outside of the loop with the bare symbol as delete_basic_block
1279            will release them.  */
1280       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1281              gsi_next (&gsi))
1282           {
1283             gphi *phi = gsi.phi ();
1284             if (virtual_operand_p (gimple_phi_result (phi)))
1285               mark_virtual_phi_result_for_renaming (phi);
1286           }
1287       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1288           {
1289             gimple *stmt = gsi_stmt (gsi);
1290             tree vdef = gimple_vdef (stmt);
1291             if (vdef && TREE_CODE (vdef) == SSA_NAME)
1292               mark_virtual_operand_for_renaming (vdef);
1293             /* Also move and eventually reset debug stmts.  We can leave
1294                constant values in place in case the stmt dominates the exit.
1295                ???  Non-constant values from the last iteration can be
1296                replaced with final values if we can compute them.  */
1297             if (gimple_debug_bind_p (stmt))
1298               {
1299                 tree val = gimple_debug_bind_get_value (stmt);
1300                 gsi_move_before (&gsi, &dst_gsi);
1301                 if (val
1302                       && (!safe_p
1303                           || !is_gimple_min_invariant (val)
1304                           || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1305                     {
1306                       gimple_debug_bind_reset_value (stmt);
1307                       update_stmt (stmt);
1308                     }
1309               }
1310             else
1311               gsi_next (&gsi);
1312           }
1313     }
1314 
1315   redirect_edge_pred (exit, src);
1316   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1317   exit->flags |= EDGE_FALLTHRU;
1318   cancel_loop_tree (loop);
1319   rescan_loop_exit (exit, false, true);
1320 
1321   i = nbbs;
1322   do
1323     {
1324       --i;
1325       delete_basic_block (bbs[i]);
1326     }
1327   while (i != 0);
1328 
1329   free (bbs);
1330 
1331   set_immediate_dominator (CDI_DOMINATORS, dest,
1332                                  recompute_dominator (CDI_DOMINATORS, dest));
1333 }
1334 
1335 /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1336 
1337 static bool
generate_code_for_partition(class loop * loop,partition * partition,bool copy_p)1338 generate_code_for_partition (class loop *loop,
1339                                    partition *partition, bool copy_p)
1340 {
1341   switch (partition->kind)
1342     {
1343     case PKIND_NORMAL:
1344     case PKIND_PARTIAL_MEMSET:
1345       /* Reductions all have to be in the last partition.  */
1346       gcc_assert (!partition_reduction_p (partition)
1347                       || !copy_p);
1348       generate_loops_for_partition (loop, partition, copy_p);
1349       return false;
1350 
1351     case PKIND_MEMSET:
1352       generate_memset_builtin (loop, partition);
1353       break;
1354 
1355     case PKIND_MEMCPY:
1356     case PKIND_MEMMOVE:
1357       generate_memcpy_builtin (loop, partition);
1358       break;
1359 
1360     default:
1361       gcc_unreachable ();
1362     }
1363 
1364   /* Common tail for partitions we turn into a call.  If this was the last
1365      partition for which we generate code, we have to destroy the loop.  */
1366   if (!copy_p)
1367     return true;
1368   return false;
1369 }
1370 
1371 data_dependence_relation *
get_data_dependence(struct graph * rdg,data_reference_p a,data_reference_p b)1372 loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1373                                                   data_reference_p b)
1374 {
1375   struct data_dependence_relation ent, **slot;
1376   struct data_dependence_relation *ddr;
1377 
1378   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1379   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1380                 <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1381   ent.a = a;
1382   ent.b = b;
1383   slot = ddrs_table->find_slot (&ent, INSERT);
1384   if (*slot == NULL)
1385     {
1386       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1387       compute_affine_dependence (ddr, loop_nest[0]);
1388       *slot = ddr;
1389     }
1390 
1391   return *slot;
1392 }
1393 
1394 bool
data_dep_in_cycle_p(struct graph * rdg,data_reference_p dr1,data_reference_p dr2)1395 loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1396                                                   data_reference_p dr1,
1397                                                   data_reference_p dr2)
1398 {
1399   struct data_dependence_relation *ddr;
1400 
1401   /* Re-shuffle data-refs to be in topological order.  */
1402   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1403       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1404     std::swap (dr1, dr2);
1405 
1406   ddr = get_data_dependence (rdg, dr1, dr2);
1407 
1408   /* In case of no data dependence.  */
1409   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1410     return false;
1411   /* For unknown data dependence or known data dependence which can't be
1412      expressed in classic distance vector, we check if it can be resolved
1413      by runtime alias check.  If yes, we still consider data dependence
1414      as won't introduce data dependence cycle.  */
1415   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1416              || DDR_NUM_DIST_VECTS (ddr) == 0)
1417     return !runtime_alias_check_p (ddr, NULL, true);
1418   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1419     return true;
1420   else if (DDR_REVERSED_P (ddr)
1421              || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1422     return false;
1423 
1424   return true;
1425 }
1426 
1427 void
update_type_for_merge(struct graph * rdg,partition * partition1,partition * partition2)1428 loop_distribution::update_type_for_merge (struct graph *rdg,
1429                                                      partition *partition1,
1430                                                      partition *partition2)
1431 {
1432   unsigned i, j;
1433   bitmap_iterator bi, bj;
1434   data_reference_p dr1, dr2;
1435 
1436   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1437     {
1438       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1439 
1440       dr1 = datarefs_vec[i];
1441       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1442           {
1443             dr2 = datarefs_vec[j];
1444             if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1445               continue;
1446 
1447             /* Partition can only be executed sequentially if there is any
1448                data dependence cycle.  */
1449             if (data_dep_in_cycle_p (rdg, dr1, dr2))
1450               {
1451                 partition1->type = PTYPE_SEQUENTIAL;
1452                 return;
1453               }
1454           }
1455     }
1456 }
1457 
1458 partition *
build_rdg_partition_for_vertex(struct graph * rdg,int v)1459 loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1460 {
1461   partition *partition = partition_alloc ();
1462   auto_vec<int, 3> nodes;
1463   unsigned i, j;
1464   int x;
1465   data_reference_p dr;
1466 
1467   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1468 
1469   FOR_EACH_VEC_ELT (nodes, i, x)
1470     {
1471       bitmap_set_bit (partition->stmts, x);
1472 
1473       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1474           {
1475             unsigned idx = (unsigned) DR_INDEX (dr);
1476             gcc_assert (idx < datarefs_vec.length ());
1477 
1478             /* Partition can only be executed sequentially if there is any
1479                unknown data reference.  */
1480             if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1481                 || !DR_INIT (dr) || !DR_STEP (dr))
1482               partition->type = PTYPE_SEQUENTIAL;
1483 
1484             bitmap_set_bit (partition->datarefs, idx);
1485           }
1486     }
1487 
1488   if (partition->type == PTYPE_SEQUENTIAL)
1489     return partition;
1490 
1491   /* Further check if any data dependence prevents us from executing the
1492      partition parallelly.  */
1493   update_type_for_merge (rdg, partition, partition);
1494 
1495   return partition;
1496 }
1497 
1498 /* Given PARTITION of LOOP and RDG, record single load/store data references
1499    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1500    data references.  */
1501 
1502 static bool
find_single_drs(class loop * loop,struct graph * rdg,const bitmap & partition_stmts,data_reference_p * dst_dr,data_reference_p * src_dr)1503 find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
1504                      data_reference_p *dst_dr, data_reference_p *src_dr)
1505 {
1506   unsigned i;
1507   data_reference_p single_ld = NULL, single_st = NULL;
1508   bitmap_iterator bi;
1509 
1510   EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
1511     {
1512       gimple *stmt = RDG_STMT (rdg, i);
1513       data_reference_p dr;
1514 
1515       if (gimple_code (stmt) == GIMPLE_PHI)
1516           continue;
1517 
1518       /* Any scalar stmts are ok.  */
1519       if (!gimple_vuse (stmt))
1520           continue;
1521 
1522       /* Otherwise just regular loads/stores.  */
1523       if (!gimple_assign_single_p (stmt))
1524           return false;
1525 
1526       /* But exactly one store and/or load.  */
1527       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1528           {
1529             tree type = TREE_TYPE (DR_REF (dr));
1530 
1531             /* The memset, memcpy and memmove library calls are only
1532                able to deal with generic address space.  */
1533             if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1534               return false;
1535 
1536             if (DR_IS_READ (dr))
1537               {
1538                 if (single_ld != NULL)
1539                     return false;
1540                 single_ld = dr;
1541               }
1542             else
1543               {
1544                 if (single_st != NULL)
1545                     return false;
1546                 single_st = dr;
1547               }
1548           }
1549     }
1550 
1551   if (!single_ld && !single_st)
1552     return false;
1553 
1554   basic_block bb_ld = NULL;
1555   basic_block bb_st = NULL;
1556 
1557   if (single_ld)
1558     {
1559       /* Bail out if this is a bitfield memory reference.  */
1560       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1561             && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1562           return false;
1563 
1564       /* Data reference must be executed exactly once per iteration of each
1565            loop in the loop nest.  We only need to check dominance information
1566            against the outermost one in a perfect loop nest because a bb can't
1567            dominate outermost loop's latch without dominating inner loop's.  */
1568       bb_ld = gimple_bb (DR_STMT (single_ld));
1569       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1570           return false;
1571     }
1572 
1573   if (single_st)
1574     {
1575       /* Bail out if this is a bitfield memory reference.  */
1576       if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1577             && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1578           return false;
1579 
1580       /* Data reference must be executed exactly once per iteration.
1581            Same as single_ld, we only need to check against the outermost
1582            loop.  */
1583       bb_st = gimple_bb (DR_STMT (single_st));
1584       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1585           return false;
1586     }
1587 
1588   if (single_ld && single_st)
1589     {
1590       /* Load and store must be in the same loop nest.  */
1591       if (bb_st->loop_father != bb_ld->loop_father)
1592           return false;
1593 
1594       edge e = single_exit (bb_st->loop_father);
1595       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1596       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1597       if (dom_ld != dom_st)
1598           return false;
1599     }
1600 
1601   *src_dr = single_ld;
1602   *dst_dr = single_st;
1603   return true;
1604 }
1605 
1606 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1607    loops from inner to outer to see if loop's step equals to access size at
1608    each level of loop.  Return 2 if we can prove this at all level loops;
1609    record access base and size in BASE and SIZE; save loop's step at each
1610    level of loop in STEPS if it is not null.  For example:
1611 
1612      int arr[100][100][100];
1613      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1614        for (j = 100; j > 0; j--)     ;steps[1] = -400
1615            for (k = 0; k < 100; k++)   ;steps[0] = 4
1616              arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1617 
1618    Return 1 if we can prove the equality at the innermost loop, but not all
1619    level loops.  In this case, no information is recorded.
1620 
1621    Return 0 if no equality can be proven at any level loops.  */
1622 
1623 static int
compute_access_range(loop_p loop_nest,data_reference_p dr,tree * base,tree * size,vec<tree> * steps=NULL)1624 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1625                           tree *size, vec<tree> *steps = NULL)
1626 {
1627   location_t loc = gimple_location (DR_STMT (dr));
1628   basic_block bb = gimple_bb (DR_STMT (dr));
1629   class loop *loop = bb->loop_father;
1630   tree ref = DR_REF (dr);
1631   tree access_base = build_fold_addr_expr (ref);
1632   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1633   int res = 0;
1634 
1635   do {
1636       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1637       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1638           return res;
1639 
1640       access_base = CHREC_LEFT (scev_fn);
1641       if (tree_contains_chrecs (access_base, NULL))
1642           return res;
1643 
1644       tree scev_step = CHREC_RIGHT (scev_fn);
1645       /* Only support constant steps.  */
1646       if (TREE_CODE (scev_step) != INTEGER_CST)
1647           return res;
1648 
1649       enum ev_direction access_dir = scev_direction (scev_fn);
1650       if (access_dir == EV_DIR_UNKNOWN)
1651           return res;
1652 
1653       if (steps != NULL)
1654           steps->safe_push (scev_step);
1655 
1656       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1657       /* Compute absolute value of scev step.  */
1658       if (access_dir == EV_DIR_DECREASES)
1659           scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1660 
1661       /* At each level of loop, scev step must equal to access size.  In other
1662            words, DR must access consecutive memory between loop iterations.  */
1663       if (!operand_equal_p (scev_step, access_size, 0))
1664           return res;
1665 
1666       /* Access stride can be computed for data reference at least for the
1667            innermost loop.  */
1668       res = 1;
1669 
1670       /* Compute DR's execution times in loop.  */
1671       tree niters = number_of_latch_executions (loop);
1672       niters = fold_convert_loc (loc, sizetype, niters);
1673       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1674           niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1675 
1676       /* Compute DR's overall access size in loop.  */
1677       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1678                                              niters, scev_step);
1679       /* Adjust base address in case of negative step.  */
1680       if (access_dir == EV_DIR_DECREASES)
1681           {
1682             tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1683                                               scev_step, access_size);
1684             access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1685           }
1686   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1687 
1688   *base = access_base;
1689   *size = access_size;
1690   /* Access stride can be computed for data reference at each level loop.  */
1691   return 2;
1692 }
1693 
1694 /* Allocate and return builtin struct.  Record information like DST_DR,
1695    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1696 
1697 static struct builtin_info *
alloc_builtin(data_reference_p dst_dr,data_reference_p src_dr,tree dst_base,tree src_base,tree size)1698 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1699                  tree dst_base, tree src_base, tree size)
1700 {
1701   struct builtin_info *builtin = XNEW (struct builtin_info);
1702   builtin->dst_dr = dst_dr;
1703   builtin->src_dr = src_dr;
1704   builtin->dst_base = dst_base;
1705   builtin->src_base = src_base;
1706   builtin->size = size;
1707   return builtin;
1708 }
1709 
1710 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1711    memset call.  */
1712 
1713 static void
classify_builtin_st(loop_p loop,partition * partition,data_reference_p dr)1714 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1715 {
1716   gimple *stmt = DR_STMT (dr);
1717   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1718 
1719   if (const_with_all_bytes_same (rhs) == -1
1720       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1721             || (TYPE_MODE (TREE_TYPE (rhs))
1722                 != TYPE_MODE (unsigned_char_type_node))))
1723     return;
1724 
1725   if (TREE_CODE (rhs) == SSA_NAME
1726       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1727       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1728     return;
1729 
1730   int res = compute_access_range (loop, dr, &base, &size);
1731   if (res == 0)
1732     return;
1733   if (res == 1)
1734     {
1735       partition->kind = PKIND_PARTIAL_MEMSET;
1736       return;
1737     }
1738 
1739   poly_uint64 base_offset;
1740   unsigned HOST_WIDE_INT const_base_offset;
1741   tree base_base = strip_offset (base, &base_offset);
1742   if (!base_offset.is_constant (&const_base_offset))
1743     return;
1744 
1745   struct builtin_info *builtin;
1746   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1747   builtin->dst_base_base = base_base;
1748   builtin->dst_base_offset = const_base_offset;
1749   partition->builtin = builtin;
1750   partition->kind = PKIND_MEMSET;
1751 }
1752 
1753 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1754    if it forms builtin memcpy or memmove call.  */
1755 
1756 void
classify_builtin_ldst(loop_p loop,struct graph * rdg,partition * partition,data_reference_p dst_dr,data_reference_p src_dr)1757 loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1758                                                     partition *partition,
1759                                                     data_reference_p dst_dr,
1760                                                     data_reference_p src_dr)
1761 {
1762   tree base, size, src_base, src_size;
1763   auto_vec<tree> dst_steps, src_steps;
1764 
1765   /* Compute access range of both load and store.  */
1766   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1767   if (res != 2)
1768     return;
1769   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1770   if (res != 2)
1771     return;
1772 
1773   /* They must have the same access size.  */
1774   if (!operand_equal_p (size, src_size, 0))
1775     return;
1776 
1777   /* They must have the same storage order.  */
1778   if (reverse_storage_order_for_component_p (DR_REF (dst_dr))
1779       != reverse_storage_order_for_component_p (DR_REF (src_dr)))
1780     return;
1781 
1782   /* Load and store in loop nest must access memory in the same way, i.e,
1783      their must have the same steps in each loop of the nest.  */
1784   if (dst_steps.length () != src_steps.length ())
1785     return;
1786   for (unsigned i = 0; i < dst_steps.length (); ++i)
1787     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1788       return;
1789 
1790   /* Now check that if there is a dependence.  */
1791   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1792 
1793   /* Classify as memmove if no dependence between load and store.  */
1794   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1795     {
1796       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1797       partition->kind = PKIND_MEMMOVE;
1798       return;
1799     }
1800 
1801   /* Can't do memmove in case of unknown dependence or dependence without
1802      classical distance vector.  */
1803   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1804       || DDR_NUM_DIST_VECTS (ddr) == 0)
1805     return;
1806 
1807   unsigned i;
1808   lambda_vector dist_v;
1809   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1810   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1811     {
1812       unsigned dep_lev = dependence_level (dist_v, num_lev);
1813       /* Can't do memmove if load depends on store.  */
1814       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1815           return;
1816     }
1817 
1818   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1819   partition->kind = PKIND_MEMMOVE;
1820   return;
1821 }
1822 
1823 bool
classify_partition(loop_p loop,struct graph * rdg,partition * partition,bitmap stmt_in_all_partitions)1824 loop_distribution::classify_partition (loop_p loop,
1825                                                struct graph *rdg, partition *partition,
1826                                                bitmap stmt_in_all_partitions)
1827 {
1828   bitmap_iterator bi;
1829   unsigned i;
1830   data_reference_p single_ld = NULL, single_st = NULL;
1831   bool volatiles_p = false, has_reduction = false;
1832 
1833   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1834     {
1835       gimple *stmt = RDG_STMT (rdg, i);
1836 
1837       if (gimple_has_volatile_ops (stmt))
1838           volatiles_p = true;
1839 
1840       /* If the stmt is not included by all partitions and there is uses
1841            outside of the loop, then mark the partition as reduction.  */
1842       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1843           {
1844             /* Due to limitation in the transform phase we have to fuse all
1845                reduction partitions.  As a result, this could cancel valid
1846                loop distribution especially for loop that induction variable
1847                is used outside of loop.  To workaround this issue, we skip
1848                marking partition as reudction if the reduction stmt belongs
1849                to all partitions.  In such case, reduction will be computed
1850                correctly no matter how partitions are fused/distributed.  */
1851             if (!bitmap_bit_p (stmt_in_all_partitions, i))
1852               partition->reduction_p = true;
1853             else
1854               has_reduction = true;
1855           }
1856     }
1857 
1858   /* Simple workaround to prevent classifying the partition as builtin
1859      if it contains any use outside of loop.  For the case where all
1860      partitions have the reduction this simple workaround is delayed
1861      to only affect the last partition.  */
1862   if (partition->reduction_p)
1863      return has_reduction;
1864 
1865   /* Perform general partition disqualification for builtins.  */
1866   if (volatiles_p
1867       || !flag_tree_loop_distribute_patterns)
1868     return has_reduction;
1869 
1870   /* Find single load/store data references for builtin partition.  */
1871   if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
1872       || !single_st)
1873     return has_reduction;
1874 
1875   if (single_ld && single_st)
1876     {
1877       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1878       /* Direct aggregate copy or via an SSA name temporary.  */
1879       if (load != store
1880             && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1881           return has_reduction;
1882     }
1883 
1884   partition->loc = gimple_location (DR_STMT (single_st));
1885 
1886   /* Classify the builtin kind.  */
1887   if (single_ld == NULL)
1888     classify_builtin_st (loop, partition, single_st);
1889   else
1890     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1891   return has_reduction;
1892 }
1893 
1894 bool
share_memory_accesses(struct graph * rdg,partition * partition1,partition * partition2)1895 loop_distribution::share_memory_accesses (struct graph *rdg,
1896                            partition *partition1, partition *partition2)
1897 {
1898   unsigned i, j;
1899   bitmap_iterator bi, bj;
1900   data_reference_p dr1, dr2;
1901 
1902   /* First check whether in the intersection of the two partitions are
1903      any loads or stores.  Common loads are the situation that happens
1904      most often.  */
1905   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1906     if (RDG_MEM_WRITE_STMT (rdg, i)
1907           || RDG_MEM_READS_STMT (rdg, i))
1908       return true;
1909 
1910   /* Then check whether the two partitions access the same memory object.  */
1911   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1912     {
1913       dr1 = datarefs_vec[i];
1914 
1915       if (!DR_BASE_ADDRESS (dr1)
1916             || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1917           continue;
1918 
1919       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1920           {
1921             dr2 = datarefs_vec[j];
1922 
1923             if (!DR_BASE_ADDRESS (dr2)
1924                 || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1925               continue;
1926 
1927             if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1928                 && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1929                 && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1930                 && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1931               return true;
1932           }
1933     }
1934 
1935   return false;
1936 }
1937 
1938 /* For each seed statement in STARTING_STMTS, this function builds
1939    partition for it by adding depended statements according to RDG.
1940    All partitions are recorded in PARTITIONS.  */
1941 
1942 void
rdg_build_partitions(struct graph * rdg,vec<gimple * > starting_stmts,vec<partition * > * partitions)1943 loop_distribution::rdg_build_partitions (struct graph *rdg,
1944                                                    vec<gimple *> starting_stmts,
1945                                                    vec<partition *> *partitions)
1946 {
1947   auto_bitmap processed;
1948   int i;
1949   gimple *stmt;
1950 
1951   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1952     {
1953       int v = rdg_vertex_for_stmt (rdg, stmt);
1954 
1955       if (dump_file && (dump_flags & TDF_DETAILS))
1956           fprintf (dump_file,
1957                      "ldist asked to generate code for vertex %d\n", v);
1958 
1959       /* If the vertex is already contained in another partition so
1960          is the partition rooted at it.  */
1961       if (bitmap_bit_p (processed, v))
1962           continue;
1963 
1964       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1965       bitmap_ior_into (processed, partition->stmts);
1966 
1967       if (dump_file && (dump_flags & TDF_DETAILS))
1968           {
1969             fprintf (dump_file, "ldist creates useful %s partition:\n",
1970                        partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1971             bitmap_print (dump_file, partition->stmts, "  ", "\n");
1972           }
1973 
1974       partitions->safe_push (partition);
1975     }
1976 
1977   /* All vertices should have been assigned to at least one partition now,
1978      other than vertices belonging to dead code.  */
1979 }
1980 
1981 /* Dump to FILE the PARTITIONS.  */
1982 
1983 static void
dump_rdg_partitions(FILE * file,const vec<partition * > & partitions)1984 dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
1985 {
1986   int i;
1987   partition *partition;
1988 
1989   FOR_EACH_VEC_ELT (partitions, i, partition)
1990     debug_bitmap_file (file, partition->stmts);
1991 }
1992 
1993 /* Debug PARTITIONS.  */
1994 extern void debug_rdg_partitions (const vec<partition *> &);
1995 
1996 DEBUG_FUNCTION void
debug_rdg_partitions(const vec<partition * > & partitions)1997 debug_rdg_partitions (const vec<partition *> &partitions)
1998 {
1999   dump_rdg_partitions (stderr, partitions);
2000 }
2001 
2002 /* Returns the number of read and write operations in the RDG.  */
2003 
2004 static int
number_of_rw_in_rdg(struct graph * rdg)2005 number_of_rw_in_rdg (struct graph *rdg)
2006 {
2007   int i, res = 0;
2008 
2009   for (i = 0; i < rdg->n_vertices; i++)
2010     {
2011       if (RDG_MEM_WRITE_STMT (rdg, i))
2012           ++res;
2013 
2014       if (RDG_MEM_READS_STMT (rdg, i))
2015           ++res;
2016     }
2017 
2018   return res;
2019 }
2020 
2021 /* Returns the number of read and write operations in a PARTITION of
2022    the RDG.  */
2023 
2024 static int
number_of_rw_in_partition(struct graph * rdg,partition * partition)2025 number_of_rw_in_partition (struct graph *rdg, partition *partition)
2026 {
2027   int res = 0;
2028   unsigned i;
2029   bitmap_iterator ii;
2030 
2031   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2032     {
2033       if (RDG_MEM_WRITE_STMT (rdg, i))
2034           ++res;
2035 
2036       if (RDG_MEM_READS_STMT (rdg, i))
2037           ++res;
2038     }
2039 
2040   return res;
2041 }
2042 
2043 /* Returns true when one of the PARTITIONS contains all the read or
2044    write operations of RDG.  */
2045 
2046 static bool
partition_contains_all_rw(struct graph * rdg,const vec<partition * > & partitions)2047 partition_contains_all_rw (struct graph *rdg,
2048                                  const vec<partition *> &partitions)
2049 {
2050   int i;
2051   partition *partition;
2052   int nrw = number_of_rw_in_rdg (rdg);
2053 
2054   FOR_EACH_VEC_ELT (partitions, i, partition)
2055     if (nrw == number_of_rw_in_partition (rdg, partition))
2056       return true;
2057 
2058   return false;
2059 }
2060 
2061 int
pg_add_dependence_edges(struct graph * rdg,int dir,bitmap drs1,bitmap drs2,vec<ddr_p> * alias_ddrs)2062 loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2063                                bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2064 {
2065   unsigned i, j;
2066   bitmap_iterator bi, bj;
2067   data_reference_p dr1, dr2, saved_dr1;
2068 
2069   /* dependence direction - 0 is no dependence, -1 is back,
2070      1 is forth, 2 is both (we can stop then, merging will occur).  */
2071   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2072     {
2073       dr1 = datarefs_vec[i];
2074 
2075       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2076           {
2077             int res, this_dir = 1;
2078             ddr_p ddr;
2079 
2080             dr2 = datarefs_vec[j];
2081 
2082             /* Skip all <read, read> data dependence.  */
2083             if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2084               continue;
2085 
2086             saved_dr1 = dr1;
2087             /* Re-shuffle data-refs to be in topological order.  */
2088             if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2089                 > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2090               {
2091                 std::swap (dr1, dr2);
2092                 this_dir = -this_dir;
2093               }
2094             ddr = get_data_dependence (rdg, dr1, dr2);
2095             if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2096               {
2097                 this_dir = 0;
2098                 res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2099                                                      DR_BASE_ADDRESS (dr2));
2100                 /* Be conservative.  If data references are not well analyzed,
2101                      or the two data references have the same base address and
2102                      offset, add dependence and consider it alias to each other.
2103                      In other words, the dependence cannot be resolved by
2104                      runtime alias check.  */
2105                 if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2106                       || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2107                       || !DR_INIT (dr1) || !DR_INIT (dr2)
2108                       || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2109                       || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2110                       || res == 0)
2111                     this_dir = 2;
2112                 /* Data dependence could be resolved by runtime alias check,
2113                      record it in ALIAS_DDRS.  */
2114                 else if (alias_ddrs != NULL)
2115                     alias_ddrs->safe_push (ddr);
2116                 /* Or simply ignore it.  */
2117               }
2118             else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2119               {
2120                 /* Known dependences can still be unordered througout the
2121                      iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2122                      gcc.dg/tree-ssa/pr94969.c.  */
2123                 if (DDR_NUM_DIST_VECTS (ddr) != 1)
2124                     this_dir = 2;
2125                 /* If the overlap is exact preserve stmt order.  */
2126                 else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2127                                                       DDR_NB_LOOPS (ddr)))
2128                     ;
2129                 /* Else as the distance vector is lexicographic positive swap
2130                      the dependence direction.  */
2131                 else
2132                     {
2133                       if (DDR_REVERSED_P (ddr))
2134                         this_dir = -this_dir;
2135                       this_dir = -this_dir;
2136 
2137                       /* When then dependence distance of the innermost common
2138                          loop of the DRs is zero we have a conflict.  */
2139                       auto l1 = gimple_bb (DR_STMT (dr1))->loop_father;
2140                       auto l2 = gimple_bb (DR_STMT (dr2))->loop_father;
2141                       int idx = index_in_loop_nest (find_common_loop (l1, l2)->num,
2142                                                             DDR_LOOP_NEST (ddr));
2143                       if (DDR_DIST_VECT (ddr, 0)[idx] == 0)
2144                         this_dir = 2;
2145                     }
2146               }
2147             else
2148               this_dir = 0;
2149             if (this_dir == 2)
2150               return 2;
2151             else if (dir == 0)
2152               dir = this_dir;
2153             else if (this_dir != 0 && dir != this_dir)
2154               return 2;
2155             /* Shuffle "back" dr1.  */
2156             dr1 = saved_dr1;
2157           }
2158     }
2159   return dir;
2160 }
2161 
2162 /* Compare postorder number of the partition graph vertices V1 and V2.  */
2163 
2164 static int
pgcmp(const void * v1_,const void * v2_)2165 pgcmp (const void *v1_, const void *v2_)
2166 {
2167   const vertex *v1 = (const vertex *)v1_;
2168   const vertex *v2 = (const vertex *)v2_;
2169   return v2->post - v1->post;
2170 }
2171 
2172 /* Data attached to vertices of partition dependence graph.  */
2173 struct pg_vdata
2174 {
2175   /* ID of the corresponding partition.  */
2176   int id;
2177   /* The partition.  */
2178   struct partition *partition;
2179 };
2180 
2181 /* Data attached to edges of partition dependence graph.  */
2182 struct pg_edata
2183 {
2184   /* If the dependence edge can be resolved by runtime alias check,
2185      this vector contains data dependence relations for runtime alias
2186      check.  On the other hand, if the dependence edge is introduced
2187      because of compilation time known data dependence, this vector
2188      contains nothing.  */
2189   vec<ddr_p> alias_ddrs;
2190 };
2191 
2192 /* Callback data for traversing edges in graph.  */
2193 struct pg_edge_callback_data
2194 {
2195   /* Bitmap contains strong connected components should be merged.  */
2196   bitmap sccs_to_merge;
2197   /* Array constains component information for all vertices.  */
2198   int *vertices_component;
2199   /* Vector to record all data dependence relations which are needed
2200      to break strong connected components by runtime alias checks.  */
2201   vec<ddr_p> *alias_ddrs;
2202 };
2203 
2204 /* Initialize vertice's data for partition dependence graph PG with
2205    PARTITIONS.  */
2206 
2207 static void
init_partition_graph_vertices(struct graph * pg,vec<struct partition * > * partitions)2208 init_partition_graph_vertices (struct graph *pg,
2209                                      vec<struct partition *> *partitions)
2210 {
2211   int i;
2212   partition *partition;
2213   struct pg_vdata *data;
2214 
2215   for (i = 0; partitions->iterate (i, &partition); ++i)
2216     {
2217       data = new pg_vdata;
2218       pg->vertices[i].data = data;
2219       data->id = i;
2220       data->partition = partition;
2221     }
2222 }
2223 
2224 /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2225    dependence relations to the EDGE if DDRS isn't NULL.  */
2226 
2227 static void
add_partition_graph_edge(struct graph * pg,int i,int j,vec<ddr_p> * ddrs)2228 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2229 {
2230   struct graph_edge *e = add_edge (pg, i, j);
2231 
2232   /* If the edge is attached with data dependence relations, it means this
2233      dependence edge can be resolved by runtime alias checks.  */
2234   if (ddrs != NULL)
2235     {
2236       struct pg_edata *data = new pg_edata;
2237 
2238       gcc_assert (ddrs->length () > 0);
2239       e->data = data;
2240       data->alias_ddrs = vNULL;
2241       data->alias_ddrs.safe_splice (*ddrs);
2242     }
2243 }
2244 
2245 /* Callback function for graph travesal algorithm.  It returns true
2246    if edge E should skipped when traversing the graph.  */
2247 
2248 static bool
pg_skip_alias_edge(struct graph_edge * e)2249 pg_skip_alias_edge (struct graph_edge *e)
2250 {
2251   struct pg_edata *data = (struct pg_edata *)e->data;
2252   return (data != NULL && data->alias_ddrs.length () > 0);
2253 }
2254 
2255 /* Callback function freeing data attached to edge E of graph.  */
2256 
2257 static void
free_partition_graph_edata_cb(struct graph *,struct graph_edge * e,void *)2258 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2259 {
2260   if (e->data != NULL)
2261     {
2262       struct pg_edata *data = (struct pg_edata *)e->data;
2263       data->alias_ddrs.release ();
2264       delete data;
2265     }
2266 }
2267 
2268 /* Free data attached to vertice of partition dependence graph PG.  */
2269 
2270 static void
free_partition_graph_vdata(struct graph * pg)2271 free_partition_graph_vdata (struct graph *pg)
2272 {
2273   int i;
2274   struct pg_vdata *data;
2275 
2276   for (i = 0; i < pg->n_vertices; ++i)
2277     {
2278       data = (struct pg_vdata *)pg->vertices[i].data;
2279       delete data;
2280     }
2281 }
2282 
2283 /* Build and return partition dependence graph for PARTITIONS.  RDG is
2284    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2285    is true, data dependence caused by possible alias between references
2286    is ignored, as if it doesn't exist at all; otherwise all depdendences
2287    are considered.  */
2288 
2289 struct graph *
build_partition_graph(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2290 loop_distribution::build_partition_graph (struct graph *rdg,
2291                                                     vec<struct partition *> *partitions,
2292                                                     bool ignore_alias_p)
2293 {
2294   int i, j;
2295   struct partition *partition1, *partition2;
2296   graph *pg = new_graph (partitions->length ());
2297   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2298 
2299   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2300 
2301   init_partition_graph_vertices (pg, partitions);
2302 
2303   for (i = 0; partitions->iterate (i, &partition1); ++i)
2304     {
2305       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2306           {
2307             /* dependence direction - 0 is no dependence, -1 is back,
2308                1 is forth, 2 is both (we can stop then, merging will occur).  */
2309             int dir = 0;
2310 
2311             /* If the first partition has reduction, add back edge; if the
2312                second partition has reduction, add forth edge.  This makes
2313                sure that reduction partition will be sorted as the last one.  */
2314             if (partition_reduction_p (partition1))
2315               dir = -1;
2316             else if (partition_reduction_p (partition2))
2317               dir = 1;
2318 
2319             /* Cleanup the temporary vector.  */
2320             alias_ddrs.truncate (0);
2321 
2322             dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2323                                                    partition2->datarefs, alias_ddrs_p);
2324 
2325             /* Add edge to partition graph if there exists dependence.  There
2326                are two types of edges.  One type edge is caused by compilation
2327                time known dependence, this type cannot be resolved by runtime
2328                alias check.  The other type can be resolved by runtime alias
2329                check.  */
2330             if (dir == 1 || dir == 2
2331                 || alias_ddrs.length () > 0)
2332               {
2333                 /* Attach data dependence relations to edge that can be resolved
2334                      by runtime alias check.  */
2335                 bool alias_edge_p = (dir != 1 && dir != 2);
2336                 add_partition_graph_edge (pg, i, j,
2337                                                   (alias_edge_p) ? &alias_ddrs : NULL);
2338               }
2339             if (dir == -1 || dir == 2
2340                 || alias_ddrs.length () > 0)
2341               {
2342                 /* Attach data dependence relations to edge that can be resolved
2343                      by runtime alias check.  */
2344                 bool alias_edge_p = (dir != -1 && dir != 2);
2345                 add_partition_graph_edge (pg, j, i,
2346                                                   (alias_edge_p) ? &alias_ddrs : NULL);
2347               }
2348           }
2349     }
2350   return pg;
2351 }
2352 
2353 /* Sort partitions in PG in descending post order and store them in
2354    PARTITIONS.  */
2355 
2356 static void
sort_partitions_by_post_order(struct graph * pg,vec<struct partition * > * partitions)2357 sort_partitions_by_post_order (struct graph *pg,
2358                                      vec<struct partition *> *partitions)
2359 {
2360   int i;
2361   struct pg_vdata *data;
2362 
2363   /* Now order the remaining nodes in descending postorder.  */
2364   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2365   partitions->truncate (0);
2366   for (i = 0; i < pg->n_vertices; ++i)
2367     {
2368       data = (struct pg_vdata *)pg->vertices[i].data;
2369       if (data->partition)
2370           partitions->safe_push (data->partition);
2371     }
2372 }
2373 
2374 void
merge_dep_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2375 loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2376                                                        vec<struct partition *> *partitions,
2377                                                        bool ignore_alias_p)
2378 {
2379   struct partition *partition1, *partition2;
2380   struct pg_vdata *data;
2381   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2382   int i, j, num_sccs = graphds_scc (pg, NULL);
2383 
2384   /* Strong connected compoenent means dependence cycle, we cannot distribute
2385      them.  So fuse them together.  */
2386   if ((unsigned) num_sccs < partitions->length ())
2387     {
2388       for (i = 0; i < num_sccs; ++i)
2389           {
2390             for (j = 0; partitions->iterate (j, &partition1); ++j)
2391               if (pg->vertices[j].component == i)
2392                 break;
2393             for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2394               if (pg->vertices[j].component == i)
2395                 {
2396                     partition_merge_into (NULL, partition1,
2397                                               partition2, FUSE_SAME_SCC);
2398                     partition1->type = PTYPE_SEQUENTIAL;
2399                     (*partitions)[j] = NULL;
2400                     partition_free (partition2);
2401                     data = (struct pg_vdata *)pg->vertices[j].data;
2402                     data->partition = NULL;
2403                 }
2404           }
2405     }
2406 
2407   sort_partitions_by_post_order (pg, partitions);
2408   gcc_assert (partitions->length () == (unsigned)num_sccs);
2409   free_partition_graph_vdata (pg);
2410   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2411   free_graph (pg);
2412 }
2413 
2414 /* Callback function for traversing edge E in graph G.  DATA is private
2415    callback data.  */
2416 
2417 static void
pg_collect_alias_ddrs(struct graph * g,struct graph_edge * e,void * data)2418 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2419 {
2420   int i, j, component;
2421   struct pg_edge_callback_data *cbdata;
2422   struct pg_edata *edata = (struct pg_edata *) e->data;
2423 
2424   /* If the edge doesn't have attached data dependence, it represents
2425      compilation time known dependences.  This type dependence cannot
2426      be resolved by runtime alias check.  */
2427   if (edata == NULL || edata->alias_ddrs.length () == 0)
2428     return;
2429 
2430   cbdata = (struct pg_edge_callback_data *) data;
2431   i = e->src;
2432   j = e->dest;
2433   component = cbdata->vertices_component[i];
2434   /* Vertices are topologically sorted according to compilation time
2435      known dependences, so we can break strong connected components
2436      by removing edges of the opposite direction, i.e, edges pointing
2437      from vertice with smaller post number to vertice with bigger post
2438      number.  */
2439   if (g->vertices[i].post < g->vertices[j].post
2440       /* We only need to remove edges connecting vertices in the same
2441            strong connected component to break it.  */
2442       && component == cbdata->vertices_component[j]
2443       /* Check if we want to break the strong connected component or not.  */
2444       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2445     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2446 }
2447 
2448 /* Callback function for traversing edge E.  DATA is private
2449    callback data.  */
2450 
2451 static void
pg_unmark_merged_alias_ddrs(struct graph *,struct graph_edge * e,void * data)2452 pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2453 {
2454   int i, j, component;
2455   struct pg_edge_callback_data *cbdata;
2456   struct pg_edata *edata = (struct pg_edata *) e->data;
2457 
2458   if (edata == NULL || edata->alias_ddrs.length () == 0)
2459     return;
2460 
2461   cbdata = (struct pg_edge_callback_data *) data;
2462   i = e->src;
2463   j = e->dest;
2464   component = cbdata->vertices_component[i];
2465   /* Make sure to not skip vertices inside SCCs we are going to merge.  */
2466   if (component == cbdata->vertices_component[j]
2467       && bitmap_bit_p (cbdata->sccs_to_merge, component))
2468     {
2469       edata->alias_ddrs.release ();
2470       delete edata;
2471       e->data = NULL;
2472     }
2473 }
2474 
2475 /* This is the main function breaking strong conected components in
2476    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2477    relations for runtime alias check in ALIAS_DDRS.  */
2478 void
break_alias_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2479 loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2480                                                          vec<struct partition *> *partitions,
2481                                                          vec<ddr_p> *alias_ddrs)
2482 {
2483   int i, j, k, num_sccs, num_sccs_no_alias = 0;
2484   /* Build partition dependence graph.  */
2485   graph *pg = build_partition_graph (rdg, partitions, false);
2486 
2487   alias_ddrs->truncate (0);
2488   /* Find strong connected components in the graph, with all dependence edges
2489      considered.  */
2490   num_sccs = graphds_scc (pg, NULL);
2491   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2492      compilation time known dependences are merged before this function.  */
2493   if ((unsigned) num_sccs < partitions->length ())
2494     {
2495       struct pg_edge_callback_data cbdata;
2496       auto_bitmap sccs_to_merge;
2497       auto_vec<enum partition_type> scc_types;
2498       struct partition *partition, *first;
2499 
2500       /* If all partitions in a SCC have the same type, we can simply merge the
2501            SCC.  This loop finds out such SCCS and record them in bitmap.  */
2502       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2503       for (i = 0; i < num_sccs; ++i)
2504           {
2505             for (j = 0; partitions->iterate (j, &first); ++j)
2506               if (pg->vertices[j].component == i)
2507                 break;
2508 
2509             bool same_type = true, all_builtins = partition_builtin_p (first);
2510             for (++j; partitions->iterate (j, &partition); ++j)
2511               {
2512                 if (pg->vertices[j].component != i)
2513                     continue;
2514 
2515                 if (first->type != partition->type)
2516                     {
2517                       same_type = false;
2518                       break;
2519                     }
2520                 all_builtins &= partition_builtin_p (partition);
2521               }
2522             /* Merge SCC if all partitions in SCC have the same type, though the
2523                result partition is sequential, because vectorizer can do better
2524                runtime alias check.  One expecption is all partitions in SCC are
2525                builtins.  */
2526             if (!same_type || all_builtins)
2527               bitmap_clear_bit (sccs_to_merge, i);
2528           }
2529 
2530       /* Initialize callback data for traversing.  */
2531       cbdata.sccs_to_merge = sccs_to_merge;
2532       cbdata.alias_ddrs = alias_ddrs;
2533       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2534       /* Record the component information which will be corrupted by next
2535            graph scc finding call.  */
2536       for (i = 0; i < pg->n_vertices; ++i)
2537           cbdata.vertices_component[i] = pg->vertices[i].component;
2538 
2539       /* Collect data dependences for runtime alias checks to break SCCs.  */
2540       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2541           {
2542             /* For SCCs we want to merge clear all alias_ddrs for edges
2543                inside the component.  */
2544             for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
2545 
2546             /* Run SCC finding algorithm again, with alias dependence edges
2547                skipped.  This is to topologically sort partitions according to
2548                compilation time known dependence.  Note the topological order
2549                is stored in the form of pg's post order number.  */
2550             num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2551             /* We cannot assert partitions->length () == num_sccs_no_alias
2552                since we are not ignoring alias edges in cycles we are
2553                going to merge.  That's required to compute correct postorder.  */
2554             /* With topological order, we can construct two subgraphs L and R.
2555                L contains edge <x, y> where x < y in terms of post order, while
2556                R contains edge <x, y> where x > y.  Edges for compilation time
2557                known dependence all fall in R, so we break SCCs by removing all
2558                (alias) edges of in subgraph L.  */
2559             for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2560           }
2561 
2562       /* For SCC that doesn't need to be broken, merge it.  */
2563       for (i = 0; i < num_sccs; ++i)
2564           {
2565             if (!bitmap_bit_p (sccs_to_merge, i))
2566               continue;
2567 
2568             for (j = 0; partitions->iterate (j, &first); ++j)
2569               if (cbdata.vertices_component[j] == i)
2570                 break;
2571             for (k = j + 1; partitions->iterate (k, &partition); ++k)
2572               {
2573                 struct pg_vdata *data;
2574 
2575                 if (cbdata.vertices_component[k] != i)
2576                     continue;
2577 
2578                 partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2579                 (*partitions)[k] = NULL;
2580                 partition_free (partition);
2581                 data = (struct pg_vdata *)pg->vertices[k].data;
2582                 gcc_assert (data->id == k);
2583                 data->partition = NULL;
2584                 /* The result partition of merged SCC must be sequential.  */
2585                 first->type = PTYPE_SEQUENTIAL;
2586               }
2587           }
2588       /* If reduction partition's SCC is broken by runtime alias checks,
2589            we force a negative post order to it making sure it will be scheduled
2590            in the last.  */
2591       if (num_sccs_no_alias > 0)
2592           {
2593             j = -1;
2594             for (i = 0; i < pg->n_vertices; ++i)
2595               {
2596                 struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2597                 if (data->partition && partition_reduction_p (data->partition))
2598                     {
2599                       gcc_assert (j == -1);
2600                       j = i;
2601                     }
2602               }
2603             if (j >= 0)
2604               pg->vertices[j].post = -1;
2605           }
2606 
2607       free (cbdata.vertices_component);
2608     }
2609 
2610   sort_partitions_by_post_order (pg, partitions);
2611   free_partition_graph_vdata (pg);
2612   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2613   free_graph (pg);
2614 
2615   if (dump_file && (dump_flags & TDF_DETAILS))
2616     {
2617       fprintf (dump_file, "Possible alias data dependence to break:\n");
2618       dump_data_dependence_relations (dump_file, *alias_ddrs);
2619     }
2620 }
2621 
2622 /* Compute and return an expression whose value is the segment length which
2623    will be accessed by DR in NITERS iterations.  */
2624 
2625 static tree
data_ref_segment_size(struct data_reference * dr,tree niters)2626 data_ref_segment_size (struct data_reference *dr, tree niters)
2627 {
2628   niters = size_binop (MINUS_EXPR,
2629                            fold_convert (sizetype, niters),
2630                            size_one_node);
2631   return size_binop (MULT_EXPR,
2632                          fold_convert (sizetype, DR_STEP (dr)),
2633                          fold_convert (sizetype, niters));
2634 }
2635 
2636 /* Return true if LOOP's latch is dominated by statement for data reference
2637    DR.  */
2638 
2639 static inline bool
latch_dominated_by_data_ref(class loop * loop,data_reference * dr)2640 latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2641 {
2642   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2643                                gimple_bb (DR_STMT (dr)));
2644 }
2645 
2646 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2647    data dependence relations ALIAS_DDRS.  */
2648 
2649 static void
compute_alias_check_pairs(class loop * loop,vec<ddr_p> * alias_ddrs,vec<dr_with_seg_len_pair_t> * comp_alias_pairs)2650 compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2651                                  vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2652 {
2653   unsigned int i;
2654   unsigned HOST_WIDE_INT factor = 1;
2655   tree niters_plus_one, niters = number_of_latch_executions (loop);
2656 
2657   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2658   niters = fold_convert (sizetype, niters);
2659   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2660 
2661   if (dump_file && (dump_flags & TDF_DETAILS))
2662     fprintf (dump_file, "Creating alias check pairs:\n");
2663 
2664   /* Iterate all data dependence relations and compute alias check pairs.  */
2665   for (i = 0; i < alias_ddrs->length (); i++)
2666     {
2667       ddr_p ddr = (*alias_ddrs)[i];
2668       struct data_reference *dr_a = DDR_A (ddr);
2669       struct data_reference *dr_b = DDR_B (ddr);
2670       tree seg_length_a, seg_length_b;
2671 
2672       if (latch_dominated_by_data_ref (loop, dr_a))
2673           seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2674       else
2675           seg_length_a = data_ref_segment_size (dr_a, niters);
2676 
2677       if (latch_dominated_by_data_ref (loop, dr_b))
2678           seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2679       else
2680           seg_length_b = data_ref_segment_size (dr_b, niters);
2681 
2682       unsigned HOST_WIDE_INT access_size_a
2683           = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2684       unsigned HOST_WIDE_INT access_size_b
2685           = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2686       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2687       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2688 
2689       dr_with_seg_len_pair_t dr_with_seg_len_pair
2690           (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2691            dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2692            /* ??? Would WELL_ORDERED be safe?  */
2693            dr_with_seg_len_pair_t::REORDERED);
2694 
2695       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2696     }
2697 
2698   if (tree_fits_uhwi_p (niters))
2699     factor = tree_to_uhwi (niters);
2700 
2701   /* Prune alias check pairs.  */
2702   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2703   if (dump_file && (dump_flags & TDF_DETAILS))
2704     fprintf (dump_file,
2705                "Improved number of alias checks from %d to %d\n",
2706                alias_ddrs->length (), comp_alias_pairs->length ());
2707 }
2708 
2709 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2710    checks and version LOOP under condition of these runtime alias checks.  */
2711 
2712 static void
version_loop_by_alias_check(vec<struct partition * > * partitions,class loop * loop,vec<ddr_p> * alias_ddrs)2713 version_loop_by_alias_check (vec<struct partition *> *partitions,
2714                                    class loop *loop, vec<ddr_p> *alias_ddrs)
2715 {
2716   profile_probability prob;
2717   basic_block cond_bb;
2718   class loop *nloop;
2719   tree lhs, arg0, cond_expr = NULL_TREE;
2720   gimple_seq cond_stmts = NULL;
2721   gimple *call_stmt = NULL;
2722   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2723 
2724   /* Generate code for runtime alias checks if necessary.  */
2725   gcc_assert (alias_ddrs->length () > 0);
2726 
2727   if (dump_file && (dump_flags & TDF_DETAILS))
2728     fprintf (dump_file,
2729                "Version loop <%d> with runtime alias check\n", loop->num);
2730 
2731   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2732   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2733   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2734                                               is_gimple_val, NULL_TREE);
2735 
2736   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2737   bool cancelable_p = flag_tree_loop_vectorize;
2738   if (cancelable_p)
2739     {
2740       unsigned i = 0;
2741       struct partition *partition;
2742       for (; partitions->iterate (i, &partition); ++i)
2743           if (!partition_builtin_p (partition))
2744             break;
2745 
2746      /* If all partitions are builtins, distributing it would be profitable and
2747           we don't want to cancel the runtime alias checks.  */
2748       if (i == partitions->length ())
2749           cancelable_p = false;
2750     }
2751 
2752   /* Generate internal function call for loop distribution alias check if the
2753      runtime alias check should be cancelable.  */
2754   if (cancelable_p)
2755     {
2756       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2757                                                         2, NULL_TREE, cond_expr);
2758       lhs = make_ssa_name (boolean_type_node);
2759       gimple_call_set_lhs (call_stmt, lhs);
2760     }
2761   else
2762     lhs = cond_expr;
2763 
2764   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2765   initialize_original_copy_tables ();
2766   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2767                               prob, prob.invert (), true);
2768   free_original_copy_tables ();
2769   /* Record the original loop number in newly generated loops.  In case of
2770      distribution, the original loop will be distributed and the new loop
2771      is kept.  */
2772   loop->orig_loop_num = nloop->num;
2773   nloop->orig_loop_num = nloop->num;
2774   nloop->dont_vectorize = true;
2775   nloop->force_vectorize = false;
2776 
2777   if (call_stmt)
2778     {
2779       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2780            loop could be destroyed.  */
2781       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2782       gimple_call_set_arg (call_stmt, 0, arg0);
2783       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2784     }
2785 
2786   if (cond_stmts)
2787     {
2788       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2789       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2790     }
2791   update_ssa (TODO_update_ssa);
2792 }
2793 
2794 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2795    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2796 
2797 static inline bool
version_for_distribution_p(vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2798 version_for_distribution_p (vec<struct partition *> *partitions,
2799                                   vec<ddr_p> *alias_ddrs)
2800 {
2801   /* No need to version loop if we have only one partition.  */
2802   if (partitions->length () == 1)
2803     return false;
2804 
2805   /* Need to version loop if runtime alias check is necessary.  */
2806   return (alias_ddrs->length () > 0);
2807 }
2808 
2809 /* Compare base offset of builtin mem* partitions P1 and P2.  */
2810 
2811 static int
offset_cmp(const void * vp1,const void * vp2)2812 offset_cmp (const void *vp1, const void *vp2)
2813 {
2814   struct partition *p1 = *(struct partition *const *) vp1;
2815   struct partition *p2 = *(struct partition *const *) vp2;
2816   unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2817   unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2818   return (o2 < o1) - (o1 < o2);
2819 }
2820 
2821 /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2822    case optimization transforming below code:
2823 
2824      __builtin_memset (&obj, 0, 100);
2825      _1 = &obj + 100;
2826      __builtin_memset (_1, 0, 200);
2827      _2 = &obj + 300;
2828      __builtin_memset (_2, 0, 100);
2829 
2830    into:
2831 
2832      __builtin_memset (&obj, 0, 400);
2833 
2834    Note we don't have dependence information between different partitions
2835    at this point, as a result, we can't handle nonadjacent memset builtin
2836    partitions since dependence might be broken.  */
2837 
2838 static void
fuse_memset_builtins(vec<struct partition * > * partitions)2839 fuse_memset_builtins (vec<struct partition *> *partitions)
2840 {
2841   unsigned i, j;
2842   struct partition *part1, *part2;
2843   tree rhs1, rhs2;
2844 
2845   for (i = 0; partitions->iterate (i, &part1);)
2846     {
2847       if (part1->kind != PKIND_MEMSET)
2848           {
2849             i++;
2850             continue;
2851           }
2852 
2853       /* Find sub-array of memset builtins of the same base.  Index range
2854            of the sub-array is [i, j) with "j > i".  */
2855       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2856           {
2857             if (part2->kind != PKIND_MEMSET
2858                 || !operand_equal_p (part1->builtin->dst_base_base,
2859                                            part2->builtin->dst_base_base, 0))
2860               break;
2861 
2862             /* Memset calls setting different values can't be merged.  */
2863             rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2864             rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2865             if (!operand_equal_p (rhs1, rhs2, 0))
2866               break;
2867           }
2868 
2869       /* Stable sort is required in order to avoid breaking dependence.  */
2870       gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2871                           offset_cmp);
2872       /* Continue with next partition.  */
2873       i = j;
2874     }
2875 
2876   /* Merge all consecutive memset builtin partitions.  */
2877   for (i = 0; i < partitions->length () - 1;)
2878     {
2879       part1 = (*partitions)[i];
2880       if (part1->kind != PKIND_MEMSET)
2881           {
2882             i++;
2883             continue;
2884           }
2885 
2886       part2 = (*partitions)[i + 1];
2887       /* Only merge memset partitions of the same base and with constant
2888            access sizes.  */
2889       if (part2->kind != PKIND_MEMSET
2890             || TREE_CODE (part1->builtin->size) != INTEGER_CST
2891             || TREE_CODE (part2->builtin->size) != INTEGER_CST
2892             || !operand_equal_p (part1->builtin->dst_base_base,
2893                                      part2->builtin->dst_base_base, 0))
2894           {
2895             i++;
2896             continue;
2897           }
2898       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2899       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2900       int bytev1 = const_with_all_bytes_same (rhs1);
2901       int bytev2 = const_with_all_bytes_same (rhs2);
2902       /* Only merge memset partitions of the same value.  */
2903       if (bytev1 != bytev2 || bytev1 == -1)
2904           {
2905             i++;
2906             continue;
2907           }
2908       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2909                                      wi::to_wide (part1->builtin->size));
2910       /* Only merge adjacent memset partitions.  */
2911       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2912           {
2913             i++;
2914             continue;
2915           }
2916       /* Merge partitions[i] and partitions[i+1].  */
2917       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2918                                                     part1->builtin->size,
2919                                                     part2->builtin->size);
2920       partition_free (part2);
2921       partitions->ordered_remove (i + 1);
2922     }
2923 }
2924 
2925 void
finalize_partitions(class loop * loop,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2926 loop_distribution::finalize_partitions (class loop *loop,
2927                                                   vec<struct partition *> *partitions,
2928                                                   vec<ddr_p> *alias_ddrs)
2929 {
2930   unsigned i;
2931   struct partition *partition, *a;
2932 
2933   if (partitions->length () == 1
2934       || alias_ddrs->length () > 0)
2935     return;
2936 
2937   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2938   bool same_type_p = true;
2939   enum partition_type type = ((*partitions)[0])->type;
2940   for (i = 0; partitions->iterate (i, &partition); ++i)
2941     {
2942       same_type_p &= (type == partition->type);
2943       if (partition_builtin_p (partition))
2944           {
2945             num_builtin++;
2946             continue;
2947           }
2948       num_normal++;
2949       if (partition->kind == PKIND_PARTIAL_MEMSET)
2950           num_partial_memset++;
2951     }
2952 
2953   /* Don't distribute current loop into too many loops given we don't have
2954      memory stream cost model.  Be even more conservative in case of loop
2955      nest distribution.  */
2956   if ((same_type_p && num_builtin == 0
2957        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2958       || (loop->inner != NULL
2959             && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2960       || (loop->inner == NULL
2961             && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2962     {
2963       a = (*partitions)[0];
2964       for (i = 1; partitions->iterate (i, &partition); ++i)
2965           {
2966             partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2967             partition_free (partition);
2968           }
2969       partitions->truncate (1);
2970     }
2971 
2972   /* Fuse memset builtins if possible.  */
2973   if (partitions->length () > 1)
2974     fuse_memset_builtins (partitions);
2975 }
2976 
2977 /* Distributes the code from LOOP in such a way that producer statements
2978    are placed before consumer statements.  Tries to separate only the
2979    statements from STMTS into separate loops.  Returns the number of
2980    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2981    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2982 
2983 int
distribute_loop(class loop * loop,const vec<gimple * > & stmts,control_dependences * cd,int * nb_calls,bool * destroy_p,bool only_patterns_p)2984 loop_distribution::distribute_loop (class loop *loop,
2985                      const vec<gimple *> &stmts,
2986                      control_dependences *cd, int *nb_calls, bool *destroy_p,
2987                      bool only_patterns_p)
2988 {
2989   ddrs_table = new hash_table<ddr_hasher> (389);
2990   struct graph *rdg;
2991   partition *partition;
2992   int i, nbp;
2993 
2994   *destroy_p = false;
2995   *nb_calls = 0;
2996   loop_nest.create (0);
2997   if (!find_loop_nest (loop, &loop_nest))
2998     {
2999       loop_nest.release ();
3000       delete ddrs_table;
3001       return 0;
3002     }
3003 
3004   datarefs_vec.create (20);
3005   has_nonaddressable_dataref_p = false;
3006   rdg = build_rdg (loop, cd);
3007   if (!rdg)
3008     {
3009       if (dump_file && (dump_flags & TDF_DETAILS))
3010           fprintf (dump_file,
3011                      "Loop %d not distributed: failed to build the RDG.\n",
3012                      loop->num);
3013 
3014       loop_nest.release ();
3015       free_data_refs (datarefs_vec);
3016       delete ddrs_table;
3017       return 0;
3018     }
3019 
3020   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
3021     {
3022       if (dump_file && (dump_flags & TDF_DETAILS))
3023           fprintf (dump_file,
3024                      "Loop %d not distributed: too many memory references.\n",
3025                      loop->num);
3026 
3027       free_rdg (rdg);
3028       loop_nest.release ();
3029       free_data_refs (datarefs_vec);
3030       delete ddrs_table;
3031       return 0;
3032     }
3033 
3034   data_reference_p dref;
3035   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
3036     dref->aux = (void *) (uintptr_t) i;
3037 
3038   if (dump_file && (dump_flags & TDF_DETAILS))
3039     dump_rdg (dump_file, rdg);
3040 
3041   auto_vec<struct partition *, 3> partitions;
3042   rdg_build_partitions (rdg, stmts, &partitions);
3043 
3044   auto_vec<ddr_p> alias_ddrs;
3045 
3046   auto_bitmap stmt_in_all_partitions;
3047   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3048   for (i = 1; partitions.iterate (i, &partition); ++i)
3049     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3050 
3051   bool any_builtin = false;
3052   bool reduction_in_all = false;
3053   FOR_EACH_VEC_ELT (partitions, i, partition)
3054     {
3055       reduction_in_all
3056           |= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
3057       any_builtin |= partition_builtin_p (partition);
3058     }
3059 
3060   /* If we are only distributing patterns but did not detect any,
3061      simply bail out.  */
3062   if (only_patterns_p
3063       && !any_builtin)
3064     {
3065       nbp = 0;
3066       goto ldist_done;
3067     }
3068 
3069   /* If we are only distributing patterns fuse all partitions that
3070      were not classified as builtins.  This also avoids chopping
3071      a loop into pieces, separated by builtin calls.  That is, we
3072      only want no or a single loop body remaining.  */
3073   struct partition *into;
3074   if (only_patterns_p)
3075     {
3076       for (i = 0; partitions.iterate (i, &into); ++i)
3077           if (!partition_builtin_p (into))
3078             break;
3079       for (++i; partitions.iterate (i, &partition); ++i)
3080           if (!partition_builtin_p (partition))
3081             {
3082               partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3083               partitions.unordered_remove (i);
3084               partition_free (partition);
3085               i--;
3086             }
3087     }
3088 
3089   /* Due to limitations in the transform phase we have to fuse all
3090      reduction partitions into the last partition so the existing
3091      loop will contain all loop-closed PHI nodes.  */
3092   for (i = 0; partitions.iterate (i, &into); ++i)
3093     if (partition_reduction_p (into))
3094       break;
3095   for (i = i + 1; partitions.iterate (i, &partition); ++i)
3096     if (partition_reduction_p (partition))
3097       {
3098           partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3099           partitions.unordered_remove (i);
3100           partition_free (partition);
3101           i--;
3102       }
3103 
3104   /* Apply our simple cost model - fuse partitions with similar
3105      memory accesses.  */
3106   for (i = 0; partitions.iterate (i, &into); ++i)
3107     {
3108       bool changed = false;
3109       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
3110           continue;
3111       for (int j = i + 1;
3112              partitions.iterate (j, &partition); ++j)
3113           {
3114             if (share_memory_accesses (rdg, into, partition))
3115               {
3116                 partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3117                 partitions.unordered_remove (j);
3118                 partition_free (partition);
3119                 j--;
3120                 changed = true;
3121               }
3122           }
3123       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3124          accesses when 1 and 2 have similar accesses but not 0 and 1
3125            then in the next iteration we will fail to consider merging
3126            1 into 0,2.  So try again if we did any merging into 0.  */
3127       if (changed)
3128           i--;
3129     }
3130 
3131   /* Put a non-builtin partition last if we need to preserve a reduction.
3132      ???  This is a workaround that makes sort_partitions_by_post_order do
3133      the correct thing while in reality it should sort each component
3134      separately and then put the component with a reduction or a non-builtin
3135      last.  */
3136   if (reduction_in_all
3137       && partition_builtin_p (partitions.last()))
3138     FOR_EACH_VEC_ELT (partitions, i, partition)
3139       if (!partition_builtin_p (partition))
3140           {
3141             partitions.unordered_remove (i);
3142             partitions.quick_push (partition);
3143             break;
3144           }
3145 
3146   /* Build the partition dependency graph and fuse partitions in strong
3147      connected component.  */
3148   if (partitions.length () > 1)
3149     {
3150       /* Don't support loop nest distribution under runtime alias check
3151            since it's not likely to enable many vectorization opportunities.
3152            Also if loop has any data reference which may be not addressable
3153            since alias check needs to take, compare address of the object.  */
3154       if (loop->inner || has_nonaddressable_dataref_p)
3155           merge_dep_scc_partitions (rdg, &partitions, false);
3156       else
3157           {
3158             merge_dep_scc_partitions (rdg, &partitions, true);
3159             if (partitions.length () > 1)
3160               break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3161           }
3162     }
3163 
3164   finalize_partitions (loop, &partitions, &alias_ddrs);
3165 
3166   /* If there is a reduction in all partitions make sure the last one
3167      is not classified for builtin code generation.  */
3168   if (reduction_in_all)
3169     {
3170       partition = partitions.last ();
3171       if (only_patterns_p
3172             && partition_builtin_p (partition)
3173             && !partition_builtin_p (partitions[0]))
3174           {
3175             nbp = 0;
3176             goto ldist_done;
3177           }
3178       partition->kind = PKIND_NORMAL;
3179     }
3180 
3181   nbp = partitions.length ();
3182   if (nbp == 0
3183       || (nbp == 1 && !partition_builtin_p (partitions[0]))
3184       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3185     {
3186       nbp = 0;
3187       goto ldist_done;
3188     }
3189 
3190   if (version_for_distribution_p (&partitions, &alias_ddrs))
3191     version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3192 
3193   if (dump_file && (dump_flags & TDF_DETAILS))
3194     {
3195       fprintf (dump_file,
3196                  "distribute loop <%d> into partitions:\n", loop->num);
3197       dump_rdg_partitions (dump_file, partitions);
3198     }
3199 
3200   FOR_EACH_VEC_ELT (partitions, i, partition)
3201     {
3202       if (partition_builtin_p (partition))
3203           (*nb_calls)++;
3204       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
3205     }
3206 
3207  ldist_done:
3208   loop_nest.release ();
3209   free_data_refs (datarefs_vec);
3210   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3211        iter != ddrs_table->end (); ++iter)
3212     {
3213       free_dependence_relation (*iter);
3214       *iter = NULL;
3215     }
3216   delete ddrs_table;
3217 
3218   FOR_EACH_VEC_ELT (partitions, i, partition)
3219     partition_free (partition);
3220 
3221   free_rdg (rdg);
3222   return nbp - *nb_calls;
3223 }
3224 
3225 
bb_top_order_init(void)3226 void loop_distribution::bb_top_order_init (void)
3227 {
3228   int rpo_num;
3229   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3230   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3231   bitmap exit_bbs = BITMAP_ALLOC (NULL);
3232 
3233   bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3234   bb_top_order_index_size = last_basic_block_for_fn (cfun);
3235 
3236   entry->flags &= ~EDGE_DFS_BACK;
3237   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3238   rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3239                                                                rpo, NULL);
3240   BITMAP_FREE (exit_bbs);
3241 
3242   for (int i = 0; i < rpo_num; i++)
3243     bb_top_order_index[rpo[i]] = i;
3244 
3245   free (rpo);
3246 }
3247 
bb_top_order_destroy()3248 void loop_distribution::bb_top_order_destroy ()
3249 {
3250   free (bb_top_order_index);
3251   bb_top_order_index = NULL;
3252   bb_top_order_index_size = 0;
3253 }
3254 
3255 
3256 /* Given LOOP, this function records seed statements for distribution in
3257    WORK_LIST.  Return false if there is nothing for distribution.  */
3258 
3259 static bool
find_seed_stmts_for_distribution(class loop * loop,vec<gimple * > * work_list)3260 find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3261 {
3262   basic_block *bbs = get_loop_body_in_dom_order (loop);
3263 
3264   /* Initialize the worklist with stmts we seed the partitions with.  */
3265   for (unsigned i = 0; i < loop->num_nodes; ++i)
3266     {
3267       /* In irreducible sub-regions we don't know how to redirect
3268            conditions, so fail.  See PR100492.  */
3269       if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3270           {
3271             if (dump_file && (dump_flags & TDF_DETAILS))
3272               fprintf (dump_file, "loop %d contains an irreducible region.\n",
3273                          loop->num);
3274             work_list->truncate (0);
3275             break;
3276           }
3277       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3278              !gsi_end_p (gsi); gsi_next (&gsi))
3279           {
3280             gphi *phi = gsi.phi ();
3281             if (virtual_operand_p (gimple_phi_result (phi)))
3282               continue;
3283             /* Distribute stmts which have defs that are used outside of
3284                the loop.  */
3285             if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3286               continue;
3287             work_list->safe_push (phi);
3288           }
3289       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3290              !gsi_end_p (gsi); gsi_next (&gsi))
3291           {
3292             gimple *stmt = gsi_stmt (gsi);
3293 
3294             /* Ignore clobbers, they do not have true side effects.  */
3295             if (gimple_clobber_p (stmt))
3296               continue;
3297 
3298             /* If there is a stmt with side-effects bail out - we
3299                cannot and should not distribute this loop.  */
3300             if (gimple_has_side_effects (stmt))
3301               {
3302                 free (bbs);
3303                 return false;
3304               }
3305 
3306             /* Distribute stmts which have defs that are used outside of
3307                the loop.  */
3308             if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3309               ;
3310             /* Otherwise only distribute stores for now.  */
3311             else if (!gimple_vdef (stmt))
3312               continue;
3313 
3314             work_list->safe_push (stmt);
3315           }
3316     }
3317   bool res = work_list->length () > 0;
3318   if (res && !can_copy_bbs_p (bbs, loop->num_nodes))
3319     {
3320       if (dump_file && (dump_flags & TDF_DETAILS))
3321           fprintf (dump_file, "cannot copy loop %d.\n", loop->num);
3322       res = false;
3323     }
3324   free (bbs);
3325   return res;
3326 }
3327 
3328 /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
3329    to place new statements SEQ before LOOP and replace the old reduction
3330    variable with the new one.  */
3331 
3332 static void
generate_reduction_builtin_1(loop_p loop,gimple_seq & seq,tree reduction_var_old,tree reduction_var_new,const char * info,machine_mode load_mode)3333 generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
3334                                     tree reduction_var_old, tree reduction_var_new,
3335                                     const char *info, machine_mode load_mode)
3336 {
3337   gcc_assert (flag_tree_loop_distribute_patterns);
3338 
3339   /* Place new statements before LOOP.  */
3340   gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
3341   gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
3342 
3343   /* Replace old reduction variable with new one.  */
3344   imm_use_iterator iter;
3345   gimple *stmt;
3346   use_operand_p use_p;
3347   FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
3348     {
3349       FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
3350           SET_USE (use_p, reduction_var_new);
3351 
3352       update_stmt (stmt);
3353     }
3354 
3355   if (dump_file && (dump_flags & TDF_DETAILS))
3356     fprintf (dump_file, info, GET_MODE_NAME (load_mode));
3357 }
3358 
3359 /* Generate a call to rawmemchr and place it before LOOP.  REDUCTION_VAR is
3360    replaced with a fresh SSA name representing the result of the call.  */
3361 
3362 static void
generate_rawmemchr_builtin(loop_p loop,tree reduction_var,data_reference_p store_dr,tree base,tree pattern,location_t loc)3363 generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
3364                                   data_reference_p store_dr, tree base, tree pattern,
3365                                   location_t loc)
3366 {
3367   gimple_seq seq = NULL;
3368 
3369   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3370   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
3371   tree reduction_var_new = copy_ssa_name (reduction_var);
3372   gimple_call_set_lhs (fn_call, reduction_var_new);
3373   gimple_set_location (fn_call, loc);
3374   gimple_seq_add_stmt (&seq, fn_call);
3375 
3376   if (store_dr)
3377     {
3378       gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
3379       gimple_seq_add_stmt (&seq, g);
3380     }
3381 
3382   generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3383                                         "generated rawmemchr%s\n",
3384                                         TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
3385 }
3386 
3387 /* Helper function for generate_strlen_builtin(,_using_rawmemchr)  */
3388 
3389 static void
generate_strlen_builtin_1(loop_p loop,gimple_seq & seq,tree reduction_var_old,tree reduction_var_new,machine_mode mode,tree start_len)3390 generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
3391                                  tree reduction_var_old, tree reduction_var_new,
3392                                  machine_mode mode, tree start_len)
3393 {
3394   /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
3395      converted if types of old and new reduction variable are not compatible. */
3396   reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
3397                                               reduction_var_new);
3398 
3399   /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
3400      length.  */
3401   if (!integer_zerop (start_len))
3402     {
3403       tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
3404       gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
3405                                                start_len);
3406       gimple_seq_add_stmt (&seq, g);
3407       reduction_var_new = lhs;
3408     }
3409 
3410   generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
3411                                         "generated strlen%s\n", mode);
3412 }
3413 
3414 /* Generate a call to strlen and place it before LOOP.  REDUCTION_VAR is
3415    replaced with a fresh SSA name representing the result of the call.  */
3416 
3417 static void
generate_strlen_builtin(loop_p loop,tree reduction_var,tree base,tree start_len,location_t loc)3418 generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
3419                                tree start_len, location_t loc)
3420 {
3421   gimple_seq seq = NULL;
3422 
3423   tree reduction_var_new = make_ssa_name (size_type_node);
3424 
3425   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3426   tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
3427   gimple *fn_call = gimple_build_call (fn, 1, mem);
3428   gimple_call_set_lhs (fn_call, reduction_var_new);
3429   gimple_set_location (fn_call, loc);
3430   gimple_seq_add_stmt (&seq, fn_call);
3431 
3432   generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3433                                    QImode, start_len);
3434 }
3435 
3436 /* Generate code in order to mimic the behaviour of strlen but this time over
3437    an array of elements with mode different than QI.  REDUCTION_VAR is replaced
3438    with a fresh SSA name representing the result, i.e., the length.  */
3439 
3440 static void
generate_strlen_builtin_using_rawmemchr(loop_p loop,tree reduction_var,tree base,tree load_type,tree start_len,location_t loc)3441 generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
3442                                                    tree base, tree load_type,
3443                                                    tree start_len, location_t loc)
3444 {
3445   gimple_seq seq = NULL;
3446 
3447   tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
3448   tree zero = build_zero_cst (load_type);
3449   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
3450   tree end = make_ssa_name (TREE_TYPE (base));
3451   gimple_call_set_lhs (fn_call, end);
3452   gimple_set_location (fn_call, loc);
3453   gimple_seq_add_stmt (&seq, fn_call);
3454 
3455   /* Determine the number of elements between START and END by
3456      evaluating (END - START) / sizeof (*START).  */
3457   tree diff = make_ssa_name (ptrdiff_type_node);
3458   gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
3459   gimple_seq_add_stmt (&seq, diff_stmt);
3460   /* Let SIZE be the size of each character.  */
3461   tree size = gimple_convert (&seq, ptrdiff_type_node,
3462                                     TYPE_SIZE_UNIT (load_type));
3463   tree count = make_ssa_name (ptrdiff_type_node);
3464   gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
3465   gimple_seq_add_stmt (&seq, count_stmt);
3466 
3467   generate_strlen_builtin_1 (loop, seq, reduction_var, count,
3468                                    TYPE_MODE (load_type),
3469                                    start_len);
3470 }
3471 
3472 /* Return true if we can count at least as many characters by taking pointer
3473    difference as we can count via reduction_var without an overflow.  Thus
3474    compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
3475    m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character.  */
3476 static bool
reduction_var_overflows_first(tree reduction_var_type,tree load_type)3477 reduction_var_overflows_first (tree reduction_var_type, tree load_type)
3478 {
3479   widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));;
3480   widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
3481   widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
3482   return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
3483 }
3484 
3485 static gimple *
determine_reduction_stmt_1(const loop_p loop,const basic_block * bbs)3486 determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
3487 {
3488   gimple *reduction_stmt = NULL;
3489 
3490   for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
3491     {
3492       basic_block bb = bbs[i];
3493 
3494       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
3495              gsi_next_nondebug (&bsi))
3496           {
3497             gphi *phi = bsi.phi ();
3498             if (virtual_operand_p (gimple_phi_result (phi)))
3499               continue;
3500             if (stmt_has_scalar_dependences_outside_loop (loop, phi))
3501               {
3502                 if (reduction_stmt)
3503                     return NULL;
3504                 reduction_stmt = phi;
3505               }
3506           }
3507 
3508       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
3509              gsi_next_nondebug (&bsi), ++ninsns)
3510           {
3511             /* Bail out early for loops which are unlikely to match.  */
3512             if (ninsns > 16)
3513               return NULL;
3514             gimple *stmt = gsi_stmt (bsi);
3515             if (gimple_clobber_p (stmt))
3516               continue;
3517             if (gimple_code (stmt) == GIMPLE_LABEL)
3518               continue;
3519             if (gimple_has_volatile_ops (stmt))
3520               return NULL;
3521             if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3522               {
3523                 if (reduction_stmt)
3524                     return NULL;
3525                 reduction_stmt = stmt;
3526               }
3527           }
3528     }
3529 
3530   return reduction_stmt;
3531 }
3532 
3533 /* If LOOP has a single non-volatile reduction statement, then return a pointer
3534    to it.  Otherwise return NULL.  */
3535 static gimple *
determine_reduction_stmt(const loop_p loop)3536 determine_reduction_stmt (const loop_p loop)
3537 {
3538   basic_block *bbs = get_loop_body (loop);
3539   gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
3540   XDELETEVEC (bbs);
3541   return reduction_stmt;
3542 }
3543 
3544 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
3545    replace them accordingly.  For example, a loop of the form
3546 
3547      for (; *p != 42; ++p);
3548 
3549    is replaced by
3550 
3551      p = rawmemchr<MODE> (p, 42);
3552 
3553    under the assumption that rawmemchr is available for a particular MODE.
3554    Another example is
3555 
3556      int i;
3557      for (i = 42; s[i]; ++i);
3558 
3559    which is replaced by
3560 
3561      i = (int)strlen (&s[42]) + 42;
3562 
3563    for some character array S.  In case array S is not of type character array
3564    we end up with
3565 
3566      i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
3567 
3568    assuming that rawmemchr is available for a particular MODE.  */
3569 
3570 bool
transform_reduction_loop(loop_p loop)3571 loop_distribution::transform_reduction_loop (loop_p loop)
3572 {
3573   gimple *reduction_stmt;
3574   data_reference_p load_dr = NULL, store_dr = NULL;
3575 
3576   edge e = single_exit (loop);
3577   gcond *cond = safe_dyn_cast <gcond *> (last_stmt (e->src));
3578   if (!cond)
3579     return false;
3580   /* Ensure loop condition is an (in)equality test and loop is exited either if
3581      the inequality test fails or the equality test succeeds.  */
3582   if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
3583       && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
3584     return false;
3585   /* A limitation of the current implementation is that we only support
3586      constant patterns in (in)equality tests.  */
3587   tree pattern = gimple_cond_rhs (cond);
3588   if (TREE_CODE (pattern) != INTEGER_CST)
3589     return false;
3590 
3591   reduction_stmt = determine_reduction_stmt (loop);
3592 
3593   /* A limitation of the current implementation is that we require a reduction
3594      statement.  Therefore, loops without a reduction statement as in the
3595      following are not recognized:
3596      int *p;
3597      void foo (void) { for (; *p; ++p); } */
3598   if (reduction_stmt == NULL)
3599     return false;
3600 
3601   /* Reduction variables are guaranteed to be SSA names.  */
3602   tree reduction_var;
3603   switch (gimple_code (reduction_stmt))
3604     {
3605     case GIMPLE_ASSIGN:
3606     case GIMPLE_PHI:
3607       reduction_var = gimple_get_lhs (reduction_stmt);
3608       break;
3609     default:
3610       /* Bail out e.g. for GIMPLE_CALL.  */
3611       return false;
3612     }
3613 
3614   struct graph *rdg = build_rdg (loop, NULL);
3615   if (rdg == NULL)
3616     {
3617       if (dump_file && (dump_flags & TDF_DETAILS))
3618           fprintf (dump_file,
3619                      "Loop %d not transformed: failed to build the RDG.\n",
3620                      loop->num);
3621 
3622       return false;
3623     }
3624   auto_bitmap partition_stmts;
3625   bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
3626   find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
3627   free_rdg (rdg);
3628 
3629   /* Bail out if there is no single load.  */
3630   if (load_dr == NULL)
3631     return false;
3632 
3633   /* Reaching this point we have a loop with a single reduction variable,
3634      a single load, and an optional single store.  */
3635 
3636   tree load_ref = DR_REF (load_dr);
3637   tree load_type = TREE_TYPE (load_ref);
3638   tree load_access_base = build_fold_addr_expr (load_ref);
3639   tree load_access_size = TYPE_SIZE_UNIT (load_type);
3640   affine_iv load_iv, reduction_iv;
3641 
3642   if (!INTEGRAL_TYPE_P (load_type)
3643       || !type_has_mode_precision_p (load_type))
3644     return false;
3645 
3646   /* We already ensured that the loop condition tests for (in)equality where the
3647      rhs is a constant pattern. Now ensure that the lhs is the result of the
3648      load.  */
3649   if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
3650     return false;
3651 
3652   /* Bail out if no affine induction variable with constant step can be
3653      determined.  */
3654   if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
3655     return false;
3656 
3657   /* Bail out if memory accesses are not consecutive or not growing.  */
3658   if (!operand_equal_p (load_iv.step, load_access_size, 0))
3659     return false;
3660 
3661   if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
3662     return false;
3663 
3664   /* Handle rawmemchr like loops.  */
3665   if (operand_equal_p (load_iv.base, reduction_iv.base)
3666       && operand_equal_p (load_iv.step, reduction_iv.step))
3667     {
3668       if (store_dr)
3669           {
3670             /* Ensure that we store to X and load from X+I where I>0.  */
3671             if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
3672                 || !integer_onep (TREE_OPERAND (load_iv.base, 1)))
3673               return false;
3674             tree ptr_base = TREE_OPERAND (load_iv.base, 0);
3675             if (TREE_CODE (ptr_base) != SSA_NAME)
3676               return false;
3677             gimple *def = SSA_NAME_DEF_STMT (ptr_base);
3678             if (!gimple_assign_single_p (def)
3679                 || gimple_assign_rhs1 (def) != DR_REF (store_dr))
3680               return false;
3681             /* Ensure that the reduction value is stored.  */
3682             if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
3683               return false;
3684           }
3685       /* Bail out if target does not provide rawmemchr for a certain mode.  */
3686       machine_mode mode = TYPE_MODE (load_type);
3687       if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
3688           return false;
3689       location_t loc = gimple_location (DR_STMT (load_dr));
3690       generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
3691                                           pattern, loc);
3692       return true;
3693     }
3694 
3695   /* Handle strlen like loops.  */
3696   if (store_dr == NULL
3697       && integer_zerop (pattern)
3698       && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var))
3699       && TREE_CODE (reduction_iv.base) == INTEGER_CST
3700       && TREE_CODE (reduction_iv.step) == INTEGER_CST
3701       && integer_onep (reduction_iv.step))
3702     {
3703       location_t loc = gimple_location (DR_STMT (load_dr));
3704       tree reduction_var_type = TREE_TYPE (reduction_var);
3705       /* While determining the length of a string an overflow might occur.
3706            If an overflow only occurs in the loop implementation and not in the
3707            strlen implementation, then either the overflow is undefined or the
3708            truncated result of strlen equals the one of the loop.  Otherwise if
3709            an overflow may also occur in the strlen implementation, then
3710            replacing a loop by a call to strlen is sound whenever we ensure that
3711            if an overflow occurs in the strlen implementation, then also an
3712            overflow occurs in the loop implementation which is undefined.  It
3713            seems reasonable to relax this and assume that the strlen
3714            implementation cannot overflow in case sizetype is big enough in the
3715            sense that an overflow can only happen for string objects which are
3716            bigger than half of the address space; at least for 32-bit targets and
3717            up.
3718 
3719            For strlen which makes use of rawmemchr the maximal length of a string
3720            which can be determined without an overflow is PTRDIFF_MAX / S where
3721            each character has size S.  Since an overflow for ptrdiff type is
3722            undefined we have to make sure that if an overflow occurs, then an
3723            overflow occurs in the loop implementation, too, and this is
3724            undefined, too.  Similar as before we relax this and assume that no
3725            string object is larger than half of the address space; at least for
3726            32-bit targets and up.  */
3727       if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
3728             && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
3729             && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
3730                  && TYPE_PRECISION (ptr_type_node) >= 32)
3731                 || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3732                       && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype)))
3733             && builtin_decl_implicit (BUILT_IN_STRLEN))
3734           generate_strlen_builtin (loop, reduction_var, load_iv.base,
3735                                          reduction_iv.base, loc);
3736       else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
3737                  != CODE_FOR_nothing
3738                  && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
3739                         && TYPE_PRECISION (ptrdiff_type_node) >= 32)
3740                        || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3741                            && reduction_var_overflows_first (reduction_var_type, load_type))))
3742           generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
3743                                                              load_iv.base,
3744                                                              load_type,
3745                                                              reduction_iv.base, loc);
3746       else
3747           return false;
3748       return true;
3749     }
3750 
3751   return false;
3752 }
3753 
3754 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3755    perfect loop nest.  */
3756 
3757 static class loop *
prepare_perfect_loop_nest(class loop * loop)3758 prepare_perfect_loop_nest (class loop *loop)
3759 {
3760   class loop *outer = loop_outer (loop);
3761   tree niters = number_of_latch_executions (loop);
3762 
3763   /* TODO: We only support the innermost 3-level loop nest distribution
3764      because of compilation time issue for now.  This should be relaxed
3765      in the future.  Note we only allow 3-level loop nest distribution
3766      when parallelizing loops.  */
3767   while ((loop->inner == NULL
3768             || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3769            && loop_outer (outer)
3770            && outer->inner == loop && loop->next == NULL
3771            && single_exit (outer)
3772            && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3773            && (niters = number_of_latch_executions (outer)) != NULL_TREE
3774            && niters != chrec_dont_know)
3775     {
3776       loop = outer;
3777       outer = loop_outer (loop);
3778     }
3779 
3780   return loop;
3781 }
3782 
3783 
3784 unsigned int
execute(function * fun)3785 loop_distribution::execute (function *fun)
3786 {
3787   bool changed = false;
3788   basic_block bb;
3789   control_dependences *cd = NULL;
3790   auto_vec<loop_p> loops_to_be_destroyed;
3791 
3792   if (number_of_loops (fun) <= 1)
3793     return 0;
3794 
3795   bb_top_order_init ();
3796 
3797   FOR_ALL_BB_FN (bb, fun)
3798     {
3799       gimple_stmt_iterator gsi;
3800       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3801           gimple_set_uid (gsi_stmt (gsi), -1);
3802       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3803           gimple_set_uid (gsi_stmt (gsi), -1);
3804     }
3805 
3806   /* We can at the moment only distribute non-nested loops, thus restrict
3807      walking to innermost loops.  */
3808   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
3809     {
3810       /* Don't distribute multiple exit edges loop, or cold loop when
3811          not doing pattern detection.  */
3812       if (!single_exit (loop)
3813             || (!flag_tree_loop_distribute_patterns
3814                 && !optimize_loop_for_speed_p (loop)))
3815           continue;
3816 
3817       /* If niters is unknown don't distribute loop but rather try to transform
3818            it to a call to a builtin.  */
3819       tree niters = number_of_latch_executions (loop);
3820       if (niters == NULL_TREE || niters == chrec_dont_know)
3821           {
3822             datarefs_vec.create (20);
3823             if (flag_tree_loop_distribute_patterns
3824                 && transform_reduction_loop (loop))
3825               {
3826                 changed = true;
3827                 loops_to_be_destroyed.safe_push (loop);
3828                 if (dump_enabled_p ())
3829                     {
3830                       dump_user_location_t loc = find_loop_location (loop);
3831                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3832                                            loc, "Loop %d transformed into a builtin.\n",
3833                                            loop->num);
3834                     }
3835               }
3836             free_data_refs (datarefs_vec);
3837             continue;
3838           }
3839 
3840       /* Get the perfect loop nest for distribution.  */
3841       loop = prepare_perfect_loop_nest (loop);
3842       for (; loop; loop = loop->inner)
3843           {
3844             auto_vec<gimple *> work_list;
3845             if (!find_seed_stmts_for_distribution (loop, &work_list))
3846               break;
3847 
3848             const char *str = loop->inner ? " nest" : "";
3849             dump_user_location_t loc = find_loop_location (loop);
3850             if (!cd)
3851               {
3852                 calculate_dominance_info (CDI_DOMINATORS);
3853                 calculate_dominance_info (CDI_POST_DOMINATORS);
3854                 cd = new control_dependences ();
3855                 free_dominance_info (CDI_POST_DOMINATORS);
3856               }
3857 
3858             bool destroy_p;
3859             int nb_generated_loops, nb_generated_calls;
3860             nb_generated_loops
3861               = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3862                                      &destroy_p, (!optimize_loop_for_speed_p (loop)
3863                                                       || !flag_tree_loop_distribution));
3864             if (destroy_p)
3865               loops_to_be_destroyed.safe_push (loop);
3866 
3867             if (nb_generated_loops + nb_generated_calls > 0)
3868               {
3869                 changed = true;
3870                 if (dump_enabled_p ())
3871                     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3872                                          loc, "Loop%s %d distributed: split to %d loops "
3873                                          "and %d library calls.\n", str, loop->num,
3874                                          nb_generated_loops, nb_generated_calls);
3875 
3876                 break;
3877               }
3878 
3879             if (dump_file && (dump_flags & TDF_DETAILS))
3880               fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3881           }
3882     }
3883 
3884   if (cd)
3885     delete cd;
3886 
3887   if (bb_top_order_index != NULL)
3888     bb_top_order_destroy ();
3889 
3890   if (changed)
3891     {
3892       /* Destroy loop bodies that could not be reused.  Do this late as we
3893            otherwise can end up refering to stale data in control dependences.  */
3894       unsigned i;
3895       class loop *loop;
3896       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3897           destroy_loop (loop);
3898 
3899       /* Cached scalar evolutions now may refer to wrong or non-existing
3900            loops.  */
3901       scev_reset ();
3902       mark_virtual_operands_for_renaming (fun);
3903       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3904     }
3905 
3906   checking_verify_loop_structure ();
3907 
3908   return changed ? TODO_cleanup_cfg : 0;
3909 }
3910 
3911 
3912 /* Distribute all loops in the current function.  */
3913 
3914 namespace {
3915 
3916 const pass_data pass_data_loop_distribution =
3917 {
3918   GIMPLE_PASS, /* type */
3919   "ldist", /* name */
3920   OPTGROUP_LOOP, /* optinfo_flags */
3921   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
3922   ( PROP_cfg | PROP_ssa ), /* properties_required */
3923   0, /* properties_provided */
3924   0, /* properties_destroyed */
3925   0, /* todo_flags_start */
3926   0, /* todo_flags_finish */
3927 };
3928 
3929 class pass_loop_distribution : public gimple_opt_pass
3930 {
3931 public:
pass_loop_distribution(gcc::context * ctxt)3932   pass_loop_distribution (gcc::context *ctxt)
3933     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3934   {}
3935 
3936   /* opt_pass methods: */
gate(function *)3937   virtual bool gate (function *)
3938     {
3939       return flag_tree_loop_distribution
3940           || flag_tree_loop_distribute_patterns;
3941     }
3942 
3943   virtual unsigned int execute (function *);
3944 
3945 }; // class pass_loop_distribution
3946 
3947 unsigned int
execute(function * fun)3948 pass_loop_distribution::execute (function *fun)
3949 {
3950   return loop_distribution ().execute (fun);
3951 }
3952 
3953 } // anon namespace
3954 
3955 gimple_opt_pass *
make_pass_loop_distribution(gcc::context * ctxt)3956 make_pass_loop_distribution (gcc::context *ctxt)
3957 {
3958   return new pass_loop_distribution (ctxt);
3959 }
3960