1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-builtins.h"
93 #include "i386-features.h"
94 
95 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
96   "savms64",
97   "resms64",
98   "resms64x",
99   "savms64f",
100   "resms64f",
101   "resms64fx"
102 };
103 
104 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
105 /* The below offset values are where each register is stored for the layout
106    relative to incoming stack pointer.  The value of each m_regs[].offset will
107    be relative to the incoming base pointer (rax or rsi) used by the stub.
108 
109     s_instances:   0                    1                   2                   3
110     Offset:                                                 realigned or        aligned + 8
111     Register           aligned          aligned + 8         aligned w/HFP       w/HFP     */
112     XMM15_REG,      /* 0x10             0x18                0x10                0x18      */
113     XMM14_REG,      /* 0x20             0x28                0x20                0x28      */
114     XMM13_REG,      /* 0x30             0x38                0x30                0x38      */
115     XMM12_REG,      /* 0x40             0x48                0x40                0x48      */
116     XMM11_REG,      /* 0x50             0x58                0x50                0x58      */
117     XMM10_REG,      /* 0x60             0x68                0x60                0x68      */
118     XMM9_REG,       /* 0x70             0x78                0x70                0x78      */
119     XMM8_REG,       /* 0x80             0x88                0x80                0x88      */
120     XMM7_REG,       /* 0x90             0x98                0x90                0x98      */
121     XMM6_REG,       /* 0xa0             0xa8                0xa0                0xa8      */
122     SI_REG,         /* 0xa8             0xb0                0xa8                0xb0      */
123     DI_REG,         /* 0xb0             0xb8                0xb0                0xb8      */
124     BX_REG,         /* 0xb8             0xc0                0xb8                0xc0      */
125     BP_REG,         /* 0xc0             0xc8                N/A                 N/A       */
126     R12_REG,        /* 0xc8             0xd0                0xc0                0xc8      */
127     R13_REG,        /* 0xd0             0xd8                0xc8                0xd0      */
128     R14_REG,        /* 0xd8             0xe0                0xd0                0xd8      */
129     R15_REG,        /* 0xe0             0xe8                0xd8                0xe0      */
130 };
131 
132 /* Instantiate static const values.  */
133 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
134 const unsigned xlogue_layout::MIN_REGS;
135 const unsigned xlogue_layout::MAX_REGS;
136 const unsigned xlogue_layout::MAX_EXTRA_REGS;
137 const unsigned xlogue_layout::VARIANT_COUNT;
138 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
139 
140 /* Initialize xlogue_layout::s_stub_names to zero.  */
141 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
142                                         [STUB_NAME_MAX_LEN];
143 
144 /* Instantiates all xlogue_layout instances.  */
145 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
146   xlogue_layout (0, false),
147   xlogue_layout (8, false),
148   xlogue_layout (0, true),
149   xlogue_layout (8, true)
150 };
151 
152 /* Return an appropriate const instance of xlogue_layout based upon values
153    in cfun->machine and crtl.  */
154 const class xlogue_layout &
get_instance()155 xlogue_layout::get_instance ()
156 {
157   enum xlogue_stub_sets stub_set;
158   bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
159 
160   if (stack_realign_fp)
161     stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
162   else if (frame_pointer_needed)
163     stub_set = aligned_plus_8
164                 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
165                 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
166   else
167     stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
168 
169   return s_instances[stub_set];
170 }
171 
172 /* Determine how many clobbered registers can be saved by the stub.
173    Returns the count of registers the stub will save and restore.  */
174 unsigned
count_stub_managed_regs()175 xlogue_layout::count_stub_managed_regs ()
176 {
177   bool hfp = frame_pointer_needed || stack_realign_fp;
178   unsigned i, count;
179   unsigned regno;
180 
181   for (count = i = MIN_REGS; i < MAX_REGS; ++i)
182     {
183       regno = REG_ORDER[i];
184       if (regno == BP_REG && hfp)
185           continue;
186       if (!ix86_save_reg (regno, false, false))
187           break;
188       ++count;
189     }
190   return count;
191 }
192 
193 /* Determine if register REGNO is a stub managed register given the
194    total COUNT of stub managed registers.  */
195 bool
is_stub_managed_reg(unsigned regno,unsigned count)196 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
197 {
198   bool hfp = frame_pointer_needed || stack_realign_fp;
199   unsigned i;
200 
201   for (i = 0; i < count; ++i)
202     {
203       gcc_assert (i < MAX_REGS);
204       if (REG_ORDER[i] == BP_REG && hfp)
205           ++count;
206       else if (REG_ORDER[i] == regno)
207           return true;
208     }
209   return false;
210 }
211 
212 /* Constructor for xlogue_layout.  */
xlogue_layout(HOST_WIDE_INT stack_align_off_in,bool hfp)213 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
214   : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
215     m_stack_align_off_in (stack_align_off_in)
216 {
217   HOST_WIDE_INT offset = stack_align_off_in;
218   unsigned i, j;
219 
220   for (i = j = 0; i < MAX_REGS; ++i)
221     {
222       unsigned regno = REG_ORDER[i];
223 
224       if (regno == BP_REG && hfp)
225           continue;
226       if (SSE_REGNO_P (regno))
227           {
228             offset += 16;
229             /* Verify that SSE regs are always aligned.  */
230             gcc_assert (!((stack_align_off_in + offset) & 15));
231           }
232       else
233           offset += 8;
234 
235       m_regs[j].regno    = regno;
236       m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
237     }
238   gcc_assert (j == m_nregs);
239 }
240 
241 const char *
get_stub_name(enum xlogue_stub stub,unsigned n_extra_regs)242 xlogue_layout::get_stub_name (enum xlogue_stub stub,
243                                     unsigned n_extra_regs)
244 {
245   const int have_avx = TARGET_AVX;
246   char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
247 
248   /* Lazy init */
249   if (!*name)
250     {
251       int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
252                                 (have_avx ? "avx" : "sse"),
253                                 STUB_BASE_NAMES[stub],
254                                 MIN_REGS + n_extra_regs);
255       gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
256     }
257 
258   return name;
259 }
260 
261 /* Return rtx of a symbol ref for the entry point (based upon
262    cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
263 rtx
get_stub_rtx(enum xlogue_stub stub)264 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
265 {
266   const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
267   gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
268   gcc_assert (stub < XLOGUE_STUB_COUNT);
269   gcc_assert (crtl->stack_realign_finalized);
270 
271   return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
272 }
273 
274 unsigned scalar_chain::max_id = 0;
275 
276 namespace {
277 
278 /* Initialize new chain.  */
279 
scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)280 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
281 {
282   smode = smode_;
283   vmode = vmode_;
284 
285   chain_id = ++max_id;
286 
287    if (dump_file)
288     fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
289 
290   bitmap_obstack_initialize (NULL);
291   insns = BITMAP_ALLOC (NULL);
292   defs = BITMAP_ALLOC (NULL);
293   defs_conv = BITMAP_ALLOC (NULL);
294   queue = NULL;
295 }
296 
297 /* Free chain's data.  */
298 
~scalar_chain()299 scalar_chain::~scalar_chain ()
300 {
301   BITMAP_FREE (insns);
302   BITMAP_FREE (defs);
303   BITMAP_FREE (defs_conv);
304   bitmap_obstack_release (NULL);
305 }
306 
307 /* Add instruction into chains' queue.  */
308 
309 void
add_to_queue(unsigned insn_uid)310 scalar_chain::add_to_queue (unsigned insn_uid)
311 {
312   if (bitmap_bit_p (insns, insn_uid)
313       || bitmap_bit_p (queue, insn_uid))
314     return;
315 
316   if (dump_file)
317     fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
318                insn_uid, chain_id);
319   bitmap_set_bit (queue, insn_uid);
320 }
321 
general_scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)322 general_scalar_chain::general_scalar_chain (enum machine_mode smode_,
323                                                       enum machine_mode vmode_)
324      : scalar_chain (smode_, vmode_)
325 {
326   insns_conv = BITMAP_ALLOC (NULL);
327   n_sse_to_integer = 0;
328   n_integer_to_sse = 0;
329 }
330 
~general_scalar_chain()331 general_scalar_chain::~general_scalar_chain ()
332 {
333   BITMAP_FREE (insns_conv);
334 }
335 
336 /* For DImode conversion, mark register defined by DEF as requiring
337    conversion.  */
338 
339 void
mark_dual_mode_def(df_ref def)340 general_scalar_chain::mark_dual_mode_def (df_ref def)
341 {
342   gcc_assert (DF_REF_REG_DEF_P (def));
343 
344   /* Record the def/insn pair so we can later efficiently iterate over
345      the defs to convert on insns not in the chain.  */
346   bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
347   if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
348     {
349       if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
350             && !reg_new)
351           return;
352       n_integer_to_sse++;
353     }
354   else
355     {
356       if (!reg_new)
357           return;
358       n_sse_to_integer++;
359     }
360 
361   if (dump_file)
362     fprintf (dump_file,
363                "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
364                DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
365 }
366 
367 /* For TImode conversion, it is unused.  */
368 
369 void
mark_dual_mode_def(df_ref)370 timode_scalar_chain::mark_dual_mode_def (df_ref)
371 {
372   gcc_unreachable ();
373 }
374 
375 /* Check REF's chain to add new insns into a queue
376    and find registers requiring conversion.  */
377 
378 void
analyze_register_chain(bitmap candidates,df_ref ref)379 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
380 {
381   df_link *chain;
382 
383   gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
384                 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
385   add_to_queue (DF_REF_INSN_UID (ref));
386 
387   for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
388     {
389       unsigned uid = DF_REF_INSN_UID (chain->ref);
390 
391       if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
392           continue;
393 
394       if (!DF_REF_REG_MEM_P (chain->ref))
395           {
396             if (bitmap_bit_p (insns, uid))
397               continue;
398 
399             if (bitmap_bit_p (candidates, uid))
400               {
401                 add_to_queue (uid);
402                 continue;
403               }
404           }
405 
406       if (DF_REF_REG_DEF_P (chain->ref))
407           {
408             if (dump_file)
409               fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
410                          DF_REF_REGNO (chain->ref), uid);
411             mark_dual_mode_def (chain->ref);
412           }
413       else
414           {
415             if (dump_file)
416               fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
417                          DF_REF_REGNO (chain->ref), uid);
418             mark_dual_mode_def (ref);
419           }
420     }
421 }
422 
423 /* Add instruction into a chain.  */
424 
425 void
add_insn(bitmap candidates,unsigned int insn_uid)426 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
427 {
428   if (bitmap_bit_p (insns, insn_uid))
429     return;
430 
431   if (dump_file)
432     fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
433 
434   bitmap_set_bit (insns, insn_uid);
435 
436   rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
437   rtx def_set = single_set (insn);
438   if (def_set && REG_P (SET_DEST (def_set))
439       && !HARD_REGISTER_P (SET_DEST (def_set)))
440     bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
441 
442   /* ???  The following is quadratic since analyze_register_chain
443      iterates over all refs to look for dual-mode regs.  Instead this
444      should be done separately for all regs mentioned in the chain once.  */
445   df_ref ref;
446   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
447     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
448       analyze_register_chain (candidates, ref);
449   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
450     if (!DF_REF_REG_MEM_P (ref))
451       analyze_register_chain (candidates, ref);
452 }
453 
454 /* Build new chain starting from insn INSN_UID recursively
455    adding all dependent uses and definitions.  */
456 
457 void
build(bitmap candidates,unsigned insn_uid)458 scalar_chain::build (bitmap candidates, unsigned insn_uid)
459 {
460   queue = BITMAP_ALLOC (NULL);
461   bitmap_set_bit (queue, insn_uid);
462 
463   if (dump_file)
464     fprintf (dump_file, "Building chain #%d...\n", chain_id);
465 
466   while (!bitmap_empty_p (queue))
467     {
468       insn_uid = bitmap_first_set_bit (queue);
469       bitmap_clear_bit (queue, insn_uid);
470       bitmap_clear_bit (candidates, insn_uid);
471       add_insn (candidates, insn_uid);
472     }
473 
474   if (dump_file)
475     {
476       fprintf (dump_file, "Collected chain #%d...\n", chain_id);
477       fprintf (dump_file, "  insns: ");
478       dump_bitmap (dump_file, insns);
479       if (!bitmap_empty_p (defs_conv))
480           {
481             bitmap_iterator bi;
482             unsigned id;
483             const char *comma = "";
484             fprintf (dump_file, "  defs to convert: ");
485             EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
486               {
487                 fprintf (dump_file, "%sr%d", comma, id);
488                 comma = ", ";
489               }
490             fprintf (dump_file, "\n");
491           }
492     }
493 
494   BITMAP_FREE (queue);
495 }
496 
497 /* Return a cost of building a vector costant
498    instead of using a scalar one.  */
499 
500 int
vector_const_cost(rtx exp)501 general_scalar_chain::vector_const_cost (rtx exp)
502 {
503   gcc_assert (CONST_INT_P (exp));
504 
505   if (standard_sse_constant_p (exp, vmode))
506     return ix86_cost->sse_op;
507   /* We have separate costs for SImode and DImode, use SImode costs
508      for smaller modes.  */
509   return ix86_cost->sse_load[smode == DImode ? 1 : 0];
510 }
511 
512 /* Compute a gain for chain conversion.  */
513 
514 int
compute_convert_gain()515 general_scalar_chain::compute_convert_gain ()
516 {
517   bitmap_iterator bi;
518   unsigned insn_uid;
519   int gain = 0;
520   int cost = 0;
521 
522   if (dump_file)
523     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
524 
525   /* SSE costs distinguish between SImode and DImode loads/stores, for
526      int costs factor in the number of GPRs involved.  When supporting
527      smaller modes than SImode the int load/store costs need to be
528      adjusted as well.  */
529   unsigned sse_cost_idx = smode == DImode ? 1 : 0;
530   unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
531 
532   EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
533     {
534       rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
535       rtx def_set = single_set (insn);
536       rtx src = SET_SRC (def_set);
537       rtx dst = SET_DEST (def_set);
538       int igain = 0;
539 
540       if (REG_P (src) && REG_P (dst))
541           igain += 2 * m - ix86_cost->xmm_move;
542       else if (REG_P (src) && MEM_P (dst))
543           igain
544             += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
545       else if (MEM_P (src) && REG_P (dst))
546           igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
547       else
548           switch (GET_CODE (src))
549             {
550             case ASHIFT:
551             case ASHIFTRT:
552             case LSHIFTRT:
553               if (m == 2)
554                 {
555                     if (INTVAL (XEXP (src, 1)) >= 32)
556                       igain += ix86_cost->add;
557                     else
558                       igain += ix86_cost->shift_const;
559                 }
560 
561               igain += ix86_cost->shift_const - ix86_cost->sse_op;
562 
563               if (CONST_INT_P (XEXP (src, 0)))
564                 igain -= vector_const_cost (XEXP (src, 0));
565               break;
566 
567             case AND:
568             case IOR:
569             case XOR:
570             case PLUS:
571             case MINUS:
572               igain += m * ix86_cost->add - ix86_cost->sse_op;
573               /* Additional gain for andnot for targets without BMI.  */
574               if (GET_CODE (XEXP (src, 0)) == NOT
575                     && !TARGET_BMI)
576                 igain += m * ix86_cost->add;
577 
578               if (CONST_INT_P (XEXP (src, 0)))
579                 igain -= vector_const_cost (XEXP (src, 0));
580               if (CONST_INT_P (XEXP (src, 1)))
581                 igain -= vector_const_cost (XEXP (src, 1));
582               break;
583 
584             case NEG:
585             case NOT:
586               igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
587 
588               if (GET_CODE (XEXP (src, 0)) != ABS)
589                 {
590                     igain += m * ix86_cost->add;
591                     break;
592                 }
593               /* FALLTHRU */
594 
595             case ABS:
596             case SMAX:
597             case SMIN:
598             case UMAX:
599             case UMIN:
600               /* We do not have any conditional move cost, estimate it as a
601                  reg-reg move.  Comparisons are costed as adds.  */
602               igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
603               /* Integer SSE ops are all costed the same.  */
604               igain -= ix86_cost->sse_op;
605               break;
606 
607             case COMPARE:
608               /* Assume comparison cost is the same.  */
609               break;
610 
611             case CONST_INT:
612               if (REG_P (dst))
613                 {
614                     if (optimize_insn_for_size_p ())
615                       {
616                         /* xor (2 bytes) vs. xorps (3 bytes).  */
617                         if (src == const0_rtx)
618                           igain -= COSTS_N_BYTES (1);
619                         /* movdi_internal vs. movv2di_internal.  */
620                         /* => mov (5 bytes) vs. movaps (7 bytes).  */
621                         else if (x86_64_immediate_operand (src, SImode))
622                           igain -= COSTS_N_BYTES (2);
623                         else
624                           /* ??? Larger immediate constants are placed in the
625                                constant pool, where the size benefit/impact of
626                                STV conversion is affected by whether and how
627                                often each constant pool entry is shared/reused.
628                                The value below is empirically derived from the
629                                CSiBE benchmark (and the optimal value may drift
630                                over time).  */
631                           igain += COSTS_N_BYTES (0);
632                       }
633                     else
634                       {
635                         /* DImode can be immediate for TARGET_64BIT
636                            and SImode always.  */
637                         igain += m * COSTS_N_INSNS (1);
638                         igain -= vector_const_cost (src);
639                       }
640                 }
641               else if (MEM_P (dst))
642                 {
643                     igain += (m * ix86_cost->int_store[2]
644                                 - ix86_cost->sse_store[sse_cost_idx]);
645                     igain -= vector_const_cost (src);
646                 }
647               break;
648 
649             default:
650               gcc_unreachable ();
651             }
652 
653       if (igain != 0 && dump_file)
654           {
655             fprintf (dump_file, "  Instruction gain %d for ", igain);
656             dump_insn_slim (dump_file, insn);
657           }
658       gain += igain;
659     }
660 
661   if (dump_file)
662     fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
663 
664   /* Cost the integer to sse and sse to integer moves.  */
665   cost += n_sse_to_integer * ix86_cost->sse_to_integer;
666   /* ???  integer_to_sse but we only have that in the RA cost table.
667      Assume sse_to_integer/integer_to_sse are the same which they
668      are at the moment.  */
669   cost += n_integer_to_sse * ix86_cost->sse_to_integer;
670 
671   if (dump_file)
672     fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
673 
674   gain -= cost;
675 
676   if (dump_file)
677     fprintf (dump_file, "  Total gain: %d\n", gain);
678 
679   return gain;
680 }
681 
682 /* Insert generated conversion instruction sequence INSNS
683    after instruction AFTER.  New BB may be required in case
684    instruction has EH region attached.  */
685 
686 void
emit_conversion_insns(rtx insns,rtx_insn * after)687 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
688 {
689   if (!control_flow_insn_p (after))
690     {
691       emit_insn_after (insns, after);
692       return;
693     }
694 
695   basic_block bb = BLOCK_FOR_INSN (after);
696   edge e = find_fallthru_edge (bb->succs);
697   gcc_assert (e);
698 
699   basic_block new_bb = split_edge (e);
700   emit_insn_after (insns, BB_HEAD (new_bb));
701 }
702 
703 } // anon namespace
704 
705 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
706    zeroing the upper parts.  */
707 
708 static rtx
gen_gpr_to_xmm_move_src(enum machine_mode vmode,rtx gpr)709 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
710 {
711   switch (GET_MODE_NUNITS (vmode))
712     {
713     case 1:
714       /* We are not using this case currently.  */
715       gcc_unreachable ();
716     case 2:
717       return gen_rtx_VEC_CONCAT (vmode, gpr,
718                                          CONST0_RTX (GET_MODE_INNER (vmode)));
719     default:
720       return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
721                                         CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
722     }
723 }
724 
725 /* Make vector copies for all register REGNO definitions
726    and replace its uses in a chain.  */
727 
728 void
make_vector_copies(rtx_insn * insn,rtx reg)729 general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
730 {
731   rtx vreg = *defs_map.get (reg);
732 
733   start_sequence ();
734   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
735     {
736       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
737       if (smode == DImode && !TARGET_64BIT)
738           {
739             emit_move_insn (adjust_address (tmp, SImode, 0),
740                                 gen_rtx_SUBREG (SImode, reg, 0));
741             emit_move_insn (adjust_address (tmp, SImode, 4),
742                                 gen_rtx_SUBREG (SImode, reg, 4));
743           }
744       else
745           emit_move_insn (copy_rtx (tmp), reg);
746       emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
747                                     gen_gpr_to_xmm_move_src (vmode, tmp)));
748     }
749   else if (!TARGET_64BIT && smode == DImode)
750     {
751       if (TARGET_SSE4_1)
752           {
753             emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
754                                               CONST0_RTX (V4SImode),
755                                               gen_rtx_SUBREG (SImode, reg, 0)));
756             emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
757                                                   gen_rtx_SUBREG (V4SImode, vreg, 0),
758                                                   gen_rtx_SUBREG (SImode, reg, 4),
759                                                   GEN_INT (2)));
760           }
761       else
762           {
763             rtx tmp = gen_reg_rtx (DImode);
764             emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
765                                               CONST0_RTX (V4SImode),
766                                               gen_rtx_SUBREG (SImode, reg, 0)));
767             emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
768                                               CONST0_RTX (V4SImode),
769                                               gen_rtx_SUBREG (SImode, reg, 4)));
770             emit_insn (gen_vec_interleave_lowv4si
771                          (gen_rtx_SUBREG (V4SImode, vreg, 0),
772                           gen_rtx_SUBREG (V4SImode, vreg, 0),
773                           gen_rtx_SUBREG (V4SImode, tmp, 0)));
774           }
775     }
776   else
777     emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
778                                   gen_gpr_to_xmm_move_src (vmode, reg)));
779   rtx_insn *seq = get_insns ();
780   end_sequence ();
781   emit_conversion_insns (seq, insn);
782 
783   if (dump_file)
784     fprintf (dump_file,
785                "  Copied r%d to a vector register r%d for insn %d\n",
786                REGNO (reg), REGNO (vreg), INSN_UID (insn));
787 }
788 
789 /* Copy the definition SRC of INSN inside the chain to DST for
790    scalar uses outside of the chain.  */
791 
792 void
convert_reg(rtx_insn * insn,rtx dst,rtx src)793 general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
794 {
795   start_sequence ();
796   if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
797     {
798       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
799       emit_move_insn (tmp, src);
800       if (!TARGET_64BIT && smode == DImode)
801           {
802             emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
803                                 adjust_address (tmp, SImode, 0));
804             emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
805                                 adjust_address (tmp, SImode, 4));
806           }
807       else
808           emit_move_insn (dst, copy_rtx (tmp));
809     }
810   else if (!TARGET_64BIT && smode == DImode)
811     {
812       if (TARGET_SSE4_1)
813           {
814             rtx tmp = gen_rtx_PARALLEL (VOIDmode,
815                                               gen_rtvec (1, const0_rtx));
816             emit_insn
817                 (gen_rtx_SET
818                  (gen_rtx_SUBREG (SImode, dst, 0),
819                     gen_rtx_VEC_SELECT (SImode,
820                                             gen_rtx_SUBREG (V4SImode, src, 0),
821                                             tmp)));
822 
823             tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
824             emit_insn
825                 (gen_rtx_SET
826                  (gen_rtx_SUBREG (SImode, dst, 4),
827                     gen_rtx_VEC_SELECT (SImode,
828                                             gen_rtx_SUBREG (V4SImode, src, 0),
829                                             tmp)));
830           }
831       else
832           {
833             rtx vcopy = gen_reg_rtx (V2DImode);
834             emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
835             emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
836                                 gen_rtx_SUBREG (SImode, vcopy, 0));
837             emit_move_insn (vcopy,
838                                 gen_rtx_LSHIFTRT (V2DImode,
839                                                       vcopy, GEN_INT (32)));
840             emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
841                                 gen_rtx_SUBREG (SImode, vcopy, 0));
842           }
843     }
844   else
845     emit_move_insn (dst, src);
846 
847   rtx_insn *seq = get_insns ();
848   end_sequence ();
849   emit_conversion_insns (seq, insn);
850 
851   if (dump_file)
852     fprintf (dump_file,
853                "  Copied r%d to a scalar register r%d for insn %d\n",
854                REGNO (src), REGNO (dst), INSN_UID (insn));
855 }
856 
857 /* Convert operand OP in INSN.  We should handle
858    memory operands and uninitialized registers.
859    All other register uses are converted during
860    registers conversion.  */
861 
862 void
convert_op(rtx * op,rtx_insn * insn)863 general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
864 {
865   *op = copy_rtx_if_shared (*op);
866 
867   if (GET_CODE (*op) == NOT)
868     {
869       convert_op (&XEXP (*op, 0), insn);
870       PUT_MODE (*op, vmode);
871     }
872   else if (MEM_P (*op))
873     {
874       rtx_insn* eh_insn, *movabs = NULL;
875       rtx tmp = gen_reg_rtx (GET_MODE (*op));
876 
877       /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
878       if (!memory_operand (*op, GET_MODE (*op)))
879           {
880             rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
881             movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
882 
883             *op = tmp2;
884           }
885 
886       eh_insn
887           = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
888                                                    gen_gpr_to_xmm_move_src (vmode, *op)),
889                                   insn);
890 
891       if (cfun->can_throw_non_call_exceptions)
892           {
893             /* Handle REG_EH_REGION note.  */
894             rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
895             if (note)
896               {
897                 if (movabs)
898                     eh_insn = movabs;
899                 control_flow_insns.safe_push (eh_insn);
900                 add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
901               }
902           }
903 
904       *op = gen_rtx_SUBREG (vmode, tmp, 0);
905 
906       if (dump_file)
907           fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
908                      INSN_UID (insn), REGNO (tmp));
909     }
910   else if (REG_P (*op))
911     {
912       *op = gen_rtx_SUBREG (vmode, *op, 0);
913     }
914   else if (CONST_INT_P (*op))
915     {
916       rtx vec_cst;
917       rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
918 
919       /* Prefer all ones vector in case of -1.  */
920       if (constm1_operand (*op, GET_MODE (*op)))
921           vec_cst = CONSTM1_RTX (vmode);
922       else
923           {
924             unsigned n = GET_MODE_NUNITS (vmode);
925             rtx *v = XALLOCAVEC (rtx, n);
926             v[0] = *op;
927             for (unsigned i = 1; i < n; ++i)
928               v[i] = const0_rtx;
929             vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
930           }
931 
932       if (!standard_sse_constant_p (vec_cst, vmode))
933           {
934             start_sequence ();
935             vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
936             rtx_insn *seq = get_insns ();
937             end_sequence ();
938             emit_insn_before (seq, insn);
939           }
940 
941       emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
942       *op = tmp;
943     }
944   else
945     {
946       gcc_assert (SUBREG_P (*op));
947       gcc_assert (GET_MODE (*op) == vmode);
948     }
949 }
950 
951 /* Convert INSN to vector mode.  */
952 
953 void
convert_insn(rtx_insn * insn)954 general_scalar_chain::convert_insn (rtx_insn *insn)
955 {
956   /* Generate copies for out-of-chain uses of defs and adjust debug uses.  */
957   for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
958     if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
959       {
960           df_link *use;
961           for (use = DF_REF_CHAIN (ref); use; use = use->next)
962             if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
963                 && (DF_REF_REG_MEM_P (use->ref)
964                       || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
965               break;
966           if (use)
967             convert_reg (insn, DF_REF_REG (ref),
968                            *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
969           else if (MAY_HAVE_DEBUG_BIND_INSNS)
970             {
971               /* If we generated a scalar copy we can leave debug-insns
972                  as-is, if not, we have to adjust them.  */
973               auto_vec<rtx_insn *, 5> to_reset_debug_insns;
974               for (use = DF_REF_CHAIN (ref); use; use = use->next)
975                 if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
976                     {
977                       rtx_insn *debug_insn = DF_REF_INSN (use->ref);
978                       /* If there's a reaching definition outside of the
979                          chain we have to reset.  */
980                       df_link *def;
981                       for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
982                         if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
983                           break;
984                       if (def)
985                         to_reset_debug_insns.safe_push (debug_insn);
986                       else
987                         {
988                           *DF_REF_REAL_LOC (use->ref)
989                               = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
990                           df_insn_rescan (debug_insn);
991                         }
992                     }
993               /* Have to do the reset outside of the DF_CHAIN walk to not
994                  disrupt it.  */
995               while (!to_reset_debug_insns.is_empty ())
996                 {
997                     rtx_insn *debug_insn = to_reset_debug_insns.pop ();
998                     INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
999                     df_insn_rescan_debug_internal (debug_insn);
1000                 }
1001             }
1002       }
1003 
1004   /* Replace uses in this insn with the defs we use in the chain.  */
1005   for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1006     if (!DF_REF_REG_MEM_P (ref))
1007       if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1008           {
1009             /* Also update a corresponding REG_DEAD note.  */
1010             rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1011             if (note)
1012               XEXP (note, 0) = *vreg;
1013             *DF_REF_REAL_LOC (ref) = *vreg;
1014           }
1015 
1016   rtx def_set = single_set (insn);
1017   rtx src = SET_SRC (def_set);
1018   rtx dst = SET_DEST (def_set);
1019   rtx subreg;
1020 
1021   if (MEM_P (dst) && !REG_P (src))
1022     {
1023       /* There are no scalar integer instructions and therefore
1024            temporary register usage is required.  */
1025       rtx tmp = gen_reg_rtx (smode);
1026       emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1027       dst = gen_rtx_SUBREG (vmode, tmp, 0);
1028     }
1029   else if (REG_P (dst))
1030     {
1031       /* Replace the definition with a SUBREG to the definition we
1032          use inside the chain.  */
1033       rtx *vdef = defs_map.get (dst);
1034       if (vdef)
1035           dst = *vdef;
1036       dst = gen_rtx_SUBREG (vmode, dst, 0);
1037       /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1038          is a non-REG_P.  So kill those off.  */
1039       rtx note = find_reg_equal_equiv_note (insn);
1040       if (note)
1041           remove_note (insn, note);
1042     }
1043 
1044   switch (GET_CODE (src))
1045     {
1046     case PLUS:
1047     case MINUS:
1048     case IOR:
1049     case XOR:
1050     case AND:
1051     case SMAX:
1052     case SMIN:
1053     case UMAX:
1054     case UMIN:
1055       convert_op (&XEXP (src, 1), insn);
1056       /* FALLTHRU */
1057 
1058     case ABS:
1059     case ASHIFT:
1060     case ASHIFTRT:
1061     case LSHIFTRT:
1062       convert_op (&XEXP (src, 0), insn);
1063       PUT_MODE (src, vmode);
1064       break;
1065 
1066     case NEG:
1067       src = XEXP (src, 0);
1068 
1069       if (GET_CODE (src) == ABS)
1070           {
1071             src = XEXP (src, 0);
1072             convert_op (&src, insn);
1073             subreg = gen_reg_rtx (vmode);
1074             emit_insn_before (gen_rtx_SET (subreg,
1075                                                    gen_rtx_ABS (vmode, src)), insn);
1076             src = subreg;
1077           }
1078       else
1079           convert_op (&src, insn);
1080 
1081       subreg = gen_reg_rtx (vmode);
1082       emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1083       src = gen_rtx_MINUS (vmode, subreg, src);
1084       break;
1085 
1086     case NOT:
1087       src = XEXP (src, 0);
1088       convert_op (&src, insn);
1089       subreg = gen_reg_rtx (vmode);
1090       emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1091       src = gen_rtx_XOR (vmode, src, subreg);
1092       break;
1093 
1094     case MEM:
1095       if (!REG_P (dst))
1096           convert_op (&src, insn);
1097       break;
1098 
1099     case REG:
1100       if (!MEM_P (dst))
1101           convert_op (&src, insn);
1102       break;
1103 
1104     case SUBREG:
1105       gcc_assert (GET_MODE (src) == vmode);
1106       break;
1107 
1108     case COMPARE:
1109       src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
1110 
1111       gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
1112       subreg = gen_rtx_SUBREG (V2DImode, src, 0);
1113       emit_insn_before (gen_vec_interleave_lowv2di
1114                               (copy_rtx_if_shared (subreg),
1115                                copy_rtx_if_shared (subreg),
1116                                copy_rtx_if_shared (subreg)),
1117                               insn);
1118       dst = gen_rtx_REG (CCmode, FLAGS_REG);
1119       src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
1120                                                          copy_rtx_if_shared (subreg)),
1121                                   UNSPEC_PTEST);
1122       break;
1123 
1124     case CONST_INT:
1125       convert_op (&src, insn);
1126       break;
1127 
1128     default:
1129       gcc_unreachable ();
1130     }
1131 
1132   SET_SRC (def_set) = src;
1133   SET_DEST (def_set) = dst;
1134 
1135   /* Drop possible dead definitions.  */
1136   PATTERN (insn) = def_set;
1137 
1138   INSN_CODE (insn) = -1;
1139   int patt = recog_memoized (insn);
1140   if  (patt == -1)
1141     fatal_insn_not_found (insn);
1142   df_insn_rescan (insn);
1143 }
1144 
1145 /* Fix uses of converted REG in debug insns.  */
1146 
1147 void
fix_debug_reg_uses(rtx reg)1148 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1149 {
1150   if (!flag_var_tracking)
1151     return;
1152 
1153   df_ref ref, next;
1154   for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1155     {
1156       rtx_insn *insn = DF_REF_INSN (ref);
1157       /* Make sure the next ref is for a different instruction,
1158          so that we're not affected by the rescan.  */
1159       next = DF_REF_NEXT_REG (ref);
1160       while (next && DF_REF_INSN (next) == insn)
1161           next = DF_REF_NEXT_REG (next);
1162 
1163       if (DEBUG_INSN_P (insn))
1164           {
1165             /* It may be a debug insn with a TImode variable in
1166                register.  */
1167             bool changed = false;
1168             for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1169               {
1170                 rtx *loc = DF_REF_LOC (ref);
1171                 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1172                     {
1173                       *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1174                       changed = true;
1175                     }
1176               }
1177             if (changed)
1178               df_insn_rescan (insn);
1179           }
1180     }
1181 }
1182 
1183 /* Convert INSN from TImode to V1T1mode.  */
1184 
1185 void
convert_insn(rtx_insn * insn)1186 timode_scalar_chain::convert_insn (rtx_insn *insn)
1187 {
1188   rtx def_set = single_set (insn);
1189   rtx src = SET_SRC (def_set);
1190   rtx dst = SET_DEST (def_set);
1191 
1192   switch (GET_CODE (dst))
1193     {
1194     case REG:
1195       {
1196           rtx tmp = find_reg_equal_equiv_note (insn);
1197           if (tmp)
1198             PUT_MODE (XEXP (tmp, 0), V1TImode);
1199           PUT_MODE (dst, V1TImode);
1200           fix_debug_reg_uses (dst);
1201       }
1202       break;
1203     case MEM:
1204       PUT_MODE (dst, V1TImode);
1205       break;
1206 
1207     default:
1208       gcc_unreachable ();
1209     }
1210 
1211   switch (GET_CODE (src))
1212     {
1213     case REG:
1214       PUT_MODE (src, V1TImode);
1215       /* Call fix_debug_reg_uses only if SRC is never defined.  */
1216       if (!DF_REG_DEF_CHAIN (REGNO (src)))
1217           fix_debug_reg_uses (src);
1218       break;
1219 
1220     case MEM:
1221       PUT_MODE (src, V1TImode);
1222       break;
1223 
1224     case CONST_WIDE_INT:
1225       if (NONDEBUG_INSN_P (insn))
1226           {
1227             /* Since there are no instructions to store 128-bit constant,
1228                temporary register usage is required.  */
1229             rtx tmp = gen_reg_rtx (V1TImode);
1230             start_sequence ();
1231             src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
1232             src = validize_mem (force_const_mem (V1TImode, src));
1233             rtx_insn *seq = get_insns ();
1234             end_sequence ();
1235             if (seq)
1236               emit_insn_before (seq, insn);
1237             emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1238             dst = tmp;
1239           }
1240       break;
1241 
1242     case CONST_INT:
1243       switch (standard_sse_constant_p (src, TImode))
1244           {
1245           case 1:
1246             src = CONST0_RTX (GET_MODE (dst));
1247             break;
1248           case 2:
1249             src = CONSTM1_RTX (GET_MODE (dst));
1250             break;
1251           default:
1252             gcc_unreachable ();
1253           }
1254       if (NONDEBUG_INSN_P (insn))
1255           {
1256             rtx tmp = gen_reg_rtx (V1TImode);
1257             /* Since there are no instructions to store standard SSE
1258                constant, temporary register usage is required.  */
1259             emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1260             dst = tmp;
1261           }
1262       break;
1263 
1264     default:
1265       gcc_unreachable ();
1266     }
1267 
1268   SET_SRC (def_set) = src;
1269   SET_DEST (def_set) = dst;
1270 
1271   /* Drop possible dead definitions.  */
1272   PATTERN (insn) = def_set;
1273 
1274   INSN_CODE (insn) = -1;
1275   recog_memoized (insn);
1276   df_insn_rescan (insn);
1277 }
1278 
1279 /* Generate copies from defs used by the chain but not defined therein.
1280    Also populates defs_map which is used later by convert_insn.  */
1281 
1282 void
convert_registers()1283 general_scalar_chain::convert_registers ()
1284 {
1285   bitmap_iterator bi;
1286   unsigned id;
1287   EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1288     {
1289       rtx chain_reg = gen_reg_rtx (smode);
1290       defs_map.put (regno_reg_rtx[id], chain_reg);
1291     }
1292   EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1293     for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1294       if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1295           make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1296 }
1297 
1298 /* Convert whole chain creating required register
1299    conversions and copies.  */
1300 
1301 int
convert()1302 scalar_chain::convert ()
1303 {
1304   bitmap_iterator bi;
1305   unsigned id;
1306   int converted_insns = 0;
1307 
1308   if (!dbg_cnt (stv_conversion))
1309     return 0;
1310 
1311   if (dump_file)
1312     fprintf (dump_file, "Converting chain #%d...\n", chain_id);
1313 
1314   convert_registers ();
1315 
1316   EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1317     {
1318       convert_insn (DF_INSN_UID_GET (id)->insn);
1319       converted_insns++;
1320     }
1321 
1322   return converted_insns;
1323 }
1324 
1325 /* Return the SET expression if INSN doesn't reference hard register.
1326    Return NULL if INSN uses or defines a hard register, excluding
1327    pseudo register pushes, hard register uses in a memory address,
1328    clobbers and flags definitions.  */
1329 
1330 static rtx
pseudo_reg_set(rtx_insn * insn)1331 pseudo_reg_set (rtx_insn *insn)
1332 {
1333   rtx set = single_set (insn);
1334   if (!set)
1335     return NULL;
1336 
1337   /* Check pseudo register push first. */
1338   machine_mode mode = TARGET_64BIT ? TImode : DImode;
1339   if (REG_P (SET_SRC (set))
1340       && !HARD_REGISTER_P (SET_SRC (set))
1341       && push_operand (SET_DEST (set), mode))
1342     return set;
1343 
1344   df_ref ref;
1345   FOR_EACH_INSN_DEF (ref, insn)
1346     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
1347           && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
1348           && DF_REF_REGNO (ref) != FLAGS_REG)
1349       return NULL;
1350 
1351   FOR_EACH_INSN_USE (ref, insn)
1352     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
1353       return NULL;
1354 
1355   return set;
1356 }
1357 
1358 /* Check if comparison INSN may be transformed
1359    into vector comparison.  Currently we transform
1360    zero checks only which look like:
1361 
1362    (set (reg:CCZ 17 flags)
1363         (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
1364                              (subreg:SI (reg:DI x) 0))
1365                          (const_int 0 [0])))  */
1366 
1367 static bool
convertible_comparison_p(rtx_insn * insn,enum machine_mode mode)1368 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
1369 {
1370   /* ??? Currently convertible for double-word DImode chain only.  */
1371   if (TARGET_64BIT || mode != DImode)
1372     return false;
1373 
1374   if (!TARGET_SSE4_1)
1375     return false;
1376 
1377   rtx def_set = single_set (insn);
1378 
1379   gcc_assert (def_set);
1380 
1381   rtx src = SET_SRC (def_set);
1382   rtx dst = SET_DEST (def_set);
1383 
1384   gcc_assert (GET_CODE (src) == COMPARE);
1385 
1386   if (GET_CODE (dst) != REG
1387       || REGNO (dst) != FLAGS_REG
1388       || GET_MODE (dst) != CCZmode)
1389     return false;
1390 
1391   rtx op1 = XEXP (src, 0);
1392   rtx op2 = XEXP (src, 1);
1393 
1394   if (op2 != CONST0_RTX (GET_MODE (op2)))
1395     return false;
1396 
1397   if (GET_CODE (op1) != IOR)
1398     return false;
1399 
1400   op2 = XEXP (op1, 1);
1401   op1 = XEXP (op1, 0);
1402 
1403   if (!SUBREG_P (op1)
1404       || !SUBREG_P (op2)
1405       || GET_MODE (op1) != SImode
1406       || GET_MODE (op2) != SImode
1407       || ((SUBREG_BYTE (op1) != 0
1408              || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
1409             && (SUBREG_BYTE (op2) != 0
1410                 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
1411     return false;
1412 
1413   op1 = SUBREG_REG (op1);
1414   op2 = SUBREG_REG (op2);
1415 
1416   if (op1 != op2
1417       || !REG_P (op1)
1418       || GET_MODE (op1) != DImode)
1419     return false;
1420 
1421   return true;
1422 }
1423 
1424 /* The general version of scalar_to_vector_candidate_p.  */
1425 
1426 static bool
general_scalar_to_vector_candidate_p(rtx_insn * insn,enum machine_mode mode)1427 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
1428 {
1429   rtx def_set = pseudo_reg_set (insn);
1430 
1431   if (!def_set)
1432     return false;
1433 
1434   rtx src = SET_SRC (def_set);
1435   rtx dst = SET_DEST (def_set);
1436 
1437   if (GET_CODE (src) == COMPARE)
1438     return convertible_comparison_p (insn, mode);
1439 
1440   /* We are interested in "mode" only.  */
1441   if ((GET_MODE (src) != mode
1442        && !CONST_INT_P (src))
1443       || GET_MODE (dst) != mode)
1444     return false;
1445 
1446   if (!REG_P (dst) && !MEM_P (dst))
1447     return false;
1448 
1449   switch (GET_CODE (src))
1450     {
1451     case ASHIFTRT:
1452       if (!TARGET_AVX512VL)
1453           return false;
1454       /* FALLTHRU */
1455 
1456     case ASHIFT:
1457     case LSHIFTRT:
1458       if (!CONST_INT_P (XEXP (src, 1))
1459             || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
1460           return false;
1461       break;
1462 
1463     case SMAX:
1464     case SMIN:
1465     case UMAX:
1466     case UMIN:
1467       if ((mode == DImode && !TARGET_AVX512VL)
1468             || (mode == SImode && !TARGET_SSE4_1))
1469           return false;
1470       /* Fallthru.  */
1471 
1472     case AND:
1473     case IOR:
1474     case XOR:
1475     case PLUS:
1476     case MINUS:
1477       if (!REG_P (XEXP (src, 1))
1478             && !MEM_P (XEXP (src, 1))
1479             && !CONST_INT_P (XEXP (src, 1)))
1480           return false;
1481 
1482       if (GET_MODE (XEXP (src, 1)) != mode
1483             && !CONST_INT_P (XEXP (src, 1)))
1484           return false;
1485 
1486       /* Check for andnot case.  */
1487       if (GET_CODE (src) != AND
1488             || GET_CODE (XEXP (src, 0)) != NOT)
1489           break;
1490 
1491       src = XEXP (src, 0);
1492       /* FALLTHRU */
1493 
1494     case NOT:
1495       break;
1496 
1497     case NEG:
1498       /* Check for nabs case.  */
1499       if (GET_CODE (XEXP (src, 0)) != ABS)
1500           break;
1501 
1502       src = XEXP (src, 0);
1503       /* FALLTHRU */
1504 
1505     case ABS:
1506       if ((mode == DImode && !TARGET_AVX512VL)
1507             || (mode == SImode && !TARGET_SSSE3))
1508           return false;
1509       break;
1510 
1511     case REG:
1512       return true;
1513 
1514     case MEM:
1515     case CONST_INT:
1516       return REG_P (dst);
1517 
1518     default:
1519       return false;
1520     }
1521 
1522   if (!REG_P (XEXP (src, 0))
1523       && !MEM_P (XEXP (src, 0))
1524       && !CONST_INT_P (XEXP (src, 0)))
1525     return false;
1526 
1527   if (GET_MODE (XEXP (src, 0)) != mode
1528       && !CONST_INT_P (XEXP (src, 0)))
1529     return false;
1530 
1531   return true;
1532 }
1533 
1534 /* The TImode version of scalar_to_vector_candidate_p.  */
1535 
1536 static bool
timode_scalar_to_vector_candidate_p(rtx_insn * insn)1537 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1538 {
1539   rtx def_set = pseudo_reg_set (insn);
1540 
1541   if (!def_set)
1542     return false;
1543 
1544   rtx src = SET_SRC (def_set);
1545   rtx dst = SET_DEST (def_set);
1546 
1547   /* Only TImode load and store are allowed.  */
1548   if (GET_MODE (dst) != TImode)
1549     return false;
1550 
1551   if (MEM_P (dst))
1552     {
1553       /* Check for store.  Memory must be aligned or unaligned store
1554            is optimal.  Only support store from register, standard SSE
1555            constant or CONST_WIDE_INT generated from piecewise store.
1556 
1557            ??? Verify performance impact before enabling CONST_INT for
1558            __int128 store.  */
1559       if (misaligned_operand (dst, TImode)
1560             && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1561           return false;
1562 
1563       switch (GET_CODE (src))
1564           {
1565           default:
1566             return false;
1567 
1568           case REG:
1569           case CONST_WIDE_INT:
1570             return true;
1571 
1572           case CONST_INT:
1573             return standard_sse_constant_p (src, TImode);
1574           }
1575     }
1576   else if (MEM_P (src))
1577     {
1578       /* Check for load.  Memory must be aligned or unaligned load is
1579            optimal.  */
1580       return (REG_P (dst)
1581                 && (!misaligned_operand (src, TImode)
1582                       || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1583     }
1584 
1585   return false;
1586 }
1587 
1588 /* For a register REGNO, scan instructions for its defs and uses.
1589    Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
1590 
1591 static void
timode_check_non_convertible_regs(bitmap candidates,bitmap regs,unsigned int regno)1592 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1593                                            unsigned int regno)
1594 {
1595   for (df_ref def = DF_REG_DEF_CHAIN (regno);
1596        def;
1597        def = DF_REF_NEXT_REG (def))
1598     {
1599       if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1600           {
1601             if (dump_file)
1602               fprintf (dump_file,
1603                          "r%d has non convertible def in insn %d\n",
1604                          regno, DF_REF_INSN_UID (def));
1605 
1606             bitmap_set_bit (regs, regno);
1607             break;
1608           }
1609     }
1610 
1611   for (df_ref ref = DF_REG_USE_CHAIN (regno);
1612        ref;
1613        ref = DF_REF_NEXT_REG (ref))
1614     {
1615       /* Debug instructions are skipped.  */
1616       if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1617             && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1618           {
1619             if (dump_file)
1620               fprintf (dump_file,
1621                          "r%d has non convertible use in insn %d\n",
1622                          regno, DF_REF_INSN_UID (ref));
1623 
1624             bitmap_set_bit (regs, regno);
1625             break;
1626           }
1627     }
1628 }
1629 
1630 /* The TImode version of remove_non_convertible_regs.  */
1631 
1632 static void
timode_remove_non_convertible_regs(bitmap candidates)1633 timode_remove_non_convertible_regs (bitmap candidates)
1634 {
1635   bitmap_iterator bi;
1636   unsigned id;
1637   bitmap regs = BITMAP_ALLOC (NULL);
1638 
1639   EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1640     {
1641       rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1642       rtx dest = SET_DEST (def_set);
1643       rtx src = SET_SRC (def_set);
1644 
1645       if ((!REG_P (dest)
1646              || bitmap_bit_p (regs, REGNO (dest))
1647              || HARD_REGISTER_P (dest))
1648             && (!REG_P (src)
1649                 || bitmap_bit_p (regs, REGNO (src))
1650                 || HARD_REGISTER_P (src)))
1651           continue;
1652 
1653       if (REG_P (dest))
1654           timode_check_non_convertible_regs (candidates, regs,
1655                                                      REGNO (dest));
1656 
1657       if (REG_P (src))
1658           timode_check_non_convertible_regs (candidates, regs,
1659                                                      REGNO (src));
1660     }
1661 
1662   EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1663     {
1664       for (df_ref def = DF_REG_DEF_CHAIN (id);
1665              def;
1666              def = DF_REF_NEXT_REG (def))
1667           if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1668             {
1669               if (dump_file)
1670                 fprintf (dump_file, "Removing insn %d from candidates list\n",
1671                            DF_REF_INSN_UID (def));
1672 
1673               bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1674             }
1675 
1676       for (df_ref ref = DF_REG_USE_CHAIN (id);
1677              ref;
1678              ref = DF_REF_NEXT_REG (ref))
1679           if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1680             {
1681               if (dump_file)
1682                 fprintf (dump_file, "Removing insn %d from candidates list\n",
1683                            DF_REF_INSN_UID (ref));
1684 
1685               bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1686             }
1687     }
1688 
1689   BITMAP_FREE (regs);
1690 }
1691 
1692 /* Main STV pass function.  Find and convert scalar
1693    instructions into vector mode when profitable.  */
1694 
1695 static unsigned int
convert_scalars_to_vector(bool timode_p)1696 convert_scalars_to_vector (bool timode_p)
1697 {
1698   basic_block bb;
1699   int converted_insns = 0;
1700   auto_vec<rtx_insn *> control_flow_insns;
1701 
1702   bitmap_obstack_initialize (NULL);
1703   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
1704   const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
1705   bitmap_head candidates[3];  /* { SImode, DImode, TImode } */
1706   for (unsigned i = 0; i < 3; ++i)
1707     bitmap_initialize (&candidates[i], &bitmap_default_obstack);
1708 
1709   calculate_dominance_info (CDI_DOMINATORS);
1710   df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
1711   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
1712   df_analyze ();
1713 
1714   /* Find all instructions we want to convert into vector mode.  */
1715   if (dump_file)
1716     fprintf (dump_file, "Searching for mode conversion candidates...\n");
1717 
1718   FOR_EACH_BB_FN (bb, cfun)
1719     {
1720       rtx_insn *insn;
1721       FOR_BB_INSNS (bb, insn)
1722           if (timode_p
1723               && timode_scalar_to_vector_candidate_p (insn))
1724             {
1725               if (dump_file)
1726                 fprintf (dump_file, "  insn %d is marked as a TImode candidate\n",
1727                            INSN_UID (insn));
1728 
1729               bitmap_set_bit (&candidates[2], INSN_UID (insn));
1730             }
1731           else if (!timode_p)
1732             {
1733               /* Check {SI,DI}mode.  */
1734               for (unsigned i = 0; i <= 1; ++i)
1735                 if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
1736                     {
1737                       if (dump_file)
1738                         fprintf (dump_file, "  insn %d is marked as a %s candidate\n",
1739                                    INSN_UID (insn), i == 0 ? "SImode" : "DImode");
1740 
1741                       bitmap_set_bit (&candidates[i], INSN_UID (insn));
1742                       break;
1743                     }
1744             }
1745     }
1746 
1747   if (timode_p)
1748     timode_remove_non_convertible_regs (&candidates[2]);
1749 
1750   for (unsigned i = 0; i <= 2; ++i)
1751     if (!bitmap_empty_p (&candidates[i]))
1752       break;
1753     else if (i == 2 && dump_file)
1754       fprintf (dump_file, "There are no candidates for optimization.\n");
1755 
1756   for (unsigned i = 0; i <= 2; ++i)
1757     while (!bitmap_empty_p (&candidates[i]))
1758       {
1759           unsigned uid = bitmap_first_set_bit (&candidates[i]);
1760           scalar_chain *chain;
1761 
1762           if (cand_mode[i] == TImode)
1763             chain = new timode_scalar_chain;
1764           else
1765             chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
1766 
1767           /* Find instructions chain we want to convert to vector mode.
1768              Check all uses and definitions to estimate all required
1769              conversions.  */
1770           chain->build (&candidates[i], uid);
1771 
1772           if (chain->compute_convert_gain () > 0)
1773             converted_insns += chain->convert ();
1774           else
1775             if (dump_file)
1776               fprintf (dump_file, "Chain #%d conversion is not profitable\n",
1777                          chain->chain_id);
1778 
1779           rtx_insn* iter_insn;
1780           unsigned int ii;
1781           FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
1782             control_flow_insns.safe_push (iter_insn);
1783 
1784           delete chain;
1785       }
1786 
1787   if (dump_file)
1788     fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
1789 
1790   for (unsigned i = 0; i <= 2; ++i)
1791     bitmap_release (&candidates[i]);
1792   bitmap_obstack_release (NULL);
1793   df_process_deferred_rescans ();
1794 
1795   /* Conversion means we may have 128bit register spills/fills
1796      which require aligned stack.  */
1797   if (converted_insns)
1798     {
1799       if (crtl->stack_alignment_needed < 128)
1800           crtl->stack_alignment_needed = 128;
1801       if (crtl->stack_alignment_estimated < 128)
1802           crtl->stack_alignment_estimated = 128;
1803 
1804       crtl->stack_realign_needed
1805           = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
1806       crtl->stack_realign_tried = crtl->stack_realign_needed;
1807 
1808       crtl->stack_realign_processed = true;
1809 
1810       if (!crtl->drap_reg)
1811           {
1812             rtx drap_rtx = targetm.calls.get_drap_rtx ();
1813 
1814             /* stack_realign_drap and drap_rtx must match.  */
1815             gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
1816 
1817             /* Do nothing if NULL is returned,
1818                which means DRAP is not needed.  */
1819             if (drap_rtx != NULL)
1820               {
1821                 crtl->args.internal_arg_pointer = drap_rtx;
1822 
1823                 /* Call fixup_tail_calls to clean up
1824                      REG_EQUIV note if DRAP is needed. */
1825                 fixup_tail_calls ();
1826               }
1827           }
1828 
1829       /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
1830       if (TARGET_64BIT)
1831           for (tree parm = DECL_ARGUMENTS (current_function_decl);
1832                parm; parm = DECL_CHAIN (parm))
1833             {
1834               if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
1835                 continue;
1836               if (DECL_RTL_SET_P (parm)
1837                     && GET_MODE (DECL_RTL (parm)) == V1TImode)
1838                 {
1839                     rtx r = DECL_RTL (parm);
1840                     if (REG_P (r))
1841                       SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
1842                 }
1843               if (DECL_INCOMING_RTL (parm)
1844                     && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
1845                 {
1846                     rtx r = DECL_INCOMING_RTL (parm);
1847                     if (REG_P (r))
1848                       DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
1849                 }
1850             }
1851 
1852       if (!control_flow_insns.is_empty ())
1853           {
1854             free_dominance_info (CDI_DOMINATORS);
1855 
1856             unsigned int i;
1857             rtx_insn* insn;
1858             FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
1859               if (control_flow_insn_p (insn))
1860                 {
1861                     /* Split the block after insn.  There will be a fallthru
1862                        edge, which is OK so we keep it.  We have to create
1863                        the exception edges ourselves.  */
1864                     bb = BLOCK_FOR_INSN (insn);
1865                     split_block (bb, insn);
1866                     rtl_make_eh_edge (NULL, bb, BB_END (bb));
1867                 }
1868           }
1869     }
1870 
1871   return 0;
1872 }
1873 
1874 static unsigned int
rest_of_handle_insert_vzeroupper(void)1875 rest_of_handle_insert_vzeroupper (void)
1876 {
1877   /* vzeroupper instructions are inserted immediately after reload to
1878      account for possible spills from 256bit or 512bit registers.  The pass
1879      reuses mode switching infrastructure by re-running mode insertion
1880      pass, so disable entities that have already been processed.  */
1881   for (int i = 0; i < MAX_386_ENTITIES; i++)
1882     ix86_optimize_mode_switching[i] = 0;
1883 
1884   ix86_optimize_mode_switching[AVX_U128] = 1;
1885 
1886   /* Call optimize_mode_switching.  */
1887   g->get_passes ()->execute_pass_mode_switching ();
1888 
1889   df_analyze ();
1890   return 0;
1891 }
1892 
1893 namespace {
1894 
1895 const pass_data pass_data_insert_vzeroupper =
1896 {
1897   RTL_PASS, /* type */
1898   "vzeroupper", /* name */
1899   OPTGROUP_NONE, /* optinfo_flags */
1900   TV_MACH_DEP, /* tv_id */
1901   0, /* properties_required */
1902   0, /* properties_provided */
1903   0, /* properties_destroyed */
1904   0, /* todo_flags_start */
1905   TODO_df_finish, /* todo_flags_finish */
1906 };
1907 
1908 class pass_insert_vzeroupper : public rtl_opt_pass
1909 {
1910 public:
pass_insert_vzeroupper(gcc::context * ctxt)1911   pass_insert_vzeroupper(gcc::context *ctxt)
1912     : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
1913   {}
1914 
1915   /* opt_pass methods: */
gate(function *)1916   virtual bool gate (function *)
1917     {
1918       return TARGET_AVX && TARGET_VZEROUPPER;
1919     }
1920 
execute(function *)1921   virtual unsigned int execute (function *)
1922     {
1923       return rest_of_handle_insert_vzeroupper ();
1924     }
1925 
1926 }; // class pass_insert_vzeroupper
1927 
1928 const pass_data pass_data_stv =
1929 {
1930   RTL_PASS, /* type */
1931   "stv", /* name */
1932   OPTGROUP_NONE, /* optinfo_flags */
1933   TV_MACH_DEP, /* tv_id */
1934   0, /* properties_required */
1935   0, /* properties_provided */
1936   0, /* properties_destroyed */
1937   0, /* todo_flags_start */
1938   TODO_df_finish, /* todo_flags_finish */
1939 };
1940 
1941 class pass_stv : public rtl_opt_pass
1942 {
1943 public:
pass_stv(gcc::context * ctxt)1944   pass_stv (gcc::context *ctxt)
1945     : rtl_opt_pass (pass_data_stv, ctxt),
1946       timode_p (false)
1947   {}
1948 
1949   /* opt_pass methods: */
gate(function *)1950   virtual bool gate (function *)
1951     {
1952       return ((!timode_p || TARGET_64BIT)
1953                 && TARGET_STV && TARGET_SSE2 && optimize > 1);
1954     }
1955 
execute(function *)1956   virtual unsigned int execute (function *)
1957     {
1958       return convert_scalars_to_vector (timode_p);
1959     }
1960 
clone()1961   opt_pass *clone ()
1962     {
1963       return new pass_stv (m_ctxt);
1964     }
1965 
set_pass_param(unsigned int n,bool param)1966   void set_pass_param (unsigned int n, bool param)
1967     {
1968       gcc_assert (n == 0);
1969       timode_p = param;
1970     }
1971 
1972 private:
1973   bool timode_p;
1974 }; // class pass_stv
1975 
1976 } // anon namespace
1977 
1978 rtl_opt_pass *
make_pass_insert_vzeroupper(gcc::context * ctxt)1979 make_pass_insert_vzeroupper (gcc::context *ctxt)
1980 {
1981   return new pass_insert_vzeroupper (ctxt);
1982 }
1983 
1984 rtl_opt_pass *
make_pass_stv(gcc::context * ctxt)1985 make_pass_stv (gcc::context *ctxt)
1986 {
1987   return new pass_stv (ctxt);
1988 }
1989 
1990 /* Inserting ENDBR and pseudo patchable-area instructions.  */
1991 
1992 static void
rest_of_insert_endbr_and_patchable_area(bool need_endbr,unsigned int patchable_area_size)1993 rest_of_insert_endbr_and_patchable_area (bool need_endbr,
1994                                                    unsigned int patchable_area_size)
1995 {
1996   rtx endbr;
1997   rtx_insn *insn;
1998   rtx_insn *endbr_insn = NULL;
1999   basic_block bb;
2000 
2001   if (need_endbr)
2002     {
2003       /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
2004            is absent among function attributes.  Later an optimization will
2005            be introduced to make analysis if an address of a static function
2006            is taken.  A static function whose address is not taken will get
2007            a nocf_check attribute.  This will allow to reduce the number of
2008            EB.  */
2009       if (!lookup_attribute ("nocf_check",
2010                                    TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2011             && (!flag_manual_endbr
2012                 || lookup_attribute ("cf_check",
2013                                            DECL_ATTRIBUTES (cfun->decl)))
2014             && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
2015                 || ix86_cmodel == CM_LARGE
2016                 || ix86_cmodel == CM_LARGE_PIC
2017                 || flag_force_indirect_call
2018                 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2019                       && DECL_DLLIMPORT_P (cfun->decl))))
2020           {
2021             if (crtl->profile && flag_fentry)
2022               {
2023                 /* Queue ENDBR insertion to x86_function_profiler.
2024                      NB: Any patchable-area insn will be inserted after
2025                      ENDBR.  */
2026                 cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
2027               }
2028             else
2029               {
2030                 endbr = gen_nop_endbr ();
2031                 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2032                 rtx_insn *insn = BB_HEAD (bb);
2033                 endbr_insn = emit_insn_before (endbr, insn);
2034               }
2035           }
2036     }
2037 
2038   if (patchable_area_size)
2039     {
2040       if (crtl->profile && flag_fentry)
2041           {
2042             /* Queue patchable-area insertion to x86_function_profiler.
2043                NB: If there is a queued ENDBR, x86_function_profiler
2044                will also handle patchable-area.  */
2045             if (!cfun->machine->insn_queued_at_entrance)
2046               cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
2047           }
2048       else
2049           {
2050             rtx patchable_area
2051               = gen_patchable_area (GEN_INT (patchable_area_size),
2052                                           GEN_INT (crtl->patch_area_entry == 0));
2053             if (endbr_insn)
2054               emit_insn_after (patchable_area, endbr_insn);
2055             else
2056               {
2057                 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2058                 insn = BB_HEAD (bb);
2059                 emit_insn_before (patchable_area, insn);
2060               }
2061           }
2062     }
2063 
2064   if (!need_endbr)
2065     return;
2066 
2067   bb = 0;
2068   FOR_EACH_BB_FN (bb, cfun)
2069     {
2070       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2071              insn = NEXT_INSN (insn))
2072           {
2073             if (CALL_P (insn))
2074               {
2075                 need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2076                 if (!need_endbr && !SIBLING_CALL_P (insn))
2077                     {
2078                       rtx call = get_call_rtx_from (insn);
2079                       rtx fnaddr = XEXP (call, 0);
2080                       tree fndecl = NULL_TREE;
2081 
2082                       /* Also generate ENDBRANCH for non-tail call which
2083                          may return via indirect branch.  */
2084                       if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2085                         fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2086                       if (fndecl == NULL_TREE)
2087                         fndecl = MEM_EXPR (fnaddr);
2088                       if (fndecl
2089                           && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2090                           && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2091                         fndecl = NULL_TREE;
2092                       if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2093                         {
2094                           tree fntype = TREE_TYPE (fndecl);
2095                           if (lookup_attribute ("indirect_return",
2096                                                       TYPE_ATTRIBUTES (fntype)))
2097                               need_endbr = true;
2098                         }
2099                     }
2100                 if (!need_endbr)
2101                     continue;
2102                 /* Generate ENDBRANCH after CALL, which can return more than
2103                      twice, setjmp-like functions.  */
2104 
2105                 endbr = gen_nop_endbr ();
2106                 emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
2107                 continue;
2108               }
2109 
2110             if (JUMP_P (insn) && flag_cet_switch)
2111               {
2112                 rtx target = JUMP_LABEL (insn);
2113                 if (target == NULL_RTX || ANY_RETURN_P (target))
2114                     continue;
2115 
2116                 /* Check the jump is a switch table.  */
2117                 rtx_insn *label = as_a<rtx_insn *> (target);
2118                 rtx_insn *table = next_insn (label);
2119                 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2120                     continue;
2121 
2122                 /* For the indirect jump find out all places it jumps and insert
2123                      ENDBRANCH there.  It should be done under a special flag to
2124                      control ENDBRANCH generation for switch stmts.  */
2125                 edge_iterator ei;
2126                 edge e;
2127                 basic_block dest_blk;
2128 
2129                 FOR_EACH_EDGE (e, ei, bb->succs)
2130                     {
2131                       rtx_insn *insn;
2132 
2133                       dest_blk = e->dest;
2134                       insn = BB_HEAD (dest_blk);
2135                       gcc_assert (LABEL_P (insn));
2136                       endbr = gen_nop_endbr ();
2137                       emit_insn_after (endbr, insn);
2138                     }
2139                 continue;
2140               }
2141 
2142             if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2143               {
2144                 endbr = gen_nop_endbr ();
2145                 emit_insn_after (endbr, insn);
2146                 continue;
2147               }
2148           }
2149     }
2150 
2151   return;
2152 }
2153 
2154 namespace {
2155 
2156 const pass_data pass_data_insert_endbr_and_patchable_area =
2157 {
2158   RTL_PASS, /* type.  */
2159   "endbr_and_patchable_area", /* name.  */
2160   OPTGROUP_NONE, /* optinfo_flags.  */
2161   TV_MACH_DEP, /* tv_id.  */
2162   0, /* properties_required.  */
2163   0, /* properties_provided.  */
2164   0, /* properties_destroyed.  */
2165   0, /* todo_flags_start.  */
2166   0, /* todo_flags_finish.  */
2167 };
2168 
2169 class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
2170 {
2171 public:
pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2172   pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2173     : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
2174   {}
2175 
2176   /* opt_pass methods: */
gate(function *)2177   virtual bool gate (function *)
2178     {
2179       need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
2180       patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
2181       return need_endbr || patchable_area_size;
2182     }
2183 
execute(function *)2184   virtual unsigned int execute (function *)
2185     {
2186       timevar_push (TV_MACH_DEP);
2187       rest_of_insert_endbr_and_patchable_area (need_endbr,
2188                                                          patchable_area_size);
2189       timevar_pop (TV_MACH_DEP);
2190       return 0;
2191     }
2192 
2193 private:
2194   bool need_endbr;
2195   unsigned int patchable_area_size;
2196 }; // class pass_insert_endbr_and_patchable_area
2197 
2198 } // anon namespace
2199 
2200 rtl_opt_pass *
make_pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2201 make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2202 {
2203   return new pass_insert_endbr_and_patchable_area (ctxt);
2204 }
2205 
2206 /* At entry of the nearest common dominator for basic blocks with
2207    conversions/rcp/sqrt/rsqrt/round, generate a single
2208           vxorps %xmmN, %xmmN, %xmmN
2209    for all
2210           vcvtss2sd  op, %xmmN, %xmmX
2211           vcvtsd2ss  op, %xmmN, %xmmX
2212           vcvtsi2ss  op, %xmmN, %xmmX
2213           vcvtsi2sd  op, %xmmN, %xmmX
2214 
2215    NB: We want to generate only a single vxorps to cover the whole
2216    function.  The LCM algorithm isn't appropriate here since it may
2217    place a vxorps inside the loop.  */
2218 
2219 static unsigned int
remove_partial_avx_dependency(void)2220 remove_partial_avx_dependency (void)
2221 {
2222   timevar_push (TV_MACH_DEP);
2223 
2224   bitmap_obstack_initialize (NULL);
2225   bitmap convert_bbs = BITMAP_ALLOC (NULL);
2226 
2227   basic_block bb;
2228   rtx_insn *insn, *set_insn;
2229   rtx set;
2230   rtx v4sf_const0 = NULL_RTX;
2231 
2232   auto_vec<rtx_insn *> control_flow_insns;
2233 
2234   /* We create invalid RTL initially so defer rescans.  */
2235   df_set_flags (DF_DEFER_INSN_RESCAN);
2236 
2237   FOR_EACH_BB_FN (bb, cfun)
2238     {
2239       FOR_BB_INSNS (bb, insn)
2240           {
2241             if (!NONDEBUG_INSN_P (insn))
2242               continue;
2243 
2244             set = single_set (insn);
2245             if (!set)
2246               continue;
2247 
2248             if (get_attr_avx_partial_xmm_update (insn)
2249                 != AVX_PARTIAL_XMM_UPDATE_TRUE)
2250               continue;
2251 
2252             /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
2253                SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
2254                round, to vec_dup and vec_merge with subreg.  */
2255             rtx src = SET_SRC (set);
2256             rtx dest = SET_DEST (set);
2257             machine_mode dest_mode = GET_MODE (dest);
2258             bool convert_p = false;
2259             switch (GET_CODE (src))
2260               {
2261               case FLOAT:
2262               case FLOAT_EXTEND:
2263               case FLOAT_TRUNCATE:
2264               case UNSIGNED_FLOAT:
2265                 convert_p = true;
2266                 break;
2267               default:
2268                 break;
2269               }
2270 
2271             /* Only hanlde conversion here.  */
2272             machine_mode src_mode
2273               = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
2274             switch (src_mode)
2275               {
2276               case E_SFmode:
2277               case E_DFmode:
2278                 if (TARGET_USE_VECTOR_FP_CONVERTS
2279                       || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
2280                     continue;
2281                 break;
2282               case E_SImode:
2283               case E_DImode:
2284                 if (TARGET_USE_VECTOR_CONVERTS
2285                       || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
2286                     continue;
2287                 break;
2288               case E_VOIDmode:
2289                 gcc_assert (!convert_p);
2290                 break;
2291               default:
2292                 gcc_unreachable ();
2293               }
2294 
2295             if (!v4sf_const0)
2296               v4sf_const0 = gen_reg_rtx (V4SFmode);
2297 
2298             rtx zero;
2299             machine_mode dest_vecmode;
2300             switch (dest_mode)
2301               {
2302               case E_HFmode:
2303                 dest_vecmode = V8HFmode;
2304                 zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
2305                 break;
2306               case E_SFmode:
2307                 dest_vecmode = V4SFmode;
2308                 zero = v4sf_const0;
2309                 break;
2310               case E_DFmode:
2311                 dest_vecmode = V2DFmode;
2312                 zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
2313                 break;
2314               default:
2315                 gcc_unreachable ();
2316               }
2317 
2318             /* Change source to vector mode.  */
2319             src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
2320             src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
2321                                            GEN_INT (HOST_WIDE_INT_1U));
2322             /* Change destination to vector mode.  */
2323             rtx vec = gen_reg_rtx (dest_vecmode);
2324             /* Generate an XMM vector SET.  */
2325             set = gen_rtx_SET (vec, src);
2326             set_insn = emit_insn_before (set, insn);
2327             df_insn_rescan (set_insn);
2328 
2329             if (cfun->can_throw_non_call_exceptions)
2330               {
2331                 /* Handle REG_EH_REGION note.  */
2332                 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
2333                 if (note)
2334                     {
2335                       control_flow_insns.safe_push (set_insn);
2336                       add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
2337                     }
2338               }
2339 
2340             src = gen_rtx_SUBREG (dest_mode, vec, 0);
2341             set = gen_rtx_SET (dest, src);
2342 
2343             /* Drop possible dead definitions.  */
2344             PATTERN (insn) = set;
2345 
2346             INSN_CODE (insn) = -1;
2347             recog_memoized (insn);
2348             df_insn_rescan (insn);
2349             bitmap_set_bit (convert_bbs, bb->index);
2350           }
2351     }
2352 
2353   if (v4sf_const0)
2354     {
2355       /* (Re-)discover loops so that bb->loop_father can be used in the
2356            analysis below.  */
2357       calculate_dominance_info (CDI_DOMINATORS);
2358       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2359 
2360       /* Generate a vxorps at entry of the nearest dominator for basic
2361            blocks with conversions, which is in the fake loop that
2362            contains the whole function, so that there is only a single
2363            vxorps in the whole function.   */
2364       bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
2365                                                        convert_bbs);
2366       while (bb->loop_father->latch
2367                != EXIT_BLOCK_PTR_FOR_FN (cfun))
2368           bb = get_immediate_dominator (CDI_DOMINATORS,
2369                                               bb->loop_father->header);
2370 
2371       set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
2372 
2373       insn = BB_HEAD (bb);
2374       while (insn && !NONDEBUG_INSN_P (insn))
2375           {
2376             if (insn == BB_END (bb))
2377               {
2378                 insn = NULL;
2379                 break;
2380               }
2381             insn = NEXT_INSN (insn);
2382           }
2383       if (insn == BB_HEAD (bb))
2384         set_insn = emit_insn_before (set, insn);
2385       else
2386           set_insn = emit_insn_after (set,
2387                                             insn ? PREV_INSN (insn) : BB_END (bb));
2388       df_insn_rescan (set_insn);
2389       loop_optimizer_finalize ();
2390 
2391       if (!control_flow_insns.is_empty ())
2392           {
2393             free_dominance_info (CDI_DOMINATORS);
2394 
2395             unsigned int i;
2396             FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2397               if (control_flow_insn_p (insn))
2398                 {
2399                     /* Split the block after insn.  There will be a fallthru
2400                        edge, which is OK so we keep it.  We have to create
2401                        the exception edges ourselves.  */
2402                     bb = BLOCK_FOR_INSN (insn);
2403                     split_block (bb, insn);
2404                     rtl_make_eh_edge (NULL, bb, BB_END (bb));
2405                 }
2406           }
2407     }
2408 
2409   df_process_deferred_rescans ();
2410   df_clear_flags (DF_DEFER_INSN_RESCAN);
2411   bitmap_obstack_release (NULL);
2412   BITMAP_FREE (convert_bbs);
2413 
2414   timevar_pop (TV_MACH_DEP);
2415   return 0;
2416 }
2417 
2418 namespace {
2419 
2420 const pass_data pass_data_remove_partial_avx_dependency =
2421 {
2422   RTL_PASS, /* type */
2423   "rpad", /* name */
2424   OPTGROUP_NONE, /* optinfo_flags */
2425   TV_MACH_DEP, /* tv_id */
2426   0, /* properties_required */
2427   0, /* properties_provided */
2428   0, /* properties_destroyed */
2429   0, /* todo_flags_start */
2430   0, /* todo_flags_finish */
2431 };
2432 
2433 class pass_remove_partial_avx_dependency : public rtl_opt_pass
2434 {
2435 public:
pass_remove_partial_avx_dependency(gcc::context * ctxt)2436   pass_remove_partial_avx_dependency (gcc::context *ctxt)
2437     : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
2438   {}
2439 
2440   /* opt_pass methods: */
gate(function *)2441   virtual bool gate (function *)
2442     {
2443       return (TARGET_AVX
2444                 && TARGET_SSE_PARTIAL_REG_DEPENDENCY
2445                 && TARGET_SSE_MATH
2446                 && optimize
2447                 && optimize_function_for_speed_p (cfun));
2448     }
2449 
execute(function *)2450   virtual unsigned int execute (function *)
2451     {
2452       return remove_partial_avx_dependency ();
2453     }
2454 }; // class pass_rpad
2455 
2456 } // anon namespace
2457 
2458 rtl_opt_pass *
make_pass_remove_partial_avx_dependency(gcc::context * ctxt)2459 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
2460 {
2461   return new pass_remove_partial_avx_dependency (ctxt);
2462 }
2463 
2464 /* This compares the priority of target features in function DECL1
2465    and DECL2.  It returns positive value if DECL1 is higher priority,
2466    negative value if DECL2 is higher priority and 0 if they are the
2467    same.  */
2468 
2469 int
ix86_compare_version_priority(tree decl1,tree decl2)2470 ix86_compare_version_priority (tree decl1, tree decl2)
2471 {
2472   unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
2473   unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
2474 
2475   return (int)priority1 - (int)priority2;
2476 }
2477 
2478 /* V1 and V2 point to function versions with different priorities
2479    based on the target ISA.  This function compares their priorities.  */
2480 
2481 static int
feature_compare(const void * v1,const void * v2)2482 feature_compare (const void *v1, const void *v2)
2483 {
2484   typedef struct _function_version_info
2485     {
2486       tree version_decl;
2487       tree predicate_chain;
2488       unsigned int dispatch_priority;
2489     } function_version_info;
2490 
2491   const function_version_info c1 = *(const function_version_info *)v1;
2492   const function_version_info c2 = *(const function_version_info *)v2;
2493   return (c2.dispatch_priority - c1.dispatch_priority);
2494 }
2495 
2496 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
2497    to return a pointer to VERSION_DECL if the outcome of the expression
2498    formed by PREDICATE_CHAIN is true.  This function will be called during
2499    version dispatch to decide which function version to execute.  It returns
2500    the basic block at the end, to which more conditions can be added.  */
2501 
2502 static basic_block
add_condition_to_bb(tree function_decl,tree version_decl,tree predicate_chain,basic_block new_bb)2503 add_condition_to_bb (tree function_decl, tree version_decl,
2504                          tree predicate_chain, basic_block new_bb)
2505 {
2506   gimple *return_stmt;
2507   tree convert_expr, result_var;
2508   gimple *convert_stmt;
2509   gimple *call_cond_stmt;
2510   gimple *if_else_stmt;
2511 
2512   basic_block bb1, bb2, bb3;
2513   edge e12, e23;
2514 
2515   tree cond_var, and_expr_var = NULL_TREE;
2516   gimple_seq gseq;
2517 
2518   tree predicate_decl, predicate_arg;
2519 
2520   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
2521 
2522   gcc_assert (new_bb != NULL);
2523   gseq = bb_seq (new_bb);
2524 
2525 
2526   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
2527                                build_fold_addr_expr (version_decl));
2528   result_var = create_tmp_var (ptr_type_node);
2529   convert_stmt = gimple_build_assign (result_var, convert_expr);
2530   return_stmt = gimple_build_return (result_var);
2531 
2532   if (predicate_chain == NULL_TREE)
2533     {
2534       gimple_seq_add_stmt (&gseq, convert_stmt);
2535       gimple_seq_add_stmt (&gseq, return_stmt);
2536       set_bb_seq (new_bb, gseq);
2537       gimple_set_bb (convert_stmt, new_bb);
2538       gimple_set_bb (return_stmt, new_bb);
2539       pop_cfun ();
2540       return new_bb;
2541     }
2542 
2543   while (predicate_chain != NULL)
2544     {
2545       cond_var = create_tmp_var (integer_type_node);
2546       predicate_decl = TREE_PURPOSE (predicate_chain);
2547       predicate_arg = TREE_VALUE (predicate_chain);
2548       call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
2549       gimple_call_set_lhs (call_cond_stmt, cond_var);
2550 
2551       gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
2552       gimple_set_bb (call_cond_stmt, new_bb);
2553       gimple_seq_add_stmt (&gseq, call_cond_stmt);
2554 
2555       predicate_chain = TREE_CHAIN (predicate_chain);
2556 
2557       if (and_expr_var == NULL)
2558         and_expr_var = cond_var;
2559       else
2560           {
2561             gimple *assign_stmt;
2562             /* Use MIN_EXPR to check if any integer is zero?.
2563                and_expr_var = min_expr <cond_var, and_expr_var>  */
2564             assign_stmt = gimple_build_assign (and_expr_var,
2565                                 build2 (MIN_EXPR, integer_type_node,
2566                                           cond_var, and_expr_var));
2567 
2568             gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
2569             gimple_set_bb (assign_stmt, new_bb);
2570             gimple_seq_add_stmt (&gseq, assign_stmt);
2571           }
2572     }
2573 
2574   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
2575                                           integer_zero_node,
2576                                             NULL_TREE, NULL_TREE);
2577   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
2578   gimple_set_bb (if_else_stmt, new_bb);
2579   gimple_seq_add_stmt (&gseq, if_else_stmt);
2580 
2581   gimple_seq_add_stmt (&gseq, convert_stmt);
2582   gimple_seq_add_stmt (&gseq, return_stmt);
2583   set_bb_seq (new_bb, gseq);
2584 
2585   bb1 = new_bb;
2586   e12 = split_block (bb1, if_else_stmt);
2587   bb2 = e12->dest;
2588   e12->flags &= ~EDGE_FALLTHRU;
2589   e12->flags |= EDGE_TRUE_VALUE;
2590 
2591   e23 = split_block (bb2, return_stmt);
2592 
2593   gimple_set_bb (convert_stmt, bb2);
2594   gimple_set_bb (return_stmt, bb2);
2595 
2596   bb3 = e23->dest;
2597   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
2598 
2599   remove_edge (e23);
2600   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
2601 
2602   pop_cfun ();
2603 
2604   return bb3;
2605 }
2606 
2607 /* This function generates the dispatch function for
2608    multi-versioned functions.  DISPATCH_DECL is the function which will
2609    contain the dispatch logic.  FNDECLS are the function choices for
2610    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
2611    in DISPATCH_DECL in which the dispatch code is generated.  */
2612 
2613 static int
dispatch_function_versions(tree dispatch_decl,void * fndecls_p,basic_block * empty_bb)2614 dispatch_function_versions (tree dispatch_decl,
2615                                   void *fndecls_p,
2616                                   basic_block *empty_bb)
2617 {
2618   tree default_decl;
2619   gimple *ifunc_cpu_init_stmt;
2620   gimple_seq gseq;
2621   int ix;
2622   tree ele;
2623   vec<tree> *fndecls;
2624   unsigned int num_versions = 0;
2625   unsigned int actual_versions = 0;
2626   unsigned int i;
2627 
2628   struct _function_version_info
2629     {
2630       tree version_decl;
2631       tree predicate_chain;
2632       unsigned int dispatch_priority;
2633     }*function_version_info;
2634 
2635   gcc_assert (dispatch_decl != NULL
2636                 && fndecls_p != NULL
2637                 && empty_bb != NULL);
2638 
2639   /*fndecls_p is actually a vector.  */
2640   fndecls = static_cast<vec<tree> *> (fndecls_p);
2641 
2642   /* At least one more version other than the default.  */
2643   num_versions = fndecls->length ();
2644   gcc_assert (num_versions >= 2);
2645 
2646   function_version_info = (struct _function_version_info *)
2647     XNEWVEC (struct _function_version_info, (num_versions - 1));
2648 
2649   /* The first version in the vector is the default decl.  */
2650   default_decl = (*fndecls)[0];
2651 
2652   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
2653 
2654   gseq = bb_seq (*empty_bb);
2655   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
2656      constructors, so explicity call __builtin_cpu_init here.  */
2657   ifunc_cpu_init_stmt
2658     = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
2659   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
2660   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
2661   set_bb_seq (*empty_bb, gseq);
2662 
2663   pop_cfun ();
2664 
2665 
2666   for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
2667     {
2668       tree version_decl = ele;
2669       tree predicate_chain = NULL_TREE;
2670       unsigned int priority;
2671       /* Get attribute string, parse it and find the right predicate decl.
2672          The predicate function could be a lengthy combination of many
2673            features, like arch-type and various isa-variants.  */
2674       priority = get_builtin_code_for_version (version_decl,
2675                                                        &predicate_chain);
2676 
2677       if (predicate_chain == NULL_TREE)
2678           continue;
2679 
2680       function_version_info [actual_versions].version_decl = version_decl;
2681       function_version_info [actual_versions].predicate_chain
2682            = predicate_chain;
2683       function_version_info [actual_versions].dispatch_priority = priority;
2684       actual_versions++;
2685     }
2686 
2687   /* Sort the versions according to descending order of dispatch priority.  The
2688      priority is based on the ISA.  This is not a perfect solution.  There
2689      could still be ambiguity.  If more than one function version is suitable
2690      to execute,  which one should be dispatched?  In future, allow the user
2691      to specify a dispatch  priority next to the version.  */
2692   qsort (function_version_info, actual_versions,
2693          sizeof (struct _function_version_info), feature_compare);
2694 
2695   for  (i = 0; i < actual_versions; ++i)
2696     *empty_bb = add_condition_to_bb (dispatch_decl,
2697                                              function_version_info[i].version_decl,
2698                                              function_version_info[i].predicate_chain,
2699                                              *empty_bb);
2700 
2701   /* dispatch default version at the end.  */
2702   *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
2703                                            NULL, *empty_bb);
2704 
2705   free (function_version_info);
2706   return 0;
2707 }
2708 
2709 /* This function changes the assembler name for functions that are
2710    versions.  If DECL is a function version and has a "target"
2711    attribute, it appends the attribute string to its assembler name.  */
2712 
2713 static tree
ix86_mangle_function_version_assembler_name(tree decl,tree id)2714 ix86_mangle_function_version_assembler_name (tree decl, tree id)
2715 {
2716   tree version_attr;
2717   const char *orig_name, *version_string;
2718   char *attr_str, *assembler_name;
2719 
2720   if (DECL_DECLARED_INLINE_P (decl)
2721       && lookup_attribute ("gnu_inline",
2722                                  DECL_ATTRIBUTES (decl)))
2723     error_at (DECL_SOURCE_LOCATION (decl),
2724                 "function versions cannot be marked as %<gnu_inline%>,"
2725                 " bodies have to be generated");
2726 
2727   if (DECL_VIRTUAL_P (decl)
2728       || DECL_VINDEX (decl))
2729     sorry ("virtual function multiversioning not supported");
2730 
2731   version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
2732 
2733   /* target attribute string cannot be NULL.  */
2734   gcc_assert (version_attr != NULL_TREE);
2735 
2736   orig_name = IDENTIFIER_POINTER (id);
2737   version_string
2738     = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
2739 
2740   if (strcmp (version_string, "default") == 0)
2741     return id;
2742 
2743   attr_str = sorted_attr_string (TREE_VALUE (version_attr));
2744   assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
2745 
2746   sprintf (assembler_name, "%s.%s", orig_name, attr_str);
2747 
2748   /* Allow assembler name to be modified if already set.  */
2749   if (DECL_ASSEMBLER_NAME_SET_P (decl))
2750     SET_DECL_RTL (decl, NULL);
2751 
2752   tree ret = get_identifier (assembler_name);
2753   XDELETEVEC (attr_str);
2754   XDELETEVEC (assembler_name);
2755   return ret;
2756 }
2757 
2758 tree
ix86_mangle_decl_assembler_name(tree decl,tree id)2759 ix86_mangle_decl_assembler_name (tree decl, tree id)
2760 {
2761   /* For function version, add the target suffix to the assembler name.  */
2762   if (TREE_CODE (decl) == FUNCTION_DECL
2763       && DECL_FUNCTION_VERSIONED (decl))
2764     id = ix86_mangle_function_version_assembler_name (decl, id);
2765 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
2766   id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
2767 #endif
2768 
2769   return id;
2770 }
2771 
2772 /* Make a dispatcher declaration for the multi-versioned function DECL.
2773    Calls to DECL function will be replaced with calls to the dispatcher
2774    by the front-end.  Returns the decl of the dispatcher function.  */
2775 
2776 tree
ix86_get_function_versions_dispatcher(void * decl)2777 ix86_get_function_versions_dispatcher (void *decl)
2778 {
2779   tree fn = (tree) decl;
2780   struct cgraph_node *node = NULL;
2781   struct cgraph_node *default_node = NULL;
2782   struct cgraph_function_version_info *node_v = NULL;
2783   struct cgraph_function_version_info *first_v = NULL;
2784 
2785   tree dispatch_decl = NULL;
2786 
2787   struct cgraph_function_version_info *default_version_info = NULL;
2788 
2789   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
2790 
2791   node = cgraph_node::get (fn);
2792   gcc_assert (node != NULL);
2793 
2794   node_v = node->function_version ();
2795   gcc_assert (node_v != NULL);
2796 
2797   if (node_v->dispatcher_resolver != NULL)
2798     return node_v->dispatcher_resolver;
2799 
2800   /* Find the default version and make it the first node.  */
2801   first_v = node_v;
2802   /* Go to the beginning of the chain.  */
2803   while (first_v->prev != NULL)
2804     first_v = first_v->prev;
2805   default_version_info = first_v;
2806   while (default_version_info != NULL)
2807     {
2808       if (is_function_default_version
2809               (default_version_info->this_node->decl))
2810         break;
2811       default_version_info = default_version_info->next;
2812     }
2813 
2814   /* If there is no default node, just return NULL.  */
2815   if (default_version_info == NULL)
2816     return NULL;
2817 
2818   /* Make default info the first node.  */
2819   if (first_v != default_version_info)
2820     {
2821       default_version_info->prev->next = default_version_info->next;
2822       if (default_version_info->next)
2823         default_version_info->next->prev = default_version_info->prev;
2824       first_v->prev = default_version_info;
2825       default_version_info->next = first_v;
2826       default_version_info->prev = NULL;
2827     }
2828 
2829   default_node = default_version_info->this_node;
2830 
2831 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
2832   if (targetm.has_ifunc_p ())
2833     {
2834       struct cgraph_function_version_info *it_v = NULL;
2835       struct cgraph_node *dispatcher_node = NULL;
2836       struct cgraph_function_version_info *dispatcher_version_info = NULL;
2837 
2838       /* Right now, the dispatching is done via ifunc.  */
2839       dispatch_decl = make_dispatcher_decl (default_node->decl);
2840 
2841       dispatcher_node = cgraph_node::get_create (dispatch_decl);
2842       gcc_assert (dispatcher_node != NULL);
2843       dispatcher_node->dispatcher_function = 1;
2844       dispatcher_version_info
2845           = dispatcher_node->insert_new_function_version ();
2846       dispatcher_version_info->next = default_version_info;
2847       dispatcher_node->definition = 1;
2848 
2849       /* Set the dispatcher for all the versions.  */
2850       it_v = default_version_info;
2851       while (it_v != NULL)
2852           {
2853             it_v->dispatcher_resolver = dispatch_decl;
2854             it_v = it_v->next;
2855           }
2856     }
2857   else
2858 #endif
2859     {
2860       error_at (DECL_SOURCE_LOCATION (default_node->decl),
2861                     "multiversioning needs %<ifunc%> which is not supported "
2862                     "on this target");
2863     }
2864 
2865   return dispatch_decl;
2866 }
2867 
2868 /* Make the resolver function decl to dispatch the versions of
2869    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
2870    ifunc alias that will point to the created resolver.  Create an
2871    empty basic block in the resolver and store the pointer in
2872    EMPTY_BB.  Return the decl of the resolver function.  */
2873 
2874 static tree
make_resolver_func(const tree default_decl,const tree ifunc_alias_decl,basic_block * empty_bb)2875 make_resolver_func (const tree default_decl,
2876                         const tree ifunc_alias_decl,
2877                         basic_block *empty_bb)
2878 {
2879   tree decl, type, t;
2880 
2881   /* Create resolver function name based on default_decl.  */
2882   tree decl_name = clone_function_name (default_decl, "resolver");
2883   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
2884 
2885   /* The resolver function should return a (void *). */
2886   type = build_function_type_list (ptr_type_node, NULL_TREE);
2887 
2888   decl = build_fn_decl (resolver_name, type);
2889   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
2890 
2891   DECL_NAME (decl) = decl_name;
2892   TREE_USED (decl) = 1;
2893   DECL_ARTIFICIAL (decl) = 1;
2894   DECL_IGNORED_P (decl) = 1;
2895   TREE_PUBLIC (decl) = 0;
2896   DECL_UNINLINABLE (decl) = 1;
2897 
2898   /* Resolver is not external, body is generated.  */
2899   DECL_EXTERNAL (decl) = 0;
2900   DECL_EXTERNAL (ifunc_alias_decl) = 0;
2901 
2902   DECL_CONTEXT (decl) = NULL_TREE;
2903   DECL_INITIAL (decl) = make_node (BLOCK);
2904   DECL_STATIC_CONSTRUCTOR (decl) = 0;
2905 
2906   if (DECL_COMDAT_GROUP (default_decl)
2907       || TREE_PUBLIC (default_decl))
2908     {
2909       /* In this case, each translation unit with a call to this
2910            versioned function will put out a resolver.  Ensure it
2911            is comdat to keep just one copy.  */
2912       DECL_COMDAT (decl) = 1;
2913       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
2914     }
2915   else
2916     TREE_PUBLIC (ifunc_alias_decl) = 0;
2917 
2918   /* Build result decl and add to function_decl. */
2919   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
2920   DECL_CONTEXT (t) = decl;
2921   DECL_ARTIFICIAL (t) = 1;
2922   DECL_IGNORED_P (t) = 1;
2923   DECL_RESULT (decl) = t;
2924 
2925   gimplify_function_tree (decl);
2926   push_cfun (DECL_STRUCT_FUNCTION (decl));
2927   *empty_bb = init_lowered_empty_function (decl, false,
2928                                                      profile_count::uninitialized ());
2929 
2930   cgraph_node::add_new_function (decl, true);
2931   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
2932 
2933   pop_cfun ();
2934 
2935   gcc_assert (ifunc_alias_decl != NULL);
2936   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
2937   DECL_ATTRIBUTES (ifunc_alias_decl)
2938     = make_attribute ("ifunc", resolver_name,
2939                           DECL_ATTRIBUTES (ifunc_alias_decl));
2940 
2941   /* Create the alias for dispatch to resolver here.  */
2942   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
2943   return decl;
2944 }
2945 
2946 /* Generate the dispatching code body to dispatch multi-versioned function
2947    DECL.  The target hook is called to process the "target" attributes and
2948    provide the code to dispatch the right function at run-time.  NODE points
2949    to the dispatcher decl whose body will be created.  */
2950 
2951 tree
ix86_generate_version_dispatcher_body(void * node_p)2952 ix86_generate_version_dispatcher_body (void *node_p)
2953 {
2954   tree resolver_decl;
2955   basic_block empty_bb;
2956   tree default_ver_decl;
2957   struct cgraph_node *versn;
2958   struct cgraph_node *node;
2959 
2960   struct cgraph_function_version_info *node_version_info = NULL;
2961   struct cgraph_function_version_info *versn_info = NULL;
2962 
2963   node = (cgraph_node *)node_p;
2964 
2965   node_version_info = node->function_version ();
2966   gcc_assert (node->dispatcher_function
2967                 && node_version_info != NULL);
2968 
2969   if (node_version_info->dispatcher_resolver)
2970     return node_version_info->dispatcher_resolver;
2971 
2972   /* The first version in the chain corresponds to the default version.  */
2973   default_ver_decl = node_version_info->next->this_node->decl;
2974 
2975   /* node is going to be an alias, so remove the finalized bit.  */
2976   node->definition = false;
2977 
2978   resolver_decl = make_resolver_func (default_ver_decl,
2979                                               node->decl, &empty_bb);
2980 
2981   node_version_info->dispatcher_resolver = resolver_decl;
2982 
2983   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
2984 
2985   auto_vec<tree, 2> fn_ver_vec;
2986 
2987   for (versn_info = node_version_info->next; versn_info;
2988        versn_info = versn_info->next)
2989     {
2990       versn = versn_info->this_node;
2991       /* Check for virtual functions here again, as by this time it should
2992            have been determined if this function needs a vtable index or
2993            not.  This happens for methods in derived classes that override
2994            virtual methods in base classes but are not explicitly marked as
2995            virtual.  */
2996       if (DECL_VINDEX (versn->decl))
2997           sorry ("virtual function multiversioning not supported");
2998 
2999       fn_ver_vec.safe_push (versn->decl);
3000     }
3001 
3002   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
3003   cgraph_edge::rebuild_edges ();
3004   pop_cfun ();
3005   return resolver_decl;
3006 }
3007 
3008 
3009