-
Notifications
You must be signed in to change notification settings - Fork 3
/
memcpy-avx2.patch
4315 lines (4182 loc) · 147 KB
/
memcpy-avx2.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 73c12e3bb8c..faf66634b54 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -128,7 +128,6 @@ static rtx expand_builtin_va_copy (tree);
static rtx inline_expand_builtin_bytecmp (tree, rtx);
static rtx expand_builtin_strcmp (tree, rtx);
static rtx expand_builtin_strncmp (tree, rtx, machine_mode);
-static rtx builtin_memcpy_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
static rtx expand_builtin_memchr (tree, rtx);
static rtx expand_builtin_memcpy (tree, rtx);
static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
@@ -145,7 +144,6 @@ static rtx expand_builtin_stpcpy (tree, rtx, machine_mode);
static rtx expand_builtin_stpncpy (tree, rtx);
static rtx expand_builtin_strncat (tree, rtx);
static rtx expand_builtin_strncpy (tree, rtx);
-static rtx builtin_memset_gen_str (void *, HOST_WIDE_INT, scalar_int_mode);
static rtx expand_builtin_memset (tree, rtx, machine_mode);
static rtx expand_builtin_memset_args (tree, tree, tree, rtx, machine_mode, tree);
static rtx expand_builtin_bzero (tree);
@@ -3860,7 +3858,7 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
a target constant. */
static rtx
-builtin_memcpy_read_str (void *data, HOST_WIDE_INT offset,
+builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
scalar_int_mode mode)
{
/* The REPresentation pointed to by DATA need not be a nul-terminated
@@ -6412,7 +6410,7 @@ expand_builtin_stpncpy (tree exp, rtx)
constant. */
rtx
-builtin_strncpy_read_str (void *data, HOST_WIDE_INT offset,
+builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
scalar_int_mode mode)
{
const char *str = (const char *) data;
@@ -6621,16 +6619,111 @@ expand_builtin_strncpy (tree exp, rtx target)
return NULL_RTX;
}
+/* Return the RTL of a register in MODE generated from PREV in the
+ previous iteration. */
+
+static rtx
+gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
+{
+ rtx target = nullptr;
+ by_pieces_prev *prev = (by_pieces_prev *) prevp;
+ if (prev != nullptr && prev->data != nullptr)
+ {
+ /* Use the previous data in the same mode. */
+ if (prev->mode == mode)
+ return prev->data;
+
+ rtx prev_rtx = prev->data;
+ machine_mode prev_mode = prev->mode;
+ unsigned int word_size = GET_MODE_SIZE (word_mode);
+ if (word_size < GET_MODE_SIZE (prev->mode)
+ && word_size > GET_MODE_SIZE (mode))
+ {
+ /* First generate subreg of word mode if the previous mode is
+ wider than word mode and word mode is wider than MODE. */
+ prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
+ prev_mode, 0);
+ prev_mode = word_mode;
+ }
+ if (prev_rtx != nullptr)
+ target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
+ }
+ return target;
+}
+
+/* Return the RTL of a register in MODE broadcasted from DATA. */
+
+static rtx
+gen_memset_broadcast (rtx data, scalar_int_mode mode)
+{
+ /* Skip if regno_reg_rtx isn't initialized. */
+ if (!regno_reg_rtx)
+ return nullptr;
+
+ rtx target = nullptr;
+
+ unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
+ machine_mode vector_mode;
+ if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
+ gcc_unreachable ();
+
+ enum insn_code icode = optab_handler (vec_duplicate_optab,
+ vector_mode);
+ if (icode != CODE_FOR_nothing)
+ {
+ rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
+ if (CONST_INT_P (data))
+ {
+ /* Use the move expander with CONST_VECTOR. */
+ rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
+ emit_move_insn (reg, const_vec);
+ }
+ else
+ {
+
+ class expand_operand ops[2];
+ create_output_operand (&ops[0], reg, vector_mode);
+ create_input_operand (&ops[1], data, QImode);
+ expand_insn (icode, 2, ops);
+ if (!rtx_equal_p (reg, ops[0].value))
+ emit_move_insn (reg, ops[0].value);
+ }
+ target = lowpart_subreg (mode, reg, vector_mode);
+ }
+
+ return target;
+}
+
/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
bytes from constant string DATA + OFFSET and return it as target
- constant. */
+ constant. If PREV isn't nullptr, it has the RTL info from the
+ previous iteration. */
rtx
-builtin_memset_read_str (void *data, HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+builtin_memset_read_str (void *data, void *prev,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
{
+ rtx target;
const char *c = (const char *) data;
- char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+ char *p;
+
+ /* Don't use the previous value if size is 1. */
+ if (GET_MODE_SIZE (mode) != 1)
+ {
+ target = gen_memset_value_from_prev (prev, mode);
+ if (target != nullptr)
+ return target;
+
+ p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
+ memset (p, *c, GET_MODE_SIZE (QImode));
+ rtx src = c_readstr (p, QImode);
+ target = gen_memset_broadcast (src, mode);
+ if (target != nullptr)
+ return target;
+ }
+
+ p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
memset (p, *c, GET_MODE_SIZE (mode));
@@ -6640,10 +6733,12 @@ builtin_memset_read_str (void *data, HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
/* Callback routine for store_by_pieces. Return the RTL of a register
containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
char value given in the RTL register data. For example, if mode is
- 4 bytes wide, return the RTL for 0x01010101*data. */
+ 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
+ nullptr, it has the RTL info from the previous iteration. */
static rtx
-builtin_memset_gen_str (void *data, HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+builtin_memset_gen_str (void *data, void *prev,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
{
rtx target, coeff;
@@ -6654,6 +6749,14 @@ builtin_memset_gen_str (void *data, HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
if (size == 1)
return (rtx) data;
+ target = gen_memset_value_from_prev (prev, mode);
+ if (target != nullptr)
+ return target;
+
+ target = gen_memset_broadcast ((rtx) data, mode);
+ if (target != nullptr)
+ return target;
+
p = XALLOCAVEC (char, size);
memset (p, 1, size);
coeff = c_readstr (p, mode);
diff --git a/gcc/builtins.h b/gcc/builtins.h
index 307a20fbadb..e71f40c300a 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -110,8 +110,10 @@ extern void expand_builtin_update_setjmp_buf (rtx);
extern tree mathfn_built_in (tree, enum built_in_function fn);
extern tree mathfn_built_in (tree, combined_fn);
extern tree mathfn_built_in_type (combined_fn);
-extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
-extern rtx builtin_memset_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
+extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
+ scalar_int_mode);
+extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
+ scalar_int_mode);
extern rtx expand_builtin_saveregs (void);
extern tree std_build_builtin_va_list (void);
extern tree std_fn_abi_va_list (tree);
diff --git a/gcc/calls.c b/gcc/calls.c
index 883d08ba5f2..814891896a8 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -3727,7 +3727,7 @@ expand_call (tree exp, rtx target, int ignore)
So the entire argument block must then be preallocated (i.e., we
ignore PUSH_ROUNDING in that case). */
- int must_preallocate = !PUSH_ARGS;
+ int must_preallocate = !targetm.calls.push_argument (0);
/* Size of the stack reserved for parameter registers. */
int reg_parm_stack_space = 0;
@@ -3835,7 +3835,7 @@ expand_call (tree exp, rtx target, int ignore)
#endif
if (! OUTGOING_REG_PARM_STACK_SPACE ((!fndecl ? fntype : TREE_TYPE (fndecl)))
- && reg_parm_stack_space > 0 && PUSH_ARGS)
+ && reg_parm_stack_space > 0 && targetm.calls.push_argument (0))
must_preallocate = 1;
/* Set up a place to return a structure. */
@@ -5476,7 +5476,7 @@ emit_library_call_value_1 (int retval, rtx orgfun, rtx value,
}
else
{
- if (!PUSH_ARGS)
+ if (!targetm.calls.push_argument (0))
argblock = push_block (gen_int_mode (args_size.constant, Pmode), 0, 0);
}
diff --git a/gcc/config/bpf/bpf.h b/gcc/config/bpf/bpf.h
index 4c5b19e262b..80195cea5b2 100644
--- a/gcc/config/bpf/bpf.h
+++ b/gcc/config/bpf/bpf.h
@@ -288,9 +288,6 @@ enum reg_class
never used when passing arguments. However, we still have to
define the constants below. */
-/* If nonzero, push insns will be used to pass outgoing arguments. */
-#define PUSH_ARGS 0
-
/* If nonzero, function arguments will be evaluated from last to
first, rather than from first to last. */
#define PUSH_ARGS_REVERSED 1
diff --git a/gcc/config/cr16/cr16.c b/gcc/config/cr16/cr16.c
index 079706f7a91..75040fb2fa7 100644
--- a/gcc/config/cr16/cr16.c
+++ b/gcc/config/cr16/cr16.c
@@ -158,6 +158,8 @@ static void cr16_print_operand_address (FILE *, machine_mode, rtx);
#define TARGET_CLASS_LIKELY_SPILLED_P cr16_class_likely_spilled_p
/* Passing function arguments. */
+#undef TARGET_PUSH_ARGUMENT
+#define TARGET_PUSH_ARGUMENT hook_bool_uint_true
#undef TARGET_FUNCTION_ARG
#define TARGET_FUNCTION_ARG cr16_function_arg
#undef TARGET_FUNCTION_ARG_ADVANCE
diff --git a/gcc/config/cr16/cr16.h b/gcc/config/cr16/cr16.h
index ae90610ad80..a60d9a79b0b 100644
--- a/gcc/config/cr16/cr16.h
+++ b/gcc/config/cr16/cr16.h
@@ -379,8 +379,6 @@ enum reg_class
#define ACCUMULATE_OUTGOING_ARGS 0
-#define PUSH_ARGS 1
-
#define PUSH_ROUNDING(BYTES) cr16_push_rounding (BYTES)
#ifndef CUMULATIVE_ARGS
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index d87f0fa1b1c..17206d0b0bd 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -190,6 +190,82 @@ ix86_expand_clear (rtx dest)
emit_insn (tmp);
}
+/* Return true if V can be broadcasted from an integer of WIDTH bits
+ which is returned in VAL_BROADCAST. Otherwise, return false. */
+
+static bool
+ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
+ HOST_WIDE_INT &val_broadcast)
+{
+ wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
+ val_broadcast = wi::extract_uhwi (val, 0, width);
+ for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
+ {
+ HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
+ if (val_broadcast != each)
+ return false;
+ }
+ val_broadcast = sext_hwi (val_broadcast, width);
+ return true;
+}
+
+/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
+
+static rtx
+ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
+{
+ /* Don't use integer vector broadcast if we can't move from GPR to SSE
+ register directly. */
+ if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+ return nullptr;
+
+ /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
+ broadcast only if vector broadcast is available. */
+ if (!TARGET_AVX
+ || !CONST_WIDE_INT_P (op)
+ || standard_sse_constant_p (op, mode))
+ return nullptr;
+
+ HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
+ HOST_WIDE_INT val_broadcast;
+ scalar_int_mode broadcast_mode;
+ if (TARGET_AVX2
+ && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
+ val_broadcast))
+ broadcast_mode = QImode;
+ else if (TARGET_AVX2
+ && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
+ val_broadcast))
+ broadcast_mode = HImode;
+ else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
+ val_broadcast))
+ broadcast_mode = SImode;
+ else if (TARGET_64BIT
+ && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
+ val_broadcast))
+ broadcast_mode = DImode;
+ else
+ return nullptr;
+
+ /* Check if OP can be broadcasted from VAL. */
+ for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
+ if (val != CONST_WIDE_INT_ELT (op, i))
+ return nullptr;
+
+ unsigned int nunits = (GET_MODE_SIZE (mode)
+ / GET_MODE_SIZE (broadcast_mode));
+ machine_mode vector_mode;
+ if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
+ gcc_unreachable ();
+ rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
+ bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
+ target,
+ GEN_INT (val_broadcast));
+ gcc_assert (ok);
+ target = lowpart_subreg (mode, target, vector_mode);
+ return target;
+}
+
void
ix86_expand_move (machine_mode mode, rtx operands[])
{
@@ -347,20 +423,29 @@ ix86_expand_move (machine_mode mode, rtx operands[])
&& optimize)
op1 = copy_to_mode_reg (mode, op1);
- if (can_create_pseudo_p ()
- && CONST_DOUBLE_P (op1))
+ if (can_create_pseudo_p ())
{
- /* If we are loading a floating point constant to a register,
- force the value to memory now, since we'll get better code
- out the back end. */
+ if (CONST_DOUBLE_P (op1))
+ {
+ /* If we are loading a floating point constant to a
+ register, force the value to memory now, since we'll
+ get better code out the back end. */
- op1 = validize_mem (force_const_mem (mode, op1));
- if (!register_operand (op0, mode))
+ op1 = validize_mem (force_const_mem (mode, op1));
+ if (!register_operand (op0, mode))
+ {
+ rtx temp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (temp, op1));
+ emit_move_insn (op0, temp);
+ return;
+ }
+ }
+ else if (GET_MODE_SIZE (mode) >= 16)
{
- rtx temp = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (temp, op1));
- emit_move_insn (op0, temp);
- return;
+ rtx tmp = ix86_convert_const_wide_int_to_broadcast
+ (GET_MODE (op0), op1);
+ if (tmp != nullptr)
+ op1 = tmp;
}
}
}
@@ -368,6 +453,62 @@ ix86_expand_move (machine_mode mode, rtx operands[])
emit_insn (gen_rtx_SET (op0, op1));
}
+static rtx
+ix86_broadcast_from_integer_constant (machine_mode mode, rtx op)
+{
+ int nunits = GET_MODE_NUNITS (mode);
+ if (nunits < 2)
+ return nullptr;
+
+ /* Don't use integer vector broadcast if we can't move from GPR to SSE
+ register directly. */
+ if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+ return nullptr;
+
+ /* Convert CONST_VECTOR to a non-standard SSE constant integer
+ broadcast only if vector broadcast is available. */
+ if (!(TARGET_AVX2
+ || (TARGET_AVX
+ && (GET_MODE_INNER (mode) == SImode
+ || GET_MODE_INNER (mode) == DImode)))
+ || standard_sse_constant_p (op, mode))
+ return nullptr;
+
+ /* Don't broadcast from a 64-bit integer constant in 32-bit mode. */
+ if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT)
+ return nullptr;
+
+ if (GET_MODE_INNER (mode) == TImode)
+ return nullptr;
+
+ rtx constant = get_pool_constant (XEXP (op, 0));
+ if (GET_CODE (constant) != CONST_VECTOR)
+ return nullptr;
+
+ /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector. */
+ if (GET_MODE (constant) != mode)
+ {
+ constant = simplify_subreg (mode, constant, GET_MODE (constant),
+ 0);
+ if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+ return nullptr;
+ }
+
+ rtx first = XVECEXP (constant, 0, 0);
+
+ for (int i = 1; i < nunits; ++i)
+ {
+ rtx tmp = XVECEXP (constant, 0, i);
+ /* Vector duplicate value. */
+ if (!rtx_equal_p (tmp, first))
+ return nullptr;
+ }
+
+ return first;
+}
+
void
ix86_expand_vector_move (machine_mode mode, rtx operands[])
{
@@ -407,7 +548,36 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
}
else
- op1 = validize_mem (force_const_mem (mode, op1));
+ {
+ machine_mode mode = GET_MODE (op0);
+ rtx tmp = ix86_convert_const_wide_int_to_broadcast
+ (mode, op1);
+ if (tmp == nullptr)
+ op1 = validize_mem (force_const_mem (mode, op1));
+ else
+ op1 = tmp;
+ }
+ }
+
+ if (can_create_pseudo_p ()
+ && GET_MODE_SIZE (mode) >= 16
+ && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && (MEM_P (op1)
+ && SYMBOL_REF_P (XEXP (op1, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
+ {
+ rtx first = ix86_broadcast_from_integer_constant (mode, op1);
+ if (first != nullptr)
+ {
+ /* Broadcast to XMM/YMM/ZMM register from an integer
+ constant. */
+ op1 = ix86_gen_scratch_sse_rtx (mode);
+ bool ok = ix86_expand_vector_init_duplicate (false, mode,
+ op1, first);
+ gcc_assert (ok);
+ emit_move_insn (op0, op1);
+ return;
+ }
}
/* We need to check memory alignment for SSE mode since attribute
@@ -435,7 +605,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
&& !register_operand (op0, mode)
&& !register_operand (op1, mode))
{
- emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+ rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
+ emit_move_insn (tmp, op1);
+ emit_move_insn (op0, tmp);
return;
}
@@ -13584,7 +13756,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
with all elements equal to VAR. Return true if successful. */
-static bool
+bool
ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
rtx target, rtx val)
{
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index dbddfd8e48f..4e7014be034 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -107,19 +107,10 @@ INT_MODE (XI, 64);
PARTIAL_INT_MODE (HI, 16, P2QI);
PARTIAL_INT_MODE (SI, 32, P2HI);
-/* Mode used for signed overflow checking of TImode. As
- MAX_BITSIZE_MODE_ANY_INT is only 160, wide-int.h reserves only that
- rounded up to multiple of HOST_BITS_PER_WIDE_INT bits in wide_int etc.,
- so OImode is too large. For the overflow checking we actually need
- just 1 or 2 bits beyond TImode precision. Use 160 bits to have
- a multiple of 32. */
+/* Mode used for signed overflow checking of TImode. For the overflow
+ checking we actually need just 1 or 2 bits beyond TImode precision.
+ Use 160 bits to have a multiple of 32. */
PARTIAL_INT_MODE (OI, 160, POI);
-/* Keep the OI and XI modes from confusing the compiler into thinking
- that these modes could actually be used for computation. They are
- only holders for vectors during data movement. Include POImode precision
- though. */
-#define MAX_BITSIZE_MODE_ANY_INT (160)
-
/* The symbol Pmode stands for one of the above machine modes (usually SImode).
The tm.h file specifies which one. It is not a distinct mode. */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 7782cf1163f..bb79b7815fb 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -50,6 +50,8 @@ extern void ix86_reset_previous_fndecl (void);
extern bool ix86_using_red_zone (void);
+extern rtx ix86_gen_scratch_sse_rtx (machine_mode);
+
extern unsigned int ix86_regmode_natural_size (machine_mode);
#ifdef RTX_CODE
extern int standard_80387_constant_p (rtx);
@@ -257,6 +259,8 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
+ rtx);
/* In i386-c.c */
extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 36684c7857c..f03b81ed9cf 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -4191,6 +4191,18 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
}
}
+/* Implement TARGET_PUSH_ARGUMENT. */
+
+static bool
+ix86_push_argument (unsigned int npush)
+{
+ /* If SSE2 is available, use vector move to put large argument onto
+ stack. NB: In 32-bit mode, use 8-byte vector move. */
+ return ((!TARGET_SSE2 || npush < (TARGET_64BIT ? 16 : 8))
+ && TARGET_PUSH_ARGS
+ && !ACCUMULATE_OUTGOING_ARGS);
+}
+
/* Create the va_list data type. */
@@ -7941,8 +7953,17 @@ ix86_finalize_stack_frame_flags (void)
assumed stack realignment might be needed or -fno-omit-frame-pointer
is used, but in the end nothing that needed the stack alignment had
been spilled nor stack access, clear frame_pointer_needed and say we
- don't need stack realignment. */
- if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+ don't need stack realignment.
+
+ When vector register is used for piecewise move and store, we don't
+ increase stack_alignment_needed as there is no register spill for
+ piecewise move and store. Since stack_realign_needed is set to true
+ by checking stack_alignment_estimated which is updated by pseudo
+ vector register usage, we also need to check stack_realign_needed to
+ eliminate frame pointer. */
+ if ((stack_realign
+ || (!flag_omit_frame_pointer && optimize)
+ || crtl->stack_realign_needed)
&& frame_pointer_needed
&& crtl->is_leaf
&& crtl->sp_is_unchanging
@@ -10401,7 +10422,13 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
/* FALLTHRU */
case E_OImode:
case E_XImode:
- if (!standard_sse_constant_p (x, mode))
+ if (!standard_sse_constant_p (x, mode)
+ && GET_MODE_SIZE (TARGET_AVX512F
+ ? XImode
+ : (TARGET_AVX
+ ? OImode
+ : (TARGET_SSE2
+ ? TImode : DImode))) < GET_MODE_SIZE (mode))
return false;
default:
break;
@@ -22984,6 +23011,20 @@ ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
}
}
+/* Implement the TARGET_GEN_MEMSET_SCRATCH_RTX hook. Return a scratch
+ register in MODE for vector load and store. */
+
+rtx
+ix86_gen_scratch_sse_rtx (machine_mode mode)
+{
+ if (TARGET_SSE)
+ return gen_rtx_REG (mode, (TARGET_64BIT
+ ? LAST_REX_SSE_REG
+ : LAST_SSE_REG));
+ else
+ return gen_reg_rtx (mode);
+}
+
/* Address space support.
This is not "far pointers" in the 16-bit sense, but an easy way
@@ -23514,6 +23555,9 @@ ix86_run_selftests (void)
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST ix86_address_cost
+#undef TARGET_OVERLAP_OP_BY_PIECES_P
+#define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
+
#undef TARGET_FLAGS_REGNUM
#define TARGET_FLAGS_REGNUM FLAGS_REG
#undef TARGET_FIXED_CONDITION_CODE_REGS
@@ -23563,6 +23607,8 @@ ix86_run_selftests (void)
#define TARGET_C_EXCESS_PRECISION ix86_get_excess_precision
#undef TARGET_PROMOTE_PROTOTYPES
#define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
+#undef TARGET_PUSH_ARGUMENT
+#define TARGET_PUSH_ARGUMENT ix86_push_argument
#undef TARGET_SETUP_INCOMING_VARARGS
#define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
#undef TARGET_MUST_PASS_IN_STACK
@@ -23882,6 +23928,9 @@ static bool ix86_libc_has_fast_function (int fcode ATTRIBUTE_UNUSED)
#undef TARGET_LIBC_HAS_FAST_FUNCTION
#define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
+#undef TARGET_GEN_MEMSET_SCRATCH_RTX
+#define TARGET_GEN_MEMSET_SCRATCH_RTX ix86_gen_scratch_sse_rtx
+
#if CHECKING_P
#undef TARGET_RUN_TARGET_SELFTESTS
#define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 97700d797a7..4649bccc056 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1679,13 +1679,8 @@ enum reg_class
|| TARGET_64BIT_MS_ABI \
|| (TARGET_MACHO && crtl->profile))
-/* If defined, a C expression whose value is nonzero when we want to use PUSH
- instructions to pass outgoing arguments. */
-
-#define PUSH_ARGS (TARGET_PUSH_ARGS && !ACCUMULATE_OUTGOING_ARGS)
-
/* We want the stack and args grow in opposite directions, even if
- PUSH_ARGS is 0. */
+ targetm.calls.push_argument returns false. */
#define PUSH_ARGS_REVERSED 1
/* Offset of first parameter from the argument pointer register value. */
@@ -1970,9 +1965,10 @@ typedef struct ix86_args {
/* Define this as 1 if `char' should by default be signed; else as 0. */
#define DEFAULT_SIGNED_CHAR 1
-/* Max number of bytes we can move from memory to memory
- in one reasonably fast instruction. */
-#define MOVE_MAX 16
+/* The constant maximum number of bytes that a single instruction can
+ move quickly between memory and registers or between two memory
+ locations. */
+#define MAX_MOVE_MAX 64
/* MOVE_MAX_PIECES is the number of bytes at a time which we can
move efficiently, as opposed to MOVE_MAX which is the maximum
@@ -1983,11 +1979,34 @@ typedef struct ix86_args {
widest mode with MAX_FIXED_MODE_SIZE, we can only use TImode in
64-bit mode. */
#define MOVE_MAX_PIECES \
- ((TARGET_64BIT \
- && TARGET_SSE2 \
- && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
- && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
- ? GET_MODE_SIZE (TImode) : UNITS_PER_WORD)
+ ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? 64 \
+ : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD)))
+
+/* Max number of bytes we can move from memory to memory in one
+ reasonably fast instruction. */
+#define MOVE_MAX MOVE_MAX_PIECES
+
+/* STORE_MAX_PIECES is the number of bytes at a time that we can
+ store efficiently. */
+#define STORE_MAX_PIECES \
+ ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? 64 \
+ : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD)))
/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3f81abc7804..749323384a6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -24319,3 +24319,34 @@
"TARGET_WIDEKL"
"aes<aeswideklvariant>\t{%0}"
[(set_attr "type" "other")])
+
+;; Modes handled by broadcast patterns. NB: Allow V64QI and V32HI with
+;; TARGET_AVX512F since ix86_expand_vector_init_duplicate can expand
+;; without TARGET_AVX512BW which is used by memset vector broadcast
+;; expander to XI with:
+;; vmovd %edi, %xmm15
+;; vpbroadcastb %xmm15, %ymm15
+;; vinserti64x4 $0x1, %ymm15, %zmm15, %zmm15
+
+(define_mode_iterator INT_BROADCAST_MODE
+ [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
+ (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+ (V8DI "TARGET_AVX512F && TARGET_64BIT")
+ (V4DI "TARGET_AVX && TARGET_64BIT") (V2DI "TARGET_64BIT")])
+
+;; Broadcast from an integer. NB: Enable broadcast only if we can move
+;; from GPR to SSE register directly.
+(define_expand "vec_duplicate<mode>"
+ [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand")
+ (vec_duplicate:INT_BROADCAST_MODE
+ (match_operand:<ssescalarmode> 1 "nonimmediate_operand")))]
+ "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
+{
+ if (!ix86_expand_vector_init_duplicate (false,
+ GET_MODE (operands[0]),
+ operands[0],
+ operands[1]))
+ gcc_unreachable ();
+ DONE;
+})
diff --git a/gcc/config/m32c/m32c.c b/gcc/config/m32c/m32c.c
index b1cb3591da6..d22bdd79c71 100644
--- a/gcc/config/m32c/m32c.c
+++ b/gcc/config/m32c/m32c.c
@@ -1296,6 +1296,9 @@ m32c_push_rounding (poly_int64 n)
return (n + 1) & ~1;
}
+#undef TARGET_PUSH_ARGUMENT
+#define TARGET_PUSH_ARGUMENT hook_bool_uint_true
+
/* Passing Arguments in Registers */
/* Implements TARGET_FUNCTION_ARG. Arguments are passed partly in
diff --git a/gcc/config/m32c/m32c.h b/gcc/config/m32c/m32c.h
index 635f5924c20..228a73d1c42 100644
--- a/gcc/config/m32c/m32c.h
+++ b/gcc/config/m32c/m32c.h
@@ -472,7 +472,6 @@ enum reg_class
/* Passing Function Arguments on the Stack */
-#define PUSH_ARGS 1
#define PUSH_ROUNDING(N) m32c_push_rounding (N)
#define CALL_POPS_ARGS(C) 0
diff --git a/gcc/config/nios2/nios2.h b/gcc/config/nios2/nios2.h
index 1840a466f96..dfca12cc525 100644
--- a/gcc/config/nios2/nios2.h
+++ b/gcc/config/nios2/nios2.h
@@ -297,7 +297,6 @@ typedef struct nios2_args
((REGNO) >= FIRST_ARG_REGNO && (REGNO) <= LAST_ARG_REGNO)
/* Passing function arguments on stack. */
-#define PUSH_ARGS 0
#define ACCUMULATE_OUTGOING_ARGS 1
/* We define TARGET_RETURN_IN_MEMORY, so set to zero. */
diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h
index 4c35a7d7ee3..9b6be323e6d 100644
--- a/gcc/config/pru/pru.h
+++ b/gcc/config/pru/pru.h
@@ -339,7 +339,6 @@ typedef struct pru_args
((REGNO) >= FIRST_ARG_REGNUM && (REGNO) <= LAST_ARG_REGNUM)
/* Passing function arguments on stack. */
-#define PUSH_ARGS 0
#define ACCUMULATE_OUTGOING_ARGS 1
/* We define TARGET_RETURN_IN_MEMORY, so set to zero. */
diff --git a/gcc/defaults.h b/gcc/defaults.h
index 91216593e75..ba79a8e48ed 100644
--- a/gcc/defaults.h
+++ b/gcc/defaults.h
@@ -801,15 +801,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define NEXT_OBJC_RUNTIME 0
#endif
-/* Supply a default definition for PUSH_ARGS. */
-#ifndef PUSH_ARGS
-#ifdef PUSH_ROUNDING
-#define PUSH_ARGS !ACCUMULATE_OUTGOING_ARGS
-#else
-#define PUSH_ARGS 0
-#endif
-#endif
-
/* Decide whether a function's arguments should be processed
from first to last or from last to first.
@@ -820,7 +811,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#ifndef PUSH_ARGS_REVERSED
#if defined (STACK_GROWS_DOWNWARD) != defined (ARGS_GROW_DOWNWARD)
-#define PUSH_ARGS_REVERSED PUSH_ARGS
+#define PUSH_ARGS_REVERSED targetm.calls.push_argument (0)
#endif
#endif
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b370bc76b25..3b714234abc 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -3807,14 +3807,17 @@ cases of mismatch, it also makes for better code on certain machines.
The default is to not promote prototypes.
@end deftypefn
-@defmac PUSH_ARGS
-A C expression. If nonzero, push insns will be used to pass
-outgoing arguments.
-If the target machine does not have a push instruction, set it to zero.
-That directs GCC to use an alternate strategy: to
-allocate the entire argument block and then store the arguments into
-it. When @code{PUSH_ARGS} is nonzero, @code{PUSH_ROUNDING} must be defined too.
-@end defmac
+@deftypefn {Target Hook} bool TARGET_PUSH_ARGUMENT (unsigned int @var{npush})
+This target hook returns @code{true} if push instructions will be
+used to pass outgoing arguments. When the push instruction usage is
+optional, @var{npush} is nonzero to indicate the number of bytes to
+push. Otherwise, @var{npush} is zero. If the target machine does not
+have a push instruction or push instruction should be avoided,
+@code{false} should be returned. That directs GCC to use an alternate
+strategy: to allocate the entire argument block and then store the
+arguments into it. If this target hook may return @code{true},
+@code{PUSH_ROUNDING} must be defined.
+@end deftypefn
@defmac PUSH_ARGS_REVERSED
A C expression. If nonzero, function arguments will be evaluated from
@@ -6767,6 +6770,13 @@ in code size, for example where the number of insns emitted to perform a
move would be greater than that of a library call.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_OVERLAP_OP_BY_PIECES_P (void)
+This target hook should return true if when the @code{by_pieces}
+infrastructure is used, an offset adjusted unaligned memory operation
+in the smallest integer mode for the last piece operation of a memory
+region can be generated to avoid doing more than one smaller operations.
+@end deftypefn
+
@deftypefn {Target Hook} int TARGET_COMPARE_BY_PIECES_BRANCH_RATIO (machine_mode @var{mode})
When expanding a block comparison in MODE, gcc can try to reduce the
number of branches at the expense of more memory operations. This hook
@@ -11937,6 +11947,11 @@ This function prepares to emit a conditional comparison within a sequence
@var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
@end deftypefn
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for scratch register in @var{mode} to
+be used by memset broadcast. The default is @code{gen_reg_rtx}.
+@end deftypefn
+
@deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
This target hook returns a new value for the number of times @var{loop}
should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 2974dae2701..e60af7c3ec6 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -3100,14 +3100,7 @@ control passing certain arguments in registers.
@hook TARGET_PROMOTE_PROTOTYPES
-@defmac PUSH_ARGS
-A C expression. If nonzero, push insns will be used to pass
-outgoing arguments.
-If the target machine does not have a push instruction, set it to zero.
-That directs GCC to use an alternate strategy: to
-allocate the entire argument block and then store the arguments into
-it. When @code{PUSH_ARGS} is nonzero, @code{PUSH_ROUNDING} must be defined too.
-@end defmac
+@hook TARGET_PUSH_ARGUMENT
@defmac PUSH_ARGS_REVERSED
A C expression. If nonzero, function arguments will be evaluated from
@@ -4588,6 +4581,8 @@ If you don't define this, a reasonable default is used.
@hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+@hook TARGET_OVERLAP_OP_BY_PIECES_P
+
@hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO
@defmac MOVE_MAX_PIECES
@@ -8030,6 +8025,8 @@ lists.
@hook TARGET_GEN_CCMP_NEXT
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
@hook TARGET_LOOP_UNROLL_ADJUST
@defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 14a25c25450..3af9ea89305 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -746,7 +746,7 @@ static unsigned int
alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
{
scalar_int_mode tmode
- = int_mode_for_size (max_pieces * BITS_PER_UNIT, 1).require ();
+ = int_mode_for_size (max_pieces * BITS_PER_UNIT, 0).require ();
if (align >= GET_MODE_ALIGNMENT (tmode))
align = GET_MODE_ALIGNMENT (tmode);
@@ -815,12 +815,27 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
unsigned int max_size, by_pieces_operation op)
{
unsigned HOST_WIDE_INT n_insns = 0;
+ scalar_int_mode mode;
+
+ if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
+ {
+ /* NB: Round up L and ALIGN to the widest integer mode for
+ MAX_SIZE. */
+ mode = widest_int_mode_for_size (max_size);
+ if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
+ {
+ unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
+ if (up > l)
+ l = up;
+ align = GET_MODE_ALIGNMENT (mode);
+ }
+ }
align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
while (max_size > 1 && l > 0)
{
- scalar_int_mode mode = widest_int_mode_for_size (max_size);