macro-fuse-cmp.ll
7.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG
; RUN: opt < %s -loop-reduce -mcpu=bdver2 -S | FileCheck %s --check-prefix=BUL
; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW
; RUN: llc < %s | FileCheck %s --check-prefix=BASE
; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE
; RUN: llc < %s -mattr=branchfusion | FileCheck %s --check-prefix=FUSE
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681
; FIXME: If a CPU can macro-fuse a compare and branch, then we discount that
; cost in LSR and avoid generating large offsets in each memory access.
; This reduces code size and may improve decode throughput.
define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) {
; JAG-LABEL: @maxArray(
; JAG-NEXT: entry:
; JAG-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to i8*
; JAG-NEXT: [[X3:%.*]] = bitcast double* [[X:%.*]] to i8*
; JAG-NEXT: br label [[VECTOR_BODY:%.*]]
; JAG: vector.body:
; JAG-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
; JAG-NEXT: [[UGLYGEP7:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
; JAG-NEXT: [[UGLYGEP78:%.*]] = bitcast i8* [[UGLYGEP7]] to <2 x double>*
; JAG-NEXT: [[SCEVGEP9:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP78]], i64 32768
; JAG-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y1]], i64 [[LSR_IV]]
; JAG-NEXT: [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
; JAG-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP2]], i64 32768
; JAG-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP9]], align 8
; JAG-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
; JAG-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
; JAG-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
; JAG-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
; JAG-NEXT: [[UGLYGEP45:%.*]] = bitcast i8* [[UGLYGEP4]] to <2 x double>*
; JAG-NEXT: [[SCEVGEP6:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP45]], i64 32768
; JAG-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP6]], align 8
; JAG-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
; JAG-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
; JAG-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
; JAG: exit:
; JAG-NEXT: ret void
;
; BUL-LABEL: @maxArray(
; BUL-NEXT: entry:
; BUL-NEXT: br label [[VECTOR_BODY:%.*]]
; BUL: vector.body:
; BUL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; BUL-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
; BUL-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
; BUL-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
; BUL-NEXT: [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
; BUL-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
; BUL-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
; BUL-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
; BUL-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
; BUL-NEXT: [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
; BUL-NEXT: [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
; BUL-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
; BUL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; BUL-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; BUL-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
; BUL: exit:
; BUL-NEXT: ret void
;
; HSW-LABEL: @maxArray(
; HSW-NEXT: entry:
; HSW-NEXT: br label [[VECTOR_BODY:%.*]]
; HSW: vector.body:
; HSW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; HSW-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
; HSW-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
; HSW-NEXT: [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
; HSW-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
; HSW-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
; HSW-NEXT: [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
; HSW-NEXT: [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
; HSW-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
; HSW-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; HSW-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
; HSW: exit:
; HSW-NEXT: ret void
;
; BASE-LABEL: maxArray:
; BASE: # %bb.0: # %entry
; BASE-NEXT: movq $-524288, %rax # imm = 0xFFF80000
; BASE-NEXT: .p2align 4, 0x90
; BASE-NEXT: .LBB0_1: # %vector.body
; BASE-NEXT: # =>This Inner Loop Header: Depth=1
; BASE-NEXT: movupd 524288(%rdi,%rax), %xmm0
; BASE-NEXT: movupd 524288(%rsi,%rax), %xmm1
; BASE-NEXT: maxpd %xmm0, %xmm1
; BASE-NEXT: movupd %xmm1, 524288(%rdi,%rax)
; BASE-NEXT: addq $16, %rax
; BASE-NEXT: jne .LBB0_1
; BASE-NEXT: # %bb.2: # %exit
; BASE-NEXT: retq
; FUSE-LABEL: maxArray:
; FUSE: # %bb.0: # %entry
; FUSE-NEXT: xorl %eax, %eax
; FUSE-NEXT: .p2align 4, 0x90
; FUSE-NEXT: .LBB0_1: # %vector.body
; FUSE-NEXT: # =>This Inner Loop Header: Depth=1
; FUSE-NEXT: movupd (%rdi,%rax,8), %xmm0
; FUSE-NEXT: movupd (%rsi,%rax,8), %xmm1
; FUSE-NEXT: maxpd %xmm0, %xmm1
; FUSE-NEXT: movupd %xmm1, (%rdi,%rax,8)
; FUSE-NEXT: addq $2, %rax
; FUSE-NEXT: cmpq $65536, %rax # imm = 0x10000
; FUSE-NEXT: jne .LBB0_1
; FUSE-NEXT: # %bb.2: # %exit
; FUSE-NEXT: retq
entry:
br label %vector.body
vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%gepx = getelementptr inbounds double, double* %x, i64 %index
%gepy = getelementptr inbounds double, double* %y, i64 %index
%xptr = bitcast double* %gepx to <2 x double>*
%yptr = bitcast double* %gepy to <2 x double>*
%xval = load <2 x double>, <2 x double>* %xptr, align 8
%yval = load <2 x double>, <2 x double>* %yptr, align 8
%cmp = fcmp ogt <2 x double> %yval, %xval
%max = select <2 x i1> %cmp, <2 x double> %yval, <2 x double> %xval
%xptr_again = bitcast double* %gepx to <2 x double>*
store <2 x double> %max, <2 x double>* %xptr_again, align 8
%index.next = add i64 %index, 2
%done = icmp eq i64 %index.next, 65536
br i1 %done, label %exit, label %vector.body
exit:
ret void
}