llvm
diff --git a/‎include/llvm/Target/TargetLowering.h‎
Lines changed: 3 additions & 0 deletions b/‎include/llvm/Target/TargetLowering.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 6 additions & 0 deletions b/‎lib/Target/X86/X86ISelLowering.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/Target/X86/X86ISelLowering.h‎
Lines changed: 4 additions & 0 deletions b/‎lib/Target/X86/X86ISelLowering.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎test/CodeGen/X86/MergeConsecutiveStores.ll‎
Lines changed: 24 additions & 15 deletions b/‎test/CodeGen/X86/MergeConsecutiveStores.ll‎
Lines changed: 24 additions & 15 deletions
diff --git a/‎test/CodeGen/X86/bigstructret.ll‎
Lines changed: 3 additions & 4 deletions b/‎test/CodeGen/X86/bigstructret.ll‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎test/CodeGen/X86/bitcast-i256.ll‎
Lines changed: 2 additions & 5 deletions b/‎test/CodeGen/X86/bitcast-i256.ll‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎test/CodeGen/X86/constant-combines.ll‎
Lines changed: 1 addition & 2 deletions b/‎test/CodeGen/X86/constant-combines.ll‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎test/CodeGen/X86/extract-store.ll‎
Lines changed: 46 additions & 32 deletions b/‎test/CodeGen/X86/extract-store.ll‎
Lines changed: 46 additions & 32 deletions
diff --git a/‎test/CodeGen/X86/fold-vector-sext-crash2.ll‎
Lines changed: 8 additions & 4 deletions b/‎test/CodeGen/X86/fold-vector-sext-crash2.ll‎
Lines changed: 8 additions & 4 deletions
@@ -2723,6 +2723,9 @@ class TargetLowering : public TargetLoweringBase {
  bool foldBooleans, DAGCombinerInfo &DCI,
  const SDLoc &dl) const;
 
+ // For targets which wrap address, unwrap for analysis.
+ virtual SDValue unwrapAddress(SDValue N) const { return N; }
+
  /// Returns true (and the GlobalValue and the offset) if the node is a
  /// GlobalAddress + offset.
  virtual bool
 
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
 
@@ -55,7 +56,7 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
 /// Parses tree in Ptr for base, index, offset addresses.
 BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
  // (((B + I*M) + c)) + c ...
- SDValue Base = Ptr;
+ SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
  SDValue Index = SDValue();
  int64_t Offset = 0;
  bool IsIndexSignExt = false;
 
@@ -27034,6 +27034,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
  return 1;
 }
 
+SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
+ if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
+ return N->getOperand(0);
+ return N;
+}
+
 /// Returns true (and the GlobalValue and the offset) if the node is a
 /// GlobalAddress + offset.
 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 
@@ -812,6 +812,8 @@ namespace llvm {
  /// This method returns the name of a target specific DAG node.
  const char *getTargetNodeName(unsigned Opcode) const override;
 
+ bool mergeStoresAfterLegalization() const override { return true; }
+
  bool isCheapToSpeculateCttz() const override;
 
  bool isCheapToSpeculateCtlz() const override;
@@ -867,6 +869,8 @@ namespace llvm {
  const SelectionDAG &DAG,
  unsigned Depth) const override;
 
+ SDValue unwrapAddress(SDValue N) const override;
+
  bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
  int64_t &Offset) const override;
 
 
@@ -492,10 +492,15 @@ define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
  store float %vecext7, float* %arrayidx7, align 4
  ret void
 
-; CHECK-LABEL: merge_vec_element_store
-; CHECK: vmovups
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK: vextractf128	$1, %ymm0, %xmm1
+; CHECK: vinsertf128	$1, %xmm1, %ymm0, %ymm0
+; CHECK: retq
+
+; This is what should be generated:
+; FIXME-LABEL: merge_vec_element_store
+; FIXME: vmovups
+; FIXME-NEXT: vzeroupper
+; FIXME-NEXT: retq
 }
 
 ; PR21711 - Merge vector stores into wider vector stores.
@@ -515,11 +520,18 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo
  store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
  ret void
 
-; CHECK-LABEL: merge_vec_extract_stores
-; CHECK: vmovups %ymm0, 48(%rdi)
-; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; These vblendpd are obviously redundant.
+; CHECK: vblendpd	$12, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3]
+; CHECK: vmovupd	%ymm0, 48(%rdi)
+; CHECK: vblendpd	$12, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3]
+; CHECK: vmovupd	%ymm0, 80(%rdi)
+
+; This is what should be generated:
+; FIXME-LABEL: merge_vec_extract_stores
+; FIXME: vmovups %ymm0, 48(%rdi)
+; FIXME-NEXT: vmovups %ymm1, 80(%rdi)
+; FIXME-NEXT: vzeroupper
+; FIXME-NEXT: retq
 }
 
 ; Merging vector stores when sourced from vector loads.
@@ -557,8 +569,7 @@ define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
 }
 
 ; This is a minimized test based on real code that was failing.
-; We could merge stores (and loads) like this...
-
+; This should now be merged.
 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
  %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
  %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
@@ -575,10 +586,8 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
  ret void
 
 ; CHECK-LABEL: merge_vec_element_and_scalar_load
-; CHECK: movq	(%rdi), %rax
-; CHECK-NEXT: movq	8(%rdi), %rcx
-; CHECK-NEXT: movq	%rax, 32(%rdi)
-; CHECK-NEXT: movq	%rcx, 40(%rdi)
+; CHECK: vmovups (%rdi), %xmm0
+; CHECK-NEXT: vmovups %xmm0, 32(%rdi)
 ; CHECK-NEXT: retq
 }
 
 
@@ -31,22 +31,21 @@ entry:
  ret %0 %3
 }
 
+
 define fastcc %1 @ReturnBigStruct2() nounwind readnone {
 ; X86-LABEL: ReturnBigStruct2:
 ; X86: # BB#0: # %entry
 ; X86-NEXT: movl $48, 4(%ecx)
 ; X86-NEXT: movb $1, 2(%ecx)
-; X86-NEXT: movb $1, 1(%ecx)
-; X86-NEXT: movb $0, (%ecx)
+; X86-NEXT: movw $256, (%ecx) # imm = 0x100
 ; X86-NEXT: movl %ecx, %eax
 ; X86-NEXT: retl
 ;
 ; X64-LABEL: ReturnBigStruct2:
 ; X64: # BB#0: # %entry
 ; X64-NEXT: movl $48, 4(%rdi)
 ; X64-NEXT: movb $1, 2(%rdi)
-; X64-NEXT: movb $1, 1(%rdi)
-; X64-NEXT: movb $0, (%rdi)
+; X64-NEXT: movw $256, (%rdi) # imm = 0x100
 ; X64-NEXT: movq %rdi, %rax
 ; X64-NEXT: retq
 entry:
 
@@ -12,11 +12,8 @@ define i256 @foo(<8 x i32> %a) {
 ;
 ; SLOW-LABEL: foo:
 ; SLOW: # BB#0:
-; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; SLOW-NEXT: vpextrq $1, %xmm1, 24(%rdi)
-; SLOW-NEXT: vmovq %xmm1, 16(%rdi)
-; SLOW-NEXT: vpextrq $1, %xmm0, 8(%rdi)
-; SLOW-NEXT: vmovq %xmm0, (%rdi)
+; SLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi)
+; SLOW-NEXT: vmovups %xmm0, (%rdi)
 ; SLOW-NEXT: movq %rdi, %rax
 ; SLOW-NEXT: vzeroupper
 ; SLOW-NEXT: retq
 
@@ -15,12 +15,11 @@ define void @PR22524({ float, float }* %arg) {
 ;
 ; CHECK-LABEL: PR22524:
 ; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movl $0, 4(%rdi)
 ; CHECK-NEXT: xorl %eax, %eax
 ; CHECK-NEXT: movd %eax, %xmm0
 ; CHECK-NEXT: xorps %xmm1, %xmm1
 ; CHECK-NEXT: mulss %xmm0, %xmm1
-; CHECK-NEXT: movl $0, (%rdi)
+; CHECK-NEXT: movq $0, (%rdi)
 ; CHECK-NEXT: movss %xmm1, 4(%rdi)
 ; CHECK-NEXT: retq
 entry:
 
@@ -510,22 +510,22 @@ define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind {
 }
 
 define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
-; X32-LABEL: extract_f128_0:
-; X32: # BB#0:
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %esi, 12(%edi)
-; X32-NEXT: movl %edx, 8(%edi)
-; X32-NEXT: movl %ecx, 4(%edi)
-; X32-NEXT: movl %eax, (%edi)
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %edi
-; X32-NEXT: retl
+; SSE-X32-LABEL: extract_f128_0:
+; SSE-X32: # BB#0:
+; SSE-X32-NEXT: pushl %edi
+; SSE-X32-NEXT: pushl %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-X32-NEXT: movl %esi, 12(%edi)
+; SSE-X32-NEXT: movl %edx, 8(%edi)
+; SSE-X32-NEXT: movl %ecx, 4(%edi)
+; SSE-X32-NEXT: movl %eax, (%edi)
+; SSE-X32-NEXT: popl %esi
+; SSE-X32-NEXT: popl %edi
+; SSE-X32-NEXT: retl
 ;
 ; SSE2-X64-LABEL: extract_f128_0:
 ; SSE2-X64: # BB#0:
@@ -539,6 +539,13 @@ define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
 ; SSE41-X64-NEXT: movq %rsi, (%rdi)
 ; SSE41-X64-NEXT: retq
 ;
+; AVX-X32-LABEL: extract_f128_0:
+; AVX-X32: # BB#0:
+; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT: vmovups %xmm0, (%eax)
+; AVX-X32-NEXT: retl
+;
 ; AVX-X64-LABEL: extract_f128_0:
 ; AVX-X64: # BB#0:
 ; AVX-X64-NEXT: movq %rdx, 8(%rdi)
@@ -555,22 +562,22 @@ define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
 }
 
 define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
-; X32-LABEL: extract_f128_1:
-; X32: # BB#0:
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %esi, 12(%edi)
-; X32-NEXT: movl %edx, 8(%edi)
-; X32-NEXT: movl %ecx, 4(%edi)
-; X32-NEXT: movl %eax, (%edi)
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %edi
-; X32-NEXT: retl
+; SSE-X32-LABEL: extract_f128_1:
+; SSE-X32: # BB#0:
+; SSE-X32-NEXT: pushl %edi
+; SSE-X32-NEXT: pushl %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-X32-NEXT: movl %esi, 12(%edi)
+; SSE-X32-NEXT: movl %edx, 8(%edi)
+; SSE-X32-NEXT: movl %ecx, 4(%edi)
+; SSE-X32-NEXT: movl %eax, (%edi)
+; SSE-X32-NEXT: popl %esi
+; SSE-X32-NEXT: popl %edi
+; SSE-X32-NEXT: retl
 ;
 ; SSE2-X64-LABEL: extract_f128_1:
 ; SSE2-X64: # BB#0:
@@ -584,6 +591,13 @@ define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
 ; SSE41-X64-NEXT: movq %rcx, (%rdi)
 ; SSE41-X64-NEXT: retq
 ;
+; AVX-X32-LABEL: extract_f128_1:
+; AVX-X32: # BB#0:
+; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT: vmovups %xmm0, (%eax)
+; AVX-X32-NEXT: retl
+;
 ; AVX-X64-LABEL: extract_f128_1:
 ; AVX-X64: # BB#0:
 ; AVX-X64-NEXT: movq %r8, 8(%rdi)
 
@@ -53,8 +53,10 @@ define <2 x i256> @test_zext1() {
  ret <2 x i256> %Shuff
 
  ; X64-LABEL: test_zext1
- ; X64: movq $0
- ; X64-NEXT: movq $0
+ ; X64: xorps %xmm0, %xmm0
+ ; X64: movaps %xmm0
+ ; X64: movaps %xmm0
+ ; X64: movaps %xmm0
  ; X64-NEXT: movq $0
  ; X64-NEXT: movq $254
 
@@ -75,8 +77,10 @@ define <2 x i256> @test_zext2() {
  ret <2 x i256> %Shuff
 
  ; X64-LABEL: test_zext2
- ; X64: movq $0
- ; X64-NEXT: movq $0
+ ; X64: xorps %xmm0, %xmm0
+ ; X64-NEXT: movaps %xmm0
+ ; X64-NEXT: movaps %xmm0
+ ; X64-NEXT: movaps %xmm0 
  ; X64-NEXT: movq $-1
  ; X64-NEXT: movq $-2