[optimizing] Improve x86 parallel moves/swaps
Add a new constructor to ScratchRegisterScope that will supply a
register if there is a free one, but not spill to force one. Use this
to generated alternate code that doesn't use a temporary, as the
spill/restore of a register generates extra instructions that aren't
necessary on x86.
Here is the benefit for a 32 bit memory-to-memory exchange with no
free registers:
< 50 push eax
< 53 push ebx
< 8B44244C mov eax, [esp + 76]
< 8B5C246C mov ebx, [esp + 108]
< 8944246C mov [esp + 108], eax
< 895C244C mov [esp + 76], ebx
< 5B pop ebx
< 58 pop eax
---
> FF742444 push [esp + 68]
> FF742468 push [esp + 104]
> 8F44244C pop [esp + 72]
> 8F442468 pop [esp + 100]
Avoid using xchg instruction, as it is slow on smaller processors.
Change-Id: Id29ee3abd998577baaee552d55d23e60ae0c7871
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5710ec5..f714214 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -3838,15 +3838,27 @@
void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
ScratchRegisterScope ensure_scratch(
- this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+ this, TMP, codegen_->GetNumberOfCoreRegisters());
- int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
- __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
- __ movq(CpuRegister(ensure_scratch.GetRegister()),
- Address(CpuRegister(RSP), mem2 + stack_offset));
- __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
- __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
- CpuRegister(ensure_scratch.GetRegister()));
+ int temp_reg = ensure_scratch.GetRegister();
+ if (temp_reg == kNoRegister) {
+ // Use the stack as a temporary.
+ // Save mem1 on the stack.
+ __ pushq(Address(CpuRegister(RSP), mem1));
+
+ // Copy mem2 into mem1.
+ __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize));
+ __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP));
+
+ // Now pop mem1 into mem2.
+ __ popq(Address(CpuRegister(RSP), mem2));
+ } else {
+ CpuRegister temp = CpuRegister(temp_reg);
+ __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1));
+ __ movq(temp, Address(CpuRegister(RSP), mem2));
+ __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP));
+ __ movq(Address(CpuRegister(RSP), mem1), temp);
+ }
}
void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
@@ -3855,6 +3867,13 @@
__ movd(reg, CpuRegister(TMP));
}
+void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
+ // Prefer to avoid xchg as it isn't speedy on smaller processors.
+ __ movq(CpuRegister(TMP), reg1);
+ __ movq(reg1, reg2);
+ __ movq(reg2, CpuRegister(TMP));
+}
+
void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
__ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
__ movsd(Address(CpuRegister(RSP), mem), reg);
@@ -3867,7 +3886,7 @@
Location destination = move->GetDestination();
if (source.IsRegister() && destination.IsRegister()) {
- __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
+ Exchange64(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
} else if (source.IsRegister() && destination.IsStackSlot()) {
Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
} else if (source.IsStackSlot() && destination.IsRegister()) {