[optimizing] Improve x86 parallel moves/swaps

Add a new constructor to ScratchRegisterScope that will supply a
register if there is a free one, but not spill to force one.  Use this
to generated alternate code that doesn't use a temporary, as the
spill/restore of a register generates extra instructions that aren't
necessary on x86.

Here is the benefit for a 32 bit memory-to-memory exchange with no
free registers:
<        50    	       push eax
<        53    	       push ebx
<  8B44244C    	       mov eax, [esp + 76]
<  8B5C246C    	       mov ebx, [esp + 108]
<  8944246C    	       mov [esp + 108], eax
<  895C244C    	       mov [esp + 76], ebx
<        5B    	       pop ebx
<        58    	       pop eax
---
>  FF742444    	       push [esp + 68]
>  FF742468    	       push [esp + 104]
>  8F44244C    	       pop [esp + 72]
>  8F442468    	       pop [esp + 100]

Avoid using xchg instruction, as it is slow on smaller processors.

Change-Id: Id29ee3abd998577baaee552d55d23e60ae0c7871
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5710ec5..f714214 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -3838,15 +3838,27 @@
 
 void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch(
-      this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+      this, TMP, codegen_->GetNumberOfCoreRegisters());
 
-  int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
-  __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
-  __ movq(CpuRegister(ensure_scratch.GetRegister()),
-          Address(CpuRegister(RSP), mem2 + stack_offset));
-  __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
-  __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
-          CpuRegister(ensure_scratch.GetRegister()));
+  int temp_reg = ensure_scratch.GetRegister();
+  if (temp_reg == kNoRegister) {
+    // Use the stack as a temporary.
+    // Save mem1 on the stack.
+    __ pushq(Address(CpuRegister(RSP), mem1));
+
+    // Copy mem2 into mem1.
+    __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize));
+    __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP));
+
+    // Now pop mem1 into mem2.
+    __ popq(Address(CpuRegister(RSP), mem2));
+  } else {
+    CpuRegister temp = CpuRegister(temp_reg);
+    __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1));
+    __ movq(temp, Address(CpuRegister(RSP), mem2));
+    __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP));
+    __ movq(Address(CpuRegister(RSP), mem1), temp);
+  }
 }
 
 void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
@@ -3855,6 +3867,13 @@
   __ movd(reg, CpuRegister(TMP));
 }
 
+void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
+  // Prefer to avoid xchg as it isn't speedy on smaller processors.
+  __ movq(CpuRegister(TMP), reg1);
+  __ movq(reg1, reg2);
+  __ movq(reg2, CpuRegister(TMP));
+}
+
 void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
   __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
   __ movsd(Address(CpuRegister(RSP), mem), reg);
@@ -3867,7 +3886,7 @@
   Location destination = move->GetDestination();
 
   if (source.IsRegister() && destination.IsRegister()) {
-    __ xchgq(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
+    Exchange64(destination.AsRegister<CpuRegister>(), source.AsRegister<CpuRegister>());
   } else if (source.IsRegister() && destination.IsStackSlot()) {
     Exchange32(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
   } else if (source.IsStackSlot() && destination.IsRegister()) {