x86_64: Implement missing variants of XCHG. Add tests for them.

Also, clean up the handling of a special case when at least one of the
operands for XCHG is RAX/EAX/AX. Add a helper function that deduplicates
the code for different operand sizes.

This patch also extends `EmitOptionalByteRegNormalizingRex32` function
to handle the case when an instruction has both operands in byte
registers, and so it needs REX prefix if either of them is in a special
register (an example of such instruction is `xchg bpl, al`: if only the
source register is checked, no REX would be emitted). Previously
`EmitOptionalByteRegNormalizingRex32` handled only the case when the
source register is special (an example of such instruction is
`movzxb rax, bpl`).

Bug: 65872996
Test: m test-art-host-gtest
Change-Id: I20c5e9375bbd15d799e5748b127d154ddcc0fc11
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 35c68b8..52cedf0 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -3812,19 +3812,50 @@
 }
 
 
-void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
+bool X86_64Assembler::try_xchg_rax(CpuRegister dst,
+                                   CpuRegister src,
+                                   void (X86_64Assembler::*prefix_fn)(CpuRegister)) {
+  Register src_reg = src.AsRegister();
+  Register dst_reg = dst.AsRegister();
+  if (src_reg != RAX && dst_reg != RAX) {
+    return false;
+  }
+  if (dst_reg == RAX) {
+    std::swap(src_reg, dst_reg);
+  }
+  if (dst_reg != RAX) {
+    // Prefix is needed only if one of the registers is not RAX, otherwise it's a pure NOP.
+    (this->*prefix_fn)(CpuRegister(dst_reg));
+  }
+  EmitUint8(0x90 + CpuRegister(dst_reg).LowBits());
+  return true;
+}
+
+
+void X86_64Assembler::xchgb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  // There is a short version for rax.
-  // It's a bit awkward, as CpuRegister has a const field, so assignment and thus swapping doesn't
-  // work.
-  const bool src_rax = src.AsRegister() == RAX;
-  const bool dst_rax = dst.AsRegister() == RAX;
-  if (src_rax || dst_rax) {
-    EmitOptionalRex32(src_rax ? dst : src);
-    EmitUint8(0x90 + (src_rax ? dst.LowBits() : src.LowBits()));
+  // There is no short version for AL.
+  EmitOptionalByteRegNormalizingRex32(dst, src, /*normalize_both=*/ true);
+  EmitUint8(0x86);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+
+void X86_64Assembler::xchgb(CpuRegister reg, const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(reg, address);
+  EmitUint8(0x86);
+  EmitOperand(reg.LowBits(), address);
+}
+
+
+void X86_64Assembler::xchgw(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOperandSizeOverride();
+  if (try_xchg_rax(dst, src, &X86_64Assembler::EmitOptionalRex32)) {
+    // A short version for AX.
     return;
   }
-
   // General case.
   EmitOptionalRex32(dst, src);
   EmitUint8(0x87);
@@ -3832,26 +3863,23 @@
 }
 
 
-void X86_64Assembler::xchgq(CpuRegister dst, CpuRegister src) {
+void X86_64Assembler::xchgw(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  // There is a short version for rax.
-  // It's a bit awkward, as CpuRegister has a const field, so assignment and thus swapping doesn't
-  // work.
-  const bool src_rax = src.AsRegister() == RAX;
-  const bool dst_rax = dst.AsRegister() == RAX;
-  if (src_rax || dst_rax) {
-    // If src == target, emit a nop instead.
-    if (src_rax && dst_rax) {
-      EmitUint8(0x90);
-    } else {
-      EmitRex64(src_rax ? dst : src);
-      EmitUint8(0x90 + (src_rax ? dst.LowBits() : src.LowBits()));
-    }
+  EmitOperandSizeOverride();
+  EmitOptionalRex32(reg, address);
+  EmitUint8(0x87);
+  EmitOperand(reg.LowBits(), address);
+}
+
+
+void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (try_xchg_rax(dst, src, &X86_64Assembler::EmitOptionalRex32)) {
+    // A short version for EAX.
     return;
   }
-
   // General case.
-  EmitRex64(dst, src);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x87);
   EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
@@ -3865,6 +3893,27 @@
 }
 
 
+void X86_64Assembler::xchgq(CpuRegister dst, CpuRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  if (try_xchg_rax(dst, src, &X86_64Assembler::EmitRex64)) {
+    // A short version for RAX.
+    return;
+  }
+  // General case.
+  EmitRex64(dst, src);
+  EmitUint8(0x87);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+
+void X86_64Assembler::xchgq(CpuRegister reg, const Address& address) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitRex64(reg, address);
+  EmitUint8(0x87);
+  EmitOperand(reg.LowBits(), address);
+}
+
+
 void X86_64Assembler::cmpb(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   CHECK(imm.is_int32());
@@ -5394,9 +5443,19 @@
   EmitUint8(rex);
 }
 
-void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src) {
-  // For src, SPL, BPL, SIL, DIL need the rex prefix.
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst,
+                                                          CpuRegister src,
+                                                          bool normalize_both) {
+  // SPL, BPL, SIL, DIL need the REX prefix.
   bool force = src.AsRegister() > 3;
+  if (normalize_both) {
+    // Some instructions take two byte registers, such as `xchg bpl, al`, so they need the REX
+    // prefix if either `src` or `dst` needs it.
+    force |= dst.AsRegister() > 3;
+  } else {
+    // Other instructions take one byte register and one full register, such as `movzxb rax, bpl`.
+    // They need REX prefix only if `src` needs it, but not `dst`.
+  }
   EmitOptionalRex(force, false, dst.NeedsRex(), false, src.NeedsRex());
 }
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index f385f22..8ef8931 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -775,10 +775,18 @@
   void fptan();
   void fprem();
 
+  void xchgb(CpuRegister dst, CpuRegister src);
+  void xchgb(CpuRegister reg, const Address& address);
+
+  void xchgw(CpuRegister dst, CpuRegister src);
+  void xchgw(CpuRegister reg, const Address& address);
+
   void xchgl(CpuRegister dst, CpuRegister src);
-  void xchgq(CpuRegister dst, CpuRegister src);
   void xchgl(CpuRegister reg, const Address& address);
 
+  void xchgq(CpuRegister dst, CpuRegister src);
+  void xchgq(CpuRegister reg, const Address& address);
+
   void cmpb(const Address& address, const Immediate& imm);
   void cmpw(const Address& address, const Immediate& imm);
 
@@ -1102,7 +1110,13 @@
   void EmitRex64(CpuRegister dst, XmmRegister src);
 
   // Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
-  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
+  // `normalize_both` parameter controls if the REX prefix is checked only for the `src` register
+  // (which is the case for instructions like `movzxb rax, bpl`), or for both `src` and `dst`
+  // registers (which is the case of instructions like `xchg bpl, al`). By default only `src` is
+  // used to decide if REX is needed.
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst,
+                                           CpuRegister src,
+                                           bool normalize_both = false);
   void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
 
   uint8_t EmitVexPrefixByteZero(bool is_twobyte_form);
@@ -1118,6 +1132,12 @@
   uint8_t EmitVexPrefixByteTwo(bool W,
                                int SET_VEX_L,
                                int SET_VEX_PP);
+
+  // Helper function to emit a shorter variant of XCHG if at least one operand is RAX/EAX/AX.
+  bool try_xchg_rax(CpuRegister dst,
+                    CpuRegister src,
+                    void (X86_64Assembler::*prefix_fn)(CpuRegister));
+
   ConstantArea constant_area_;
   bool has_AVX_;     // x86 256bit SIMD AVX.
   bool has_AVX2_;    // x86 256bit SIMD AVX 2.0.
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 562301c..feee577 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -951,11 +951,15 @@
                      /*imm_bytes*/ 4U, "xor ${imm}, %{reg}"), "xorli");
 }
 
-TEST_F(AssemblerX86_64Test, Xchgq) {
+TEST_F(AssemblerX86_64Test, XchgqReg) {
   DriverStr(RepeatRR(&x86_64::X86_64Assembler::xchgq, "xchgq %{reg2}, %{reg1}"), "xchgq");
 }
 
-TEST_F(AssemblerX86_64Test, Xchgl) {
+TEST_F(AssemblerX86_64Test, XchgqMem) {
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::xchgq, "xchgq %{reg}, {mem}"), "xchgq");
+}
+
+TEST_F(AssemblerX86_64Test, XchglReg) {
   // Exclude `xcghl eax, eax` because the reference implementation generates 0x87 0xC0 (contrary to
   // the intel manual saying that this should be a `nop` 0x90). All other cases are the same.
   static const std::vector<std::pair<x86_64::CpuRegister, x86_64::CpuRegister>> except = {
@@ -964,6 +968,26 @@
   DriverStr(Repeatrr(&x86_64::X86_64Assembler::xchgl, "xchgl %{reg2}, %{reg1}", &except), "xchgl");
 }
 
+TEST_F(AssemblerX86_64Test, XchglMem) {
+  DriverStr(RepeatrA(&x86_64::X86_64Assembler::xchgl, "xchgl %{reg}, {mem}"), "xchgl");
+}
+
+TEST_F(AssemblerX86_64Test, XchgwReg) {
+  DriverStr(Repeatww(&x86_64::X86_64Assembler::xchgw, "xchgw %{reg2}, %{reg1}"), "xchgw");
+}
+
+TEST_F(AssemblerX86_64Test, XchgwMem) {
+  DriverStr(RepeatwA(&x86_64::X86_64Assembler::xchgw, "xchgw %{reg}, {mem}"), "xchgw");
+}
+
+TEST_F(AssemblerX86_64Test, XchgbReg) {
+  DriverStr(Repeatbb(&x86_64::X86_64Assembler::xchgb, "xchgb %{reg2}, %{reg1}"), "xchgb");
+}
+
+TEST_F(AssemblerX86_64Test, XchgbMem) {
+  DriverStr(RepeatbA(&x86_64::X86_64Assembler::xchgb, "xchgb %{reg}, {mem}"), "xchgb");
+}
+
 TEST_F(AssemblerX86_64Test, Cmpxchgb) {
   DriverStr(RepeatAb(&x86_64::X86_64Assembler::cmpxchgb, "cmpxchgb %{reg}, {mem}"), "cmpxchgb");
 }