x86_64: Implement missing variants of XCHG. Add tests for them.
Also, clean up the handling of a special case when at least one of the
operands for XCHG is RAX/EAX/AX. Add a helper function that deduplicates
the code for different operand sizes.
This patch also extends `EmitOptionalByteRegNormalizingRex32` function
to handle the case when an instruction has both operands in byte
registers, and so it needs REX prefix if either of them is in a special
register (an example of such instruction is `xchg bpl, al`: if only the
source register is checked, no REX would be emitted). Previously
`EmitOptionalByteRegNormalizingRex32` handled only the case when the
source register is special (an example of such instruction is
`movzxb rax, bpl`).
Bug: 65872996
Test: m test-art-host-gtest
Change-Id: I20c5e9375bbd15d799e5748b127d154ddcc0fc11
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 35c68b8..52cedf0 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -3812,19 +3812,50 @@
}
-void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
+bool X86_64Assembler::try_xchg_rax(CpuRegister dst,
+ CpuRegister src,
+ void (X86_64Assembler::*prefix_fn)(CpuRegister)) {
+ Register src_reg = src.AsRegister();
+ Register dst_reg = dst.AsRegister();
+ if (src_reg != RAX && dst_reg != RAX) {
+ return false;
+ }
+ if (dst_reg == RAX) {
+ std::swap(src_reg, dst_reg);
+ }
+ if (dst_reg != RAX) {
+ // Prefix is needed only if one of the registers is not RAX, otherwise it's a pure NOP.
+ (this->*prefix_fn)(CpuRegister(dst_reg));
+ }
+ EmitUint8(0x90 + CpuRegister(dst_reg).LowBits());
+ return true;
+}
+
+
+void X86_64Assembler::xchgb(CpuRegister dst, CpuRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
- // There is a short version for rax.
- // It's a bit awkward, as CpuRegister has a const field, so assignment and thus swapping doesn't
- // work.
- const bool src_rax = src.AsRegister() == RAX;
- const bool dst_rax = dst.AsRegister() == RAX;
- if (src_rax || dst_rax) {
- EmitOptionalRex32(src_rax ? dst : src);
- EmitUint8(0x90 + (src_rax ? dst.LowBits() : src.LowBits()));
+ // There is no short version for AL.
+ EmitOptionalByteRegNormalizingRex32(dst, src, /*normalize_both=*/ true);
+ EmitUint8(0x86);
+ EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+
+void X86_64Assembler::xchgb(CpuRegister reg, const Address& address) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ EmitOptionalByteRegNormalizingRex32(reg, address);
+ EmitUint8(0x86);
+ EmitOperand(reg.LowBits(), address);
+}
+
+
+void X86_64Assembler::xchgw(CpuRegister dst, CpuRegister src) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ EmitOperandSizeOverride();
+ if (try_xchg_rax(dst, src, &X86_64Assembler::EmitOptionalRex32)) {
+ // A short version for AX.
return;
}
-
// General case.
EmitOptionalRex32(dst, src);
EmitUint8(0x87);
@@ -3832,26 +3863,23 @@
}
-void X86_64Assembler::xchgq(CpuRegister dst, CpuRegister src) {
+void X86_64Assembler::xchgw(CpuRegister reg, const Address& address) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
- // There is a short version for rax.
- // It's a bit awkward, as CpuRegister has a const field, so assignment and thus swapping doesn't
- // work.
- const bool src_rax = src.AsRegister() == RAX;
- const bool dst_rax = dst.AsRegister() == RAX;
- if (src_rax || dst_rax) {
- // If src == target, emit a nop instead.
- if (src_rax && dst_rax) {
- EmitUint8(0x90);
- } else {
- EmitRex64(src_rax ? dst : src);
- EmitUint8(0x90 + (src_rax ? dst.LowBits() : src.LowBits()));
- }
+ EmitOperandSizeOverride();
+ EmitOptionalRex32(reg, address);
+ EmitUint8(0x87);
+ EmitOperand(reg.LowBits(), address);
+}
+
+
+void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ if (try_xchg_rax(dst, src, &X86_64Assembler::EmitOptionalRex32)) {
+ // A short version for EAX.
return;
}
-
// General case.
- EmitRex64(dst, src);
+ EmitOptionalRex32(dst, src);
EmitUint8(0x87);
EmitRegisterOperand(dst.LowBits(), src.LowBits());
}
@@ -3865,6 +3893,27 @@
}
+void X86_64Assembler::xchgq(CpuRegister dst, CpuRegister src) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ if (try_xchg_rax(dst, src, &X86_64Assembler::EmitRex64)) {
+ // A short version for RAX.
+ return;
+ }
+ // General case.
+ EmitRex64(dst, src);
+ EmitUint8(0x87);
+ EmitRegisterOperand(dst.LowBits(), src.LowBits());
+}
+
+
+void X86_64Assembler::xchgq(CpuRegister reg, const Address& address) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ EmitRex64(reg, address);
+ EmitUint8(0x87);
+ EmitOperand(reg.LowBits(), address);
+}
+
+
void X86_64Assembler::cmpb(const Address& address, const Immediate& imm) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
CHECK(imm.is_int32());
@@ -5394,9 +5443,19 @@
EmitUint8(rex);
}
-void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src) {
- // For src, SPL, BPL, SIL, DIL need the rex prefix.
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst,
+ CpuRegister src,
+ bool normalize_both) {
+ // SPL, BPL, SIL, DIL need the REX prefix.
bool force = src.AsRegister() > 3;
+ if (normalize_both) {
+ // Some instructions take two byte registers, such as `xchg bpl, al`, so they need the REX
+ // prefix if either `src` or `dst` needs it.
+ force |= dst.AsRegister() > 3;
+ } else {
+ // Other instructions take one byte register and one full register, such as `movzxb rax, bpl`.
+ // They need REX prefix only if `src` needs it, but not `dst`.
+ }
EmitOptionalRex(force, false, dst.NeedsRex(), false, src.NeedsRex());
}
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index f385f22..8ef8931 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -775,10 +775,18 @@
void fptan();
void fprem();
+ void xchgb(CpuRegister dst, CpuRegister src);
+ void xchgb(CpuRegister reg, const Address& address);
+
+ void xchgw(CpuRegister dst, CpuRegister src);
+ void xchgw(CpuRegister reg, const Address& address);
+
void xchgl(CpuRegister dst, CpuRegister src);
- void xchgq(CpuRegister dst, CpuRegister src);
void xchgl(CpuRegister reg, const Address& address);
+ void xchgq(CpuRegister dst, CpuRegister src);
+ void xchgq(CpuRegister reg, const Address& address);
+
void cmpb(const Address& address, const Immediate& imm);
void cmpw(const Address& address, const Immediate& imm);
@@ -1102,7 +1110,13 @@
void EmitRex64(CpuRegister dst, XmmRegister src);
// Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
- void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
+ // `normalize_both` parameter controls if the REX prefix is checked only for the `src` register
+ // (which is the case for instructions like `movzxb rax, bpl`), or for both `src` and `dst`
+ // registers (which is the case of instructions like `xchg bpl, al`). By default only `src` is
+ // used to decide if REX is needed.
+ void EmitOptionalByteRegNormalizingRex32(CpuRegister dst,
+ CpuRegister src,
+ bool normalize_both = false);
void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
uint8_t EmitVexPrefixByteZero(bool is_twobyte_form);
@@ -1118,6 +1132,12 @@
uint8_t EmitVexPrefixByteTwo(bool W,
int SET_VEX_L,
int SET_VEX_PP);
+
+ // Helper function to emit a shorter variant of XCHG if at least one operand is RAX/EAX/AX.
+ bool try_xchg_rax(CpuRegister dst,
+ CpuRegister src,
+ void (X86_64Assembler::*prefix_fn)(CpuRegister));
+
ConstantArea constant_area_;
bool has_AVX_; // x86 256bit SIMD AVX.
bool has_AVX2_; // x86 256bit SIMD AVX 2.0.
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 562301c..feee577 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -951,11 +951,15 @@
/*imm_bytes*/ 4U, "xor ${imm}, %{reg}"), "xorli");
}
-TEST_F(AssemblerX86_64Test, Xchgq) {
+TEST_F(AssemblerX86_64Test, XchgqReg) {
DriverStr(RepeatRR(&x86_64::X86_64Assembler::xchgq, "xchgq %{reg2}, %{reg1}"), "xchgq");
}
-TEST_F(AssemblerX86_64Test, Xchgl) {
+TEST_F(AssemblerX86_64Test, XchgqMem) {
+ DriverStr(RepeatRA(&x86_64::X86_64Assembler::xchgq, "xchgq %{reg}, {mem}"), "xchgq");
+}
+
+TEST_F(AssemblerX86_64Test, XchglReg) {
// Exclude `xcghl eax, eax` because the reference implementation generates 0x87 0xC0 (contrary to
// the intel manual saying that this should be a `nop` 0x90). All other cases are the same.
static const std::vector<std::pair<x86_64::CpuRegister, x86_64::CpuRegister>> except = {
@@ -964,6 +968,26 @@
DriverStr(Repeatrr(&x86_64::X86_64Assembler::xchgl, "xchgl %{reg2}, %{reg1}", &except), "xchgl");
}
+TEST_F(AssemblerX86_64Test, XchglMem) {
+ DriverStr(RepeatrA(&x86_64::X86_64Assembler::xchgl, "xchgl %{reg}, {mem}"), "xchgl");
+}
+
+TEST_F(AssemblerX86_64Test, XchgwReg) {
+ DriverStr(Repeatww(&x86_64::X86_64Assembler::xchgw, "xchgw %{reg2}, %{reg1}"), "xchgw");
+}
+
+TEST_F(AssemblerX86_64Test, XchgwMem) {
+ DriverStr(RepeatwA(&x86_64::X86_64Assembler::xchgw, "xchgw %{reg}, {mem}"), "xchgw");
+}
+
+TEST_F(AssemblerX86_64Test, XchgbReg) {
+ DriverStr(Repeatbb(&x86_64::X86_64Assembler::xchgb, "xchgb %{reg2}, %{reg1}"), "xchgb");
+}
+
+TEST_F(AssemblerX86_64Test, XchgbMem) {
+ DriverStr(RepeatbA(&x86_64::X86_64Assembler::xchgb, "xchgb %{reg}, {mem}"), "xchgb");
+}
+
TEST_F(AssemblerX86_64Test, Cmpxchgb) {
DriverStr(RepeatAb(&x86_64::X86_64Assembler::cmpxchgb, "cmpxchgb %{reg}, {mem}"), "cmpxchgb");
}