Direct calls to @CriticalNative methods.

Emit direct calls from compiled managed code to the native
code registered with the method, avoiding the JNI stub.

Golem results:
art-opt-cc                       x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +12.5% +62.5% +75.9% +41.7%
NativeDowncallStaticCritical6 +55.6% +87.5% +72.1% +35.3%
art-opt                          x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +28.6% +85.6% +76.4% +38.4%
NativeDowncallStaticCritical6 +44.6% +44.6% +74.6% +32.2%

Test: Covered by 178-app-image-native-method.
Test: m test-art-host-gtest
Test: testrunner.py --host --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --target --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use -t 178
Test: aosp_cf_x86_phone-userdebug boots.
Test: aosp_cf_x86_phone-userdebug/jitzygote boots.
Bug: 112189621
Change-Id: I8b37da51e8fe0b7bc513bb81b127fe0416068866
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index cfd9ea6..f74a938 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -32,6 +32,7 @@
 #include "code_generator_x86_64.h"
 #endif
 
+#include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
 #include "base/casts.h"
@@ -503,23 +504,69 @@
 
   if (invoke->IsInvokeStaticOrDirect()) {
     HInvokeStaticOrDirect* call = invoke->AsInvokeStaticOrDirect();
-    switch (call->GetMethodLoadKind()) {
-      case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-        locations->SetInAt(call->GetSpecialInputIndex(), visitor->GetMethodLocation());
-        break;
-      case HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall:
-        locations->AddTemp(visitor->GetMethodLocation());
-        locations->SetInAt(call->GetSpecialInputIndex(), Location::RequiresRegister());
-        break;
-      default:
-        locations->AddTemp(visitor->GetMethodLocation());
-        break;
+    HInvokeStaticOrDirect::MethodLoadKind method_load_kind = call->GetMethodLoadKind();
+    HInvokeStaticOrDirect::CodePtrLocation code_ptr_location = call->GetCodePtrLocation();
+    if (code_ptr_location == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+      locations->AddTemp(Location::RequiresRegister());  // For target method.
+    }
+    if (code_ptr_location == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative ||
+        method_load_kind == HInvokeStaticOrDirect::MethodLoadKind::kRecursive) {
+      // For `kCallCriticalNative` we need the current method as the hidden argument
+      // if we reach the dlsym lookup stub for @CriticalNative.
+      locations->SetInAt(call->GetCurrentMethodIndex(), visitor->GetMethodLocation());
+    } else {
+      locations->AddTemp(visitor->GetMethodLocation());
+      if (method_load_kind == HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall) {
+        locations->SetInAt(call->GetCurrentMethodIndex(), Location::RequiresRegister());
+      }
     }
   } else if (!invoke->IsInvokePolymorphic()) {
     locations->AddTemp(visitor->GetMethodLocation());
   }
 }
 
+void CodeGenerator::PrepareCriticalNativeArgumentMoves(
+    HInvokeStaticOrDirect* invoke,
+    /*inout*/InvokeDexCallingConventionVisitor* visitor,
+    /*out*/HParallelMove* parallel_move) {
+  LocationSummary* locations = invoke->GetLocations();
+  for (size_t i = 0, num = invoke->GetNumberOfArguments(); i != num; ++i) {
+    Location in_location = locations->InAt(i);
+    DataType::Type type = invoke->InputAt(i)->GetType();
+    DCHECK_NE(type, DataType::Type::kReference);
+    Location out_location = visitor->GetNextLocation(type);
+    if (out_location.IsStackSlot() || out_location.IsDoubleStackSlot()) {
+      // Stack arguments will need to be moved after adjusting the SP.
+      parallel_move->AddMove(in_location, out_location, type, /*instruction=*/ nullptr);
+    } else {
+      // Register arguments should have been assigned their final locations for register allocation.
+      DCHECK(out_location.Equals(in_location)) << in_location << " -> " << out_location;
+    }
+  }
+}
+
+void CodeGenerator::AdjustCriticalNativeArgumentMoves(size_t out_frame_size,
+                                                      /*inout*/HParallelMove* parallel_move) {
+  // Adjust the source stack offsets by `out_frame_size`, i.e. the additional
+  // frame size needed for outgoing stack arguments.
+  for (size_t i = 0, num = parallel_move->NumMoves(); i != num; ++i) {
+    MoveOperands* operands = parallel_move->MoveOperandsAt(i);
+    Location source = operands->GetSource();
+    if (operands->GetSource().IsStackSlot()) {
+      operands->SetSource(Location::StackSlot(source.GetStackIndex() +  out_frame_size));
+    } else if (operands->GetSource().IsDoubleStackSlot()) {
+      operands->SetSource(Location::DoubleStackSlot(source.GetStackIndex() +  out_frame_size));
+    }
+  }
+}
+
+const char* CodeGenerator::GetCriticalNativeShorty(HInvokeStaticOrDirect* invoke,
+                                                   uint32_t* shorty_len) {
+  ScopedObjectAccess soa(Thread::Current());
+  DCHECK(invoke->GetResolvedMethod()->IsCriticalNative());
+  return invoke->GetResolvedMethod()->GetShorty(shorty_len);
+}
+
 void CodeGenerator::GenerateInvokeStaticOrDirectRuntimeCall(
     HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path) {
   MoveConstant(temp, invoke->GetDexMethodIndex());