Path: blob/master/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
41144 views
/*1* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.2* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.3*4* This code is free software; you can redistribute it and/or modify it5* under the terms of the GNU General Public License version 2 only, as6* published by the Free Software Foundation.7*8* This code is distributed in the hope that it will be useful, but WITHOUT9* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or10* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License11* version 2 for more details (a copy is included in the LICENSE file that12* accompanied this code).13*14* You should have received a copy of the GNU General Public License version15* 2 along with this work; if not, write to the Free Software Foundation,16* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.17*18* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA19* or visit www.oracle.com if you need additional information or have any20* questions.21*22*/2324#include "precompiled.hpp"25#include "asm/assembler.hpp"26#include "asm/assembler.inline.hpp"27#include "oops/methodData.hpp"28#include "opto/c2_MacroAssembler.hpp"29#include "opto/intrinsicnode.hpp"30#include "opto/opcodes.hpp"31#include "opto/subnode.hpp"32#include "runtime/biasedLocking.hpp"33#include "runtime/objectMonitor.hpp"34#include "runtime/stubRoutines.hpp"3536inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {37switch (vlen_in_bytes) {38case 4: // fall-through39case 8: // fall-through40case 16: return Assembler::AVX_128bit;41case 32: return Assembler::AVX_256bit;42case 64: return Assembler::AVX_512bit;4344default: {45ShouldNotReachHere();46return Assembler::AVX_NoVec;47}48}49}5051void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {52guarantee(PostLoopMultiversioning, "must be");53Assembler::movl(dst, 1);54Assembler::shlxl(dst, dst, src);55Assembler::decl(dst);56Assembler::kmovdl(mask, dst);57Assembler::movl(dst, src);58}5960void C2_MacroAssembler::restorevectmask(KRegister mask) {61guarantee(PostLoopMultiversioning, "must be");62Assembler::knotwl(mask, k0);63}6465#if INCLUDE_RTM_OPT6667// Update rtm_counters based on abort status68// input: abort_status69// rtm_counters (RTMLockingCounters*)70// flags are killed71void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {7273atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));74if (PrintPreciseRTMLockingStatistics) {75for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {76Label check_abort;77testl(abort_status, (1<<i));78jccb(Assembler::equal, check_abort);79atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));80bind(check_abort);81}82}83}8485// Branch if (random & (count-1) != 0), count is 2^n86// tmp, scr and flags are killed87void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {88assert(tmp == rax, "");89assert(scr == rdx, "");90rdtsc(); // modifies EDX:EAX91andptr(tmp, count-1);92jccb(Assembler::notZero, brLabel);93}9495// Perform abort ratio calculation, set no_rtm bit if high ratio96// input: rtm_counters_Reg (RTMLockingCounters* address)97// tmpReg, rtm_counters_Reg and flags are killed98void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,99Register rtm_counters_Reg,100RTMLockingCounters* rtm_counters,101Metadata* method_data) {102Label L_done, L_check_always_rtm1, L_check_always_rtm2;103104if (RTMLockingCalculationDelay > 0) {105// Delay calculation106movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);107testptr(tmpReg, tmpReg);108jccb(Assembler::equal, L_done);109}110// Abort ratio calculation only if abort_count > RTMAbortThreshold111// Aborted transactions = abort_count * 100112// All transactions = total_count * RTMTotalCountIncrRate113// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)114115movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));116cmpptr(tmpReg, RTMAbortThreshold);117jccb(Assembler::below, L_check_always_rtm2);118imulptr(tmpReg, tmpReg, 100);119120Register scrReg = rtm_counters_Reg;121movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));122imulptr(scrReg, scrReg, RTMTotalCountIncrRate);123imulptr(scrReg, scrReg, RTMAbortRatio);124cmpptr(tmpReg, scrReg);125jccb(Assembler::below, L_check_always_rtm1);126if (method_data != NULL) {127// set rtm_state to "no rtm" in MDO128mov_metadata(tmpReg, method_data);129lock();130orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);131}132jmpb(L_done);133bind(L_check_always_rtm1);134// Reload RTMLockingCounters* address135lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));136bind(L_check_always_rtm2);137movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));138cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);139jccb(Assembler::below, L_done);140if (method_data != NULL) {141// set rtm_state to "always rtm" in MDO142mov_metadata(tmpReg, method_data);143lock();144orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);145}146bind(L_done);147}148149// Update counters and perform abort ratio calculation150// input: abort_status_Reg151// rtm_counters_Reg, flags are killed152void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,153Register rtm_counters_Reg,154RTMLockingCounters* rtm_counters,155Metadata* method_data,156bool profile_rtm) {157158assert(rtm_counters != NULL, "should not be NULL when profiling RTM");159// update rtm counters based on rax value at abort160// reads abort_status_Reg, updates flags161lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));162rtm_counters_update(abort_status_Reg, rtm_counters_Reg);163if (profile_rtm) {164// Save abort status because abort_status_Reg is used by following code.165if (RTMRetryCount > 0) {166push(abort_status_Reg);167}168assert(rtm_counters != NULL, "should not be NULL when profiling RTM");169rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);170// restore abort status171if (RTMRetryCount > 0) {172pop(abort_status_Reg);173}174}175}176177// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)178// inputs: retry_count_Reg179// : abort_status_Reg180// output: retry_count_Reg decremented by 1181// flags are killed182void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {183Label doneRetry;184assert(abort_status_Reg == rax, "");185// The abort reason bits are in eax (see all states in rtmLocking.hpp)186// 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)187// if reason is in 0x6 and retry count != 0 then retry188andptr(abort_status_Reg, 0x6);189jccb(Assembler::zero, doneRetry);190testl(retry_count_Reg, retry_count_Reg);191jccb(Assembler::zero, doneRetry);192pause();193decrementl(retry_count_Reg);194jmp(retryLabel);195bind(doneRetry);196}197198// Spin and retry if lock is busy,199// inputs: box_Reg (monitor address)200// : retry_count_Reg201// output: retry_count_Reg decremented by 1202// : clear z flag if retry count exceeded203// tmp_Reg, scr_Reg, flags are killed204void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,205Register tmp_Reg, Register scr_Reg, Label& retryLabel) {206Label SpinLoop, SpinExit, doneRetry;207int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);208209testl(retry_count_Reg, retry_count_Reg);210jccb(Assembler::zero, doneRetry);211decrementl(retry_count_Reg);212movptr(scr_Reg, RTMSpinLoopCount);213214bind(SpinLoop);215pause();216decrementl(scr_Reg);217jccb(Assembler::lessEqual, SpinExit);218movptr(tmp_Reg, Address(box_Reg, owner_offset));219testptr(tmp_Reg, tmp_Reg);220jccb(Assembler::notZero, SpinLoop);221222bind(SpinExit);223jmp(retryLabel);224bind(doneRetry);225incrementl(retry_count_Reg); // clear z flag226}227228// Use RTM for normal stack locks229// Input: objReg (object to lock)230void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,231Register retry_on_abort_count_Reg,232RTMLockingCounters* stack_rtm_counters,233Metadata* method_data, bool profile_rtm,234Label& DONE_LABEL, Label& IsInflated) {235assert(UseRTMForStackLocks, "why call this otherwise?");236assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");237assert(tmpReg == rax, "");238assert(scrReg == rdx, "");239Label L_rtm_retry, L_decrement_retry, L_on_abort;240241if (RTMRetryCount > 0) {242movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort243bind(L_rtm_retry);244}245movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));246testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased247jcc(Assembler::notZero, IsInflated);248249if (PrintPreciseRTMLockingStatistics || profile_rtm) {250Label L_noincrement;251if (RTMTotalCountIncrRate > 1) {252// tmpReg, scrReg and flags are killed253branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);254}255assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");256atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);257bind(L_noincrement);258}259xbegin(L_on_abort);260movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword261andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits262cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked263jcc(Assembler::equal, DONE_LABEL); // all done if unlocked264265Register abort_status_Reg = tmpReg; // status of abort is stored in RAX266if (UseRTMXendForLockBusy) {267xend();268movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)269jmp(L_decrement_retry);270}271else {272xabort(0);273}274bind(L_on_abort);275if (PrintPreciseRTMLockingStatistics || profile_rtm) {276rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);277}278bind(L_decrement_retry);279if (RTMRetryCount > 0) {280// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)281rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);282}283}284285// Use RTM for inflating locks286// inputs: objReg (object to lock)287// boxReg (on-stack box address (displaced header location) - KILLED)288// tmpReg (ObjectMonitor address + markWord::monitor_value)289void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,290Register scrReg, Register retry_on_busy_count_Reg,291Register retry_on_abort_count_Reg,292RTMLockingCounters* rtm_counters,293Metadata* method_data, bool profile_rtm,294Label& DONE_LABEL) {295assert(UseRTMLocking, "why call this otherwise?");296assert(tmpReg == rax, "");297assert(scrReg == rdx, "");298Label L_rtm_retry, L_decrement_retry, L_on_abort;299int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);300301// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.302movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));303movptr(boxReg, tmpReg); // Save ObjectMonitor address304305if (RTMRetryCount > 0) {306movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy307movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort308bind(L_rtm_retry);309}310if (PrintPreciseRTMLockingStatistics || profile_rtm) {311Label L_noincrement;312if (RTMTotalCountIncrRate > 1) {313// tmpReg, scrReg and flags are killed314branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);315}316assert(rtm_counters != NULL, "should not be NULL when profiling RTM");317atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);318bind(L_noincrement);319}320xbegin(L_on_abort);321movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));322movptr(tmpReg, Address(tmpReg, owner_offset));323testptr(tmpReg, tmpReg);324jcc(Assembler::zero, DONE_LABEL);325if (UseRTMXendForLockBusy) {326xend();327jmp(L_decrement_retry);328}329else {330xabort(0);331}332bind(L_on_abort);333Register abort_status_Reg = tmpReg; // status of abort is stored in RAX334if (PrintPreciseRTMLockingStatistics || profile_rtm) {335rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);336}337if (RTMRetryCount > 0) {338// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)339rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);340}341342movptr(tmpReg, Address(boxReg, owner_offset)) ;343testptr(tmpReg, tmpReg) ;344jccb(Assembler::notZero, L_decrement_retry) ;345346// Appears unlocked - try to swing _owner from null to non-null.347// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.348#ifdef _LP64349Register threadReg = r15_thread;350#else351get_thread(scrReg);352Register threadReg = scrReg;353#endif354lock();355cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg356357if (RTMRetryCount > 0) {358// success done else retry359jccb(Assembler::equal, DONE_LABEL) ;360bind(L_decrement_retry);361// Spin and retry if lock is busy.362rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);363}364else {365bind(L_decrement_retry);366}367}368369#endif // INCLUDE_RTM_OPT370371// fast_lock and fast_unlock used by C2372373// Because the transitions from emitted code to the runtime374// monitorenter/exit helper stubs are so slow it's critical that375// we inline both the stack-locking fast path and the inflated fast path.376//377// See also: cmpFastLock and cmpFastUnlock.378//379// What follows is a specialized inline transliteration of the code380// in enter() and exit(). If we're concerned about I$ bloat another381// option would be to emit TrySlowEnter and TrySlowExit methods382// at startup-time. These methods would accept arguments as383// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure384// indications in the icc.ZFlag. fast_lock and fast_unlock would simply385// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.386// In practice, however, the # of lock sites is bounded and is usually small.387// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer388// if the processor uses simple bimodal branch predictors keyed by EIP389// Since the helper routines would be called from multiple synchronization390// sites.391//392// An even better approach would be write "MonitorEnter()" and "MonitorExit()"393// in java - using j.u.c and unsafe - and just bind the lock and unlock sites394// to those specialized methods. That'd give us a mostly platform-independent395// implementation that the JITs could optimize and inline at their pleasure.396// Done correctly, the only time we'd need to cross to native could would be397// to park() or unpark() threads. We'd also need a few more unsafe operators398// to (a) prevent compiler-JIT reordering of non-volatile accesses, and399// (b) explicit barriers or fence operations.400//401// TODO:402//403// * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).404// This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.405// Given TLAB allocation, Self is usually manifested in a register, so passing it into406// the lock operators would typically be faster than reifying Self.407//408// * Ideally I'd define the primitives as:409// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.410// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED411// Unfortunately ADLC bugs prevent us from expressing the ideal form.412// Instead, we're stuck with a rather awkward and brittle register assignments below.413// Furthermore the register assignments are overconstrained, possibly resulting in414// sub-optimal code near the synchronization site.415//416// * Eliminate the sp-proximity tests and just use "== Self" tests instead.417// Alternately, use a better sp-proximity test.418//419// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.420// Either one is sufficient to uniquely identify a thread.421// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.422//423// * Intrinsify notify() and notifyAll() for the common cases where the424// object is locked by the calling thread but the waitlist is empty.425// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().426//427// * use jccb and jmpb instead of jcc and jmp to improve code density.428// But beware of excessive branch density on AMD Opterons.429//430// * Both fast_lock and fast_unlock set the ICC.ZF to indicate success431// or failure of the fast path. If the fast path fails then we pass432// control to the slow path, typically in C. In fast_lock and433// fast_unlock we often branch to DONE_LABEL, just to find that C2434// will emit a conditional branch immediately after the node.435// So we have branches to branches and lots of ICC.ZF games.436// Instead, it might be better to have C2 pass a "FailureLabel"437// into fast_lock and fast_unlock. In the case of success, control438// will drop through the node. ICC.ZF is undefined at exit.439// In the case of failure, the node will branch directly to the440// FailureLabel441442443// obj: object to lock444// box: on-stack box address (displaced header location) - KILLED445// rax,: tmp -- KILLED446// scr: tmp -- KILLED447void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,448Register scrReg, Register cx1Reg, Register cx2Reg,449BiasedLockingCounters* counters,450RTMLockingCounters* rtm_counters,451RTMLockingCounters* stack_rtm_counters,452Metadata* method_data,453bool use_rtm, bool profile_rtm) {454// Ensure the register assignments are disjoint455assert(tmpReg == rax, "");456457if (use_rtm) {458assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);459} else {460assert(cx2Reg == noreg, "");461assert_different_registers(objReg, boxReg, tmpReg, scrReg);462}463464if (counters != NULL) {465atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);466}467468// Possible cases that we'll encounter in fast_lock469// ------------------------------------------------470// * Inflated471// -- unlocked472// -- Locked473// = by self474// = by other475// * biased476// -- by Self477// -- by other478// * neutral479// * stack-locked480// -- by self481// = sp-proximity test hits482// = sp-proximity test generates false-negative483// -- by other484//485486Label IsInflated, DONE_LABEL;487488if (DiagnoseSyncOnValueBasedClasses != 0) {489load_klass(tmpReg, objReg, cx1Reg);490movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));491testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);492jcc(Assembler::notZero, DONE_LABEL);493}494495// it's stack-locked, biased or neutral496// TODO: optimize away redundant LDs of obj->mark and improve the markword triage497// order to reduce the number of conditional branches in the most common cases.498// Beware -- there's a subtle invariant that fetch of the markword499// at [FETCH], below, will never observe a biased encoding (*101b).500// If this invariant is not held we risk exclusion (safety) failure.501if (UseBiasedLocking && !UseOptoBiasInlining) {502biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);503}504505#if INCLUDE_RTM_OPT506if (UseRTMForStackLocks && use_rtm) {507rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,508stack_rtm_counters, method_data, profile_rtm,509DONE_LABEL, IsInflated);510}511#endif // INCLUDE_RTM_OPT512513movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]514testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased515jccb(Assembler::notZero, IsInflated);516517// Attempt stack-locking ...518orptr (tmpReg, markWord::unlocked_value);519movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS520lock();521cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg522if (counters != NULL) {523cond_inc32(Assembler::equal,524ExternalAddress((address)counters->fast_path_entry_count_addr()));525}526jcc(Assembler::equal, DONE_LABEL); // Success527528// Recursive locking.529// The object is stack-locked: markword contains stack pointer to BasicLock.530// Locked by current thread if difference with current SP is less than one page.531subptr(tmpReg, rsp);532// Next instruction set ZFlag == 1 (Success) if difference is less then one page.533andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );534movptr(Address(boxReg, 0), tmpReg);535if (counters != NULL) {536cond_inc32(Assembler::equal,537ExternalAddress((address)counters->fast_path_entry_count_addr()));538}539jmp(DONE_LABEL);540541bind(IsInflated);542// The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value543544#if INCLUDE_RTM_OPT545// Use the same RTM locking code in 32- and 64-bit VM.546if (use_rtm) {547rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,548rtm_counters, method_data, profile_rtm, DONE_LABEL);549} else {550#endif // INCLUDE_RTM_OPT551552#ifndef _LP64553// The object is inflated.554555// boxReg refers to the on-stack BasicLock in the current frame.556// We'd like to write:557// set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.558// This is convenient but results a ST-before-CAS penalty. The following CAS suffers559// additional latency as we have another ST in the store buffer that must drain.560561// avoid ST-before-CAS562// register juggle because we need tmpReg for cmpxchgptr below563movptr(scrReg, boxReg);564movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]565566// Optimistic form: consider XORL tmpReg,tmpReg567movptr(tmpReg, NULL_WORD);568569// Appears unlocked - try to swing _owner from null to non-null.570// Ideally, I'd manifest "Self" with get_thread and then attempt571// to CAS the register containing Self into m->Owner.572// But we don't have enough registers, so instead we can either try to CAS573// rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds574// we later store "Self" into m->Owner. Transiently storing a stack address575// (rsp or the address of the box) into m->owner is harmless.576// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.577lock();578cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));579movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3580// If we weren't able to swing _owner from NULL to the BasicLock581// then take the slow path.582jccb (Assembler::notZero, DONE_LABEL);583// update _owner from BasicLock to thread584get_thread (scrReg); // beware: clobbers ICCs585movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);586xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success587588// If the CAS fails we can either retry or pass control to the slow path.589// We use the latter tactic.590// Pass the CAS result in the icc.ZFlag into DONE_LABEL591// If the CAS was successful ...592// Self has acquired the lock593// Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.594// Intentional fall-through into DONE_LABEL ...595#else // _LP64596// It's inflated and we use scrReg for ObjectMonitor* in this section.597movq(scrReg, tmpReg);598xorq(tmpReg, tmpReg);599lock();600cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));601// Unconditionally set box->_displaced_header = markWord::unused_mark().602// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.603movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));604// Intentional fall-through into DONE_LABEL ...605// Propagate ICC.ZF from CAS above into DONE_LABEL.606#endif // _LP64607#if INCLUDE_RTM_OPT608} // use_rtm()609#endif610// DONE_LABEL is a hot target - we'd really like to place it at the611// start of cache line by padding with NOPs.612// See the AMD and Intel software optimization manuals for the613// most efficient "long" NOP encodings.614// Unfortunately none of our alignment mechanisms suffice.615bind(DONE_LABEL);616617// At DONE_LABEL the icc ZFlag is set as follows ...618// fast_unlock uses the same protocol.619// ZFlag == 1 -> Success620// ZFlag == 0 -> Failure - force control through the slow path621}622623// obj: object to unlock624// box: box address (displaced header location), killed. Must be EAX.625// tmp: killed, cannot be obj nor box.626//627// Some commentary on balanced locking:628//629// fast_lock and fast_unlock are emitted only for provably balanced lock sites.630// Methods that don't have provably balanced locking are forced to run in the631// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.632// The interpreter provides two properties:633// I1: At return-time the interpreter automatically and quietly unlocks any634// objects acquired the current activation (frame). Recall that the635// interpreter maintains an on-stack list of locks currently held by636// a frame.637// I2: If a method attempts to unlock an object that is not held by the638// the frame the interpreter throws IMSX.639//640// Lets say A(), which has provably balanced locking, acquires O and then calls B().641// B() doesn't have provably balanced locking so it runs in the interpreter.642// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O643// is still locked by A().644//645// The only other source of unbalanced locking would be JNI. The "Java Native Interface:646// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter647// should not be unlocked by "normal" java-level locking and vice-versa. The specification648// doesn't specify what will occur if a program engages in such mixed-mode locking, however.649// Arguably given that the spec legislates the JNI case as undefined our implementation650// could reasonably *avoid* checking owner in fast_unlock().651// In the interest of performance we elide m->Owner==Self check in unlock.652// A perfectly viable alternative is to elide the owner check except when653// Xcheck:jni is enabled.654655void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {656assert(boxReg == rax, "");657assert_different_registers(objReg, boxReg, tmpReg);658659Label DONE_LABEL, Stacked, CheckSucc;660661// Critically, the biased locking test must have precedence over662// and appear before the (box->dhw == 0) recursive stack-lock test.663if (UseBiasedLocking && !UseOptoBiasInlining) {664biased_locking_exit(objReg, tmpReg, DONE_LABEL);665}666667#if INCLUDE_RTM_OPT668if (UseRTMForStackLocks && use_rtm) {669assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");670Label L_regular_unlock;671movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword672andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits673cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked674jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock675xend(); // otherwise end...676jmp(DONE_LABEL); // ... and we're done677bind(L_regular_unlock);678}679#endif680681cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header682jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock683movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword684testptr(tmpReg, markWord::monitor_value); // Inflated?685jccb (Assembler::zero, Stacked);686687// It's inflated.688#if INCLUDE_RTM_OPT689if (use_rtm) {690Label L_regular_inflated_unlock;691int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);692movptr(boxReg, Address(tmpReg, owner_offset));693testptr(boxReg, boxReg);694jccb(Assembler::notZero, L_regular_inflated_unlock);695xend();696jmpb(DONE_LABEL);697bind(L_regular_inflated_unlock);698}699#endif700701// Despite our balanced locking property we still check that m->_owner == Self702// as java routines or native JNI code called by this thread might703// have released the lock.704// Refer to the comments in synchronizer.cpp for how we might encode extra705// state in _succ so we can avoid fetching EntryList|cxq.706//707// I'd like to add more cases in fast_lock() and fast_unlock() --708// such as recursive enter and exit -- but we have to be wary of709// I$ bloat, T$ effects and BP$ effects.710//711// If there's no contention try a 1-0 exit. That is, exit without712// a costly MEMBAR or CAS. See synchronizer.cpp for details on how713// we detect and recover from the race that the 1-0 exit admits.714//715// Conceptually fast_unlock() must execute a STST|LDST "release" barrier716// before it STs null into _owner, releasing the lock. Updates717// to data protected by the critical section must be visible before718// we drop the lock (and thus before any other thread could acquire719// the lock and observe the fields protected by the lock).720// IA32's memory-model is SPO, so STs are ordered with respect to721// each other and there's no need for an explicit barrier (fence).722// See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.723#ifndef _LP64724get_thread (boxReg);725726// Note that we could employ various encoding schemes to reduce727// the number of loads below (currently 4) to just 2 or 3.728// Refer to the comments in synchronizer.cpp.729// In practice the chain of fetches doesn't seem to impact performance, however.730xorptr(boxReg, boxReg);731orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));732jccb (Assembler::notZero, DONE_LABEL);733movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));734orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));735jccb (Assembler::notZero, CheckSucc);736movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);737jmpb (DONE_LABEL);738739bind (Stacked);740// It's not inflated and it's not recursively stack-locked and it's not biased.741// It must be stack-locked.742// Try to reset the header to displaced header.743// The "box" value on the stack is stable, so we can reload744// and be assured we observe the same value as above.745movptr(tmpReg, Address(boxReg, 0));746lock();747cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box748// Intention fall-thru into DONE_LABEL749750// DONE_LABEL is a hot target - we'd really like to place it at the751// start of cache line by padding with NOPs.752// See the AMD and Intel software optimization manuals for the753// most efficient "long" NOP encodings.754// Unfortunately none of our alignment mechanisms suffice.755bind (CheckSucc);756#else // _LP64757// It's inflated758xorptr(boxReg, boxReg);759orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));760jccb (Assembler::notZero, DONE_LABEL);761movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));762orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));763jccb (Assembler::notZero, CheckSucc);764// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.765movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);766jmpb (DONE_LABEL);767768// Try to avoid passing control into the slow_path ...769Label LSuccess, LGoSlowPath ;770bind (CheckSucc);771772// The following optional optimization can be elided if necessary773// Effectively: if (succ == null) goto slow path774// The code reduces the window for a race, however,775// and thus benefits performance.776cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);777jccb (Assembler::zero, LGoSlowPath);778779xorptr(boxReg, boxReg);780// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.781movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);782783// Memory barrier/fence784// Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ785// Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.786// This is faster on Nehalem and AMD Shanghai/Barcelona.787// See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences788// We might also restructure (ST Owner=0;barrier;LD _Succ) to789// (mov box,0; xchgq box, &m->Owner; LD _succ) .790lock(); addl(Address(rsp, 0), 0);791792cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);793jccb (Assembler::notZero, LSuccess);794795// Rare inopportune interleaving - race.796// The successor vanished in the small window above.797// The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.798// We need to ensure progress and succession.799// Try to reacquire the lock.800// If that fails then the new owner is responsible for succession and this801// thread needs to take no further action and can exit via the fast path (success).802// If the re-acquire succeeds then pass control into the slow path.803// As implemented, this latter mode is horrible because we generated more804// coherence traffic on the lock *and* artifically extended the critical section805// length while by virtue of passing control into the slow path.806807// box is really RAX -- the following CMPXCHG depends on that binding808// cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)809lock();810cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));811// There's no successor so we tried to regrab the lock.812// If that didn't work, then another thread grabbed the813// lock so we're done (and exit was a success).814jccb (Assembler::notEqual, LSuccess);815// Intentional fall-through into slow path816817bind (LGoSlowPath);818orl (boxReg, 1); // set ICC.ZF=0 to indicate failure819jmpb (DONE_LABEL);820821bind (LSuccess);822testl (boxReg, 0); // set ICC.ZF=1 to indicate success823jmpb (DONE_LABEL);824825bind (Stacked);826movptr(tmpReg, Address (boxReg, 0)); // re-fetch827lock();828cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box829830#endif831bind(DONE_LABEL);832}833834//-------------------------------------------------------------------------------------------835// Generic instructions support for use in .ad files C2 code generation836837void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {838if (dst != src) {839movdqu(dst, src);840}841if (opcode == Op_AbsVD) {842andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);843} else {844assert((opcode == Op_NegVD),"opcode should be Op_NegD");845xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);846}847}848849void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {850if (opcode == Op_AbsVD) {851vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);852} else {853assert((opcode == Op_NegVD),"opcode should be Op_NegD");854vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);855}856}857858void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {859if (dst != src) {860movdqu(dst, src);861}862if (opcode == Op_AbsVF) {863andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);864} else {865assert((opcode == Op_NegVF),"opcode should be Op_NegF");866xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);867}868}869870void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {871if (opcode == Op_AbsVF) {872vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);873} else {874assert((opcode == Op_NegVF),"opcode should be Op_NegF");875vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);876}877}878879void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {880assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");881assert(tmp == xnoreg || elem_bt == T_LONG, "unused");882883if (opcode == Op_MinV) {884if (elem_bt == T_BYTE) {885pminsb(dst, src);886} else if (elem_bt == T_SHORT) {887pminsw(dst, src);888} else if (elem_bt == T_INT) {889pminsd(dst, src);890} else {891assert(elem_bt == T_LONG, "required");892assert(tmp == xmm0, "required");893assert_different_registers(dst, src, tmp);894movdqu(xmm0, dst);895pcmpgtq(xmm0, src);896blendvpd(dst, src); // xmm0 as mask897}898} else { // opcode == Op_MaxV899if (elem_bt == T_BYTE) {900pmaxsb(dst, src);901} else if (elem_bt == T_SHORT) {902pmaxsw(dst, src);903} else if (elem_bt == T_INT) {904pmaxsd(dst, src);905} else {906assert(elem_bt == T_LONG, "required");907assert(tmp == xmm0, "required");908assert_different_registers(dst, src, tmp);909movdqu(xmm0, src);910pcmpgtq(xmm0, dst);911blendvpd(dst, src); // xmm0 as mask912}913}914}915916void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,917XMMRegister dst, XMMRegister src1, XMMRegister src2,918int vlen_enc) {919assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");920921if (opcode == Op_MinV) {922if (elem_bt == T_BYTE) {923vpminsb(dst, src1, src2, vlen_enc);924} else if (elem_bt == T_SHORT) {925vpminsw(dst, src1, src2, vlen_enc);926} else if (elem_bt == T_INT) {927vpminsd(dst, src1, src2, vlen_enc);928} else {929assert(elem_bt == T_LONG, "required");930if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {931vpminsq(dst, src1, src2, vlen_enc);932} else {933assert_different_registers(dst, src1, src2);934vpcmpgtq(dst, src1, src2, vlen_enc);935vblendvpd(dst, src1, src2, dst, vlen_enc);936}937}938} else { // opcode == Op_MaxV939if (elem_bt == T_BYTE) {940vpmaxsb(dst, src1, src2, vlen_enc);941} else if (elem_bt == T_SHORT) {942vpmaxsw(dst, src1, src2, vlen_enc);943} else if (elem_bt == T_INT) {944vpmaxsd(dst, src1, src2, vlen_enc);945} else {946assert(elem_bt == T_LONG, "required");947if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {948vpmaxsq(dst, src1, src2, vlen_enc);949} else {950assert_different_registers(dst, src1, src2);951vpcmpgtq(dst, src1, src2, vlen_enc);952vblendvpd(dst, src2, src1, dst, vlen_enc);953}954}955}956}957958// Float/Double min max959960void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,961XMMRegister dst, XMMRegister a, XMMRegister b,962XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,963int vlen_enc) {964assert(UseAVX > 0, "required");965assert(opcode == Op_MinV || opcode == Op_MinReductionV ||966opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");967assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");968assert_different_registers(a, b, tmp, atmp, btmp);969970bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);971bool is_double_word = is_double_word_type(elem_bt);972973if (!is_double_word && is_min) {974vblendvps(atmp, a, b, a, vlen_enc);975vblendvps(btmp, b, a, a, vlen_enc);976vminps(tmp, atmp, btmp, vlen_enc);977vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);978vblendvps(dst, tmp, atmp, btmp, vlen_enc);979} else if (!is_double_word && !is_min) {980vblendvps(btmp, b, a, b, vlen_enc);981vblendvps(atmp, a, b, b, vlen_enc);982vmaxps(tmp, atmp, btmp, vlen_enc);983vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);984vblendvps(dst, tmp, atmp, btmp, vlen_enc);985} else if (is_double_word && is_min) {986vblendvpd(atmp, a, b, a, vlen_enc);987vblendvpd(btmp, b, a, a, vlen_enc);988vminpd(tmp, atmp, btmp, vlen_enc);989vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);990vblendvpd(dst, tmp, atmp, btmp, vlen_enc);991} else {992assert(is_double_word && !is_min, "sanity");993vblendvpd(btmp, b, a, b, vlen_enc);994vblendvpd(atmp, a, b, b, vlen_enc);995vmaxpd(tmp, atmp, btmp, vlen_enc);996vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);997vblendvpd(dst, tmp, atmp, btmp, vlen_enc);998}999}10001001void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,1002XMMRegister dst, XMMRegister a, XMMRegister b,1003KRegister ktmp, XMMRegister atmp, XMMRegister btmp,1004int vlen_enc) {1005assert(UseAVX > 2, "required");1006assert(opcode == Op_MinV || opcode == Op_MinReductionV ||1007opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");1008assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");1009assert_different_registers(dst, a, b, atmp, btmp);10101011bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);1012bool is_double_word = is_double_word_type(elem_bt);1013bool merge = true;10141015if (!is_double_word && is_min) {1016evpmovd2m(ktmp, a, vlen_enc);1017evblendmps(atmp, ktmp, a, b, merge, vlen_enc);1018evblendmps(btmp, ktmp, b, a, merge, vlen_enc);1019vminps(dst, atmp, btmp, vlen_enc);1020evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1021evmovdqul(dst, ktmp, atmp, merge, vlen_enc);1022} else if (!is_double_word && !is_min) {1023evpmovd2m(ktmp, b, vlen_enc);1024evblendmps(atmp, ktmp, a, b, merge, vlen_enc);1025evblendmps(btmp, ktmp, b, a, merge, vlen_enc);1026vmaxps(dst, atmp, btmp, vlen_enc);1027evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1028evmovdqul(dst, ktmp, atmp, merge, vlen_enc);1029} else if (is_double_word && is_min) {1030evpmovq2m(ktmp, a, vlen_enc);1031evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);1032evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);1033vminpd(dst, atmp, btmp, vlen_enc);1034evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1035evmovdquq(dst, ktmp, atmp, merge, vlen_enc);1036} else {1037assert(is_double_word && !is_min, "sanity");1038evpmovq2m(ktmp, b, vlen_enc);1039evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);1040evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);1041vmaxpd(dst, atmp, btmp, vlen_enc);1042evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);1043evmovdquq(dst, ktmp, atmp, merge, vlen_enc);1044}1045}10461047// Float/Double signum1048void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,1049XMMRegister zero, XMMRegister one,1050Register scratch) {1051assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");10521053Label DONE_LABEL;10541055if (opcode == Op_SignumF) {1056assert(UseSSE > 0, "required");1057ucomiss(dst, zero);1058jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument1059jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN1060movflt(dst, one);1061jcc(Assembler::above, DONE_LABEL);1062xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);1063} else if (opcode == Op_SignumD) {1064assert(UseSSE > 1, "required");1065ucomisd(dst, zero);1066jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument1067jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN1068movdbl(dst, one);1069jcc(Assembler::above, DONE_LABEL);1070xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);1071}10721073bind(DONE_LABEL);1074}10751076void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {1077if (sign) {1078pmovsxbw(dst, src);1079} else {1080pmovzxbw(dst, src);1081}1082}10831084void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {1085if (sign) {1086vpmovsxbw(dst, src, vector_len);1087} else {1088vpmovzxbw(dst, src, vector_len);1089}1090}10911092void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {1093if (sign) {1094vpmovsxbd(dst, src, vector_len);1095} else {1096vpmovzxbd(dst, src, vector_len);1097}1098}10991100void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {1101if (sign) {1102vpmovsxwd(dst, src, vector_len);1103} else {1104vpmovzxwd(dst, src, vector_len);1105}1106}11071108void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,1109int shift, int vector_len) {1110if (opcode == Op_RotateLeftV) {1111if (etype == T_INT) {1112evprold(dst, src, shift, vector_len);1113} else {1114assert(etype == T_LONG, "expected type T_LONG");1115evprolq(dst, src, shift, vector_len);1116}1117} else {1118assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");1119if (etype == T_INT) {1120evprord(dst, src, shift, vector_len);1121} else {1122assert(etype == T_LONG, "expected type T_LONG");1123evprorq(dst, src, shift, vector_len);1124}1125}1126}11271128void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,1129XMMRegister shift, int vector_len) {1130if (opcode == Op_RotateLeftV) {1131if (etype == T_INT) {1132evprolvd(dst, src, shift, vector_len);1133} else {1134assert(etype == T_LONG, "expected type T_LONG");1135evprolvq(dst, src, shift, vector_len);1136}1137} else {1138assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");1139if (etype == T_INT) {1140evprorvd(dst, src, shift, vector_len);1141} else {1142assert(etype == T_LONG, "expected type T_LONG");1143evprorvq(dst, src, shift, vector_len);1144}1145}1146}11471148void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {1149if (opcode == Op_RShiftVI) {1150psrad(dst, shift);1151} else if (opcode == Op_LShiftVI) {1152pslld(dst, shift);1153} else {1154assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");1155psrld(dst, shift);1156}1157}11581159void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {1160switch (opcode) {1161case Op_RShiftVI: psrad(dst, shift); break;1162case Op_LShiftVI: pslld(dst, shift); break;1163case Op_URShiftVI: psrld(dst, shift); break;11641165default: assert(false, "%s", NodeClassNames[opcode]);1166}1167}11681169void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {1170if (opcode == Op_RShiftVI) {1171vpsrad(dst, nds, shift, vector_len);1172} else if (opcode == Op_LShiftVI) {1173vpslld(dst, nds, shift, vector_len);1174} else {1175assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");1176vpsrld(dst, nds, shift, vector_len);1177}1178}11791180void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1181switch (opcode) {1182case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;1183case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;1184case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;11851186default: assert(false, "%s", NodeClassNames[opcode]);1187}1188}11891190void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {1191switch (opcode) {1192case Op_RShiftVB: // fall-through1193case Op_RShiftVS: psraw(dst, shift); break;11941195case Op_LShiftVB: // fall-through1196case Op_LShiftVS: psllw(dst, shift); break;11971198case Op_URShiftVS: // fall-through1199case Op_URShiftVB: psrlw(dst, shift); break;12001201default: assert(false, "%s", NodeClassNames[opcode]);1202}1203}12041205void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1206switch (opcode) {1207case Op_RShiftVB: // fall-through1208case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;12091210case Op_LShiftVB: // fall-through1211case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;12121213case Op_URShiftVS: // fall-through1214case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;12151216default: assert(false, "%s", NodeClassNames[opcode]);1217}1218}12191220void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {1221switch (opcode) {1222case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems1223case Op_LShiftVL: psllq(dst, shift); break;1224case Op_URShiftVL: psrlq(dst, shift); break;12251226default: assert(false, "%s", NodeClassNames[opcode]);1227}1228}12291230void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {1231if (opcode == Op_RShiftVL) {1232psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems1233} else if (opcode == Op_LShiftVL) {1234psllq(dst, shift);1235} else {1236assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");1237psrlq(dst, shift);1238}1239}12401241void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1242switch (opcode) {1243case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;1244case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;1245case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;12461247default: assert(false, "%s", NodeClassNames[opcode]);1248}1249}12501251void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {1252if (opcode == Op_RShiftVL) {1253evpsraq(dst, nds, shift, vector_len);1254} else if (opcode == Op_LShiftVL) {1255vpsllq(dst, nds, shift, vector_len);1256} else {1257assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");1258vpsrlq(dst, nds, shift, vector_len);1259}1260}12611262void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1263switch (opcode) {1264case Op_RShiftVB: // fall-through1265case Op_RShiftVS: // fall-through1266case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;12671268case Op_LShiftVB: // fall-through1269case Op_LShiftVS: // fall-through1270case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;12711272case Op_URShiftVB: // fall-through1273case Op_URShiftVS: // fall-through1274case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;12751276default: assert(false, "%s", NodeClassNames[opcode]);1277}1278}12791280void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {1281switch (opcode) {1282case Op_RShiftVB: // fall-through1283case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;12841285case Op_LShiftVB: // fall-through1286case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;12871288case Op_URShiftVB: // fall-through1289case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;12901291default: assert(false, "%s", NodeClassNames[opcode]);1292}1293}12941295void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {1296assert(UseAVX >= 2, "required");1297switch (opcode) {1298case Op_RShiftVL: {1299if (UseAVX > 2) {1300assert(tmp == xnoreg, "not used");1301if (!VM_Version::supports_avx512vl()) {1302vlen_enc = Assembler::AVX_512bit;1303}1304evpsravq(dst, src, shift, vlen_enc);1305} else {1306vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));1307vpsrlvq(dst, src, shift, vlen_enc);1308vpsrlvq(tmp, tmp, shift, vlen_enc);1309vpxor(dst, dst, tmp, vlen_enc);1310vpsubq(dst, dst, tmp, vlen_enc);1311}1312break;1313}1314case Op_LShiftVL: {1315assert(tmp == xnoreg, "not used");1316vpsllvq(dst, src, shift, vlen_enc);1317break;1318}1319case Op_URShiftVL: {1320assert(tmp == xnoreg, "not used");1321vpsrlvq(dst, src, shift, vlen_enc);1322break;1323}1324default: assert(false, "%s", NodeClassNames[opcode]);1325}1326}13271328// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst1329void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {1330assert(opcode == Op_LShiftVB ||1331opcode == Op_RShiftVB ||1332opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);1333bool sign = (opcode != Op_URShiftVB);1334assert(vector_len == 0, "required");1335vextendbd(sign, dst, src, 1);1336vpmovzxbd(vtmp, shift, 1);1337varshiftd(opcode, dst, dst, vtmp, 1);1338vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);1339vextracti128_high(vtmp, dst);1340vpackusdw(dst, dst, vtmp, 0);1341}13421343// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst1344void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {1345assert(opcode == Op_LShiftVB ||1346opcode == Op_RShiftVB ||1347opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);1348bool sign = (opcode != Op_URShiftVB);1349int ext_vector_len = vector_len + 1;1350vextendbw(sign, dst, src, ext_vector_len);1351vpmovzxbw(vtmp, shift, ext_vector_len);1352varshiftw(opcode, dst, dst, vtmp, ext_vector_len);1353vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);1354if (vector_len == 0) {1355vextracti128_high(vtmp, dst);1356vpackuswb(dst, dst, vtmp, vector_len);1357} else {1358vextracti64x4_high(vtmp, dst);1359vpackuswb(dst, dst, vtmp, vector_len);1360vpermq(dst, dst, 0xD8, vector_len);1361}1362}13631364void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {1365switch(typ) {1366case T_BYTE:1367pinsrb(dst, val, idx);1368break;1369case T_SHORT:1370pinsrw(dst, val, idx);1371break;1372case T_INT:1373pinsrd(dst, val, idx);1374break;1375case T_LONG:1376pinsrq(dst, val, idx);1377break;1378default:1379assert(false,"Should not reach here.");1380break;1381}1382}13831384void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {1385switch(typ) {1386case T_BYTE:1387vpinsrb(dst, src, val, idx);1388break;1389case T_SHORT:1390vpinsrw(dst, src, val, idx);1391break;1392case T_INT:1393vpinsrd(dst, src, val, idx);1394break;1395case T_LONG:1396vpinsrq(dst, src, val, idx);1397break;1398default:1399assert(false,"Should not reach here.");1400break;1401}1402}14031404void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {1405switch(typ) {1406case T_INT:1407vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);1408break;1409case T_FLOAT:1410vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);1411break;1412case T_LONG:1413vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);1414break;1415case T_DOUBLE:1416vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);1417break;1418default:1419assert(false,"Should not reach here.");1420break;1421}1422}14231424void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {1425switch(typ) {1426case T_INT:1427evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);1428break;1429case T_FLOAT:1430evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);1431break;1432case T_LONG:1433evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);1434break;1435case T_DOUBLE:1436evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);1437break;1438default:1439assert(false,"Should not reach here.");1440break;1441}1442}14431444void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {1445switch(typ) {1446case T_INT:1447evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);1448break;1449case T_FLOAT:1450evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);1451break;1452case T_LONG:1453evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);1454break;1455case T_DOUBLE:1456evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);1457break;1458default:1459assert(false,"Should not reach here.");1460break;1461}1462}14631464void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {1465if (vlen_in_bytes <= 16) {1466pxor (dst, dst);1467psubb(dst, src);1468switch (elem_bt) {1469case T_BYTE: /* nothing to do */ break;1470case T_SHORT: pmovsxbw(dst, dst); break;1471case T_INT: pmovsxbd(dst, dst); break;1472case T_FLOAT: pmovsxbd(dst, dst); break;1473case T_LONG: pmovsxbq(dst, dst); break;1474case T_DOUBLE: pmovsxbq(dst, dst); break;14751476default: assert(false, "%s", type2name(elem_bt));1477}1478} else {1479int vlen_enc = vector_length_encoding(vlen_in_bytes);14801481vpxor (dst, dst, dst, vlen_enc);1482vpsubb(dst, dst, src, vlen_enc);1483switch (elem_bt) {1484case T_BYTE: /* nothing to do */ break;1485case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;1486case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;1487case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;1488case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;1489case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;14901491default: assert(false, "%s", type2name(elem_bt));1492}1493}1494}14951496void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {1497ExternalAddress addr(StubRoutines::x86::vector_iota_indices());1498if (vlen_in_bytes <= 16) {1499movdqu(dst, addr, scratch);1500} else if (vlen_in_bytes == 32) {1501vmovdqu(dst, addr, scratch);1502} else {1503assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);1504evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);1505}1506}1507// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.15081509void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {1510int vector_len = Assembler::AVX_128bit;15111512switch (opcode) {1513case Op_AndReductionV: pand(dst, src); break;1514case Op_OrReductionV: por (dst, src); break;1515case Op_XorReductionV: pxor(dst, src); break;1516case Op_MinReductionV:1517switch (typ) {1518case T_BYTE: pminsb(dst, src); break;1519case T_SHORT: pminsw(dst, src); break;1520case T_INT: pminsd(dst, src); break;1521case T_LONG: assert(UseAVX > 2, "required");1522vpminsq(dst, dst, src, Assembler::AVX_128bit); break;1523default: assert(false, "wrong type");1524}1525break;1526case Op_MaxReductionV:1527switch (typ) {1528case T_BYTE: pmaxsb(dst, src); break;1529case T_SHORT: pmaxsw(dst, src); break;1530case T_INT: pmaxsd(dst, src); break;1531case T_LONG: assert(UseAVX > 2, "required");1532vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;1533default: assert(false, "wrong type");1534}1535break;1536case Op_AddReductionVF: addss(dst, src); break;1537case Op_AddReductionVD: addsd(dst, src); break;1538case Op_AddReductionVI:1539switch (typ) {1540case T_BYTE: paddb(dst, src); break;1541case T_SHORT: paddw(dst, src); break;1542case T_INT: paddd(dst, src); break;1543default: assert(false, "wrong type");1544}1545break;1546case Op_AddReductionVL: paddq(dst, src); break;1547case Op_MulReductionVF: mulss(dst, src); break;1548case Op_MulReductionVD: mulsd(dst, src); break;1549case Op_MulReductionVI:1550switch (typ) {1551case T_SHORT: pmullw(dst, src); break;1552case T_INT: pmulld(dst, src); break;1553default: assert(false, "wrong type");1554}1555break;1556case Op_MulReductionVL: assert(UseAVX > 2, "required");1557vpmullq(dst, dst, src, vector_len); break;1558default: assert(false, "wrong opcode");1559}1560}15611562void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {1563int vector_len = Assembler::AVX_256bit;15641565switch (opcode) {1566case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;1567case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;1568case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;1569case Op_MinReductionV:1570switch (typ) {1571case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;1572case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;1573case T_INT: vpminsd(dst, src1, src2, vector_len); break;1574case T_LONG: assert(UseAVX > 2, "required");1575vpminsq(dst, src1, src2, vector_len); break;1576default: assert(false, "wrong type");1577}1578break;1579case Op_MaxReductionV:1580switch (typ) {1581case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;1582case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;1583case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;1584case T_LONG: assert(UseAVX > 2, "required");1585vpmaxsq(dst, src1, src2, vector_len); break;1586default: assert(false, "wrong type");1587}1588break;1589case Op_AddReductionVI:1590switch (typ) {1591case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;1592case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;1593case T_INT: vpaddd(dst, src1, src2, vector_len); break;1594default: assert(false, "wrong type");1595}1596break;1597case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;1598case Op_MulReductionVI:1599switch (typ) {1600case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;1601case T_INT: vpmulld(dst, src1, src2, vector_len); break;1602default: assert(false, "wrong type");1603}1604break;1605case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;1606default: assert(false, "wrong opcode");1607}1608}16091610void C2_MacroAssembler::reduce_fp(int opcode, int vlen,1611XMMRegister dst, XMMRegister src,1612XMMRegister vtmp1, XMMRegister vtmp2) {1613switch (opcode) {1614case Op_AddReductionVF:1615case Op_MulReductionVF:1616reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);1617break;16181619case Op_AddReductionVD:1620case Op_MulReductionVD:1621reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);1622break;16231624default: assert(false, "wrong opcode");1625}1626}16271628void C2_MacroAssembler::reduceB(int opcode, int vlen,1629Register dst, Register src1, XMMRegister src2,1630XMMRegister vtmp1, XMMRegister vtmp2) {1631switch (vlen) {1632case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;1633case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1634case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1635case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;16361637default: assert(false, "wrong vector length");1638}1639}16401641void C2_MacroAssembler::mulreduceB(int opcode, int vlen,1642Register dst, Register src1, XMMRegister src2,1643XMMRegister vtmp1, XMMRegister vtmp2) {1644switch (vlen) {1645case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;1646case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1647case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;1648case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;16491650default: assert(false, "wrong vector length");1651}1652}16531654void C2_MacroAssembler::reduceS(int opcode, int vlen,1655Register dst, Register src1, XMMRegister src2,1656XMMRegister vtmp1, XMMRegister vtmp2) {1657switch (vlen) {1658case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;1659case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;1660case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;1661case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;16621663default: assert(false, "wrong vector length");1664}1665}16661667void C2_MacroAssembler::reduceI(int opcode, int vlen,1668Register dst, Register src1, XMMRegister src2,1669XMMRegister vtmp1, XMMRegister vtmp2) {1670switch (vlen) {1671case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;1672case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;1673case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;1674case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;16751676default: assert(false, "wrong vector length");1677}1678}16791680#ifdef _LP641681void C2_MacroAssembler::reduceL(int opcode, int vlen,1682Register dst, Register src1, XMMRegister src2,1683XMMRegister vtmp1, XMMRegister vtmp2) {1684switch (vlen) {1685case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;1686case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;1687case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;16881689default: assert(false, "wrong vector length");1690}1691}1692#endif // _LP6416931694void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1695switch (vlen) {1696case 2:1697assert(vtmp2 == xnoreg, "");1698reduce2F(opcode, dst, src, vtmp1);1699break;1700case 4:1701assert(vtmp2 == xnoreg, "");1702reduce4F(opcode, dst, src, vtmp1);1703break;1704case 8:1705reduce8F(opcode, dst, src, vtmp1, vtmp2);1706break;1707case 16:1708reduce16F(opcode, dst, src, vtmp1, vtmp2);1709break;1710default: assert(false, "wrong vector length");1711}1712}17131714void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1715switch (vlen) {1716case 2:1717assert(vtmp2 == xnoreg, "");1718reduce2D(opcode, dst, src, vtmp1);1719break;1720case 4:1721reduce4D(opcode, dst, src, vtmp1, vtmp2);1722break;1723case 8:1724reduce8D(opcode, dst, src, vtmp1, vtmp2);1725break;1726default: assert(false, "wrong vector length");1727}1728}17291730void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1731if (opcode == Op_AddReductionVI) {1732if (vtmp1 != src2) {1733movdqu(vtmp1, src2);1734}1735phaddd(vtmp1, vtmp1);1736} else {1737pshufd(vtmp1, src2, 0x1);1738reduce_operation_128(T_INT, opcode, vtmp1, src2);1739}1740movdl(vtmp2, src1);1741reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);1742movdl(dst, vtmp1);1743}17441745void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1746if (opcode == Op_AddReductionVI) {1747if (vtmp1 != src2) {1748movdqu(vtmp1, src2);1749}1750phaddd(vtmp1, src2);1751reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1752} else {1753pshufd(vtmp2, src2, 0xE);1754reduce_operation_128(T_INT, opcode, vtmp2, src2);1755reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1756}1757}17581759void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1760if (opcode == Op_AddReductionVI) {1761vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);1762vextracti128_high(vtmp2, vtmp1);1763vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);1764reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1765} else {1766vextracti128_high(vtmp1, src2);1767reduce_operation_128(T_INT, opcode, vtmp1, src2);1768reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1769}1770}17711772void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1773vextracti64x4_high(vtmp2, src2);1774reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);1775reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1776}17771778void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1779pshufd(vtmp2, src2, 0x1);1780reduce_operation_128(T_BYTE, opcode, vtmp2, src2);1781movdqu(vtmp1, vtmp2);1782psrldq(vtmp1, 2);1783reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);1784movdqu(vtmp2, vtmp1);1785psrldq(vtmp2, 1);1786reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);1787movdl(vtmp2, src1);1788pmovsxbd(vtmp1, vtmp1);1789reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);1790pextrb(dst, vtmp1, 0x0);1791movsbl(dst, dst);1792}17931794void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1795pshufd(vtmp1, src2, 0xE);1796reduce_operation_128(T_BYTE, opcode, vtmp1, src2);1797reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1798}17991800void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1801vextracti128_high(vtmp2, src2);1802reduce_operation_128(T_BYTE, opcode, vtmp2, src2);1803reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1804}18051806void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1807vextracti64x4_high(vtmp1, src2);1808reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);1809reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1810}18111812void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1813pmovsxbw(vtmp2, src2);1814reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1815}18161817void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1818if (UseAVX > 1) {1819int vector_len = Assembler::AVX_256bit;1820vpmovsxbw(vtmp1, src2, vector_len);1821reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1822} else {1823pmovsxbw(vtmp2, src2);1824reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1825pshufd(vtmp2, src2, 0x1);1826pmovsxbw(vtmp2, src2);1827reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);1828}1829}18301831void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1832if (UseAVX > 2 && VM_Version::supports_avx512bw()) {1833int vector_len = Assembler::AVX_512bit;1834vpmovsxbw(vtmp1, src2, vector_len);1835reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1836} else {1837assert(UseAVX >= 2,"Should not reach here.");1838mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);1839vextracti128_high(vtmp2, src2);1840mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);1841}1842}18431844void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1845mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);1846vextracti64x4_high(vtmp2, src2);1847mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);1848}18491850void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1851if (opcode == Op_AddReductionVI) {1852if (vtmp1 != src2) {1853movdqu(vtmp1, src2);1854}1855phaddw(vtmp1, vtmp1);1856phaddw(vtmp1, vtmp1);1857} else {1858pshufd(vtmp2, src2, 0x1);1859reduce_operation_128(T_SHORT, opcode, vtmp2, src2);1860movdqu(vtmp1, vtmp2);1861psrldq(vtmp1, 2);1862reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);1863}1864movdl(vtmp2, src1);1865pmovsxwd(vtmp1, vtmp1);1866reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);1867pextrw(dst, vtmp1, 0x0);1868movswl(dst, dst);1869}18701871void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1872if (opcode == Op_AddReductionVI) {1873if (vtmp1 != src2) {1874movdqu(vtmp1, src2);1875}1876phaddw(vtmp1, src2);1877} else {1878pshufd(vtmp1, src2, 0xE);1879reduce_operation_128(T_SHORT, opcode, vtmp1, src2);1880}1881reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1882}18831884void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1885if (opcode == Op_AddReductionVI) {1886int vector_len = Assembler::AVX_256bit;1887vphaddw(vtmp2, src2, src2, vector_len);1888vpermq(vtmp2, vtmp2, 0xD8, vector_len);1889} else {1890vextracti128_high(vtmp2, src2);1891reduce_operation_128(T_SHORT, opcode, vtmp2, src2);1892}1893reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1894}18951896void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1897int vector_len = Assembler::AVX_256bit;1898vextracti64x4_high(vtmp1, src2);1899reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);1900reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1901}19021903#ifdef _LP641904void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1905pshufd(vtmp2, src2, 0xE);1906reduce_operation_128(T_LONG, opcode, vtmp2, src2);1907movdq(vtmp1, src1);1908reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);1909movdq(dst, vtmp1);1910}19111912void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1913vextracti128_high(vtmp1, src2);1914reduce_operation_128(T_LONG, opcode, vtmp1, src2);1915reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);1916}19171918void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {1919vextracti64x4_high(vtmp2, src2);1920reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);1921reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);1922}19231924void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {1925assert(ArrayCopyPartialInlineSize <= 64,"");1926mov64(temp, -1L);1927bzhiq(temp, temp, len);1928kmovql(dst, temp);1929}1930#endif // _LP6419311932void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {1933reduce_operation_128(T_FLOAT, opcode, dst, src);1934pshufd(vtmp, src, 0x1);1935reduce_operation_128(T_FLOAT, opcode, dst, vtmp);1936}19371938void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {1939reduce2F(opcode, dst, src, vtmp);1940pshufd(vtmp, src, 0x2);1941reduce_operation_128(T_FLOAT, opcode, dst, vtmp);1942pshufd(vtmp, src, 0x3);1943reduce_operation_128(T_FLOAT, opcode, dst, vtmp);1944}19451946void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1947reduce4F(opcode, dst, src, vtmp2);1948vextractf128_high(vtmp2, src);1949reduce4F(opcode, dst, vtmp2, vtmp1);1950}19511952void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1953reduce8F(opcode, dst, src, vtmp1, vtmp2);1954vextracti64x4_high(vtmp1, src);1955reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);1956}19571958void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {1959reduce_operation_128(T_DOUBLE, opcode, dst, src);1960pshufd(vtmp, src, 0xE);1961reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);1962}19631964void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1965reduce2D(opcode, dst, src, vtmp2);1966vextractf128_high(vtmp2, src);1967reduce2D(opcode, dst, vtmp2, vtmp1);1968}19691970void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {1971reduce4D(opcode, dst, src, vtmp1, vtmp2);1972vextracti64x4_high(vtmp1, src);1973reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);1974}19751976void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {1977MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);1978}19791980void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {1981MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);1982}198319841985void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,1986XMMRegister dst, XMMRegister src,1987XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,1988XMMRegister xmm_0, XMMRegister xmm_1) {1989int permconst[] = {1, 14};1990XMMRegister wsrc = src;1991XMMRegister wdst = xmm_0;1992XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;19931994int vlen_enc = Assembler::AVX_128bit;1995if (vlen == 16) {1996vlen_enc = Assembler::AVX_256bit;1997}19981999for (int i = log2(vlen) - 1; i >=0; i--) {2000if (i == 0 && !is_dst_valid) {2001wdst = dst;2002}2003if (i == 3) {2004vextracti64x4_high(wtmp, wsrc);2005} else if (i == 2) {2006vextracti128_high(wtmp, wsrc);2007} else { // i = [0,1]2008vpermilps(wtmp, wsrc, permconst[i], vlen_enc);2009}2010vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);2011wsrc = wdst;2012vlen_enc = Assembler::AVX_128bit;2013}2014if (is_dst_valid) {2015vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);2016}2017}20182019void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,2020XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,2021XMMRegister xmm_0, XMMRegister xmm_1) {2022XMMRegister wsrc = src;2023XMMRegister wdst = xmm_0;2024XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;2025int vlen_enc = Assembler::AVX_128bit;2026if (vlen == 8) {2027vlen_enc = Assembler::AVX_256bit;2028}2029for (int i = log2(vlen) - 1; i >=0; i--) {2030if (i == 0 && !is_dst_valid) {2031wdst = dst;2032}2033if (i == 1) {2034vextracti128_high(wtmp, wsrc);2035} else if (i == 2) {2036vextracti64x4_high(wtmp, wsrc);2037} else {2038assert(i == 0, "%d", i);2039vpermilpd(wtmp, wsrc, 1, vlen_enc);2040}2041vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);2042wsrc = wdst;2043vlen_enc = Assembler::AVX_128bit;2044}2045if (is_dst_valid) {2046vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);2047}2048}20492050void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {2051switch (bt) {2052case T_BYTE: pextrb(dst, src, idx); break;2053case T_SHORT: pextrw(dst, src, idx); break;2054case T_INT: pextrd(dst, src, idx); break;2055case T_LONG: pextrq(dst, src, idx); break;20562057default:2058assert(false,"Should not reach here.");2059break;2060}2061}20622063XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {2064int esize = type2aelembytes(typ);2065int elem_per_lane = 16/esize;2066int lane = elemindex / elem_per_lane;2067int eindex = elemindex % elem_per_lane;20682069if (lane >= 2) {2070assert(UseAVX > 2, "required");2071vextractf32x4(dst, src, lane & 3);2072return dst;2073} else if (lane > 0) {2074assert(UseAVX > 0, "required");2075vextractf128(dst, src, lane);2076return dst;2077} else {2078return src;2079}2080}20812082void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {2083int esize = type2aelembytes(typ);2084int elem_per_lane = 16/esize;2085int eindex = elemindex % elem_per_lane;2086assert(is_integral_type(typ),"required");20872088if (eindex == 0) {2089if (typ == T_LONG) {2090movq(dst, src);2091} else {2092movdl(dst, src);2093if (typ == T_BYTE)2094movsbl(dst, dst);2095else if (typ == T_SHORT)2096movswl(dst, dst);2097}2098} else {2099extract(typ, dst, src, eindex);2100}2101}21022103void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {2104int esize = type2aelembytes(typ);2105int elem_per_lane = 16/esize;2106int eindex = elemindex % elem_per_lane;2107assert((typ == T_FLOAT || typ == T_DOUBLE),"required");21082109if (eindex == 0) {2110movq(dst, src);2111} else {2112if (typ == T_FLOAT) {2113if (UseAVX == 0) {2114movdqu(dst, src);2115pshufps(dst, dst, eindex);2116} else {2117vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);2118}2119} else {2120if (UseAVX == 0) {2121movdqu(dst, src);2122psrldq(dst, eindex*esize);2123} else {2124vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);2125}2126movq(dst, dst);2127}2128}2129// Zero upper bits2130if (typ == T_FLOAT) {2131if (UseAVX == 0) {2132assert((vtmp != xnoreg) && (tmp != noreg), "required.");2133movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);2134pand(dst, vtmp);2135} else {2136assert((tmp != noreg), "required.");2137vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);2138}2139}2140}21412142void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {2143switch(typ) {2144case T_BYTE:2145evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2146break;2147case T_SHORT:2148evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2149break;2150case T_INT:2151case T_FLOAT:2152evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2153break;2154case T_LONG:2155case T_DOUBLE:2156evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);2157break;2158default:2159assert(false,"Should not reach here.");2160break;2161}2162}21632164void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,2165int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {2166int vlen_enc = vector_length_encoding(vlen_in_bytes*2);2167switch (typ) {2168case T_BYTE:2169vpmovzxbw(vtmp1, src1, vlen_enc);2170vpmovzxbw(vtmp2, src2, vlen_enc);2171vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);2172vpacksswb(dst, dst, dst, vlen_enc);2173break;2174case T_SHORT:2175vpmovzxwd(vtmp1, src1, vlen_enc);2176vpmovzxwd(vtmp2, src2, vlen_enc);2177vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);2178vpackssdw(dst, dst, dst, vlen_enc);2179break;2180case T_INT:2181vpmovzxdq(vtmp1, src1, vlen_enc);2182vpmovzxdq(vtmp2, src2, vlen_enc);2183vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);2184vpermilps(dst, dst, 8, vlen_enc);2185break;2186default:2187assert(false, "Should not reach here");2188}2189if (vlen_in_bytes == 16) {2190vpermpd(dst, dst, 0x8, vlen_enc);2191}2192}21932194void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,2195XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {2196int vlen_enc = vector_length_encoding(vlen_in_bytes);2197switch (typ) {2198case T_BYTE:2199vpmovzxbw(vtmp1, src1, vlen_enc);2200vpmovzxbw(vtmp2, src2, vlen_enc);2201vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);2202vextracti128(vtmp1, src1, 1);2203vextracti128(vtmp2, src2, 1);2204vpmovzxbw(vtmp1, vtmp1, vlen_enc);2205vpmovzxbw(vtmp2, vtmp2, vlen_enc);2206vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);2207vpacksswb(dst, dst, vtmp3, vlen_enc);2208vpermpd(dst, dst, 0xd8, vlen_enc);2209break;2210case T_SHORT:2211vpmovzxwd(vtmp1, src1, vlen_enc);2212vpmovzxwd(vtmp2, src2, vlen_enc);2213vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);2214vextracti128(vtmp1, src1, 1);2215vextracti128(vtmp2, src2, 1);2216vpmovzxwd(vtmp1, vtmp1, vlen_enc);2217vpmovzxwd(vtmp2, vtmp2, vlen_enc);2218vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);2219vpackssdw(dst, dst, vtmp3, vlen_enc);2220vpermpd(dst, dst, 0xd8, vlen_enc);2221break;2222case T_INT:2223vpmovzxdq(vtmp1, src1, vlen_enc);2224vpmovzxdq(vtmp2, src2, vlen_enc);2225vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);2226vpshufd(dst, dst, 8, vlen_enc);2227vpermq(dst, dst, 8, vlen_enc);2228vextracti128(vtmp1, src1, 1);2229vextracti128(vtmp2, src2, 1);2230vpmovzxdq(vtmp1, vtmp1, vlen_enc);2231vpmovzxdq(vtmp2, vtmp2, vlen_enc);2232vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);2233vpshufd(vtmp3, vtmp3, 8, vlen_enc);2234vpermq(vtmp3, vtmp3, 0x80, vlen_enc);2235vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);2236break;2237default:2238assert(false, "Should not reach here");2239}2240}22412242void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {2243switch(typ) {2244case T_BYTE:2245evpblendmb(dst, kmask, src1, src2, merge, vector_len);2246break;2247case T_SHORT:2248evpblendmw(dst, kmask, src1, src2, merge, vector_len);2249break;2250case T_INT:2251case T_FLOAT:2252evpblendmd(dst, kmask, src1, src2, merge, vector_len);2253break;2254case T_LONG:2255case T_DOUBLE:2256evpblendmq(dst, kmask, src1, src2, merge, vector_len);2257break;2258default:2259assert(false,"Should not reach here.");2260break;2261}2262}22632264void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,2265XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {2266switch(vlen) {2267case 4:2268assert(vtmp1 != xnoreg, "required.");2269// Broadcast lower 32 bits to 128 bits before ptest2270pshufd(vtmp1, src1, 0x0);2271if (bt == BoolTest::overflow) {2272assert(vtmp2 != xnoreg, "required.");2273pshufd(vtmp2, src2, 0x0);2274} else {2275assert(vtmp2 == xnoreg, "required.");2276vtmp2 = src2;2277}2278ptest(vtmp1, vtmp2);2279break;2280case 8:2281assert(vtmp1 != xnoreg, "required.");2282// Broadcast lower 64 bits to 128 bits before ptest2283pshufd(vtmp1, src1, 0x4);2284if (bt == BoolTest::overflow) {2285assert(vtmp2 != xnoreg, "required.");2286pshufd(vtmp2, src2, 0x4);2287} else {2288assert(vtmp2 == xnoreg, "required.");2289vtmp2 = src2;2290}2291ptest(vtmp1, vtmp2);2292break;2293case 16:2294assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");2295ptest(src1, src2);2296break;2297case 32:2298assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");2299vptest(src1, src2, Assembler::AVX_256bit);2300break;2301case 64:2302{2303assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");2304evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);2305if (bt == BoolTest::ne) {2306ktestql(mask, mask);2307} else {2308assert(bt == BoolTest::overflow, "required");2309kortestql(mask, mask);2310}2311}2312break;2313default:2314assert(false,"Should not reach here.");2315break;2316}2317}23182319//-------------------------------------------------------------------------------------------23202321// IndexOf for constant substrings with size >= 8 chars2322// which don't need to be loaded through stack.2323void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,2324Register cnt1, Register cnt2,2325int int_cnt2, Register result,2326XMMRegister vec, Register tmp,2327int ae) {2328ShortBranchVerifier sbv(this);2329assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");2330assert(ae != StrIntrinsicNode::LU, "Invalid encoding");23312332// This method uses the pcmpestri instruction with bound registers2333// inputs:2334// xmm - substring2335// rax - substring length (elements count)2336// mem - scanned string2337// rdx - string length (elements count)2338// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)2339// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)2340// outputs:2341// rcx - matched index in string2342assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");2343int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts2344int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 82345Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;2346Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;23472348Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,2349RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,2350MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;23512352// Note, inline_string_indexOf() generates checks:2353// if (substr.count > string.count) return -1;2354// if (substr.count == 0) return 0;2355assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");23562357// Load substring.2358if (ae == StrIntrinsicNode::UL) {2359pmovzxbw(vec, Address(str2, 0));2360} else {2361movdqu(vec, Address(str2, 0));2362}2363movl(cnt2, int_cnt2);2364movptr(result, str1); // string addr23652366if (int_cnt2 > stride) {2367jmpb(SCAN_TO_SUBSTR);23682369// Reload substr for rescan, this code2370// is executed only for large substrings (> 8 chars)2371bind(RELOAD_SUBSTR);2372if (ae == StrIntrinsicNode::UL) {2373pmovzxbw(vec, Address(str2, 0));2374} else {2375movdqu(vec, Address(str2, 0));2376}2377negptr(cnt2); // Jumped here with negative cnt2, convert to positive23782379bind(RELOAD_STR);2380// We came here after the beginning of the substring was2381// matched but the rest of it was not so we need to search2382// again. Start from the next element after the previous match.23832384// cnt2 is number of substring reminding elements and2385// cnt1 is number of string reminding elements when cmp failed.2386// Restored cnt1 = cnt1 - cnt2 + int_cnt22387subl(cnt1, cnt2);2388addl(cnt1, int_cnt2);2389movl(cnt2, int_cnt2); // Now restore cnt223902391decrementl(cnt1); // Shift to next element2392cmpl(cnt1, cnt2);2393jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring23942395addptr(result, (1<<scale1));23962397} // (int_cnt2 > 8)23982399// Scan string for start of substr in 16-byte vectors2400bind(SCAN_TO_SUBSTR);2401pcmpestri(vec, Address(result, 0), mode);2402jccb(Assembler::below, FOUND_CANDIDATE); // CF == 12403subl(cnt1, stride);2404jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string2405cmpl(cnt1, cnt2);2406jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring2407addptr(result, 16);2408jmpb(SCAN_TO_SUBSTR);24092410// Found a potential substr2411bind(FOUND_CANDIDATE);2412// Matched whole vector if first element matched (tmp(rcx) == 0).2413if (int_cnt2 == stride) {2414jccb(Assembler::overflow, RET_FOUND); // OF == 12415} else { // int_cnt2 > 82416jccb(Assembler::overflow, FOUND_SUBSTR);2417}2418// After pcmpestri tmp(rcx) contains matched element index2419// Compute start addr of substr2420lea(result, Address(result, tmp, scale1));24212422// Make sure string is still long enough2423subl(cnt1, tmp);2424cmpl(cnt1, cnt2);2425if (int_cnt2 == stride) {2426jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);2427} else { // int_cnt2 > 82428jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);2429}2430// Left less then substring.24312432bind(RET_NOT_FOUND);2433movl(result, -1);2434jmp(EXIT);24352436if (int_cnt2 > stride) {2437// This code is optimized for the case when whole substring2438// is matched if its head is matched.2439bind(MATCH_SUBSTR_HEAD);2440pcmpestri(vec, Address(result, 0), mode);2441// Reload only string if does not match2442jcc(Assembler::noOverflow, RELOAD_STR); // OF == 024432444Label CONT_SCAN_SUBSTR;2445// Compare the rest of substring (> 8 chars).2446bind(FOUND_SUBSTR);2447// First 8 chars are already matched.2448negptr(cnt2);2449addptr(cnt2, stride);24502451bind(SCAN_SUBSTR);2452subl(cnt1, stride);2453cmpl(cnt2, -stride); // Do not read beyond substring2454jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);2455// Back-up strings to avoid reading beyond substring:2456// cnt1 = cnt1 - cnt2 + 82457addl(cnt1, cnt2); // cnt2 is negative2458addl(cnt1, stride);2459movl(cnt2, stride); negptr(cnt2);2460bind(CONT_SCAN_SUBSTR);2461if (int_cnt2 < (int)G) {2462int tail_off1 = int_cnt2<<scale1;2463int tail_off2 = int_cnt2<<scale2;2464if (ae == StrIntrinsicNode::UL) {2465pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));2466} else {2467movdqu(vec, Address(str2, cnt2, scale2, tail_off2));2468}2469pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);2470} else {2471// calculate index in register to avoid integer overflow (int_cnt2*2)2472movl(tmp, int_cnt2);2473addptr(tmp, cnt2);2474if (ae == StrIntrinsicNode::UL) {2475pmovzxbw(vec, Address(str2, tmp, scale2, 0));2476} else {2477movdqu(vec, Address(str2, tmp, scale2, 0));2478}2479pcmpestri(vec, Address(result, tmp, scale1, 0), mode);2480}2481// Need to reload strings pointers if not matched whole vector2482jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 02483addptr(cnt2, stride);2484jcc(Assembler::negative, SCAN_SUBSTR);2485// Fall through if found full substring24862487} // (int_cnt2 > 8)24882489bind(RET_FOUND);2490// Found result if we matched full small substring.2491// Compute substr offset2492subptr(result, str1);2493if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {2494shrl(result, 1); // index2495}2496bind(EXIT);24972498} // string_indexofC824992500// Small strings are loaded through stack if they cross page boundary.2501void C2_MacroAssembler::string_indexof(Register str1, Register str2,2502Register cnt1, Register cnt2,2503int int_cnt2, Register result,2504XMMRegister vec, Register tmp,2505int ae) {2506ShortBranchVerifier sbv(this);2507assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");2508assert(ae != StrIntrinsicNode::LU, "Invalid encoding");25092510//2511// int_cnt2 is length of small (< 8 chars) constant substring2512// or (-1) for non constant substring in which case its length2513// is in cnt2 register.2514//2515// Note, inline_string_indexOf() generates checks:2516// if (substr.count > string.count) return -1;2517// if (substr.count == 0) return 0;2518//2519int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 82520assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");2521// This method uses the pcmpestri instruction with bound registers2522// inputs:2523// xmm - substring2524// rax - substring length (elements count)2525// mem - scanned string2526// rdx - string length (elements count)2527// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)2528// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)2529// outputs:2530// rcx - matched index in string2531assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");2532int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts2533Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;2534Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;25352536Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,2537RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,2538FOUND_CANDIDATE;25392540{ //========================================================2541// We don't know where these strings are located2542// and we can't read beyond them. Load them through stack.2543Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;25442545movptr(tmp, rsp); // save old SP25462547if (int_cnt2 > 0) { // small (< 8 chars) constant substring2548if (int_cnt2 == (1>>scale2)) { // One byte2549assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");2550load_unsigned_byte(result, Address(str2, 0));2551movdl(vec, result); // move 32 bits2552} else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes2553// Not enough header space in 32-bit VM: 12+3 = 15.2554movl(result, Address(str2, -1));2555shrl(result, 8);2556movdl(vec, result); // move 32 bits2557} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char2558load_unsigned_short(result, Address(str2, 0));2559movdl(vec, result); // move 32 bits2560} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars2561movdl(vec, Address(str2, 0)); // move 32 bits2562} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars2563movq(vec, Address(str2, 0)); // move 64 bits2564} else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})2565// Array header size is 12 bytes in 32-bit VM2566// + 6 bytes for 3 chars == 18 bytes,2567// enough space to load vec and shift.2568assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");2569if (ae == StrIntrinsicNode::UL) {2570int tail_off = int_cnt2-8;2571pmovzxbw(vec, Address(str2, tail_off));2572psrldq(vec, -2*tail_off);2573}2574else {2575int tail_off = int_cnt2*(1<<scale2);2576movdqu(vec, Address(str2, tail_off-16));2577psrldq(vec, 16-tail_off);2578}2579}2580} else { // not constant substring2581cmpl(cnt2, stride);2582jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough25832584// We can read beyond string if srt+16 does not cross page boundary2585// since heaps are aligned and mapped by pages.2586assert(os::vm_page_size() < (int)G, "default page should be small");2587movl(result, str2); // We need only low 32 bits2588andl(result, (os::vm_page_size()-1));2589cmpl(result, (os::vm_page_size()-16));2590jccb(Assembler::belowEqual, CHECK_STR);25912592// Move small strings to stack to allow load 16 bytes into vec.2593subptr(rsp, 16);2594int stk_offset = wordSize-(1<<scale2);2595push(cnt2);25962597bind(COPY_SUBSTR);2598if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {2599load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));2600movb(Address(rsp, cnt2, scale2, stk_offset), result);2601} else if (ae == StrIntrinsicNode::UU) {2602load_unsigned_short(result, Address(str2, cnt2, scale2, -2));2603movw(Address(rsp, cnt2, scale2, stk_offset), result);2604}2605decrement(cnt2);2606jccb(Assembler::notZero, COPY_SUBSTR);26072608pop(cnt2);2609movptr(str2, rsp); // New substring address2610} // non constant26112612bind(CHECK_STR);2613cmpl(cnt1, stride);2614jccb(Assembler::aboveEqual, BIG_STRINGS);26152616// Check cross page boundary.2617movl(result, str1); // We need only low 32 bits2618andl(result, (os::vm_page_size()-1));2619cmpl(result, (os::vm_page_size()-16));2620jccb(Assembler::belowEqual, BIG_STRINGS);26212622subptr(rsp, 16);2623int stk_offset = -(1<<scale1);2624if (int_cnt2 < 0) { // not constant2625push(cnt2);2626stk_offset += wordSize;2627}2628movl(cnt2, cnt1);26292630bind(COPY_STR);2631if (ae == StrIntrinsicNode::LL) {2632load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));2633movb(Address(rsp, cnt2, scale1, stk_offset), result);2634} else {2635load_unsigned_short(result, Address(str1, cnt2, scale1, -2));2636movw(Address(rsp, cnt2, scale1, stk_offset), result);2637}2638decrement(cnt2);2639jccb(Assembler::notZero, COPY_STR);26402641if (int_cnt2 < 0) { // not constant2642pop(cnt2);2643}2644movptr(str1, rsp); // New string address26452646bind(BIG_STRINGS);2647// Load substring.2648if (int_cnt2 < 0) { // -12649if (ae == StrIntrinsicNode::UL) {2650pmovzxbw(vec, Address(str2, 0));2651} else {2652movdqu(vec, Address(str2, 0));2653}2654push(cnt2); // substr count2655push(str2); // substr addr2656push(str1); // string addr2657} else {2658// Small (< 8 chars) constant substrings are loaded already.2659movl(cnt2, int_cnt2);2660}2661push(tmp); // original SP26622663} // Finished loading26642665//========================================================2666// Start search2667//26682669movptr(result, str1); // string addr26702671if (int_cnt2 < 0) { // Only for non constant substring2672jmpb(SCAN_TO_SUBSTR);26732674// SP saved at sp+02675// String saved at sp+1*wordSize2676// Substr saved at sp+2*wordSize2677// Substr count saved at sp+3*wordSize26782679// Reload substr for rescan, this code2680// is executed only for large substrings (> 8 chars)2681bind(RELOAD_SUBSTR);2682movptr(str2, Address(rsp, 2*wordSize));2683movl(cnt2, Address(rsp, 3*wordSize));2684if (ae == StrIntrinsicNode::UL) {2685pmovzxbw(vec, Address(str2, 0));2686} else {2687movdqu(vec, Address(str2, 0));2688}2689// We came here after the beginning of the substring was2690// matched but the rest of it was not so we need to search2691// again. Start from the next element after the previous match.2692subptr(str1, result); // Restore counter2693if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {2694shrl(str1, 1);2695}2696addl(cnt1, str1);2697decrementl(cnt1); // Shift to next element2698cmpl(cnt1, cnt2);2699jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring27002701addptr(result, (1<<scale1));2702} // non constant27032704// Scan string for start of substr in 16-byte vectors2705bind(SCAN_TO_SUBSTR);2706assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");2707pcmpestri(vec, Address(result, 0), mode);2708jccb(Assembler::below, FOUND_CANDIDATE); // CF == 12709subl(cnt1, stride);2710jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string2711cmpl(cnt1, cnt2);2712jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring2713addptr(result, 16);27142715bind(ADJUST_STR);2716cmpl(cnt1, stride); // Do not read beyond string2717jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);2718// Back-up string to avoid reading beyond string.2719lea(result, Address(result, cnt1, scale1, -16));2720movl(cnt1, stride);2721jmpb(SCAN_TO_SUBSTR);27222723// Found a potential substr2724bind(FOUND_CANDIDATE);2725// After pcmpestri tmp(rcx) contains matched element index27262727// Make sure string is still long enough2728subl(cnt1, tmp);2729cmpl(cnt1, cnt2);2730jccb(Assembler::greaterEqual, FOUND_SUBSTR);2731// Left less then substring.27322733bind(RET_NOT_FOUND);2734movl(result, -1);2735jmp(CLEANUP);27362737bind(FOUND_SUBSTR);2738// Compute start addr of substr2739lea(result, Address(result, tmp, scale1));2740if (int_cnt2 > 0) { // Constant substring2741// Repeat search for small substring (< 8 chars)2742// from new point without reloading substring.2743// Have to check that we don't read beyond string.2744cmpl(tmp, stride-int_cnt2);2745jccb(Assembler::greater, ADJUST_STR);2746// Fall through if matched whole substring.2747} else { // non constant2748assert(int_cnt2 == -1, "should be != 0");27492750addl(tmp, cnt2);2751// Found result if we matched whole substring.2752cmpl(tmp, stride);2753jcc(Assembler::lessEqual, RET_FOUND);27542755// Repeat search for small substring (<= 8 chars)2756// from new point 'str1' without reloading substring.2757cmpl(cnt2, stride);2758// Have to check that we don't read beyond string.2759jccb(Assembler::lessEqual, ADJUST_STR);27602761Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;2762// Compare the rest of substring (> 8 chars).2763movptr(str1, result);27642765cmpl(tmp, cnt2);2766// First 8 chars are already matched.2767jccb(Assembler::equal, CHECK_NEXT);27682769bind(SCAN_SUBSTR);2770pcmpestri(vec, Address(str1, 0), mode);2771// Need to reload strings pointers if not matched whole vector2772jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 027732774bind(CHECK_NEXT);2775subl(cnt2, stride);2776jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring2777addptr(str1, 16);2778if (ae == StrIntrinsicNode::UL) {2779addptr(str2, 8);2780} else {2781addptr(str2, 16);2782}2783subl(cnt1, stride);2784cmpl(cnt2, stride); // Do not read beyond substring2785jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);2786// Back-up strings to avoid reading beyond substring.27872788if (ae == StrIntrinsicNode::UL) {2789lea(str2, Address(str2, cnt2, scale2, -8));2790lea(str1, Address(str1, cnt2, scale1, -16));2791} else {2792lea(str2, Address(str2, cnt2, scale2, -16));2793lea(str1, Address(str1, cnt2, scale1, -16));2794}2795subl(cnt1, cnt2);2796movl(cnt2, stride);2797addl(cnt1, stride);2798bind(CONT_SCAN_SUBSTR);2799if (ae == StrIntrinsicNode::UL) {2800pmovzxbw(vec, Address(str2, 0));2801} else {2802movdqu(vec, Address(str2, 0));2803}2804jmp(SCAN_SUBSTR);28052806bind(RET_FOUND_LONG);2807movptr(str1, Address(rsp, wordSize));2808} // non constant28092810bind(RET_FOUND);2811// Compute substr offset2812subptr(result, str1);2813if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {2814shrl(result, 1); // index2815}2816bind(CLEANUP);2817pop(rsp); // restore SP28182819} // string_indexof28202821void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,2822XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {2823ShortBranchVerifier sbv(this);2824assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");28252826int stride = 8;28272828Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,2829SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,2830RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,2831FOUND_SEQ_CHAR, DONE_LABEL;28322833movptr(result, str1);2834if (UseAVX >= 2) {2835cmpl(cnt1, stride);2836jcc(Assembler::less, SCAN_TO_CHAR);2837cmpl(cnt1, 2*stride);2838jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);2839movdl(vec1, ch);2840vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);2841vpxor(vec2, vec2);2842movl(tmp, cnt1);2843andl(tmp, 0xFFFFFFF0); //vector count (in chars)2844andl(cnt1,0x0000000F); //tail count (in chars)28452846bind(SCAN_TO_16_CHAR_LOOP);2847vmovdqu(vec3, Address(result, 0));2848vpcmpeqw(vec3, vec3, vec1, 1);2849vptest(vec2, vec3);2850jcc(Assembler::carryClear, FOUND_CHAR);2851addptr(result, 32);2852subl(tmp, 2*stride);2853jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);2854jmp(SCAN_TO_8_CHAR);2855bind(SCAN_TO_8_CHAR_INIT);2856movdl(vec1, ch);2857pshuflw(vec1, vec1, 0x00);2858pshufd(vec1, vec1, 0);2859pxor(vec2, vec2);2860}2861bind(SCAN_TO_8_CHAR);2862cmpl(cnt1, stride);2863jcc(Assembler::less, SCAN_TO_CHAR);2864if (UseAVX < 2) {2865movdl(vec1, ch);2866pshuflw(vec1, vec1, 0x00);2867pshufd(vec1, vec1, 0);2868pxor(vec2, vec2);2869}2870movl(tmp, cnt1);2871andl(tmp, 0xFFFFFFF8); //vector count (in chars)2872andl(cnt1,0x00000007); //tail count (in chars)28732874bind(SCAN_TO_8_CHAR_LOOP);2875movdqu(vec3, Address(result, 0));2876pcmpeqw(vec3, vec1);2877ptest(vec2, vec3);2878jcc(Assembler::carryClear, FOUND_CHAR);2879addptr(result, 16);2880subl(tmp, stride);2881jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);2882bind(SCAN_TO_CHAR);2883testl(cnt1, cnt1);2884jcc(Assembler::zero, RET_NOT_FOUND);2885bind(SCAN_TO_CHAR_LOOP);2886load_unsigned_short(tmp, Address(result, 0));2887cmpl(ch, tmp);2888jccb(Assembler::equal, FOUND_SEQ_CHAR);2889addptr(result, 2);2890subl(cnt1, 1);2891jccb(Assembler::zero, RET_NOT_FOUND);2892jmp(SCAN_TO_CHAR_LOOP);28932894bind(RET_NOT_FOUND);2895movl(result, -1);2896jmpb(DONE_LABEL);28972898bind(FOUND_CHAR);2899if (UseAVX >= 2) {2900vpmovmskb(tmp, vec3);2901} else {2902pmovmskb(tmp, vec3);2903}2904bsfl(ch, tmp);2905addptr(result, ch);29062907bind(FOUND_SEQ_CHAR);2908subptr(result, str1);2909shrl(result, 1);29102911bind(DONE_LABEL);2912} // string_indexof_char29132914void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,2915XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {2916ShortBranchVerifier sbv(this);2917assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");29182919int stride = 16;29202921Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,2922SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,2923RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,2924FOUND_SEQ_CHAR, DONE_LABEL;29252926movptr(result, str1);2927if (UseAVX >= 2) {2928cmpl(cnt1, stride);2929jcc(Assembler::less, SCAN_TO_CHAR_INIT);2930cmpl(cnt1, stride*2);2931jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);2932movdl(vec1, ch);2933vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);2934vpxor(vec2, vec2);2935movl(tmp, cnt1);2936andl(tmp, 0xFFFFFFE0); //vector count (in chars)2937andl(cnt1,0x0000001F); //tail count (in chars)29382939bind(SCAN_TO_32_CHAR_LOOP);2940vmovdqu(vec3, Address(result, 0));2941vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);2942vptest(vec2, vec3);2943jcc(Assembler::carryClear, FOUND_CHAR);2944addptr(result, 32);2945subl(tmp, stride*2);2946jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);2947jmp(SCAN_TO_16_CHAR);29482949bind(SCAN_TO_16_CHAR_INIT);2950movdl(vec1, ch);2951pxor(vec2, vec2);2952pshufb(vec1, vec2);2953}29542955bind(SCAN_TO_16_CHAR);2956cmpl(cnt1, stride);2957jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left2958if (UseAVX < 2) {2959movdl(vec1, ch);2960pxor(vec2, vec2);2961pshufb(vec1, vec2);2962}2963movl(tmp, cnt1);2964andl(tmp, 0xFFFFFFF0); //vector count (in bytes)2965andl(cnt1,0x0000000F); //tail count (in bytes)29662967bind(SCAN_TO_16_CHAR_LOOP);2968movdqu(vec3, Address(result, 0));2969pcmpeqb(vec3, vec1);2970ptest(vec2, vec3);2971jcc(Assembler::carryClear, FOUND_CHAR);2972addptr(result, 16);2973subl(tmp, stride);2974jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...29752976bind(SCAN_TO_CHAR_INIT);2977testl(cnt1, cnt1);2978jcc(Assembler::zero, RET_NOT_FOUND);2979bind(SCAN_TO_CHAR_LOOP);2980load_unsigned_byte(tmp, Address(result, 0));2981cmpl(ch, tmp);2982jccb(Assembler::equal, FOUND_SEQ_CHAR);2983addptr(result, 1);2984subl(cnt1, 1);2985jccb(Assembler::zero, RET_NOT_FOUND);2986jmp(SCAN_TO_CHAR_LOOP);29872988bind(RET_NOT_FOUND);2989movl(result, -1);2990jmpb(DONE_LABEL);29912992bind(FOUND_CHAR);2993if (UseAVX >= 2) {2994vpmovmskb(tmp, vec3);2995} else {2996pmovmskb(tmp, vec3);2997}2998bsfl(ch, tmp);2999addptr(result, ch);30003001bind(FOUND_SEQ_CHAR);3002subptr(result, str1);30033004bind(DONE_LABEL);3005} // stringL_indexof_char30063007// helper function for string_compare3008void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,3009Address::ScaleFactor scale, Address::ScaleFactor scale1,3010Address::ScaleFactor scale2, Register index, int ae) {3011if (ae == StrIntrinsicNode::LL) {3012load_unsigned_byte(elem1, Address(str1, index, scale, 0));3013load_unsigned_byte(elem2, Address(str2, index, scale, 0));3014} else if (ae == StrIntrinsicNode::UU) {3015load_unsigned_short(elem1, Address(str1, index, scale, 0));3016load_unsigned_short(elem2, Address(str2, index, scale, 0));3017} else {3018load_unsigned_byte(elem1, Address(str1, index, scale1, 0));3019load_unsigned_short(elem2, Address(str2, index, scale2, 0));3020}3021}30223023// Compare strings, used for char[] and byte[].3024void C2_MacroAssembler::string_compare(Register str1, Register str2,3025Register cnt1, Register cnt2, Register result,3026XMMRegister vec1, int ae, KRegister mask) {3027ShortBranchVerifier sbv(this);3028Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;3029Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX33030int stride, stride2, adr_stride, adr_stride1, adr_stride2;3031int stride2x2 = 0x40;3032Address::ScaleFactor scale = Address::no_scale;3033Address::ScaleFactor scale1 = Address::no_scale;3034Address::ScaleFactor scale2 = Address::no_scale;30353036if (ae != StrIntrinsicNode::LL) {3037stride2x2 = 0x20;3038}30393040if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {3041shrl(cnt2, 1);3042}3043// Compute the minimum of the string lengths and the3044// difference of the string lengths (stack).3045// Do the conditional move stuff3046movl(result, cnt1);3047subl(cnt1, cnt2);3048push(cnt1);3049cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)30503051// Is the minimum length zero?3052testl(cnt2, cnt2);3053jcc(Assembler::zero, LENGTH_DIFF_LABEL);3054if (ae == StrIntrinsicNode::LL) {3055// Load first bytes3056load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]3057load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]3058} else if (ae == StrIntrinsicNode::UU) {3059// Load first characters3060load_unsigned_short(result, Address(str1, 0));3061load_unsigned_short(cnt1, Address(str2, 0));3062} else {3063load_unsigned_byte(result, Address(str1, 0));3064load_unsigned_short(cnt1, Address(str2, 0));3065}3066subl(result, cnt1);3067jcc(Assembler::notZero, POP_LABEL);30683069if (ae == StrIntrinsicNode::UU) {3070// Divide length by 2 to get number of chars3071shrl(cnt2, 1);3072}3073cmpl(cnt2, 1);3074jcc(Assembler::equal, LENGTH_DIFF_LABEL);30753076// Check if the strings start at the same location and setup scale and stride3077if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3078cmpptr(str1, str2);3079jcc(Assembler::equal, LENGTH_DIFF_LABEL);3080if (ae == StrIntrinsicNode::LL) {3081scale = Address::times_1;3082stride = 16;3083} else {3084scale = Address::times_2;3085stride = 8;3086}3087} else {3088scale1 = Address::times_1;3089scale2 = Address::times_2;3090// scale not used3091stride = 8;3092}30933094if (UseAVX >= 2 && UseSSE42Intrinsics) {3095Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;3096Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;3097Label COMPARE_WIDE_VECTORS_LOOP_AVX2;3098Label COMPARE_TAIL_LONG;3099Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX331003101int pcmpmask = 0x19;3102if (ae == StrIntrinsicNode::LL) {3103pcmpmask &= ~0x01;3104}31053106// Setup to compare 16-chars (32-bytes) vectors,3107// start from first character again because it has aligned address.3108if (ae == StrIntrinsicNode::LL) {3109stride2 = 32;3110} else {3111stride2 = 16;3112}3113if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3114adr_stride = stride << scale;3115} else {3116adr_stride1 = 8; //stride << scale1;3117adr_stride2 = 16; //stride << scale2;3118}31193120assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");3121// rax and rdx are used by pcmpestri as elements counters3122movl(result, cnt2);3123andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count3124jcc(Assembler::zero, COMPARE_TAIL_LONG);31253126// fast path : compare first 2 8-char vectors.3127bind(COMPARE_16_CHARS);3128if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3129movdqu(vec1, Address(str1, 0));3130} else {3131pmovzxbw(vec1, Address(str1, 0));3132}3133pcmpestri(vec1, Address(str2, 0), pcmpmask);3134jccb(Assembler::below, COMPARE_INDEX_CHAR);31353136if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3137movdqu(vec1, Address(str1, adr_stride));3138pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);3139} else {3140pmovzxbw(vec1, Address(str1, adr_stride1));3141pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);3142}3143jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);3144addl(cnt1, stride);31453146// Compare the characters at index in cnt13147bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character3148load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);3149subl(result, cnt2);3150jmp(POP_LABEL);31513152// Setup the registers to start vector comparison loop3153bind(COMPARE_WIDE_VECTORS);3154if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3155lea(str1, Address(str1, result, scale));3156lea(str2, Address(str2, result, scale));3157} else {3158lea(str1, Address(str1, result, scale1));3159lea(str2, Address(str2, result, scale2));3160}3161subl(result, stride2);3162subl(cnt2, stride2);3163jcc(Assembler::zero, COMPARE_WIDE_TAIL);3164negptr(result);31653166// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)3167bind(COMPARE_WIDE_VECTORS_LOOP);31683169#ifdef _LP643170if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop3171cmpl(cnt2, stride2x2);3172jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);3173testl(cnt2, stride2x2-1); // cnt2 holds the vector count3174jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x4031753176bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop3177if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3178evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);3179evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 03180} else {3181vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);3182evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 03183}3184kortestql(mask, mask);3185jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare3186addptr(result, stride2x2); // update since we already compared at this addr3187subl(cnt2, stride2x2); // and sub the size too3188jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);31893190vpxor(vec1, vec1);3191jmpb(COMPARE_WIDE_TAIL);3192}//if (VM_Version::supports_avx512vlbw())3193#endif // _LP64319431953196bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);3197if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3198vmovdqu(vec1, Address(str1, result, scale));3199vpxor(vec1, Address(str2, result, scale));3200} else {3201vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);3202vpxor(vec1, Address(str2, result, scale2));3203}3204vptest(vec1, vec1);3205jcc(Assembler::notZero, VECTOR_NOT_EQUAL);3206addptr(result, stride2);3207subl(cnt2, stride2);3208jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);3209// clean upper bits of YMM registers3210vpxor(vec1, vec1);32113212// compare wide vectors tail3213bind(COMPARE_WIDE_TAIL);3214testptr(result, result);3215jcc(Assembler::zero, LENGTH_DIFF_LABEL);32163217movl(result, stride2);3218movl(cnt2, result);3219negptr(result);3220jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);32213222// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.3223bind(VECTOR_NOT_EQUAL);3224// clean upper bits of YMM registers3225vpxor(vec1, vec1);3226if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3227lea(str1, Address(str1, result, scale));3228lea(str2, Address(str2, result, scale));3229} else {3230lea(str1, Address(str1, result, scale1));3231lea(str2, Address(str2, result, scale2));3232}3233jmp(COMPARE_16_CHARS);32343235// Compare tail chars, length between 1 to 15 chars3236bind(COMPARE_TAIL_LONG);3237movl(cnt2, result);3238cmpl(cnt2, stride);3239jcc(Assembler::less, COMPARE_SMALL_STR);32403241if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3242movdqu(vec1, Address(str1, 0));3243} else {3244pmovzxbw(vec1, Address(str1, 0));3245}3246pcmpestri(vec1, Address(str2, 0), pcmpmask);3247jcc(Assembler::below, COMPARE_INDEX_CHAR);3248subptr(cnt2, stride);3249jcc(Assembler::zero, LENGTH_DIFF_LABEL);3250if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3251lea(str1, Address(str1, result, scale));3252lea(str2, Address(str2, result, scale));3253} else {3254lea(str1, Address(str1, result, scale1));3255lea(str2, Address(str2, result, scale2));3256}3257negptr(cnt2);3258jmpb(WHILE_HEAD_LABEL);32593260bind(COMPARE_SMALL_STR);3261} else if (UseSSE42Intrinsics) {3262Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;3263int pcmpmask = 0x19;3264// Setup to compare 8-char (16-byte) vectors,3265// start from first character again because it has aligned address.3266movl(result, cnt2);3267andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count3268if (ae == StrIntrinsicNode::LL) {3269pcmpmask &= ~0x01;3270}3271jcc(Assembler::zero, COMPARE_TAIL);3272if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3273lea(str1, Address(str1, result, scale));3274lea(str2, Address(str2, result, scale));3275} else {3276lea(str1, Address(str1, result, scale1));3277lea(str2, Address(str2, result, scale2));3278}3279negptr(result);32803281// pcmpestri3282// inputs:3283// vec1- substring3284// rax - negative string length (elements count)3285// mem - scanned string3286// rdx - string length (elements count)3287// pcmpmask - cmp mode: 11000 (string compare with negated result)3288// + 00 (unsigned bytes) or + 01 (unsigned shorts)3289// outputs:3290// rcx - first mismatched element index3291assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");32923293bind(COMPARE_WIDE_VECTORS);3294if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3295movdqu(vec1, Address(str1, result, scale));3296pcmpestri(vec1, Address(str2, result, scale), pcmpmask);3297} else {3298pmovzxbw(vec1, Address(str1, result, scale1));3299pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);3300}3301// After pcmpestri cnt1(rcx) contains mismatched element index33023303jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==13304addptr(result, stride);3305subptr(cnt2, stride);3306jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);33073308// compare wide vectors tail3309testptr(result, result);3310jcc(Assembler::zero, LENGTH_DIFF_LABEL);33113312movl(cnt2, stride);3313movl(result, stride);3314negptr(result);3315if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3316movdqu(vec1, Address(str1, result, scale));3317pcmpestri(vec1, Address(str2, result, scale), pcmpmask);3318} else {3319pmovzxbw(vec1, Address(str1, result, scale1));3320pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);3321}3322jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);33233324// Mismatched characters in the vectors3325bind(VECTOR_NOT_EQUAL);3326addptr(cnt1, result);3327load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);3328subl(result, cnt2);3329jmpb(POP_LABEL);33303331bind(COMPARE_TAIL); // limit is zero3332movl(cnt2, result);3333// Fallthru to tail compare3334}3335// Shift str2 and str1 to the end of the arrays, negate min3336if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {3337lea(str1, Address(str1, cnt2, scale));3338lea(str2, Address(str2, cnt2, scale));3339} else {3340lea(str1, Address(str1, cnt2, scale1));3341lea(str2, Address(str2, cnt2, scale2));3342}3343decrementl(cnt2); // first character was compared already3344negptr(cnt2);33453346// Compare the rest of the elements3347bind(WHILE_HEAD_LABEL);3348load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);3349subl(result, cnt1);3350jccb(Assembler::notZero, POP_LABEL);3351increment(cnt2);3352jccb(Assembler::notZero, WHILE_HEAD_LABEL);33533354// Strings are equal up to min length. Return the length difference.3355bind(LENGTH_DIFF_LABEL);3356pop(result);3357if (ae == StrIntrinsicNode::UU) {3358// Divide diff by 2 to get number of chars3359sarl(result, 1);3360}3361jmpb(DONE_LABEL);33623363#ifdef _LP643364if (VM_Version::supports_avx512vlbw()) {33653366bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);33673368kmovql(cnt1, mask);3369notq(cnt1);3370bsfq(cnt2, cnt1);3371if (ae != StrIntrinsicNode::LL) {3372// Divide diff by 2 to get number of chars3373sarl(cnt2, 1);3374}3375addq(result, cnt2);3376if (ae == StrIntrinsicNode::LL) {3377load_unsigned_byte(cnt1, Address(str2, result));3378load_unsigned_byte(result, Address(str1, result));3379} else if (ae == StrIntrinsicNode::UU) {3380load_unsigned_short(cnt1, Address(str2, result, scale));3381load_unsigned_short(result, Address(str1, result, scale));3382} else {3383load_unsigned_short(cnt1, Address(str2, result, scale2));3384load_unsigned_byte(result, Address(str1, result, scale1));3385}3386subl(result, cnt1);3387jmpb(POP_LABEL);3388}//if (VM_Version::supports_avx512vlbw())3389#endif // _LP6433903391// Discard the stored length difference3392bind(POP_LABEL);3393pop(cnt1);33943395// That's it3396bind(DONE_LABEL);3397if(ae == StrIntrinsicNode::UL) {3398negl(result);3399}34003401}34023403// Search for Non-ASCII character (Negative byte value) in a byte array,3404// return true if it has any and false otherwise.3405// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java3406// @IntrinsicCandidate3407// private static boolean hasNegatives(byte[] ba, int off, int len) {3408// for (int i = off; i < off + len; i++) {3409// if (ba[i] < 0) {3410// return true;3411// }3412// }3413// return false;3414// }3415void C2_MacroAssembler::has_negatives(Register ary1, Register len,3416Register result, Register tmp1,3417XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {3418// rsi: byte array3419// rcx: len3420// rax: result3421ShortBranchVerifier sbv(this);3422assert_different_registers(ary1, len, result, tmp1);3423assert_different_registers(vec1, vec2);3424Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;34253426// len == 03427testl(len, len);3428jcc(Assembler::zero, FALSE_LABEL);34293430if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX5123431VM_Version::supports_avx512vlbw() &&3432VM_Version::supports_bmi2()) {34333434Label test_64_loop, test_tail;3435Register tmp3_aliased = len;34363437movl(tmp1, len);3438vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);34393440andl(tmp1, 64 - 1); // tail count (in chars) 0x3F3441andl(len, ~(64 - 1)); // vector count (in chars)3442jccb(Assembler::zero, test_tail);34433444lea(ary1, Address(ary1, len, Address::times_1));3445negptr(len);34463447bind(test_64_loop);3448// Check whether our 64 elements of size byte contain negatives3449evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);3450kortestql(mask1, mask1);3451jcc(Assembler::notZero, TRUE_LABEL);34523453addptr(len, 64);3454jccb(Assembler::notZero, test_64_loop);345534563457bind(test_tail);3458// bail out when there is nothing to be done3459testl(tmp1, -1);3460jcc(Assembler::zero, FALSE_LABEL);34613462// ~(~0 << len) applied up to two times (for 32-bit scenario)3463#ifdef _LP643464mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);3465shlxq(tmp3_aliased, tmp3_aliased, tmp1);3466notq(tmp3_aliased);3467kmovql(mask2, tmp3_aliased);3468#else3469Label k_init;3470jmp(k_init);34713472// We could not read 64-bits from a general purpose register thus we move3473// data required to compose 64 1's to the instruction stream3474// We emit 64 byte wide series of elements from 0..63 which later on would3475// be used as a compare targets with tail count contained in tmp1 register.3476// Result would be a k register having tmp1 consecutive number or 13477// counting from least significant bit.3478address tmp = pc();3479emit_int64(0x0706050403020100);3480emit_int64(0x0F0E0D0C0B0A0908);3481emit_int64(0x1716151413121110);3482emit_int64(0x1F1E1D1C1B1A1918);3483emit_int64(0x2726252423222120);3484emit_int64(0x2F2E2D2C2B2A2928);3485emit_int64(0x3736353433323130);3486emit_int64(0x3F3E3D3C3B3A3938);34873488bind(k_init);3489lea(len, InternalAddress(tmp));3490// create mask to test for negative byte inside a vector3491evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);3492evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);34933494#endif3495evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);3496ktestq(mask1, mask2);3497jcc(Assembler::notZero, TRUE_LABEL);34983499jmp(FALSE_LABEL);3500} else {3501movl(result, len); // copy35023503if (UseAVX >= 2 && UseSSE >= 2) {3504// With AVX2, use 32-byte vector compare3505Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;35063507// Compare 32-byte vectors3508andl(result, 0x0000001f); // tail count (in bytes)3509andl(len, 0xffffffe0); // vector count (in bytes)3510jccb(Assembler::zero, COMPARE_TAIL);35113512lea(ary1, Address(ary1, len, Address::times_1));3513negptr(len);35143515movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector3516movdl(vec2, tmp1);3517vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);35183519bind(COMPARE_WIDE_VECTORS);3520vmovdqu(vec1, Address(ary1, len, Address::times_1));3521vptest(vec1, vec2);3522jccb(Assembler::notZero, TRUE_LABEL);3523addptr(len, 32);3524jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);35253526testl(result, result);3527jccb(Assembler::zero, FALSE_LABEL);35283529vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));3530vptest(vec1, vec2);3531jccb(Assembler::notZero, TRUE_LABEL);3532jmpb(FALSE_LABEL);35333534bind(COMPARE_TAIL); // len is zero3535movl(len, result);3536// Fallthru to tail compare3537} else if (UseSSE42Intrinsics) {3538// With SSE4.2, use double quad vector compare3539Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;35403541// Compare 16-byte vectors3542andl(result, 0x0000000f); // tail count (in bytes)3543andl(len, 0xfffffff0); // vector count (in bytes)3544jcc(Assembler::zero, COMPARE_TAIL);35453546lea(ary1, Address(ary1, len, Address::times_1));3547negptr(len);35483549movl(tmp1, 0x80808080);3550movdl(vec2, tmp1);3551pshufd(vec2, vec2, 0);35523553bind(COMPARE_WIDE_VECTORS);3554movdqu(vec1, Address(ary1, len, Address::times_1));3555ptest(vec1, vec2);3556jcc(Assembler::notZero, TRUE_LABEL);3557addptr(len, 16);3558jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);35593560testl(result, result);3561jcc(Assembler::zero, FALSE_LABEL);35623563movdqu(vec1, Address(ary1, result, Address::times_1, -16));3564ptest(vec1, vec2);3565jccb(Assembler::notZero, TRUE_LABEL);3566jmpb(FALSE_LABEL);35673568bind(COMPARE_TAIL); // len is zero3569movl(len, result);3570// Fallthru to tail compare3571}3572}3573// Compare 4-byte vectors3574andl(len, 0xfffffffc); // vector count (in bytes)3575jccb(Assembler::zero, COMPARE_CHAR);35763577lea(ary1, Address(ary1, len, Address::times_1));3578negptr(len);35793580bind(COMPARE_VECTORS);3581movl(tmp1, Address(ary1, len, Address::times_1));3582andl(tmp1, 0x80808080);3583jccb(Assembler::notZero, TRUE_LABEL);3584addptr(len, 4);3585jcc(Assembler::notZero, COMPARE_VECTORS);35863587// Compare trailing char (final 2 bytes), if any3588bind(COMPARE_CHAR);3589testl(result, 0x2); // tail char3590jccb(Assembler::zero, COMPARE_BYTE);3591load_unsigned_short(tmp1, Address(ary1, 0));3592andl(tmp1, 0x00008080);3593jccb(Assembler::notZero, TRUE_LABEL);3594subptr(result, 2);3595lea(ary1, Address(ary1, 2));35963597bind(COMPARE_BYTE);3598testl(result, 0x1); // tail byte3599jccb(Assembler::zero, FALSE_LABEL);3600load_unsigned_byte(tmp1, Address(ary1, 0));3601andl(tmp1, 0x00000080);3602jccb(Assembler::notEqual, TRUE_LABEL);3603jmpb(FALSE_LABEL);36043605bind(TRUE_LABEL);3606movl(result, 1); // return true3607jmpb(DONE);36083609bind(FALSE_LABEL);3610xorl(result, result); // return false36113612// That's it3613bind(DONE);3614if (UseAVX >= 2 && UseSSE >= 2) {3615// clean upper bits of YMM registers3616vpxor(vec1, vec1);3617vpxor(vec2, vec2);3618}3619}3620// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.3621void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,3622Register limit, Register result, Register chr,3623XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {3624ShortBranchVerifier sbv(this);3625Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;36263627int length_offset = arrayOopDesc::length_offset_in_bytes();3628int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);36293630if (is_array_equ) {3631// Check the input args3632cmpoop(ary1, ary2);3633jcc(Assembler::equal, TRUE_LABEL);36343635// Need additional checks for arrays_equals.3636testptr(ary1, ary1);3637jcc(Assembler::zero, FALSE_LABEL);3638testptr(ary2, ary2);3639jcc(Assembler::zero, FALSE_LABEL);36403641// Check the lengths3642movl(limit, Address(ary1, length_offset));3643cmpl(limit, Address(ary2, length_offset));3644jcc(Assembler::notEqual, FALSE_LABEL);3645}36463647// count == 03648testl(limit, limit);3649jcc(Assembler::zero, TRUE_LABEL);36503651if (is_array_equ) {3652// Load array address3653lea(ary1, Address(ary1, base_offset));3654lea(ary2, Address(ary2, base_offset));3655}36563657if (is_array_equ && is_char) {3658// arrays_equals when used for char[].3659shll(limit, 1); // byte count != 03660}3661movl(result, limit); // copy36623663if (UseAVX >= 2) {3664// With AVX2, use 32-byte vector compare3665Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;36663667// Compare 32-byte vectors3668andl(result, 0x0000001f); // tail count (in bytes)3669andl(limit, 0xffffffe0); // vector count (in bytes)3670jcc(Assembler::zero, COMPARE_TAIL);36713672lea(ary1, Address(ary1, limit, Address::times_1));3673lea(ary2, Address(ary2, limit, Address::times_1));3674negptr(limit);36753676#ifdef _LP643677if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop3678Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;36793680cmpl(limit, -64);3681jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);36823683bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop36843685evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);3686evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);3687kortestql(mask, mask);3688jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare3689addptr(limit, 64); // update since we already compared at this addr3690cmpl(limit, -64);3691jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);36923693// At this point we may still need to compare -limit+result bytes.3694// We could execute the next two instruction and just continue via non-wide path:3695// cmpl(limit, 0);3696// jcc(Assembler::equal, COMPARE_TAIL); // true3697// But since we stopped at the points ary{1,2}+limit which are3698// not farther than 64 bytes from the ends of arrays ary{1,2}+result3699// (|limit| <= 32 and result < 32),3700// we may just compare the last 64 bytes.3701//3702addptr(result, -64); // it is safe, bc we just came from this area3703evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);3704evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);3705kortestql(mask, mask);3706jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare37073708jmp(TRUE_LABEL);37093710bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);37113712}//if (VM_Version::supports_avx512vlbw())3713#endif //_LP643714bind(COMPARE_WIDE_VECTORS);3715vmovdqu(vec1, Address(ary1, limit, Address::times_1));3716vmovdqu(vec2, Address(ary2, limit, Address::times_1));3717vpxor(vec1, vec2);37183719vptest(vec1, vec1);3720jcc(Assembler::notZero, FALSE_LABEL);3721addptr(limit, 32);3722jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);37233724testl(result, result);3725jcc(Assembler::zero, TRUE_LABEL);37263727vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));3728vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));3729vpxor(vec1, vec2);37303731vptest(vec1, vec1);3732jccb(Assembler::notZero, FALSE_LABEL);3733jmpb(TRUE_LABEL);37343735bind(COMPARE_TAIL); // limit is zero3736movl(limit, result);3737// Fallthru to tail compare3738} else if (UseSSE42Intrinsics) {3739// With SSE4.2, use double quad vector compare3740Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;37413742// Compare 16-byte vectors3743andl(result, 0x0000000f); // tail count (in bytes)3744andl(limit, 0xfffffff0); // vector count (in bytes)3745jcc(Assembler::zero, COMPARE_TAIL);37463747lea(ary1, Address(ary1, limit, Address::times_1));3748lea(ary2, Address(ary2, limit, Address::times_1));3749negptr(limit);37503751bind(COMPARE_WIDE_VECTORS);3752movdqu(vec1, Address(ary1, limit, Address::times_1));3753movdqu(vec2, Address(ary2, limit, Address::times_1));3754pxor(vec1, vec2);37553756ptest(vec1, vec1);3757jcc(Assembler::notZero, FALSE_LABEL);3758addptr(limit, 16);3759jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);37603761testl(result, result);3762jcc(Assembler::zero, TRUE_LABEL);37633764movdqu(vec1, Address(ary1, result, Address::times_1, -16));3765movdqu(vec2, Address(ary2, result, Address::times_1, -16));3766pxor(vec1, vec2);37673768ptest(vec1, vec1);3769jccb(Assembler::notZero, FALSE_LABEL);3770jmpb(TRUE_LABEL);37713772bind(COMPARE_TAIL); // limit is zero3773movl(limit, result);3774// Fallthru to tail compare3775}37763777// Compare 4-byte vectors3778andl(limit, 0xfffffffc); // vector count (in bytes)3779jccb(Assembler::zero, COMPARE_CHAR);37803781lea(ary1, Address(ary1, limit, Address::times_1));3782lea(ary2, Address(ary2, limit, Address::times_1));3783negptr(limit);37843785bind(COMPARE_VECTORS);3786movl(chr, Address(ary1, limit, Address::times_1));3787cmpl(chr, Address(ary2, limit, Address::times_1));3788jccb(Assembler::notEqual, FALSE_LABEL);3789addptr(limit, 4);3790jcc(Assembler::notZero, COMPARE_VECTORS);37913792// Compare trailing char (final 2 bytes), if any3793bind(COMPARE_CHAR);3794testl(result, 0x2); // tail char3795jccb(Assembler::zero, COMPARE_BYTE);3796load_unsigned_short(chr, Address(ary1, 0));3797load_unsigned_short(limit, Address(ary2, 0));3798cmpl(chr, limit);3799jccb(Assembler::notEqual, FALSE_LABEL);38003801if (is_array_equ && is_char) {3802bind(COMPARE_BYTE);3803} else {3804lea(ary1, Address(ary1, 2));3805lea(ary2, Address(ary2, 2));38063807bind(COMPARE_BYTE);3808testl(result, 0x1); // tail byte3809jccb(Assembler::zero, TRUE_LABEL);3810load_unsigned_byte(chr, Address(ary1, 0));3811load_unsigned_byte(limit, Address(ary2, 0));3812cmpl(chr, limit);3813jccb(Assembler::notEqual, FALSE_LABEL);3814}3815bind(TRUE_LABEL);3816movl(result, 1); // return true3817jmpb(DONE);38183819bind(FALSE_LABEL);3820xorl(result, result); // return false38213822// That's it3823bind(DONE);3824if (UseAVX >= 2) {3825// clean upper bits of YMM registers3826vpxor(vec1, vec1);3827vpxor(vec2, vec2);3828}3829}38303831#ifdef _LP643832void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,3833Register tmp, KRegister ktmp, int masklen, int vec_enc) {3834assert(VM_Version::supports_avx512vlbw(), "");3835vpxor(xtmp, xtmp, xtmp, vec_enc);3836vpsubb(xtmp, xtmp, mask, vec_enc);3837evpmovb2m(ktmp, xtmp, vec_enc);3838kmovql(tmp, ktmp);3839switch(opc) {3840case Op_VectorMaskTrueCount:3841popcntq(dst, tmp);3842break;3843case Op_VectorMaskLastTrue:3844mov64(dst, -1);3845bsrq(tmp, tmp);3846cmov(Assembler::notZero, dst, tmp);3847break;3848case Op_VectorMaskFirstTrue:3849mov64(dst, masklen);3850bsfq(tmp, tmp);3851cmov(Assembler::notZero, dst, tmp);3852break;3853default: assert(false, "Unhandled mask operation");3854}3855}38563857void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,3858XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {3859assert(VM_Version::supports_avx(), "");3860vpxor(xtmp, xtmp, xtmp, vec_enc);3861vpsubb(xtmp, xtmp, mask, vec_enc);3862vpmovmskb(tmp, xtmp, vec_enc);3863switch(opc) {3864case Op_VectorMaskTrueCount:3865popcntq(dst, tmp);3866break;3867case Op_VectorMaskLastTrue:3868mov64(dst, -1);3869bsrq(tmp, tmp);3870cmov(Assembler::notZero, dst, tmp);3871break;3872case Op_VectorMaskFirstTrue:3873mov64(dst, masklen);3874bsfq(tmp, tmp);3875cmov(Assembler::notZero, dst, tmp);3876break;3877default: assert(false, "Unhandled mask operation");3878}3879}3880#endif388138823883