Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
PojavLauncherTeam
GitHub Repository: PojavLauncherTeam/mobile
Path: blob/master/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
41144 views
1
/*
2
* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
*
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
8
*
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
14
*
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
*
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
21
* questions.
22
*
23
*/
24
25
#include "precompiled.hpp"
26
#include "asm/assembler.hpp"
27
#include "asm/assembler.inline.hpp"
28
#include "oops/methodData.hpp"
29
#include "opto/c2_MacroAssembler.hpp"
30
#include "opto/intrinsicnode.hpp"
31
#include "opto/opcodes.hpp"
32
#include "opto/subnode.hpp"
33
#include "runtime/biasedLocking.hpp"
34
#include "runtime/objectMonitor.hpp"
35
#include "runtime/stubRoutines.hpp"
36
37
inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
38
switch (vlen_in_bytes) {
39
case 4: // fall-through
40
case 8: // fall-through
41
case 16: return Assembler::AVX_128bit;
42
case 32: return Assembler::AVX_256bit;
43
case 64: return Assembler::AVX_512bit;
44
45
default: {
46
ShouldNotReachHere();
47
return Assembler::AVX_NoVec;
48
}
49
}
50
}
51
52
void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
53
guarantee(PostLoopMultiversioning, "must be");
54
Assembler::movl(dst, 1);
55
Assembler::shlxl(dst, dst, src);
56
Assembler::decl(dst);
57
Assembler::kmovdl(mask, dst);
58
Assembler::movl(dst, src);
59
}
60
61
void C2_MacroAssembler::restorevectmask(KRegister mask) {
62
guarantee(PostLoopMultiversioning, "must be");
63
Assembler::knotwl(mask, k0);
64
}
65
66
#if INCLUDE_RTM_OPT
67
68
// Update rtm_counters based on abort status
69
// input: abort_status
70
// rtm_counters (RTMLockingCounters*)
71
// flags are killed
72
void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
73
74
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
75
if (PrintPreciseRTMLockingStatistics) {
76
for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
77
Label check_abort;
78
testl(abort_status, (1<<i));
79
jccb(Assembler::equal, check_abort);
80
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
81
bind(check_abort);
82
}
83
}
84
}
85
86
// Branch if (random & (count-1) != 0), count is 2^n
87
// tmp, scr and flags are killed
88
void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
89
assert(tmp == rax, "");
90
assert(scr == rdx, "");
91
rdtsc(); // modifies EDX:EAX
92
andptr(tmp, count-1);
93
jccb(Assembler::notZero, brLabel);
94
}
95
96
// Perform abort ratio calculation, set no_rtm bit if high ratio
97
// input: rtm_counters_Reg (RTMLockingCounters* address)
98
// tmpReg, rtm_counters_Reg and flags are killed
99
void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
100
Register rtm_counters_Reg,
101
RTMLockingCounters* rtm_counters,
102
Metadata* method_data) {
103
Label L_done, L_check_always_rtm1, L_check_always_rtm2;
104
105
if (RTMLockingCalculationDelay > 0) {
106
// Delay calculation
107
movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
108
testptr(tmpReg, tmpReg);
109
jccb(Assembler::equal, L_done);
110
}
111
// Abort ratio calculation only if abort_count > RTMAbortThreshold
112
// Aborted transactions = abort_count * 100
113
// All transactions = total_count * RTMTotalCountIncrRate
114
// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
115
116
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
117
cmpptr(tmpReg, RTMAbortThreshold);
118
jccb(Assembler::below, L_check_always_rtm2);
119
imulptr(tmpReg, tmpReg, 100);
120
121
Register scrReg = rtm_counters_Reg;
122
movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123
imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
124
imulptr(scrReg, scrReg, RTMAbortRatio);
125
cmpptr(tmpReg, scrReg);
126
jccb(Assembler::below, L_check_always_rtm1);
127
if (method_data != NULL) {
128
// set rtm_state to "no rtm" in MDO
129
mov_metadata(tmpReg, method_data);
130
lock();
131
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
132
}
133
jmpb(L_done);
134
bind(L_check_always_rtm1);
135
// Reload RTMLockingCounters* address
136
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
137
bind(L_check_always_rtm2);
138
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
139
cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
140
jccb(Assembler::below, L_done);
141
if (method_data != NULL) {
142
// set rtm_state to "always rtm" in MDO
143
mov_metadata(tmpReg, method_data);
144
lock();
145
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
146
}
147
bind(L_done);
148
}
149
150
// Update counters and perform abort ratio calculation
151
// input: abort_status_Reg
152
// rtm_counters_Reg, flags are killed
153
void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
154
Register rtm_counters_Reg,
155
RTMLockingCounters* rtm_counters,
156
Metadata* method_data,
157
bool profile_rtm) {
158
159
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
160
// update rtm counters based on rax value at abort
161
// reads abort_status_Reg, updates flags
162
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
163
rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
164
if (profile_rtm) {
165
// Save abort status because abort_status_Reg is used by following code.
166
if (RTMRetryCount > 0) {
167
push(abort_status_Reg);
168
}
169
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
170
rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
171
// restore abort status
172
if (RTMRetryCount > 0) {
173
pop(abort_status_Reg);
174
}
175
}
176
}
177
178
// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
179
// inputs: retry_count_Reg
180
// : abort_status_Reg
181
// output: retry_count_Reg decremented by 1
182
// flags are killed
183
void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
184
Label doneRetry;
185
assert(abort_status_Reg == rax, "");
186
// The abort reason bits are in eax (see all states in rtmLocking.hpp)
187
// 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
188
// if reason is in 0x6 and retry count != 0 then retry
189
andptr(abort_status_Reg, 0x6);
190
jccb(Assembler::zero, doneRetry);
191
testl(retry_count_Reg, retry_count_Reg);
192
jccb(Assembler::zero, doneRetry);
193
pause();
194
decrementl(retry_count_Reg);
195
jmp(retryLabel);
196
bind(doneRetry);
197
}
198
199
// Spin and retry if lock is busy,
200
// inputs: box_Reg (monitor address)
201
// : retry_count_Reg
202
// output: retry_count_Reg decremented by 1
203
// : clear z flag if retry count exceeded
204
// tmp_Reg, scr_Reg, flags are killed
205
void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
206
Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
207
Label SpinLoop, SpinExit, doneRetry;
208
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
209
210
testl(retry_count_Reg, retry_count_Reg);
211
jccb(Assembler::zero, doneRetry);
212
decrementl(retry_count_Reg);
213
movptr(scr_Reg, RTMSpinLoopCount);
214
215
bind(SpinLoop);
216
pause();
217
decrementl(scr_Reg);
218
jccb(Assembler::lessEqual, SpinExit);
219
movptr(tmp_Reg, Address(box_Reg, owner_offset));
220
testptr(tmp_Reg, tmp_Reg);
221
jccb(Assembler::notZero, SpinLoop);
222
223
bind(SpinExit);
224
jmp(retryLabel);
225
bind(doneRetry);
226
incrementl(retry_count_Reg); // clear z flag
227
}
228
229
// Use RTM for normal stack locks
230
// Input: objReg (object to lock)
231
void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
232
Register retry_on_abort_count_Reg,
233
RTMLockingCounters* stack_rtm_counters,
234
Metadata* method_data, bool profile_rtm,
235
Label& DONE_LABEL, Label& IsInflated) {
236
assert(UseRTMForStackLocks, "why call this otherwise?");
237
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
238
assert(tmpReg == rax, "");
239
assert(scrReg == rdx, "");
240
Label L_rtm_retry, L_decrement_retry, L_on_abort;
241
242
if (RTMRetryCount > 0) {
243
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
244
bind(L_rtm_retry);
245
}
246
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
247
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
248
jcc(Assembler::notZero, IsInflated);
249
250
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
251
Label L_noincrement;
252
if (RTMTotalCountIncrRate > 1) {
253
// tmpReg, scrReg and flags are killed
254
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
255
}
256
assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
257
atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
258
bind(L_noincrement);
259
}
260
xbegin(L_on_abort);
261
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
262
andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
263
cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
264
jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
265
266
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
267
if (UseRTMXendForLockBusy) {
268
xend();
269
movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
270
jmp(L_decrement_retry);
271
}
272
else {
273
xabort(0);
274
}
275
bind(L_on_abort);
276
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
277
rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
278
}
279
bind(L_decrement_retry);
280
if (RTMRetryCount > 0) {
281
// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
282
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
283
}
284
}
285
286
// Use RTM for inflating locks
287
// inputs: objReg (object to lock)
288
// boxReg (on-stack box address (displaced header location) - KILLED)
289
// tmpReg (ObjectMonitor address + markWord::monitor_value)
290
void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
291
Register scrReg, Register retry_on_busy_count_Reg,
292
Register retry_on_abort_count_Reg,
293
RTMLockingCounters* rtm_counters,
294
Metadata* method_data, bool profile_rtm,
295
Label& DONE_LABEL) {
296
assert(UseRTMLocking, "why call this otherwise?");
297
assert(tmpReg == rax, "");
298
assert(scrReg == rdx, "");
299
Label L_rtm_retry, L_decrement_retry, L_on_abort;
300
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
301
302
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
303
movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
304
movptr(boxReg, tmpReg); // Save ObjectMonitor address
305
306
if (RTMRetryCount > 0) {
307
movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
308
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
309
bind(L_rtm_retry);
310
}
311
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
312
Label L_noincrement;
313
if (RTMTotalCountIncrRate > 1) {
314
// tmpReg, scrReg and flags are killed
315
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
316
}
317
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
318
atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
319
bind(L_noincrement);
320
}
321
xbegin(L_on_abort);
322
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
323
movptr(tmpReg, Address(tmpReg, owner_offset));
324
testptr(tmpReg, tmpReg);
325
jcc(Assembler::zero, DONE_LABEL);
326
if (UseRTMXendForLockBusy) {
327
xend();
328
jmp(L_decrement_retry);
329
}
330
else {
331
xabort(0);
332
}
333
bind(L_on_abort);
334
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
335
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
336
rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
337
}
338
if (RTMRetryCount > 0) {
339
// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
340
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
341
}
342
343
movptr(tmpReg, Address(boxReg, owner_offset)) ;
344
testptr(tmpReg, tmpReg) ;
345
jccb(Assembler::notZero, L_decrement_retry) ;
346
347
// Appears unlocked - try to swing _owner from null to non-null.
348
// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
349
#ifdef _LP64
350
Register threadReg = r15_thread;
351
#else
352
get_thread(scrReg);
353
Register threadReg = scrReg;
354
#endif
355
lock();
356
cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
357
358
if (RTMRetryCount > 0) {
359
// success done else retry
360
jccb(Assembler::equal, DONE_LABEL) ;
361
bind(L_decrement_retry);
362
// Spin and retry if lock is busy.
363
rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
364
}
365
else {
366
bind(L_decrement_retry);
367
}
368
}
369
370
#endif // INCLUDE_RTM_OPT
371
372
// fast_lock and fast_unlock used by C2
373
374
// Because the transitions from emitted code to the runtime
375
// monitorenter/exit helper stubs are so slow it's critical that
376
// we inline both the stack-locking fast path and the inflated fast path.
377
//
378
// See also: cmpFastLock and cmpFastUnlock.
379
//
380
// What follows is a specialized inline transliteration of the code
381
// in enter() and exit(). If we're concerned about I$ bloat another
382
// option would be to emit TrySlowEnter and TrySlowExit methods
383
// at startup-time. These methods would accept arguments as
384
// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
385
// indications in the icc.ZFlag. fast_lock and fast_unlock would simply
386
// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
387
// In practice, however, the # of lock sites is bounded and is usually small.
388
// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
389
// if the processor uses simple bimodal branch predictors keyed by EIP
390
// Since the helper routines would be called from multiple synchronization
391
// sites.
392
//
393
// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
394
// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
395
// to those specialized methods. That'd give us a mostly platform-independent
396
// implementation that the JITs could optimize and inline at their pleasure.
397
// Done correctly, the only time we'd need to cross to native could would be
398
// to park() or unpark() threads. We'd also need a few more unsafe operators
399
// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
400
// (b) explicit barriers or fence operations.
401
//
402
// TODO:
403
//
404
// * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
405
// This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
406
// Given TLAB allocation, Self is usually manifested in a register, so passing it into
407
// the lock operators would typically be faster than reifying Self.
408
//
409
// * Ideally I'd define the primitives as:
410
// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
411
// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
412
// Unfortunately ADLC bugs prevent us from expressing the ideal form.
413
// Instead, we're stuck with a rather awkward and brittle register assignments below.
414
// Furthermore the register assignments are overconstrained, possibly resulting in
415
// sub-optimal code near the synchronization site.
416
//
417
// * Eliminate the sp-proximity tests and just use "== Self" tests instead.
418
// Alternately, use a better sp-proximity test.
419
//
420
// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
421
// Either one is sufficient to uniquely identify a thread.
422
// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
423
//
424
// * Intrinsify notify() and notifyAll() for the common cases where the
425
// object is locked by the calling thread but the waitlist is empty.
426
// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
427
//
428
// * use jccb and jmpb instead of jcc and jmp to improve code density.
429
// But beware of excessive branch density on AMD Opterons.
430
//
431
// * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
432
// or failure of the fast path. If the fast path fails then we pass
433
// control to the slow path, typically in C. In fast_lock and
434
// fast_unlock we often branch to DONE_LABEL, just to find that C2
435
// will emit a conditional branch immediately after the node.
436
// So we have branches to branches and lots of ICC.ZF games.
437
// Instead, it might be better to have C2 pass a "FailureLabel"
438
// into fast_lock and fast_unlock. In the case of success, control
439
// will drop through the node. ICC.ZF is undefined at exit.
440
// In the case of failure, the node will branch directly to the
441
// FailureLabel
442
443
444
// obj: object to lock
445
// box: on-stack box address (displaced header location) - KILLED
446
// rax,: tmp -- KILLED
447
// scr: tmp -- KILLED
448
void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
449
Register scrReg, Register cx1Reg, Register cx2Reg,
450
BiasedLockingCounters* counters,
451
RTMLockingCounters* rtm_counters,
452
RTMLockingCounters* stack_rtm_counters,
453
Metadata* method_data,
454
bool use_rtm, bool profile_rtm) {
455
// Ensure the register assignments are disjoint
456
assert(tmpReg == rax, "");
457
458
if (use_rtm) {
459
assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
460
} else {
461
assert(cx2Reg == noreg, "");
462
assert_different_registers(objReg, boxReg, tmpReg, scrReg);
463
}
464
465
if (counters != NULL) {
466
atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
467
}
468
469
// Possible cases that we'll encounter in fast_lock
470
// ------------------------------------------------
471
// * Inflated
472
// -- unlocked
473
// -- Locked
474
// = by self
475
// = by other
476
// * biased
477
// -- by Self
478
// -- by other
479
// * neutral
480
// * stack-locked
481
// -- by self
482
// = sp-proximity test hits
483
// = sp-proximity test generates false-negative
484
// -- by other
485
//
486
487
Label IsInflated, DONE_LABEL;
488
489
if (DiagnoseSyncOnValueBasedClasses != 0) {
490
load_klass(tmpReg, objReg, cx1Reg);
491
movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
492
testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
493
jcc(Assembler::notZero, DONE_LABEL);
494
}
495
496
// it's stack-locked, biased or neutral
497
// TODO: optimize away redundant LDs of obj->mark and improve the markword triage
498
// order to reduce the number of conditional branches in the most common cases.
499
// Beware -- there's a subtle invariant that fetch of the markword
500
// at [FETCH], below, will never observe a biased encoding (*101b).
501
// If this invariant is not held we risk exclusion (safety) failure.
502
if (UseBiasedLocking && !UseOptoBiasInlining) {
503
biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
504
}
505
506
#if INCLUDE_RTM_OPT
507
if (UseRTMForStackLocks && use_rtm) {
508
rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
509
stack_rtm_counters, method_data, profile_rtm,
510
DONE_LABEL, IsInflated);
511
}
512
#endif // INCLUDE_RTM_OPT
513
514
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
515
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
516
jccb(Assembler::notZero, IsInflated);
517
518
// Attempt stack-locking ...
519
orptr (tmpReg, markWord::unlocked_value);
520
movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
521
lock();
522
cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
523
if (counters != NULL) {
524
cond_inc32(Assembler::equal,
525
ExternalAddress((address)counters->fast_path_entry_count_addr()));
526
}
527
jcc(Assembler::equal, DONE_LABEL); // Success
528
529
// Recursive locking.
530
// The object is stack-locked: markword contains stack pointer to BasicLock.
531
// Locked by current thread if difference with current SP is less than one page.
532
subptr(tmpReg, rsp);
533
// Next instruction set ZFlag == 1 (Success) if difference is less then one page.
534
andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
535
movptr(Address(boxReg, 0), tmpReg);
536
if (counters != NULL) {
537
cond_inc32(Assembler::equal,
538
ExternalAddress((address)counters->fast_path_entry_count_addr()));
539
}
540
jmp(DONE_LABEL);
541
542
bind(IsInflated);
543
// The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
544
545
#if INCLUDE_RTM_OPT
546
// Use the same RTM locking code in 32- and 64-bit VM.
547
if (use_rtm) {
548
rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
549
rtm_counters, method_data, profile_rtm, DONE_LABEL);
550
} else {
551
#endif // INCLUDE_RTM_OPT
552
553
#ifndef _LP64
554
// The object is inflated.
555
556
// boxReg refers to the on-stack BasicLock in the current frame.
557
// We'd like to write:
558
// set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
559
// This is convenient but results a ST-before-CAS penalty. The following CAS suffers
560
// additional latency as we have another ST in the store buffer that must drain.
561
562
// avoid ST-before-CAS
563
// register juggle because we need tmpReg for cmpxchgptr below
564
movptr(scrReg, boxReg);
565
movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
566
567
// Optimistic form: consider XORL tmpReg,tmpReg
568
movptr(tmpReg, NULL_WORD);
569
570
// Appears unlocked - try to swing _owner from null to non-null.
571
// Ideally, I'd manifest "Self" with get_thread and then attempt
572
// to CAS the register containing Self into m->Owner.
573
// But we don't have enough registers, so instead we can either try to CAS
574
// rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
575
// we later store "Self" into m->Owner. Transiently storing a stack address
576
// (rsp or the address of the box) into m->owner is harmless.
577
// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
578
lock();
579
cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
580
movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
581
// If we weren't able to swing _owner from NULL to the BasicLock
582
// then take the slow path.
583
jccb (Assembler::notZero, DONE_LABEL);
584
// update _owner from BasicLock to thread
585
get_thread (scrReg); // beware: clobbers ICCs
586
movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
587
xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
588
589
// If the CAS fails we can either retry or pass control to the slow path.
590
// We use the latter tactic.
591
// Pass the CAS result in the icc.ZFlag into DONE_LABEL
592
// If the CAS was successful ...
593
// Self has acquired the lock
594
// Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
595
// Intentional fall-through into DONE_LABEL ...
596
#else // _LP64
597
// It's inflated and we use scrReg for ObjectMonitor* in this section.
598
movq(scrReg, tmpReg);
599
xorq(tmpReg, tmpReg);
600
lock();
601
cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
602
// Unconditionally set box->_displaced_header = markWord::unused_mark().
603
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
604
movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
605
// Intentional fall-through into DONE_LABEL ...
606
// Propagate ICC.ZF from CAS above into DONE_LABEL.
607
#endif // _LP64
608
#if INCLUDE_RTM_OPT
609
} // use_rtm()
610
#endif
611
// DONE_LABEL is a hot target - we'd really like to place it at the
612
// start of cache line by padding with NOPs.
613
// See the AMD and Intel software optimization manuals for the
614
// most efficient "long" NOP encodings.
615
// Unfortunately none of our alignment mechanisms suffice.
616
bind(DONE_LABEL);
617
618
// At DONE_LABEL the icc ZFlag is set as follows ...
619
// fast_unlock uses the same protocol.
620
// ZFlag == 1 -> Success
621
// ZFlag == 0 -> Failure - force control through the slow path
622
}
623
624
// obj: object to unlock
625
// box: box address (displaced header location), killed. Must be EAX.
626
// tmp: killed, cannot be obj nor box.
627
//
628
// Some commentary on balanced locking:
629
//
630
// fast_lock and fast_unlock are emitted only for provably balanced lock sites.
631
// Methods that don't have provably balanced locking are forced to run in the
632
// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
633
// The interpreter provides two properties:
634
// I1: At return-time the interpreter automatically and quietly unlocks any
635
// objects acquired the current activation (frame). Recall that the
636
// interpreter maintains an on-stack list of locks currently held by
637
// a frame.
638
// I2: If a method attempts to unlock an object that is not held by the
639
// the frame the interpreter throws IMSX.
640
//
641
// Lets say A(), which has provably balanced locking, acquires O and then calls B().
642
// B() doesn't have provably balanced locking so it runs in the interpreter.
643
// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
644
// is still locked by A().
645
//
646
// The only other source of unbalanced locking would be JNI. The "Java Native Interface:
647
// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
648
// should not be unlocked by "normal" java-level locking and vice-versa. The specification
649
// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
650
// Arguably given that the spec legislates the JNI case as undefined our implementation
651
// could reasonably *avoid* checking owner in fast_unlock().
652
// In the interest of performance we elide m->Owner==Self check in unlock.
653
// A perfectly viable alternative is to elide the owner check except when
654
// Xcheck:jni is enabled.
655
656
void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
657
assert(boxReg == rax, "");
658
assert_different_registers(objReg, boxReg, tmpReg);
659
660
Label DONE_LABEL, Stacked, CheckSucc;
661
662
// Critically, the biased locking test must have precedence over
663
// and appear before the (box->dhw == 0) recursive stack-lock test.
664
if (UseBiasedLocking && !UseOptoBiasInlining) {
665
biased_locking_exit(objReg, tmpReg, DONE_LABEL);
666
}
667
668
#if INCLUDE_RTM_OPT
669
if (UseRTMForStackLocks && use_rtm) {
670
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
671
Label L_regular_unlock;
672
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
673
andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
674
cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
675
jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
676
xend(); // otherwise end...
677
jmp(DONE_LABEL); // ... and we're done
678
bind(L_regular_unlock);
679
}
680
#endif
681
682
cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
683
jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
684
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
685
testptr(tmpReg, markWord::monitor_value); // Inflated?
686
jccb (Assembler::zero, Stacked);
687
688
// It's inflated.
689
#if INCLUDE_RTM_OPT
690
if (use_rtm) {
691
Label L_regular_inflated_unlock;
692
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
693
movptr(boxReg, Address(tmpReg, owner_offset));
694
testptr(boxReg, boxReg);
695
jccb(Assembler::notZero, L_regular_inflated_unlock);
696
xend();
697
jmpb(DONE_LABEL);
698
bind(L_regular_inflated_unlock);
699
}
700
#endif
701
702
// Despite our balanced locking property we still check that m->_owner == Self
703
// as java routines or native JNI code called by this thread might
704
// have released the lock.
705
// Refer to the comments in synchronizer.cpp for how we might encode extra
706
// state in _succ so we can avoid fetching EntryList|cxq.
707
//
708
// I'd like to add more cases in fast_lock() and fast_unlock() --
709
// such as recursive enter and exit -- but we have to be wary of
710
// I$ bloat, T$ effects and BP$ effects.
711
//
712
// If there's no contention try a 1-0 exit. That is, exit without
713
// a costly MEMBAR or CAS. See synchronizer.cpp for details on how
714
// we detect and recover from the race that the 1-0 exit admits.
715
//
716
// Conceptually fast_unlock() must execute a STST|LDST "release" barrier
717
// before it STs null into _owner, releasing the lock. Updates
718
// to data protected by the critical section must be visible before
719
// we drop the lock (and thus before any other thread could acquire
720
// the lock and observe the fields protected by the lock).
721
// IA32's memory-model is SPO, so STs are ordered with respect to
722
// each other and there's no need for an explicit barrier (fence).
723
// See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
724
#ifndef _LP64
725
get_thread (boxReg);
726
727
// Note that we could employ various encoding schemes to reduce
728
// the number of loads below (currently 4) to just 2 or 3.
729
// Refer to the comments in synchronizer.cpp.
730
// In practice the chain of fetches doesn't seem to impact performance, however.
731
xorptr(boxReg, boxReg);
732
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
733
jccb (Assembler::notZero, DONE_LABEL);
734
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
735
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
736
jccb (Assembler::notZero, CheckSucc);
737
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
738
jmpb (DONE_LABEL);
739
740
bind (Stacked);
741
// It's not inflated and it's not recursively stack-locked and it's not biased.
742
// It must be stack-locked.
743
// Try to reset the header to displaced header.
744
// The "box" value on the stack is stable, so we can reload
745
// and be assured we observe the same value as above.
746
movptr(tmpReg, Address(boxReg, 0));
747
lock();
748
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
749
// Intention fall-thru into DONE_LABEL
750
751
// DONE_LABEL is a hot target - we'd really like to place it at the
752
// start of cache line by padding with NOPs.
753
// See the AMD and Intel software optimization manuals for the
754
// most efficient "long" NOP encodings.
755
// Unfortunately none of our alignment mechanisms suffice.
756
bind (CheckSucc);
757
#else // _LP64
758
// It's inflated
759
xorptr(boxReg, boxReg);
760
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
761
jccb (Assembler::notZero, DONE_LABEL);
762
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
763
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
764
jccb (Assembler::notZero, CheckSucc);
765
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
766
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
767
jmpb (DONE_LABEL);
768
769
// Try to avoid passing control into the slow_path ...
770
Label LSuccess, LGoSlowPath ;
771
bind (CheckSucc);
772
773
// The following optional optimization can be elided if necessary
774
// Effectively: if (succ == null) goto slow path
775
// The code reduces the window for a race, however,
776
// and thus benefits performance.
777
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
778
jccb (Assembler::zero, LGoSlowPath);
779
780
xorptr(boxReg, boxReg);
781
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
782
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
783
784
// Memory barrier/fence
785
// Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
786
// Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
787
// This is faster on Nehalem and AMD Shanghai/Barcelona.
788
// See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
789
// We might also restructure (ST Owner=0;barrier;LD _Succ) to
790
// (mov box,0; xchgq box, &m->Owner; LD _succ) .
791
lock(); addl(Address(rsp, 0), 0);
792
793
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
794
jccb (Assembler::notZero, LSuccess);
795
796
// Rare inopportune interleaving - race.
797
// The successor vanished in the small window above.
798
// The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
799
// We need to ensure progress and succession.
800
// Try to reacquire the lock.
801
// If that fails then the new owner is responsible for succession and this
802
// thread needs to take no further action and can exit via the fast path (success).
803
// If the re-acquire succeeds then pass control into the slow path.
804
// As implemented, this latter mode is horrible because we generated more
805
// coherence traffic on the lock *and* artifically extended the critical section
806
// length while by virtue of passing control into the slow path.
807
808
// box is really RAX -- the following CMPXCHG depends on that binding
809
// cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
810
lock();
811
cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
812
// There's no successor so we tried to regrab the lock.
813
// If that didn't work, then another thread grabbed the
814
// lock so we're done (and exit was a success).
815
jccb (Assembler::notEqual, LSuccess);
816
// Intentional fall-through into slow path
817
818
bind (LGoSlowPath);
819
orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
820
jmpb (DONE_LABEL);
821
822
bind (LSuccess);
823
testl (boxReg, 0); // set ICC.ZF=1 to indicate success
824
jmpb (DONE_LABEL);
825
826
bind (Stacked);
827
movptr(tmpReg, Address (boxReg, 0)); // re-fetch
828
lock();
829
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
830
831
#endif
832
bind(DONE_LABEL);
833
}
834
835
//-------------------------------------------------------------------------------------------
836
// Generic instructions support for use in .ad files C2 code generation
837
838
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
839
if (dst != src) {
840
movdqu(dst, src);
841
}
842
if (opcode == Op_AbsVD) {
843
andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
844
} else {
845
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
846
xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
847
}
848
}
849
850
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
851
if (opcode == Op_AbsVD) {
852
vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
853
} else {
854
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
855
vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
856
}
857
}
858
859
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
860
if (dst != src) {
861
movdqu(dst, src);
862
}
863
if (opcode == Op_AbsVF) {
864
andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
865
} else {
866
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
867
xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
868
}
869
}
870
871
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
872
if (opcode == Op_AbsVF) {
873
vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
874
} else {
875
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
876
vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
877
}
878
}
879
880
void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
881
assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
882
assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
883
884
if (opcode == Op_MinV) {
885
if (elem_bt == T_BYTE) {
886
pminsb(dst, src);
887
} else if (elem_bt == T_SHORT) {
888
pminsw(dst, src);
889
} else if (elem_bt == T_INT) {
890
pminsd(dst, src);
891
} else {
892
assert(elem_bt == T_LONG, "required");
893
assert(tmp == xmm0, "required");
894
assert_different_registers(dst, src, tmp);
895
movdqu(xmm0, dst);
896
pcmpgtq(xmm0, src);
897
blendvpd(dst, src); // xmm0 as mask
898
}
899
} else { // opcode == Op_MaxV
900
if (elem_bt == T_BYTE) {
901
pmaxsb(dst, src);
902
} else if (elem_bt == T_SHORT) {
903
pmaxsw(dst, src);
904
} else if (elem_bt == T_INT) {
905
pmaxsd(dst, src);
906
} else {
907
assert(elem_bt == T_LONG, "required");
908
assert(tmp == xmm0, "required");
909
assert_different_registers(dst, src, tmp);
910
movdqu(xmm0, src);
911
pcmpgtq(xmm0, dst);
912
blendvpd(dst, src); // xmm0 as mask
913
}
914
}
915
}
916
917
void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
918
XMMRegister dst, XMMRegister src1, XMMRegister src2,
919
int vlen_enc) {
920
assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
921
922
if (opcode == Op_MinV) {
923
if (elem_bt == T_BYTE) {
924
vpminsb(dst, src1, src2, vlen_enc);
925
} else if (elem_bt == T_SHORT) {
926
vpminsw(dst, src1, src2, vlen_enc);
927
} else if (elem_bt == T_INT) {
928
vpminsd(dst, src1, src2, vlen_enc);
929
} else {
930
assert(elem_bt == T_LONG, "required");
931
if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
932
vpminsq(dst, src1, src2, vlen_enc);
933
} else {
934
assert_different_registers(dst, src1, src2);
935
vpcmpgtq(dst, src1, src2, vlen_enc);
936
vblendvpd(dst, src1, src2, dst, vlen_enc);
937
}
938
}
939
} else { // opcode == Op_MaxV
940
if (elem_bt == T_BYTE) {
941
vpmaxsb(dst, src1, src2, vlen_enc);
942
} else if (elem_bt == T_SHORT) {
943
vpmaxsw(dst, src1, src2, vlen_enc);
944
} else if (elem_bt == T_INT) {
945
vpmaxsd(dst, src1, src2, vlen_enc);
946
} else {
947
assert(elem_bt == T_LONG, "required");
948
if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
949
vpmaxsq(dst, src1, src2, vlen_enc);
950
} else {
951
assert_different_registers(dst, src1, src2);
952
vpcmpgtq(dst, src1, src2, vlen_enc);
953
vblendvpd(dst, src2, src1, dst, vlen_enc);
954
}
955
}
956
}
957
}
958
959
// Float/Double min max
960
961
void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
962
XMMRegister dst, XMMRegister a, XMMRegister b,
963
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
964
int vlen_enc) {
965
assert(UseAVX > 0, "required");
966
assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
967
opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
968
assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
969
assert_different_registers(a, b, tmp, atmp, btmp);
970
971
bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
972
bool is_double_word = is_double_word_type(elem_bt);
973
974
if (!is_double_word && is_min) {
975
vblendvps(atmp, a, b, a, vlen_enc);
976
vblendvps(btmp, b, a, a, vlen_enc);
977
vminps(tmp, atmp, btmp, vlen_enc);
978
vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
979
vblendvps(dst, tmp, atmp, btmp, vlen_enc);
980
} else if (!is_double_word && !is_min) {
981
vblendvps(btmp, b, a, b, vlen_enc);
982
vblendvps(atmp, a, b, b, vlen_enc);
983
vmaxps(tmp, atmp, btmp, vlen_enc);
984
vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
985
vblendvps(dst, tmp, atmp, btmp, vlen_enc);
986
} else if (is_double_word && is_min) {
987
vblendvpd(atmp, a, b, a, vlen_enc);
988
vblendvpd(btmp, b, a, a, vlen_enc);
989
vminpd(tmp, atmp, btmp, vlen_enc);
990
vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
991
vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
992
} else {
993
assert(is_double_word && !is_min, "sanity");
994
vblendvpd(btmp, b, a, b, vlen_enc);
995
vblendvpd(atmp, a, b, b, vlen_enc);
996
vmaxpd(tmp, atmp, btmp, vlen_enc);
997
vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
998
vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
999
}
1000
}
1001
1002
void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1003
XMMRegister dst, XMMRegister a, XMMRegister b,
1004
KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1005
int vlen_enc) {
1006
assert(UseAVX > 2, "required");
1007
assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1008
opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1009
assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1010
assert_different_registers(dst, a, b, atmp, btmp);
1011
1012
bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1013
bool is_double_word = is_double_word_type(elem_bt);
1014
bool merge = true;
1015
1016
if (!is_double_word && is_min) {
1017
evpmovd2m(ktmp, a, vlen_enc);
1018
evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1019
evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1020
vminps(dst, atmp, btmp, vlen_enc);
1021
evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1022
evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1023
} else if (!is_double_word && !is_min) {
1024
evpmovd2m(ktmp, b, vlen_enc);
1025
evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026
evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027
vmaxps(dst, atmp, btmp, vlen_enc);
1028
evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029
evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030
} else if (is_double_word && is_min) {
1031
evpmovq2m(ktmp, a, vlen_enc);
1032
evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033
evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034
vminpd(dst, atmp, btmp, vlen_enc);
1035
evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036
evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037
} else {
1038
assert(is_double_word && !is_min, "sanity");
1039
evpmovq2m(ktmp, b, vlen_enc);
1040
evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1041
evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1042
vmaxpd(dst, atmp, btmp, vlen_enc);
1043
evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1044
evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1045
}
1046
}
1047
1048
// Float/Double signum
1049
void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1050
XMMRegister zero, XMMRegister one,
1051
Register scratch) {
1052
assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1053
1054
Label DONE_LABEL;
1055
1056
if (opcode == Op_SignumF) {
1057
assert(UseSSE > 0, "required");
1058
ucomiss(dst, zero);
1059
jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1060
jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1061
movflt(dst, one);
1062
jcc(Assembler::above, DONE_LABEL);
1063
xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1064
} else if (opcode == Op_SignumD) {
1065
assert(UseSSE > 1, "required");
1066
ucomisd(dst, zero);
1067
jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1068
jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1069
movdbl(dst, one);
1070
jcc(Assembler::above, DONE_LABEL);
1071
xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1072
}
1073
1074
bind(DONE_LABEL);
1075
}
1076
1077
void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1078
if (sign) {
1079
pmovsxbw(dst, src);
1080
} else {
1081
pmovzxbw(dst, src);
1082
}
1083
}
1084
1085
void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1086
if (sign) {
1087
vpmovsxbw(dst, src, vector_len);
1088
} else {
1089
vpmovzxbw(dst, src, vector_len);
1090
}
1091
}
1092
1093
void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1094
if (sign) {
1095
vpmovsxbd(dst, src, vector_len);
1096
} else {
1097
vpmovzxbd(dst, src, vector_len);
1098
}
1099
}
1100
1101
void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1102
if (sign) {
1103
vpmovsxwd(dst, src, vector_len);
1104
} else {
1105
vpmovzxwd(dst, src, vector_len);
1106
}
1107
}
1108
1109
void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1110
int shift, int vector_len) {
1111
if (opcode == Op_RotateLeftV) {
1112
if (etype == T_INT) {
1113
evprold(dst, src, shift, vector_len);
1114
} else {
1115
assert(etype == T_LONG, "expected type T_LONG");
1116
evprolq(dst, src, shift, vector_len);
1117
}
1118
} else {
1119
assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1120
if (etype == T_INT) {
1121
evprord(dst, src, shift, vector_len);
1122
} else {
1123
assert(etype == T_LONG, "expected type T_LONG");
1124
evprorq(dst, src, shift, vector_len);
1125
}
1126
}
1127
}
1128
1129
void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1130
XMMRegister shift, int vector_len) {
1131
if (opcode == Op_RotateLeftV) {
1132
if (etype == T_INT) {
1133
evprolvd(dst, src, shift, vector_len);
1134
} else {
1135
assert(etype == T_LONG, "expected type T_LONG");
1136
evprolvq(dst, src, shift, vector_len);
1137
}
1138
} else {
1139
assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1140
if (etype == T_INT) {
1141
evprorvd(dst, src, shift, vector_len);
1142
} else {
1143
assert(etype == T_LONG, "expected type T_LONG");
1144
evprorvq(dst, src, shift, vector_len);
1145
}
1146
}
1147
}
1148
1149
void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1150
if (opcode == Op_RShiftVI) {
1151
psrad(dst, shift);
1152
} else if (opcode == Op_LShiftVI) {
1153
pslld(dst, shift);
1154
} else {
1155
assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1156
psrld(dst, shift);
1157
}
1158
}
1159
1160
void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1161
switch (opcode) {
1162
case Op_RShiftVI: psrad(dst, shift); break;
1163
case Op_LShiftVI: pslld(dst, shift); break;
1164
case Op_URShiftVI: psrld(dst, shift); break;
1165
1166
default: assert(false, "%s", NodeClassNames[opcode]);
1167
}
1168
}
1169
1170
void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1171
if (opcode == Op_RShiftVI) {
1172
vpsrad(dst, nds, shift, vector_len);
1173
} else if (opcode == Op_LShiftVI) {
1174
vpslld(dst, nds, shift, vector_len);
1175
} else {
1176
assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1177
vpsrld(dst, nds, shift, vector_len);
1178
}
1179
}
1180
1181
void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1182
switch (opcode) {
1183
case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1184
case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1185
case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1186
1187
default: assert(false, "%s", NodeClassNames[opcode]);
1188
}
1189
}
1190
1191
void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1192
switch (opcode) {
1193
case Op_RShiftVB: // fall-through
1194
case Op_RShiftVS: psraw(dst, shift); break;
1195
1196
case Op_LShiftVB: // fall-through
1197
case Op_LShiftVS: psllw(dst, shift); break;
1198
1199
case Op_URShiftVS: // fall-through
1200
case Op_URShiftVB: psrlw(dst, shift); break;
1201
1202
default: assert(false, "%s", NodeClassNames[opcode]);
1203
}
1204
}
1205
1206
void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1207
switch (opcode) {
1208
case Op_RShiftVB: // fall-through
1209
case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1210
1211
case Op_LShiftVB: // fall-through
1212
case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1213
1214
case Op_URShiftVS: // fall-through
1215
case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1216
1217
default: assert(false, "%s", NodeClassNames[opcode]);
1218
}
1219
}
1220
1221
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1222
switch (opcode) {
1223
case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1224
case Op_LShiftVL: psllq(dst, shift); break;
1225
case Op_URShiftVL: psrlq(dst, shift); break;
1226
1227
default: assert(false, "%s", NodeClassNames[opcode]);
1228
}
1229
}
1230
1231
void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1232
if (opcode == Op_RShiftVL) {
1233
psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1234
} else if (opcode == Op_LShiftVL) {
1235
psllq(dst, shift);
1236
} else {
1237
assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1238
psrlq(dst, shift);
1239
}
1240
}
1241
1242
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1243
switch (opcode) {
1244
case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1245
case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1246
case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1247
1248
default: assert(false, "%s", NodeClassNames[opcode]);
1249
}
1250
}
1251
1252
void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1253
if (opcode == Op_RShiftVL) {
1254
evpsraq(dst, nds, shift, vector_len);
1255
} else if (opcode == Op_LShiftVL) {
1256
vpsllq(dst, nds, shift, vector_len);
1257
} else {
1258
assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1259
vpsrlq(dst, nds, shift, vector_len);
1260
}
1261
}
1262
1263
void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1264
switch (opcode) {
1265
case Op_RShiftVB: // fall-through
1266
case Op_RShiftVS: // fall-through
1267
case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1268
1269
case Op_LShiftVB: // fall-through
1270
case Op_LShiftVS: // fall-through
1271
case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1272
1273
case Op_URShiftVB: // fall-through
1274
case Op_URShiftVS: // fall-through
1275
case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1276
1277
default: assert(false, "%s", NodeClassNames[opcode]);
1278
}
1279
}
1280
1281
void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1282
switch (opcode) {
1283
case Op_RShiftVB: // fall-through
1284
case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1285
1286
case Op_LShiftVB: // fall-through
1287
case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1288
1289
case Op_URShiftVB: // fall-through
1290
case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1291
1292
default: assert(false, "%s", NodeClassNames[opcode]);
1293
}
1294
}
1295
1296
void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1297
assert(UseAVX >= 2, "required");
1298
switch (opcode) {
1299
case Op_RShiftVL: {
1300
if (UseAVX > 2) {
1301
assert(tmp == xnoreg, "not used");
1302
if (!VM_Version::supports_avx512vl()) {
1303
vlen_enc = Assembler::AVX_512bit;
1304
}
1305
evpsravq(dst, src, shift, vlen_enc);
1306
} else {
1307
vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1308
vpsrlvq(dst, src, shift, vlen_enc);
1309
vpsrlvq(tmp, tmp, shift, vlen_enc);
1310
vpxor(dst, dst, tmp, vlen_enc);
1311
vpsubq(dst, dst, tmp, vlen_enc);
1312
}
1313
break;
1314
}
1315
case Op_LShiftVL: {
1316
assert(tmp == xnoreg, "not used");
1317
vpsllvq(dst, src, shift, vlen_enc);
1318
break;
1319
}
1320
case Op_URShiftVL: {
1321
assert(tmp == xnoreg, "not used");
1322
vpsrlvq(dst, src, shift, vlen_enc);
1323
break;
1324
}
1325
default: assert(false, "%s", NodeClassNames[opcode]);
1326
}
1327
}
1328
1329
// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1330
void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1331
assert(opcode == Op_LShiftVB ||
1332
opcode == Op_RShiftVB ||
1333
opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1334
bool sign = (opcode != Op_URShiftVB);
1335
assert(vector_len == 0, "required");
1336
vextendbd(sign, dst, src, 1);
1337
vpmovzxbd(vtmp, shift, 1);
1338
varshiftd(opcode, dst, dst, vtmp, 1);
1339
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1340
vextracti128_high(vtmp, dst);
1341
vpackusdw(dst, dst, vtmp, 0);
1342
}
1343
1344
// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1345
void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1346
assert(opcode == Op_LShiftVB ||
1347
opcode == Op_RShiftVB ||
1348
opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1349
bool sign = (opcode != Op_URShiftVB);
1350
int ext_vector_len = vector_len + 1;
1351
vextendbw(sign, dst, src, ext_vector_len);
1352
vpmovzxbw(vtmp, shift, ext_vector_len);
1353
varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1354
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1355
if (vector_len == 0) {
1356
vextracti128_high(vtmp, dst);
1357
vpackuswb(dst, dst, vtmp, vector_len);
1358
} else {
1359
vextracti64x4_high(vtmp, dst);
1360
vpackuswb(dst, dst, vtmp, vector_len);
1361
vpermq(dst, dst, 0xD8, vector_len);
1362
}
1363
}
1364
1365
void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1366
switch(typ) {
1367
case T_BYTE:
1368
pinsrb(dst, val, idx);
1369
break;
1370
case T_SHORT:
1371
pinsrw(dst, val, idx);
1372
break;
1373
case T_INT:
1374
pinsrd(dst, val, idx);
1375
break;
1376
case T_LONG:
1377
pinsrq(dst, val, idx);
1378
break;
1379
default:
1380
assert(false,"Should not reach here.");
1381
break;
1382
}
1383
}
1384
1385
void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1386
switch(typ) {
1387
case T_BYTE:
1388
vpinsrb(dst, src, val, idx);
1389
break;
1390
case T_SHORT:
1391
vpinsrw(dst, src, val, idx);
1392
break;
1393
case T_INT:
1394
vpinsrd(dst, src, val, idx);
1395
break;
1396
case T_LONG:
1397
vpinsrq(dst, src, val, idx);
1398
break;
1399
default:
1400
assert(false,"Should not reach here.");
1401
break;
1402
}
1403
}
1404
1405
void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1406
switch(typ) {
1407
case T_INT:
1408
vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1409
break;
1410
case T_FLOAT:
1411
vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1412
break;
1413
case T_LONG:
1414
vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1415
break;
1416
case T_DOUBLE:
1417
vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1418
break;
1419
default:
1420
assert(false,"Should not reach here.");
1421
break;
1422
}
1423
}
1424
1425
void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1426
switch(typ) {
1427
case T_INT:
1428
evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1429
break;
1430
case T_FLOAT:
1431
evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1432
break;
1433
case T_LONG:
1434
evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1435
break;
1436
case T_DOUBLE:
1437
evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1438
break;
1439
default:
1440
assert(false,"Should not reach here.");
1441
break;
1442
}
1443
}
1444
1445
void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1446
switch(typ) {
1447
case T_INT:
1448
evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1449
break;
1450
case T_FLOAT:
1451
evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1452
break;
1453
case T_LONG:
1454
evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1455
break;
1456
case T_DOUBLE:
1457
evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1458
break;
1459
default:
1460
assert(false,"Should not reach here.");
1461
break;
1462
}
1463
}
1464
1465
void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
1466
if (vlen_in_bytes <= 16) {
1467
pxor (dst, dst);
1468
psubb(dst, src);
1469
switch (elem_bt) {
1470
case T_BYTE: /* nothing to do */ break;
1471
case T_SHORT: pmovsxbw(dst, dst); break;
1472
case T_INT: pmovsxbd(dst, dst); break;
1473
case T_FLOAT: pmovsxbd(dst, dst); break;
1474
case T_LONG: pmovsxbq(dst, dst); break;
1475
case T_DOUBLE: pmovsxbq(dst, dst); break;
1476
1477
default: assert(false, "%s", type2name(elem_bt));
1478
}
1479
} else {
1480
int vlen_enc = vector_length_encoding(vlen_in_bytes);
1481
1482
vpxor (dst, dst, dst, vlen_enc);
1483
vpsubb(dst, dst, src, vlen_enc);
1484
switch (elem_bt) {
1485
case T_BYTE: /* nothing to do */ break;
1486
case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1487
case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1488
case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1489
case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1490
case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1491
1492
default: assert(false, "%s", type2name(elem_bt));
1493
}
1494
}
1495
}
1496
1497
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1498
ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1499
if (vlen_in_bytes <= 16) {
1500
movdqu(dst, addr, scratch);
1501
} else if (vlen_in_bytes == 32) {
1502
vmovdqu(dst, addr, scratch);
1503
} else {
1504
assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1505
evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1506
}
1507
}
1508
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1509
1510
void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1511
int vector_len = Assembler::AVX_128bit;
1512
1513
switch (opcode) {
1514
case Op_AndReductionV: pand(dst, src); break;
1515
case Op_OrReductionV: por (dst, src); break;
1516
case Op_XorReductionV: pxor(dst, src); break;
1517
case Op_MinReductionV:
1518
switch (typ) {
1519
case T_BYTE: pminsb(dst, src); break;
1520
case T_SHORT: pminsw(dst, src); break;
1521
case T_INT: pminsd(dst, src); break;
1522
case T_LONG: assert(UseAVX > 2, "required");
1523
vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1524
default: assert(false, "wrong type");
1525
}
1526
break;
1527
case Op_MaxReductionV:
1528
switch (typ) {
1529
case T_BYTE: pmaxsb(dst, src); break;
1530
case T_SHORT: pmaxsw(dst, src); break;
1531
case T_INT: pmaxsd(dst, src); break;
1532
case T_LONG: assert(UseAVX > 2, "required");
1533
vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1534
default: assert(false, "wrong type");
1535
}
1536
break;
1537
case Op_AddReductionVF: addss(dst, src); break;
1538
case Op_AddReductionVD: addsd(dst, src); break;
1539
case Op_AddReductionVI:
1540
switch (typ) {
1541
case T_BYTE: paddb(dst, src); break;
1542
case T_SHORT: paddw(dst, src); break;
1543
case T_INT: paddd(dst, src); break;
1544
default: assert(false, "wrong type");
1545
}
1546
break;
1547
case Op_AddReductionVL: paddq(dst, src); break;
1548
case Op_MulReductionVF: mulss(dst, src); break;
1549
case Op_MulReductionVD: mulsd(dst, src); break;
1550
case Op_MulReductionVI:
1551
switch (typ) {
1552
case T_SHORT: pmullw(dst, src); break;
1553
case T_INT: pmulld(dst, src); break;
1554
default: assert(false, "wrong type");
1555
}
1556
break;
1557
case Op_MulReductionVL: assert(UseAVX > 2, "required");
1558
vpmullq(dst, dst, src, vector_len); break;
1559
default: assert(false, "wrong opcode");
1560
}
1561
}
1562
1563
void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1564
int vector_len = Assembler::AVX_256bit;
1565
1566
switch (opcode) {
1567
case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1568
case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1569
case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1570
case Op_MinReductionV:
1571
switch (typ) {
1572
case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1573
case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1574
case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1575
case T_LONG: assert(UseAVX > 2, "required");
1576
vpminsq(dst, src1, src2, vector_len); break;
1577
default: assert(false, "wrong type");
1578
}
1579
break;
1580
case Op_MaxReductionV:
1581
switch (typ) {
1582
case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1583
case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1584
case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1585
case T_LONG: assert(UseAVX > 2, "required");
1586
vpmaxsq(dst, src1, src2, vector_len); break;
1587
default: assert(false, "wrong type");
1588
}
1589
break;
1590
case Op_AddReductionVI:
1591
switch (typ) {
1592
case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1593
case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1594
case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1595
default: assert(false, "wrong type");
1596
}
1597
break;
1598
case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1599
case Op_MulReductionVI:
1600
switch (typ) {
1601
case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1602
case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1603
default: assert(false, "wrong type");
1604
}
1605
break;
1606
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1607
default: assert(false, "wrong opcode");
1608
}
1609
}
1610
1611
void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1612
XMMRegister dst, XMMRegister src,
1613
XMMRegister vtmp1, XMMRegister vtmp2) {
1614
switch (opcode) {
1615
case Op_AddReductionVF:
1616
case Op_MulReductionVF:
1617
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1618
break;
1619
1620
case Op_AddReductionVD:
1621
case Op_MulReductionVD:
1622
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1623
break;
1624
1625
default: assert(false, "wrong opcode");
1626
}
1627
}
1628
1629
void C2_MacroAssembler::reduceB(int opcode, int vlen,
1630
Register dst, Register src1, XMMRegister src2,
1631
XMMRegister vtmp1, XMMRegister vtmp2) {
1632
switch (vlen) {
1633
case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1634
case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1635
case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1636
case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1637
1638
default: assert(false, "wrong vector length");
1639
}
1640
}
1641
1642
void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1643
Register dst, Register src1, XMMRegister src2,
1644
XMMRegister vtmp1, XMMRegister vtmp2) {
1645
switch (vlen) {
1646
case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1647
case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648
case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1649
case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1650
1651
default: assert(false, "wrong vector length");
1652
}
1653
}
1654
1655
void C2_MacroAssembler::reduceS(int opcode, int vlen,
1656
Register dst, Register src1, XMMRegister src2,
1657
XMMRegister vtmp1, XMMRegister vtmp2) {
1658
switch (vlen) {
1659
case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1660
case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661
case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1662
case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663
1664
default: assert(false, "wrong vector length");
1665
}
1666
}
1667
1668
void C2_MacroAssembler::reduceI(int opcode, int vlen,
1669
Register dst, Register src1, XMMRegister src2,
1670
XMMRegister vtmp1, XMMRegister vtmp2) {
1671
switch (vlen) {
1672
case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1673
case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1674
case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1675
case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1676
1677
default: assert(false, "wrong vector length");
1678
}
1679
}
1680
1681
#ifdef _LP64
1682
void C2_MacroAssembler::reduceL(int opcode, int vlen,
1683
Register dst, Register src1, XMMRegister src2,
1684
XMMRegister vtmp1, XMMRegister vtmp2) {
1685
switch (vlen) {
1686
case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1687
case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1688
case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1689
1690
default: assert(false, "wrong vector length");
1691
}
1692
}
1693
#endif // _LP64
1694
1695
void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1696
switch (vlen) {
1697
case 2:
1698
assert(vtmp2 == xnoreg, "");
1699
reduce2F(opcode, dst, src, vtmp1);
1700
break;
1701
case 4:
1702
assert(vtmp2 == xnoreg, "");
1703
reduce4F(opcode, dst, src, vtmp1);
1704
break;
1705
case 8:
1706
reduce8F(opcode, dst, src, vtmp1, vtmp2);
1707
break;
1708
case 16:
1709
reduce16F(opcode, dst, src, vtmp1, vtmp2);
1710
break;
1711
default: assert(false, "wrong vector length");
1712
}
1713
}
1714
1715
void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1716
switch (vlen) {
1717
case 2:
1718
assert(vtmp2 == xnoreg, "");
1719
reduce2D(opcode, dst, src, vtmp1);
1720
break;
1721
case 4:
1722
reduce4D(opcode, dst, src, vtmp1, vtmp2);
1723
break;
1724
case 8:
1725
reduce8D(opcode, dst, src, vtmp1, vtmp2);
1726
break;
1727
default: assert(false, "wrong vector length");
1728
}
1729
}
1730
1731
void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1732
if (opcode == Op_AddReductionVI) {
1733
if (vtmp1 != src2) {
1734
movdqu(vtmp1, src2);
1735
}
1736
phaddd(vtmp1, vtmp1);
1737
} else {
1738
pshufd(vtmp1, src2, 0x1);
1739
reduce_operation_128(T_INT, opcode, vtmp1, src2);
1740
}
1741
movdl(vtmp2, src1);
1742
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1743
movdl(dst, vtmp1);
1744
}
1745
1746
void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1747
if (opcode == Op_AddReductionVI) {
1748
if (vtmp1 != src2) {
1749
movdqu(vtmp1, src2);
1750
}
1751
phaddd(vtmp1, src2);
1752
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1753
} else {
1754
pshufd(vtmp2, src2, 0xE);
1755
reduce_operation_128(T_INT, opcode, vtmp2, src2);
1756
reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1757
}
1758
}
1759
1760
void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1761
if (opcode == Op_AddReductionVI) {
1762
vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1763
vextracti128_high(vtmp2, vtmp1);
1764
vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1765
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1766
} else {
1767
vextracti128_high(vtmp1, src2);
1768
reduce_operation_128(T_INT, opcode, vtmp1, src2);
1769
reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1770
}
1771
}
1772
1773
void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1774
vextracti64x4_high(vtmp2, src2);
1775
reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1776
reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1777
}
1778
1779
void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1780
pshufd(vtmp2, src2, 0x1);
1781
reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1782
movdqu(vtmp1, vtmp2);
1783
psrldq(vtmp1, 2);
1784
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1785
movdqu(vtmp2, vtmp1);
1786
psrldq(vtmp2, 1);
1787
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1788
movdl(vtmp2, src1);
1789
pmovsxbd(vtmp1, vtmp1);
1790
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1791
pextrb(dst, vtmp1, 0x0);
1792
movsbl(dst, dst);
1793
}
1794
1795
void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1796
pshufd(vtmp1, src2, 0xE);
1797
reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1798
reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1799
}
1800
1801
void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1802
vextracti128_high(vtmp2, src2);
1803
reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1804
reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1805
}
1806
1807
void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1808
vextracti64x4_high(vtmp1, src2);
1809
reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1810
reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1811
}
1812
1813
void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1814
pmovsxbw(vtmp2, src2);
1815
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1816
}
1817
1818
void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1819
if (UseAVX > 1) {
1820
int vector_len = Assembler::AVX_256bit;
1821
vpmovsxbw(vtmp1, src2, vector_len);
1822
reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1823
} else {
1824
pmovsxbw(vtmp2, src2);
1825
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1826
pshufd(vtmp2, src2, 0x1);
1827
pmovsxbw(vtmp2, src2);
1828
reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1829
}
1830
}
1831
1832
void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1833
if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1834
int vector_len = Assembler::AVX_512bit;
1835
vpmovsxbw(vtmp1, src2, vector_len);
1836
reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1837
} else {
1838
assert(UseAVX >= 2,"Should not reach here.");
1839
mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1840
vextracti128_high(vtmp2, src2);
1841
mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1842
}
1843
}
1844
1845
void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1846
mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1847
vextracti64x4_high(vtmp2, src2);
1848
mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1849
}
1850
1851
void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1852
if (opcode == Op_AddReductionVI) {
1853
if (vtmp1 != src2) {
1854
movdqu(vtmp1, src2);
1855
}
1856
phaddw(vtmp1, vtmp1);
1857
phaddw(vtmp1, vtmp1);
1858
} else {
1859
pshufd(vtmp2, src2, 0x1);
1860
reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1861
movdqu(vtmp1, vtmp2);
1862
psrldq(vtmp1, 2);
1863
reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1864
}
1865
movdl(vtmp2, src1);
1866
pmovsxwd(vtmp1, vtmp1);
1867
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1868
pextrw(dst, vtmp1, 0x0);
1869
movswl(dst, dst);
1870
}
1871
1872
void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1873
if (opcode == Op_AddReductionVI) {
1874
if (vtmp1 != src2) {
1875
movdqu(vtmp1, src2);
1876
}
1877
phaddw(vtmp1, src2);
1878
} else {
1879
pshufd(vtmp1, src2, 0xE);
1880
reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1881
}
1882
reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1883
}
1884
1885
void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1886
if (opcode == Op_AddReductionVI) {
1887
int vector_len = Assembler::AVX_256bit;
1888
vphaddw(vtmp2, src2, src2, vector_len);
1889
vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1890
} else {
1891
vextracti128_high(vtmp2, src2);
1892
reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1893
}
1894
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1895
}
1896
1897
void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1898
int vector_len = Assembler::AVX_256bit;
1899
vextracti64x4_high(vtmp1, src2);
1900
reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1901
reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1902
}
1903
1904
#ifdef _LP64
1905
void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1906
pshufd(vtmp2, src2, 0xE);
1907
reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1908
movdq(vtmp1, src1);
1909
reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1910
movdq(dst, vtmp1);
1911
}
1912
1913
void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1914
vextracti128_high(vtmp1, src2);
1915
reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1916
reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1917
}
1918
1919
void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1920
vextracti64x4_high(vtmp2, src2);
1921
reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1922
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1923
}
1924
1925
void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1926
assert(ArrayCopyPartialInlineSize <= 64,"");
1927
mov64(temp, -1L);
1928
bzhiq(temp, temp, len);
1929
kmovql(dst, temp);
1930
}
1931
#endif // _LP64
1932
1933
void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1934
reduce_operation_128(T_FLOAT, opcode, dst, src);
1935
pshufd(vtmp, src, 0x1);
1936
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1937
}
1938
1939
void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1940
reduce2F(opcode, dst, src, vtmp);
1941
pshufd(vtmp, src, 0x2);
1942
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1943
pshufd(vtmp, src, 0x3);
1944
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1945
}
1946
1947
void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1948
reduce4F(opcode, dst, src, vtmp2);
1949
vextractf128_high(vtmp2, src);
1950
reduce4F(opcode, dst, vtmp2, vtmp1);
1951
}
1952
1953
void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1954
reduce8F(opcode, dst, src, vtmp1, vtmp2);
1955
vextracti64x4_high(vtmp1, src);
1956
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1957
}
1958
1959
void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1960
reduce_operation_128(T_DOUBLE, opcode, dst, src);
1961
pshufd(vtmp, src, 0xE);
1962
reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1963
}
1964
1965
void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1966
reduce2D(opcode, dst, src, vtmp2);
1967
vextractf128_high(vtmp2, src);
1968
reduce2D(opcode, dst, vtmp2, vtmp1);
1969
}
1970
1971
void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1972
reduce4D(opcode, dst, src, vtmp1, vtmp2);
1973
vextracti64x4_high(vtmp1, src);
1974
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1975
}
1976
1977
void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1978
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1979
}
1980
1981
void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1982
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1983
}
1984
1985
1986
void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
1987
XMMRegister dst, XMMRegister src,
1988
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1989
XMMRegister xmm_0, XMMRegister xmm_1) {
1990
int permconst[] = {1, 14};
1991
XMMRegister wsrc = src;
1992
XMMRegister wdst = xmm_0;
1993
XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
1994
1995
int vlen_enc = Assembler::AVX_128bit;
1996
if (vlen == 16) {
1997
vlen_enc = Assembler::AVX_256bit;
1998
}
1999
2000
for (int i = log2(vlen) - 1; i >=0; i--) {
2001
if (i == 0 && !is_dst_valid) {
2002
wdst = dst;
2003
}
2004
if (i == 3) {
2005
vextracti64x4_high(wtmp, wsrc);
2006
} else if (i == 2) {
2007
vextracti128_high(wtmp, wsrc);
2008
} else { // i = [0,1]
2009
vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2010
}
2011
vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2012
wsrc = wdst;
2013
vlen_enc = Assembler::AVX_128bit;
2014
}
2015
if (is_dst_valid) {
2016
vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2017
}
2018
}
2019
2020
void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2021
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2022
XMMRegister xmm_0, XMMRegister xmm_1) {
2023
XMMRegister wsrc = src;
2024
XMMRegister wdst = xmm_0;
2025
XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2026
int vlen_enc = Assembler::AVX_128bit;
2027
if (vlen == 8) {
2028
vlen_enc = Assembler::AVX_256bit;
2029
}
2030
for (int i = log2(vlen) - 1; i >=0; i--) {
2031
if (i == 0 && !is_dst_valid) {
2032
wdst = dst;
2033
}
2034
if (i == 1) {
2035
vextracti128_high(wtmp, wsrc);
2036
} else if (i == 2) {
2037
vextracti64x4_high(wtmp, wsrc);
2038
} else {
2039
assert(i == 0, "%d", i);
2040
vpermilpd(wtmp, wsrc, 1, vlen_enc);
2041
}
2042
vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2043
wsrc = wdst;
2044
vlen_enc = Assembler::AVX_128bit;
2045
}
2046
if (is_dst_valid) {
2047
vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2048
}
2049
}
2050
2051
void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2052
switch (bt) {
2053
case T_BYTE: pextrb(dst, src, idx); break;
2054
case T_SHORT: pextrw(dst, src, idx); break;
2055
case T_INT: pextrd(dst, src, idx); break;
2056
case T_LONG: pextrq(dst, src, idx); break;
2057
2058
default:
2059
assert(false,"Should not reach here.");
2060
break;
2061
}
2062
}
2063
2064
XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2065
int esize = type2aelembytes(typ);
2066
int elem_per_lane = 16/esize;
2067
int lane = elemindex / elem_per_lane;
2068
int eindex = elemindex % elem_per_lane;
2069
2070
if (lane >= 2) {
2071
assert(UseAVX > 2, "required");
2072
vextractf32x4(dst, src, lane & 3);
2073
return dst;
2074
} else if (lane > 0) {
2075
assert(UseAVX > 0, "required");
2076
vextractf128(dst, src, lane);
2077
return dst;
2078
} else {
2079
return src;
2080
}
2081
}
2082
2083
void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2084
int esize = type2aelembytes(typ);
2085
int elem_per_lane = 16/esize;
2086
int eindex = elemindex % elem_per_lane;
2087
assert(is_integral_type(typ),"required");
2088
2089
if (eindex == 0) {
2090
if (typ == T_LONG) {
2091
movq(dst, src);
2092
} else {
2093
movdl(dst, src);
2094
if (typ == T_BYTE)
2095
movsbl(dst, dst);
2096
else if (typ == T_SHORT)
2097
movswl(dst, dst);
2098
}
2099
} else {
2100
extract(typ, dst, src, eindex);
2101
}
2102
}
2103
2104
void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2105
int esize = type2aelembytes(typ);
2106
int elem_per_lane = 16/esize;
2107
int eindex = elemindex % elem_per_lane;
2108
assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2109
2110
if (eindex == 0) {
2111
movq(dst, src);
2112
} else {
2113
if (typ == T_FLOAT) {
2114
if (UseAVX == 0) {
2115
movdqu(dst, src);
2116
pshufps(dst, dst, eindex);
2117
} else {
2118
vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2119
}
2120
} else {
2121
if (UseAVX == 0) {
2122
movdqu(dst, src);
2123
psrldq(dst, eindex*esize);
2124
} else {
2125
vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2126
}
2127
movq(dst, dst);
2128
}
2129
}
2130
// Zero upper bits
2131
if (typ == T_FLOAT) {
2132
if (UseAVX == 0) {
2133
assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2134
movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2135
pand(dst, vtmp);
2136
} else {
2137
assert((tmp != noreg), "required.");
2138
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2139
}
2140
}
2141
}
2142
2143
void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2144
switch(typ) {
2145
case T_BYTE:
2146
evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2147
break;
2148
case T_SHORT:
2149
evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2150
break;
2151
case T_INT:
2152
case T_FLOAT:
2153
evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2154
break;
2155
case T_LONG:
2156
case T_DOUBLE:
2157
evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2158
break;
2159
default:
2160
assert(false,"Should not reach here.");
2161
break;
2162
}
2163
}
2164
2165
void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2166
int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2167
int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2168
switch (typ) {
2169
case T_BYTE:
2170
vpmovzxbw(vtmp1, src1, vlen_enc);
2171
vpmovzxbw(vtmp2, src2, vlen_enc);
2172
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2173
vpacksswb(dst, dst, dst, vlen_enc);
2174
break;
2175
case T_SHORT:
2176
vpmovzxwd(vtmp1, src1, vlen_enc);
2177
vpmovzxwd(vtmp2, src2, vlen_enc);
2178
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2179
vpackssdw(dst, dst, dst, vlen_enc);
2180
break;
2181
case T_INT:
2182
vpmovzxdq(vtmp1, src1, vlen_enc);
2183
vpmovzxdq(vtmp2, src2, vlen_enc);
2184
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2185
vpermilps(dst, dst, 8, vlen_enc);
2186
break;
2187
default:
2188
assert(false, "Should not reach here");
2189
}
2190
if (vlen_in_bytes == 16) {
2191
vpermpd(dst, dst, 0x8, vlen_enc);
2192
}
2193
}
2194
2195
void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2196
XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2197
int vlen_enc = vector_length_encoding(vlen_in_bytes);
2198
switch (typ) {
2199
case T_BYTE:
2200
vpmovzxbw(vtmp1, src1, vlen_enc);
2201
vpmovzxbw(vtmp2, src2, vlen_enc);
2202
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2203
vextracti128(vtmp1, src1, 1);
2204
vextracti128(vtmp2, src2, 1);
2205
vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2206
vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2207
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2208
vpacksswb(dst, dst, vtmp3, vlen_enc);
2209
vpermpd(dst, dst, 0xd8, vlen_enc);
2210
break;
2211
case T_SHORT:
2212
vpmovzxwd(vtmp1, src1, vlen_enc);
2213
vpmovzxwd(vtmp2, src2, vlen_enc);
2214
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2215
vextracti128(vtmp1, src1, 1);
2216
vextracti128(vtmp2, src2, 1);
2217
vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2218
vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2219
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2220
vpackssdw(dst, dst, vtmp3, vlen_enc);
2221
vpermpd(dst, dst, 0xd8, vlen_enc);
2222
break;
2223
case T_INT:
2224
vpmovzxdq(vtmp1, src1, vlen_enc);
2225
vpmovzxdq(vtmp2, src2, vlen_enc);
2226
vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2227
vpshufd(dst, dst, 8, vlen_enc);
2228
vpermq(dst, dst, 8, vlen_enc);
2229
vextracti128(vtmp1, src1, 1);
2230
vextracti128(vtmp2, src2, 1);
2231
vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2232
vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2233
vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2234
vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2235
vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2236
vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2237
break;
2238
default:
2239
assert(false, "Should not reach here");
2240
}
2241
}
2242
2243
void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2244
switch(typ) {
2245
case T_BYTE:
2246
evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2247
break;
2248
case T_SHORT:
2249
evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2250
break;
2251
case T_INT:
2252
case T_FLOAT:
2253
evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2254
break;
2255
case T_LONG:
2256
case T_DOUBLE:
2257
evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2258
break;
2259
default:
2260
assert(false,"Should not reach here.");
2261
break;
2262
}
2263
}
2264
2265
void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2266
XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2267
switch(vlen) {
2268
case 4:
2269
assert(vtmp1 != xnoreg, "required.");
2270
// Broadcast lower 32 bits to 128 bits before ptest
2271
pshufd(vtmp1, src1, 0x0);
2272
if (bt == BoolTest::overflow) {
2273
assert(vtmp2 != xnoreg, "required.");
2274
pshufd(vtmp2, src2, 0x0);
2275
} else {
2276
assert(vtmp2 == xnoreg, "required.");
2277
vtmp2 = src2;
2278
}
2279
ptest(vtmp1, vtmp2);
2280
break;
2281
case 8:
2282
assert(vtmp1 != xnoreg, "required.");
2283
// Broadcast lower 64 bits to 128 bits before ptest
2284
pshufd(vtmp1, src1, 0x4);
2285
if (bt == BoolTest::overflow) {
2286
assert(vtmp2 != xnoreg, "required.");
2287
pshufd(vtmp2, src2, 0x4);
2288
} else {
2289
assert(vtmp2 == xnoreg, "required.");
2290
vtmp2 = src2;
2291
}
2292
ptest(vtmp1, vtmp2);
2293
break;
2294
case 16:
2295
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2296
ptest(src1, src2);
2297
break;
2298
case 32:
2299
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2300
vptest(src1, src2, Assembler::AVX_256bit);
2301
break;
2302
case 64:
2303
{
2304
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2305
evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2306
if (bt == BoolTest::ne) {
2307
ktestql(mask, mask);
2308
} else {
2309
assert(bt == BoolTest::overflow, "required");
2310
kortestql(mask, mask);
2311
}
2312
}
2313
break;
2314
default:
2315
assert(false,"Should not reach here.");
2316
break;
2317
}
2318
}
2319
2320
//-------------------------------------------------------------------------------------------
2321
2322
// IndexOf for constant substrings with size >= 8 chars
2323
// which don't need to be loaded through stack.
2324
void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2325
Register cnt1, Register cnt2,
2326
int int_cnt2, Register result,
2327
XMMRegister vec, Register tmp,
2328
int ae) {
2329
ShortBranchVerifier sbv(this);
2330
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2331
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2332
2333
// This method uses the pcmpestri instruction with bound registers
2334
// inputs:
2335
// xmm - substring
2336
// rax - substring length (elements count)
2337
// mem - scanned string
2338
// rdx - string length (elements count)
2339
// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2340
// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2341
// outputs:
2342
// rcx - matched index in string
2343
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2344
int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2345
int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2346
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2347
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2348
2349
Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2350
RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2351
MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2352
2353
// Note, inline_string_indexOf() generates checks:
2354
// if (substr.count > string.count) return -1;
2355
// if (substr.count == 0) return 0;
2356
assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2357
2358
// Load substring.
2359
if (ae == StrIntrinsicNode::UL) {
2360
pmovzxbw(vec, Address(str2, 0));
2361
} else {
2362
movdqu(vec, Address(str2, 0));
2363
}
2364
movl(cnt2, int_cnt2);
2365
movptr(result, str1); // string addr
2366
2367
if (int_cnt2 > stride) {
2368
jmpb(SCAN_TO_SUBSTR);
2369
2370
// Reload substr for rescan, this code
2371
// is executed only for large substrings (> 8 chars)
2372
bind(RELOAD_SUBSTR);
2373
if (ae == StrIntrinsicNode::UL) {
2374
pmovzxbw(vec, Address(str2, 0));
2375
} else {
2376
movdqu(vec, Address(str2, 0));
2377
}
2378
negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2379
2380
bind(RELOAD_STR);
2381
// We came here after the beginning of the substring was
2382
// matched but the rest of it was not so we need to search
2383
// again. Start from the next element after the previous match.
2384
2385
// cnt2 is number of substring reminding elements and
2386
// cnt1 is number of string reminding elements when cmp failed.
2387
// Restored cnt1 = cnt1 - cnt2 + int_cnt2
2388
subl(cnt1, cnt2);
2389
addl(cnt1, int_cnt2);
2390
movl(cnt2, int_cnt2); // Now restore cnt2
2391
2392
decrementl(cnt1); // Shift to next element
2393
cmpl(cnt1, cnt2);
2394
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2395
2396
addptr(result, (1<<scale1));
2397
2398
} // (int_cnt2 > 8)
2399
2400
// Scan string for start of substr in 16-byte vectors
2401
bind(SCAN_TO_SUBSTR);
2402
pcmpestri(vec, Address(result, 0), mode);
2403
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2404
subl(cnt1, stride);
2405
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2406
cmpl(cnt1, cnt2);
2407
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2408
addptr(result, 16);
2409
jmpb(SCAN_TO_SUBSTR);
2410
2411
// Found a potential substr
2412
bind(FOUND_CANDIDATE);
2413
// Matched whole vector if first element matched (tmp(rcx) == 0).
2414
if (int_cnt2 == stride) {
2415
jccb(Assembler::overflow, RET_FOUND); // OF == 1
2416
} else { // int_cnt2 > 8
2417
jccb(Assembler::overflow, FOUND_SUBSTR);
2418
}
2419
// After pcmpestri tmp(rcx) contains matched element index
2420
// Compute start addr of substr
2421
lea(result, Address(result, tmp, scale1));
2422
2423
// Make sure string is still long enough
2424
subl(cnt1, tmp);
2425
cmpl(cnt1, cnt2);
2426
if (int_cnt2 == stride) {
2427
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2428
} else { // int_cnt2 > 8
2429
jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2430
}
2431
// Left less then substring.
2432
2433
bind(RET_NOT_FOUND);
2434
movl(result, -1);
2435
jmp(EXIT);
2436
2437
if (int_cnt2 > stride) {
2438
// This code is optimized for the case when whole substring
2439
// is matched if its head is matched.
2440
bind(MATCH_SUBSTR_HEAD);
2441
pcmpestri(vec, Address(result, 0), mode);
2442
// Reload only string if does not match
2443
jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2444
2445
Label CONT_SCAN_SUBSTR;
2446
// Compare the rest of substring (> 8 chars).
2447
bind(FOUND_SUBSTR);
2448
// First 8 chars are already matched.
2449
negptr(cnt2);
2450
addptr(cnt2, stride);
2451
2452
bind(SCAN_SUBSTR);
2453
subl(cnt1, stride);
2454
cmpl(cnt2, -stride); // Do not read beyond substring
2455
jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2456
// Back-up strings to avoid reading beyond substring:
2457
// cnt1 = cnt1 - cnt2 + 8
2458
addl(cnt1, cnt2); // cnt2 is negative
2459
addl(cnt1, stride);
2460
movl(cnt2, stride); negptr(cnt2);
2461
bind(CONT_SCAN_SUBSTR);
2462
if (int_cnt2 < (int)G) {
2463
int tail_off1 = int_cnt2<<scale1;
2464
int tail_off2 = int_cnt2<<scale2;
2465
if (ae == StrIntrinsicNode::UL) {
2466
pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2467
} else {
2468
movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2469
}
2470
pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2471
} else {
2472
// calculate index in register to avoid integer overflow (int_cnt2*2)
2473
movl(tmp, int_cnt2);
2474
addptr(tmp, cnt2);
2475
if (ae == StrIntrinsicNode::UL) {
2476
pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2477
} else {
2478
movdqu(vec, Address(str2, tmp, scale2, 0));
2479
}
2480
pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2481
}
2482
// Need to reload strings pointers if not matched whole vector
2483
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2484
addptr(cnt2, stride);
2485
jcc(Assembler::negative, SCAN_SUBSTR);
2486
// Fall through if found full substring
2487
2488
} // (int_cnt2 > 8)
2489
2490
bind(RET_FOUND);
2491
// Found result if we matched full small substring.
2492
// Compute substr offset
2493
subptr(result, str1);
2494
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2495
shrl(result, 1); // index
2496
}
2497
bind(EXIT);
2498
2499
} // string_indexofC8
2500
2501
// Small strings are loaded through stack if they cross page boundary.
2502
void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2503
Register cnt1, Register cnt2,
2504
int int_cnt2, Register result,
2505
XMMRegister vec, Register tmp,
2506
int ae) {
2507
ShortBranchVerifier sbv(this);
2508
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2509
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2510
2511
//
2512
// int_cnt2 is length of small (< 8 chars) constant substring
2513
// or (-1) for non constant substring in which case its length
2514
// is in cnt2 register.
2515
//
2516
// Note, inline_string_indexOf() generates checks:
2517
// if (substr.count > string.count) return -1;
2518
// if (substr.count == 0) return 0;
2519
//
2520
int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2521
assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2522
// This method uses the pcmpestri instruction with bound registers
2523
// inputs:
2524
// xmm - substring
2525
// rax - substring length (elements count)
2526
// mem - scanned string
2527
// rdx - string length (elements count)
2528
// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2529
// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2530
// outputs:
2531
// rcx - matched index in string
2532
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2533
int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2534
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2535
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2536
2537
Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2538
RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2539
FOUND_CANDIDATE;
2540
2541
{ //========================================================
2542
// We don't know where these strings are located
2543
// and we can't read beyond them. Load them through stack.
2544
Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2545
2546
movptr(tmp, rsp); // save old SP
2547
2548
if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2549
if (int_cnt2 == (1>>scale2)) { // One byte
2550
assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2551
load_unsigned_byte(result, Address(str2, 0));
2552
movdl(vec, result); // move 32 bits
2553
} else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2554
// Not enough header space in 32-bit VM: 12+3 = 15.
2555
movl(result, Address(str2, -1));
2556
shrl(result, 8);
2557
movdl(vec, result); // move 32 bits
2558
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2559
load_unsigned_short(result, Address(str2, 0));
2560
movdl(vec, result); // move 32 bits
2561
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2562
movdl(vec, Address(str2, 0)); // move 32 bits
2563
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2564
movq(vec, Address(str2, 0)); // move 64 bits
2565
} else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2566
// Array header size is 12 bytes in 32-bit VM
2567
// + 6 bytes for 3 chars == 18 bytes,
2568
// enough space to load vec and shift.
2569
assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2570
if (ae == StrIntrinsicNode::UL) {
2571
int tail_off = int_cnt2-8;
2572
pmovzxbw(vec, Address(str2, tail_off));
2573
psrldq(vec, -2*tail_off);
2574
}
2575
else {
2576
int tail_off = int_cnt2*(1<<scale2);
2577
movdqu(vec, Address(str2, tail_off-16));
2578
psrldq(vec, 16-tail_off);
2579
}
2580
}
2581
} else { // not constant substring
2582
cmpl(cnt2, stride);
2583
jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2584
2585
// We can read beyond string if srt+16 does not cross page boundary
2586
// since heaps are aligned and mapped by pages.
2587
assert(os::vm_page_size() < (int)G, "default page should be small");
2588
movl(result, str2); // We need only low 32 bits
2589
andl(result, (os::vm_page_size()-1));
2590
cmpl(result, (os::vm_page_size()-16));
2591
jccb(Assembler::belowEqual, CHECK_STR);
2592
2593
// Move small strings to stack to allow load 16 bytes into vec.
2594
subptr(rsp, 16);
2595
int stk_offset = wordSize-(1<<scale2);
2596
push(cnt2);
2597
2598
bind(COPY_SUBSTR);
2599
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2600
load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2601
movb(Address(rsp, cnt2, scale2, stk_offset), result);
2602
} else if (ae == StrIntrinsicNode::UU) {
2603
load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2604
movw(Address(rsp, cnt2, scale2, stk_offset), result);
2605
}
2606
decrement(cnt2);
2607
jccb(Assembler::notZero, COPY_SUBSTR);
2608
2609
pop(cnt2);
2610
movptr(str2, rsp); // New substring address
2611
} // non constant
2612
2613
bind(CHECK_STR);
2614
cmpl(cnt1, stride);
2615
jccb(Assembler::aboveEqual, BIG_STRINGS);
2616
2617
// Check cross page boundary.
2618
movl(result, str1); // We need only low 32 bits
2619
andl(result, (os::vm_page_size()-1));
2620
cmpl(result, (os::vm_page_size()-16));
2621
jccb(Assembler::belowEqual, BIG_STRINGS);
2622
2623
subptr(rsp, 16);
2624
int stk_offset = -(1<<scale1);
2625
if (int_cnt2 < 0) { // not constant
2626
push(cnt2);
2627
stk_offset += wordSize;
2628
}
2629
movl(cnt2, cnt1);
2630
2631
bind(COPY_STR);
2632
if (ae == StrIntrinsicNode::LL) {
2633
load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2634
movb(Address(rsp, cnt2, scale1, stk_offset), result);
2635
} else {
2636
load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2637
movw(Address(rsp, cnt2, scale1, stk_offset), result);
2638
}
2639
decrement(cnt2);
2640
jccb(Assembler::notZero, COPY_STR);
2641
2642
if (int_cnt2 < 0) { // not constant
2643
pop(cnt2);
2644
}
2645
movptr(str1, rsp); // New string address
2646
2647
bind(BIG_STRINGS);
2648
// Load substring.
2649
if (int_cnt2 < 0) { // -1
2650
if (ae == StrIntrinsicNode::UL) {
2651
pmovzxbw(vec, Address(str2, 0));
2652
} else {
2653
movdqu(vec, Address(str2, 0));
2654
}
2655
push(cnt2); // substr count
2656
push(str2); // substr addr
2657
push(str1); // string addr
2658
} else {
2659
// Small (< 8 chars) constant substrings are loaded already.
2660
movl(cnt2, int_cnt2);
2661
}
2662
push(tmp); // original SP
2663
2664
} // Finished loading
2665
2666
//========================================================
2667
// Start search
2668
//
2669
2670
movptr(result, str1); // string addr
2671
2672
if (int_cnt2 < 0) { // Only for non constant substring
2673
jmpb(SCAN_TO_SUBSTR);
2674
2675
// SP saved at sp+0
2676
// String saved at sp+1*wordSize
2677
// Substr saved at sp+2*wordSize
2678
// Substr count saved at sp+3*wordSize
2679
2680
// Reload substr for rescan, this code
2681
// is executed only for large substrings (> 8 chars)
2682
bind(RELOAD_SUBSTR);
2683
movptr(str2, Address(rsp, 2*wordSize));
2684
movl(cnt2, Address(rsp, 3*wordSize));
2685
if (ae == StrIntrinsicNode::UL) {
2686
pmovzxbw(vec, Address(str2, 0));
2687
} else {
2688
movdqu(vec, Address(str2, 0));
2689
}
2690
// We came here after the beginning of the substring was
2691
// matched but the rest of it was not so we need to search
2692
// again. Start from the next element after the previous match.
2693
subptr(str1, result); // Restore counter
2694
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2695
shrl(str1, 1);
2696
}
2697
addl(cnt1, str1);
2698
decrementl(cnt1); // Shift to next element
2699
cmpl(cnt1, cnt2);
2700
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2701
2702
addptr(result, (1<<scale1));
2703
} // non constant
2704
2705
// Scan string for start of substr in 16-byte vectors
2706
bind(SCAN_TO_SUBSTR);
2707
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2708
pcmpestri(vec, Address(result, 0), mode);
2709
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2710
subl(cnt1, stride);
2711
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2712
cmpl(cnt1, cnt2);
2713
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2714
addptr(result, 16);
2715
2716
bind(ADJUST_STR);
2717
cmpl(cnt1, stride); // Do not read beyond string
2718
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2719
// Back-up string to avoid reading beyond string.
2720
lea(result, Address(result, cnt1, scale1, -16));
2721
movl(cnt1, stride);
2722
jmpb(SCAN_TO_SUBSTR);
2723
2724
// Found a potential substr
2725
bind(FOUND_CANDIDATE);
2726
// After pcmpestri tmp(rcx) contains matched element index
2727
2728
// Make sure string is still long enough
2729
subl(cnt1, tmp);
2730
cmpl(cnt1, cnt2);
2731
jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2732
// Left less then substring.
2733
2734
bind(RET_NOT_FOUND);
2735
movl(result, -1);
2736
jmp(CLEANUP);
2737
2738
bind(FOUND_SUBSTR);
2739
// Compute start addr of substr
2740
lea(result, Address(result, tmp, scale1));
2741
if (int_cnt2 > 0) { // Constant substring
2742
// Repeat search for small substring (< 8 chars)
2743
// from new point without reloading substring.
2744
// Have to check that we don't read beyond string.
2745
cmpl(tmp, stride-int_cnt2);
2746
jccb(Assembler::greater, ADJUST_STR);
2747
// Fall through if matched whole substring.
2748
} else { // non constant
2749
assert(int_cnt2 == -1, "should be != 0");
2750
2751
addl(tmp, cnt2);
2752
// Found result if we matched whole substring.
2753
cmpl(tmp, stride);
2754
jcc(Assembler::lessEqual, RET_FOUND);
2755
2756
// Repeat search for small substring (<= 8 chars)
2757
// from new point 'str1' without reloading substring.
2758
cmpl(cnt2, stride);
2759
// Have to check that we don't read beyond string.
2760
jccb(Assembler::lessEqual, ADJUST_STR);
2761
2762
Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2763
// Compare the rest of substring (> 8 chars).
2764
movptr(str1, result);
2765
2766
cmpl(tmp, cnt2);
2767
// First 8 chars are already matched.
2768
jccb(Assembler::equal, CHECK_NEXT);
2769
2770
bind(SCAN_SUBSTR);
2771
pcmpestri(vec, Address(str1, 0), mode);
2772
// Need to reload strings pointers if not matched whole vector
2773
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2774
2775
bind(CHECK_NEXT);
2776
subl(cnt2, stride);
2777
jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2778
addptr(str1, 16);
2779
if (ae == StrIntrinsicNode::UL) {
2780
addptr(str2, 8);
2781
} else {
2782
addptr(str2, 16);
2783
}
2784
subl(cnt1, stride);
2785
cmpl(cnt2, stride); // Do not read beyond substring
2786
jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2787
// Back-up strings to avoid reading beyond substring.
2788
2789
if (ae == StrIntrinsicNode::UL) {
2790
lea(str2, Address(str2, cnt2, scale2, -8));
2791
lea(str1, Address(str1, cnt2, scale1, -16));
2792
} else {
2793
lea(str2, Address(str2, cnt2, scale2, -16));
2794
lea(str1, Address(str1, cnt2, scale1, -16));
2795
}
2796
subl(cnt1, cnt2);
2797
movl(cnt2, stride);
2798
addl(cnt1, stride);
2799
bind(CONT_SCAN_SUBSTR);
2800
if (ae == StrIntrinsicNode::UL) {
2801
pmovzxbw(vec, Address(str2, 0));
2802
} else {
2803
movdqu(vec, Address(str2, 0));
2804
}
2805
jmp(SCAN_SUBSTR);
2806
2807
bind(RET_FOUND_LONG);
2808
movptr(str1, Address(rsp, wordSize));
2809
} // non constant
2810
2811
bind(RET_FOUND);
2812
// Compute substr offset
2813
subptr(result, str1);
2814
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2815
shrl(result, 1); // index
2816
}
2817
bind(CLEANUP);
2818
pop(rsp); // restore SP
2819
2820
} // string_indexof
2821
2822
void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2823
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2824
ShortBranchVerifier sbv(this);
2825
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2826
2827
int stride = 8;
2828
2829
Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2830
SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2831
RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2832
FOUND_SEQ_CHAR, DONE_LABEL;
2833
2834
movptr(result, str1);
2835
if (UseAVX >= 2) {
2836
cmpl(cnt1, stride);
2837
jcc(Assembler::less, SCAN_TO_CHAR);
2838
cmpl(cnt1, 2*stride);
2839
jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2840
movdl(vec1, ch);
2841
vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2842
vpxor(vec2, vec2);
2843
movl(tmp, cnt1);
2844
andl(tmp, 0xFFFFFFF0); //vector count (in chars)
2845
andl(cnt1,0x0000000F); //tail count (in chars)
2846
2847
bind(SCAN_TO_16_CHAR_LOOP);
2848
vmovdqu(vec3, Address(result, 0));
2849
vpcmpeqw(vec3, vec3, vec1, 1);
2850
vptest(vec2, vec3);
2851
jcc(Assembler::carryClear, FOUND_CHAR);
2852
addptr(result, 32);
2853
subl(tmp, 2*stride);
2854
jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2855
jmp(SCAN_TO_8_CHAR);
2856
bind(SCAN_TO_8_CHAR_INIT);
2857
movdl(vec1, ch);
2858
pshuflw(vec1, vec1, 0x00);
2859
pshufd(vec1, vec1, 0);
2860
pxor(vec2, vec2);
2861
}
2862
bind(SCAN_TO_8_CHAR);
2863
cmpl(cnt1, stride);
2864
jcc(Assembler::less, SCAN_TO_CHAR);
2865
if (UseAVX < 2) {
2866
movdl(vec1, ch);
2867
pshuflw(vec1, vec1, 0x00);
2868
pshufd(vec1, vec1, 0);
2869
pxor(vec2, vec2);
2870
}
2871
movl(tmp, cnt1);
2872
andl(tmp, 0xFFFFFFF8); //vector count (in chars)
2873
andl(cnt1,0x00000007); //tail count (in chars)
2874
2875
bind(SCAN_TO_8_CHAR_LOOP);
2876
movdqu(vec3, Address(result, 0));
2877
pcmpeqw(vec3, vec1);
2878
ptest(vec2, vec3);
2879
jcc(Assembler::carryClear, FOUND_CHAR);
2880
addptr(result, 16);
2881
subl(tmp, stride);
2882
jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2883
bind(SCAN_TO_CHAR);
2884
testl(cnt1, cnt1);
2885
jcc(Assembler::zero, RET_NOT_FOUND);
2886
bind(SCAN_TO_CHAR_LOOP);
2887
load_unsigned_short(tmp, Address(result, 0));
2888
cmpl(ch, tmp);
2889
jccb(Assembler::equal, FOUND_SEQ_CHAR);
2890
addptr(result, 2);
2891
subl(cnt1, 1);
2892
jccb(Assembler::zero, RET_NOT_FOUND);
2893
jmp(SCAN_TO_CHAR_LOOP);
2894
2895
bind(RET_NOT_FOUND);
2896
movl(result, -1);
2897
jmpb(DONE_LABEL);
2898
2899
bind(FOUND_CHAR);
2900
if (UseAVX >= 2) {
2901
vpmovmskb(tmp, vec3);
2902
} else {
2903
pmovmskb(tmp, vec3);
2904
}
2905
bsfl(ch, tmp);
2906
addptr(result, ch);
2907
2908
bind(FOUND_SEQ_CHAR);
2909
subptr(result, str1);
2910
shrl(result, 1);
2911
2912
bind(DONE_LABEL);
2913
} // string_indexof_char
2914
2915
void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2916
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2917
ShortBranchVerifier sbv(this);
2918
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2919
2920
int stride = 16;
2921
2922
Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2923
SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2924
RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2925
FOUND_SEQ_CHAR, DONE_LABEL;
2926
2927
movptr(result, str1);
2928
if (UseAVX >= 2) {
2929
cmpl(cnt1, stride);
2930
jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2931
cmpl(cnt1, stride*2);
2932
jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2933
movdl(vec1, ch);
2934
vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2935
vpxor(vec2, vec2);
2936
movl(tmp, cnt1);
2937
andl(tmp, 0xFFFFFFE0); //vector count (in chars)
2938
andl(cnt1,0x0000001F); //tail count (in chars)
2939
2940
bind(SCAN_TO_32_CHAR_LOOP);
2941
vmovdqu(vec3, Address(result, 0));
2942
vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2943
vptest(vec2, vec3);
2944
jcc(Assembler::carryClear, FOUND_CHAR);
2945
addptr(result, 32);
2946
subl(tmp, stride*2);
2947
jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2948
jmp(SCAN_TO_16_CHAR);
2949
2950
bind(SCAN_TO_16_CHAR_INIT);
2951
movdl(vec1, ch);
2952
pxor(vec2, vec2);
2953
pshufb(vec1, vec2);
2954
}
2955
2956
bind(SCAN_TO_16_CHAR);
2957
cmpl(cnt1, stride);
2958
jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
2959
if (UseAVX < 2) {
2960
movdl(vec1, ch);
2961
pxor(vec2, vec2);
2962
pshufb(vec1, vec2);
2963
}
2964
movl(tmp, cnt1);
2965
andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
2966
andl(cnt1,0x0000000F); //tail count (in bytes)
2967
2968
bind(SCAN_TO_16_CHAR_LOOP);
2969
movdqu(vec3, Address(result, 0));
2970
pcmpeqb(vec3, vec1);
2971
ptest(vec2, vec3);
2972
jcc(Assembler::carryClear, FOUND_CHAR);
2973
addptr(result, 16);
2974
subl(tmp, stride);
2975
jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
2976
2977
bind(SCAN_TO_CHAR_INIT);
2978
testl(cnt1, cnt1);
2979
jcc(Assembler::zero, RET_NOT_FOUND);
2980
bind(SCAN_TO_CHAR_LOOP);
2981
load_unsigned_byte(tmp, Address(result, 0));
2982
cmpl(ch, tmp);
2983
jccb(Assembler::equal, FOUND_SEQ_CHAR);
2984
addptr(result, 1);
2985
subl(cnt1, 1);
2986
jccb(Assembler::zero, RET_NOT_FOUND);
2987
jmp(SCAN_TO_CHAR_LOOP);
2988
2989
bind(RET_NOT_FOUND);
2990
movl(result, -1);
2991
jmpb(DONE_LABEL);
2992
2993
bind(FOUND_CHAR);
2994
if (UseAVX >= 2) {
2995
vpmovmskb(tmp, vec3);
2996
} else {
2997
pmovmskb(tmp, vec3);
2998
}
2999
bsfl(ch, tmp);
3000
addptr(result, ch);
3001
3002
bind(FOUND_SEQ_CHAR);
3003
subptr(result, str1);
3004
3005
bind(DONE_LABEL);
3006
} // stringL_indexof_char
3007
3008
// helper function for string_compare
3009
void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3010
Address::ScaleFactor scale, Address::ScaleFactor scale1,
3011
Address::ScaleFactor scale2, Register index, int ae) {
3012
if (ae == StrIntrinsicNode::LL) {
3013
load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3014
load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3015
} else if (ae == StrIntrinsicNode::UU) {
3016
load_unsigned_short(elem1, Address(str1, index, scale, 0));
3017
load_unsigned_short(elem2, Address(str2, index, scale, 0));
3018
} else {
3019
load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3020
load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3021
}
3022
}
3023
3024
// Compare strings, used for char[] and byte[].
3025
void C2_MacroAssembler::string_compare(Register str1, Register str2,
3026
Register cnt1, Register cnt2, Register result,
3027
XMMRegister vec1, int ae, KRegister mask) {
3028
ShortBranchVerifier sbv(this);
3029
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3030
Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
3031
int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3032
int stride2x2 = 0x40;
3033
Address::ScaleFactor scale = Address::no_scale;
3034
Address::ScaleFactor scale1 = Address::no_scale;
3035
Address::ScaleFactor scale2 = Address::no_scale;
3036
3037
if (ae != StrIntrinsicNode::LL) {
3038
stride2x2 = 0x20;
3039
}
3040
3041
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3042
shrl(cnt2, 1);
3043
}
3044
// Compute the minimum of the string lengths and the
3045
// difference of the string lengths (stack).
3046
// Do the conditional move stuff
3047
movl(result, cnt1);
3048
subl(cnt1, cnt2);
3049
push(cnt1);
3050
cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3051
3052
// Is the minimum length zero?
3053
testl(cnt2, cnt2);
3054
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3055
if (ae == StrIntrinsicNode::LL) {
3056
// Load first bytes
3057
load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3058
load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3059
} else if (ae == StrIntrinsicNode::UU) {
3060
// Load first characters
3061
load_unsigned_short(result, Address(str1, 0));
3062
load_unsigned_short(cnt1, Address(str2, 0));
3063
} else {
3064
load_unsigned_byte(result, Address(str1, 0));
3065
load_unsigned_short(cnt1, Address(str2, 0));
3066
}
3067
subl(result, cnt1);
3068
jcc(Assembler::notZero, POP_LABEL);
3069
3070
if (ae == StrIntrinsicNode::UU) {
3071
// Divide length by 2 to get number of chars
3072
shrl(cnt2, 1);
3073
}
3074
cmpl(cnt2, 1);
3075
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3076
3077
// Check if the strings start at the same location and setup scale and stride
3078
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3079
cmpptr(str1, str2);
3080
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3081
if (ae == StrIntrinsicNode::LL) {
3082
scale = Address::times_1;
3083
stride = 16;
3084
} else {
3085
scale = Address::times_2;
3086
stride = 8;
3087
}
3088
} else {
3089
scale1 = Address::times_1;
3090
scale2 = Address::times_2;
3091
// scale not used
3092
stride = 8;
3093
}
3094
3095
if (UseAVX >= 2 && UseSSE42Intrinsics) {
3096
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3097
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3098
Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3099
Label COMPARE_TAIL_LONG;
3100
Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
3101
3102
int pcmpmask = 0x19;
3103
if (ae == StrIntrinsicNode::LL) {
3104
pcmpmask &= ~0x01;
3105
}
3106
3107
// Setup to compare 16-chars (32-bytes) vectors,
3108
// start from first character again because it has aligned address.
3109
if (ae == StrIntrinsicNode::LL) {
3110
stride2 = 32;
3111
} else {
3112
stride2 = 16;
3113
}
3114
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3115
adr_stride = stride << scale;
3116
} else {
3117
adr_stride1 = 8; //stride << scale1;
3118
adr_stride2 = 16; //stride << scale2;
3119
}
3120
3121
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3122
// rax and rdx are used by pcmpestri as elements counters
3123
movl(result, cnt2);
3124
andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3125
jcc(Assembler::zero, COMPARE_TAIL_LONG);
3126
3127
// fast path : compare first 2 8-char vectors.
3128
bind(COMPARE_16_CHARS);
3129
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3130
movdqu(vec1, Address(str1, 0));
3131
} else {
3132
pmovzxbw(vec1, Address(str1, 0));
3133
}
3134
pcmpestri(vec1, Address(str2, 0), pcmpmask);
3135
jccb(Assembler::below, COMPARE_INDEX_CHAR);
3136
3137
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3138
movdqu(vec1, Address(str1, adr_stride));
3139
pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3140
} else {
3141
pmovzxbw(vec1, Address(str1, adr_stride1));
3142
pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3143
}
3144
jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3145
addl(cnt1, stride);
3146
3147
// Compare the characters at index in cnt1
3148
bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3149
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3150
subl(result, cnt2);
3151
jmp(POP_LABEL);
3152
3153
// Setup the registers to start vector comparison loop
3154
bind(COMPARE_WIDE_VECTORS);
3155
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3156
lea(str1, Address(str1, result, scale));
3157
lea(str2, Address(str2, result, scale));
3158
} else {
3159
lea(str1, Address(str1, result, scale1));
3160
lea(str2, Address(str2, result, scale2));
3161
}
3162
subl(result, stride2);
3163
subl(cnt2, stride2);
3164
jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3165
negptr(result);
3166
3167
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3168
bind(COMPARE_WIDE_VECTORS_LOOP);
3169
3170
#ifdef _LP64
3171
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3172
cmpl(cnt2, stride2x2);
3173
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3174
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3175
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3176
3177
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3178
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3179
evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3180
evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3181
} else {
3182
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3183
evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3184
}
3185
kortestql(mask, mask);
3186
jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3187
addptr(result, stride2x2); // update since we already compared at this addr
3188
subl(cnt2, stride2x2); // and sub the size too
3189
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3190
3191
vpxor(vec1, vec1);
3192
jmpb(COMPARE_WIDE_TAIL);
3193
}//if (VM_Version::supports_avx512vlbw())
3194
#endif // _LP64
3195
3196
3197
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3198
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3199
vmovdqu(vec1, Address(str1, result, scale));
3200
vpxor(vec1, Address(str2, result, scale));
3201
} else {
3202
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3203
vpxor(vec1, Address(str2, result, scale2));
3204
}
3205
vptest(vec1, vec1);
3206
jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3207
addptr(result, stride2);
3208
subl(cnt2, stride2);
3209
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3210
// clean upper bits of YMM registers
3211
vpxor(vec1, vec1);
3212
3213
// compare wide vectors tail
3214
bind(COMPARE_WIDE_TAIL);
3215
testptr(result, result);
3216
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3217
3218
movl(result, stride2);
3219
movl(cnt2, result);
3220
negptr(result);
3221
jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3222
3223
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3224
bind(VECTOR_NOT_EQUAL);
3225
// clean upper bits of YMM registers
3226
vpxor(vec1, vec1);
3227
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3228
lea(str1, Address(str1, result, scale));
3229
lea(str2, Address(str2, result, scale));
3230
} else {
3231
lea(str1, Address(str1, result, scale1));
3232
lea(str2, Address(str2, result, scale2));
3233
}
3234
jmp(COMPARE_16_CHARS);
3235
3236
// Compare tail chars, length between 1 to 15 chars
3237
bind(COMPARE_TAIL_LONG);
3238
movl(cnt2, result);
3239
cmpl(cnt2, stride);
3240
jcc(Assembler::less, COMPARE_SMALL_STR);
3241
3242
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3243
movdqu(vec1, Address(str1, 0));
3244
} else {
3245
pmovzxbw(vec1, Address(str1, 0));
3246
}
3247
pcmpestri(vec1, Address(str2, 0), pcmpmask);
3248
jcc(Assembler::below, COMPARE_INDEX_CHAR);
3249
subptr(cnt2, stride);
3250
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3251
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3252
lea(str1, Address(str1, result, scale));
3253
lea(str2, Address(str2, result, scale));
3254
} else {
3255
lea(str1, Address(str1, result, scale1));
3256
lea(str2, Address(str2, result, scale2));
3257
}
3258
negptr(cnt2);
3259
jmpb(WHILE_HEAD_LABEL);
3260
3261
bind(COMPARE_SMALL_STR);
3262
} else if (UseSSE42Intrinsics) {
3263
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3264
int pcmpmask = 0x19;
3265
// Setup to compare 8-char (16-byte) vectors,
3266
// start from first character again because it has aligned address.
3267
movl(result, cnt2);
3268
andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3269
if (ae == StrIntrinsicNode::LL) {
3270
pcmpmask &= ~0x01;
3271
}
3272
jcc(Assembler::zero, COMPARE_TAIL);
3273
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3274
lea(str1, Address(str1, result, scale));
3275
lea(str2, Address(str2, result, scale));
3276
} else {
3277
lea(str1, Address(str1, result, scale1));
3278
lea(str2, Address(str2, result, scale2));
3279
}
3280
negptr(result);
3281
3282
// pcmpestri
3283
// inputs:
3284
// vec1- substring
3285
// rax - negative string length (elements count)
3286
// mem - scanned string
3287
// rdx - string length (elements count)
3288
// pcmpmask - cmp mode: 11000 (string compare with negated result)
3289
// + 00 (unsigned bytes) or + 01 (unsigned shorts)
3290
// outputs:
3291
// rcx - first mismatched element index
3292
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3293
3294
bind(COMPARE_WIDE_VECTORS);
3295
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3296
movdqu(vec1, Address(str1, result, scale));
3297
pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3298
} else {
3299
pmovzxbw(vec1, Address(str1, result, scale1));
3300
pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3301
}
3302
// After pcmpestri cnt1(rcx) contains mismatched element index
3303
3304
jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3305
addptr(result, stride);
3306
subptr(cnt2, stride);
3307
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3308
3309
// compare wide vectors tail
3310
testptr(result, result);
3311
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3312
3313
movl(cnt2, stride);
3314
movl(result, stride);
3315
negptr(result);
3316
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3317
movdqu(vec1, Address(str1, result, scale));
3318
pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3319
} else {
3320
pmovzxbw(vec1, Address(str1, result, scale1));
3321
pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3322
}
3323
jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3324
3325
// Mismatched characters in the vectors
3326
bind(VECTOR_NOT_EQUAL);
3327
addptr(cnt1, result);
3328
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3329
subl(result, cnt2);
3330
jmpb(POP_LABEL);
3331
3332
bind(COMPARE_TAIL); // limit is zero
3333
movl(cnt2, result);
3334
// Fallthru to tail compare
3335
}
3336
// Shift str2 and str1 to the end of the arrays, negate min
3337
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3338
lea(str1, Address(str1, cnt2, scale));
3339
lea(str2, Address(str2, cnt2, scale));
3340
} else {
3341
lea(str1, Address(str1, cnt2, scale1));
3342
lea(str2, Address(str2, cnt2, scale2));
3343
}
3344
decrementl(cnt2); // first character was compared already
3345
negptr(cnt2);
3346
3347
// Compare the rest of the elements
3348
bind(WHILE_HEAD_LABEL);
3349
load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3350
subl(result, cnt1);
3351
jccb(Assembler::notZero, POP_LABEL);
3352
increment(cnt2);
3353
jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3354
3355
// Strings are equal up to min length. Return the length difference.
3356
bind(LENGTH_DIFF_LABEL);
3357
pop(result);
3358
if (ae == StrIntrinsicNode::UU) {
3359
// Divide diff by 2 to get number of chars
3360
sarl(result, 1);
3361
}
3362
jmpb(DONE_LABEL);
3363
3364
#ifdef _LP64
3365
if (VM_Version::supports_avx512vlbw()) {
3366
3367
bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3368
3369
kmovql(cnt1, mask);
3370
notq(cnt1);
3371
bsfq(cnt2, cnt1);
3372
if (ae != StrIntrinsicNode::LL) {
3373
// Divide diff by 2 to get number of chars
3374
sarl(cnt2, 1);
3375
}
3376
addq(result, cnt2);
3377
if (ae == StrIntrinsicNode::LL) {
3378
load_unsigned_byte(cnt1, Address(str2, result));
3379
load_unsigned_byte(result, Address(str1, result));
3380
} else if (ae == StrIntrinsicNode::UU) {
3381
load_unsigned_short(cnt1, Address(str2, result, scale));
3382
load_unsigned_short(result, Address(str1, result, scale));
3383
} else {
3384
load_unsigned_short(cnt1, Address(str2, result, scale2));
3385
load_unsigned_byte(result, Address(str1, result, scale1));
3386
}
3387
subl(result, cnt1);
3388
jmpb(POP_LABEL);
3389
}//if (VM_Version::supports_avx512vlbw())
3390
#endif // _LP64
3391
3392
// Discard the stored length difference
3393
bind(POP_LABEL);
3394
pop(cnt1);
3395
3396
// That's it
3397
bind(DONE_LABEL);
3398
if(ae == StrIntrinsicNode::UL) {
3399
negl(result);
3400
}
3401
3402
}
3403
3404
// Search for Non-ASCII character (Negative byte value) in a byte array,
3405
// return true if it has any and false otherwise.
3406
// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3407
// @IntrinsicCandidate
3408
// private static boolean hasNegatives(byte[] ba, int off, int len) {
3409
// for (int i = off; i < off + len; i++) {
3410
// if (ba[i] < 0) {
3411
// return true;
3412
// }
3413
// }
3414
// return false;
3415
// }
3416
void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3417
Register result, Register tmp1,
3418
XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3419
// rsi: byte array
3420
// rcx: len
3421
// rax: result
3422
ShortBranchVerifier sbv(this);
3423
assert_different_registers(ary1, len, result, tmp1);
3424
assert_different_registers(vec1, vec2);
3425
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3426
3427
// len == 0
3428
testl(len, len);
3429
jcc(Assembler::zero, FALSE_LABEL);
3430
3431
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3432
VM_Version::supports_avx512vlbw() &&
3433
VM_Version::supports_bmi2()) {
3434
3435
Label test_64_loop, test_tail;
3436
Register tmp3_aliased = len;
3437
3438
movl(tmp1, len);
3439
vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3440
3441
andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
3442
andl(len, ~(64 - 1)); // vector count (in chars)
3443
jccb(Assembler::zero, test_tail);
3444
3445
lea(ary1, Address(ary1, len, Address::times_1));
3446
negptr(len);
3447
3448
bind(test_64_loop);
3449
// Check whether our 64 elements of size byte contain negatives
3450
evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3451
kortestql(mask1, mask1);
3452
jcc(Assembler::notZero, TRUE_LABEL);
3453
3454
addptr(len, 64);
3455
jccb(Assembler::notZero, test_64_loop);
3456
3457
3458
bind(test_tail);
3459
// bail out when there is nothing to be done
3460
testl(tmp1, -1);
3461
jcc(Assembler::zero, FALSE_LABEL);
3462
3463
// ~(~0 << len) applied up to two times (for 32-bit scenario)
3464
#ifdef _LP64
3465
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3466
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3467
notq(tmp3_aliased);
3468
kmovql(mask2, tmp3_aliased);
3469
#else
3470
Label k_init;
3471
jmp(k_init);
3472
3473
// We could not read 64-bits from a general purpose register thus we move
3474
// data required to compose 64 1's to the instruction stream
3475
// We emit 64 byte wide series of elements from 0..63 which later on would
3476
// be used as a compare targets with tail count contained in tmp1 register.
3477
// Result would be a k register having tmp1 consecutive number or 1
3478
// counting from least significant bit.
3479
address tmp = pc();
3480
emit_int64(0x0706050403020100);
3481
emit_int64(0x0F0E0D0C0B0A0908);
3482
emit_int64(0x1716151413121110);
3483
emit_int64(0x1F1E1D1C1B1A1918);
3484
emit_int64(0x2726252423222120);
3485
emit_int64(0x2F2E2D2C2B2A2928);
3486
emit_int64(0x3736353433323130);
3487
emit_int64(0x3F3E3D3C3B3A3938);
3488
3489
bind(k_init);
3490
lea(len, InternalAddress(tmp));
3491
// create mask to test for negative byte inside a vector
3492
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3493
evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3494
3495
#endif
3496
evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3497
ktestq(mask1, mask2);
3498
jcc(Assembler::notZero, TRUE_LABEL);
3499
3500
jmp(FALSE_LABEL);
3501
} else {
3502
movl(result, len); // copy
3503
3504
if (UseAVX >= 2 && UseSSE >= 2) {
3505
// With AVX2, use 32-byte vector compare
3506
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3507
3508
// Compare 32-byte vectors
3509
andl(result, 0x0000001f); // tail count (in bytes)
3510
andl(len, 0xffffffe0); // vector count (in bytes)
3511
jccb(Assembler::zero, COMPARE_TAIL);
3512
3513
lea(ary1, Address(ary1, len, Address::times_1));
3514
negptr(len);
3515
3516
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
3517
movdl(vec2, tmp1);
3518
vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3519
3520
bind(COMPARE_WIDE_VECTORS);
3521
vmovdqu(vec1, Address(ary1, len, Address::times_1));
3522
vptest(vec1, vec2);
3523
jccb(Assembler::notZero, TRUE_LABEL);
3524
addptr(len, 32);
3525
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3526
3527
testl(result, result);
3528
jccb(Assembler::zero, FALSE_LABEL);
3529
3530
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3531
vptest(vec1, vec2);
3532
jccb(Assembler::notZero, TRUE_LABEL);
3533
jmpb(FALSE_LABEL);
3534
3535
bind(COMPARE_TAIL); // len is zero
3536
movl(len, result);
3537
// Fallthru to tail compare
3538
} else if (UseSSE42Intrinsics) {
3539
// With SSE4.2, use double quad vector compare
3540
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3541
3542
// Compare 16-byte vectors
3543
andl(result, 0x0000000f); // tail count (in bytes)
3544
andl(len, 0xfffffff0); // vector count (in bytes)
3545
jcc(Assembler::zero, COMPARE_TAIL);
3546
3547
lea(ary1, Address(ary1, len, Address::times_1));
3548
negptr(len);
3549
3550
movl(tmp1, 0x80808080);
3551
movdl(vec2, tmp1);
3552
pshufd(vec2, vec2, 0);
3553
3554
bind(COMPARE_WIDE_VECTORS);
3555
movdqu(vec1, Address(ary1, len, Address::times_1));
3556
ptest(vec1, vec2);
3557
jcc(Assembler::notZero, TRUE_LABEL);
3558
addptr(len, 16);
3559
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3560
3561
testl(result, result);
3562
jcc(Assembler::zero, FALSE_LABEL);
3563
3564
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3565
ptest(vec1, vec2);
3566
jccb(Assembler::notZero, TRUE_LABEL);
3567
jmpb(FALSE_LABEL);
3568
3569
bind(COMPARE_TAIL); // len is zero
3570
movl(len, result);
3571
// Fallthru to tail compare
3572
}
3573
}
3574
// Compare 4-byte vectors
3575
andl(len, 0xfffffffc); // vector count (in bytes)
3576
jccb(Assembler::zero, COMPARE_CHAR);
3577
3578
lea(ary1, Address(ary1, len, Address::times_1));
3579
negptr(len);
3580
3581
bind(COMPARE_VECTORS);
3582
movl(tmp1, Address(ary1, len, Address::times_1));
3583
andl(tmp1, 0x80808080);
3584
jccb(Assembler::notZero, TRUE_LABEL);
3585
addptr(len, 4);
3586
jcc(Assembler::notZero, COMPARE_VECTORS);
3587
3588
// Compare trailing char (final 2 bytes), if any
3589
bind(COMPARE_CHAR);
3590
testl(result, 0x2); // tail char
3591
jccb(Assembler::zero, COMPARE_BYTE);
3592
load_unsigned_short(tmp1, Address(ary1, 0));
3593
andl(tmp1, 0x00008080);
3594
jccb(Assembler::notZero, TRUE_LABEL);
3595
subptr(result, 2);
3596
lea(ary1, Address(ary1, 2));
3597
3598
bind(COMPARE_BYTE);
3599
testl(result, 0x1); // tail byte
3600
jccb(Assembler::zero, FALSE_LABEL);
3601
load_unsigned_byte(tmp1, Address(ary1, 0));
3602
andl(tmp1, 0x00000080);
3603
jccb(Assembler::notEqual, TRUE_LABEL);
3604
jmpb(FALSE_LABEL);
3605
3606
bind(TRUE_LABEL);
3607
movl(result, 1); // return true
3608
jmpb(DONE);
3609
3610
bind(FALSE_LABEL);
3611
xorl(result, result); // return false
3612
3613
// That's it
3614
bind(DONE);
3615
if (UseAVX >= 2 && UseSSE >= 2) {
3616
// clean upper bits of YMM registers
3617
vpxor(vec1, vec1);
3618
vpxor(vec2, vec2);
3619
}
3620
}
3621
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3622
void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3623
Register limit, Register result, Register chr,
3624
XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3625
ShortBranchVerifier sbv(this);
3626
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3627
3628
int length_offset = arrayOopDesc::length_offset_in_bytes();
3629
int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3630
3631
if (is_array_equ) {
3632
// Check the input args
3633
cmpoop(ary1, ary2);
3634
jcc(Assembler::equal, TRUE_LABEL);
3635
3636
// Need additional checks for arrays_equals.
3637
testptr(ary1, ary1);
3638
jcc(Assembler::zero, FALSE_LABEL);
3639
testptr(ary2, ary2);
3640
jcc(Assembler::zero, FALSE_LABEL);
3641
3642
// Check the lengths
3643
movl(limit, Address(ary1, length_offset));
3644
cmpl(limit, Address(ary2, length_offset));
3645
jcc(Assembler::notEqual, FALSE_LABEL);
3646
}
3647
3648
// count == 0
3649
testl(limit, limit);
3650
jcc(Assembler::zero, TRUE_LABEL);
3651
3652
if (is_array_equ) {
3653
// Load array address
3654
lea(ary1, Address(ary1, base_offset));
3655
lea(ary2, Address(ary2, base_offset));
3656
}
3657
3658
if (is_array_equ && is_char) {
3659
// arrays_equals when used for char[].
3660
shll(limit, 1); // byte count != 0
3661
}
3662
movl(result, limit); // copy
3663
3664
if (UseAVX >= 2) {
3665
// With AVX2, use 32-byte vector compare
3666
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3667
3668
// Compare 32-byte vectors
3669
andl(result, 0x0000001f); // tail count (in bytes)
3670
andl(limit, 0xffffffe0); // vector count (in bytes)
3671
jcc(Assembler::zero, COMPARE_TAIL);
3672
3673
lea(ary1, Address(ary1, limit, Address::times_1));
3674
lea(ary2, Address(ary2, limit, Address::times_1));
3675
negptr(limit);
3676
3677
#ifdef _LP64
3678
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3679
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3680
3681
cmpl(limit, -64);
3682
jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3683
3684
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3685
3686
evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3687
evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3688
kortestql(mask, mask);
3689
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3690
addptr(limit, 64); // update since we already compared at this addr
3691
cmpl(limit, -64);
3692
jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3693
3694
// At this point we may still need to compare -limit+result bytes.
3695
// We could execute the next two instruction and just continue via non-wide path:
3696
// cmpl(limit, 0);
3697
// jcc(Assembler::equal, COMPARE_TAIL); // true
3698
// But since we stopped at the points ary{1,2}+limit which are
3699
// not farther than 64 bytes from the ends of arrays ary{1,2}+result
3700
// (|limit| <= 32 and result < 32),
3701
// we may just compare the last 64 bytes.
3702
//
3703
addptr(result, -64); // it is safe, bc we just came from this area
3704
evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3705
evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3706
kortestql(mask, mask);
3707
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3708
3709
jmp(TRUE_LABEL);
3710
3711
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3712
3713
}//if (VM_Version::supports_avx512vlbw())
3714
#endif //_LP64
3715
bind(COMPARE_WIDE_VECTORS);
3716
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3717
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3718
vpxor(vec1, vec2);
3719
3720
vptest(vec1, vec1);
3721
jcc(Assembler::notZero, FALSE_LABEL);
3722
addptr(limit, 32);
3723
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3724
3725
testl(result, result);
3726
jcc(Assembler::zero, TRUE_LABEL);
3727
3728
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3729
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3730
vpxor(vec1, vec2);
3731
3732
vptest(vec1, vec1);
3733
jccb(Assembler::notZero, FALSE_LABEL);
3734
jmpb(TRUE_LABEL);
3735
3736
bind(COMPARE_TAIL); // limit is zero
3737
movl(limit, result);
3738
// Fallthru to tail compare
3739
} else if (UseSSE42Intrinsics) {
3740
// With SSE4.2, use double quad vector compare
3741
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3742
3743
// Compare 16-byte vectors
3744
andl(result, 0x0000000f); // tail count (in bytes)
3745
andl(limit, 0xfffffff0); // vector count (in bytes)
3746
jcc(Assembler::zero, COMPARE_TAIL);
3747
3748
lea(ary1, Address(ary1, limit, Address::times_1));
3749
lea(ary2, Address(ary2, limit, Address::times_1));
3750
negptr(limit);
3751
3752
bind(COMPARE_WIDE_VECTORS);
3753
movdqu(vec1, Address(ary1, limit, Address::times_1));
3754
movdqu(vec2, Address(ary2, limit, Address::times_1));
3755
pxor(vec1, vec2);
3756
3757
ptest(vec1, vec1);
3758
jcc(Assembler::notZero, FALSE_LABEL);
3759
addptr(limit, 16);
3760
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3761
3762
testl(result, result);
3763
jcc(Assembler::zero, TRUE_LABEL);
3764
3765
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3766
movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3767
pxor(vec1, vec2);
3768
3769
ptest(vec1, vec1);
3770
jccb(Assembler::notZero, FALSE_LABEL);
3771
jmpb(TRUE_LABEL);
3772
3773
bind(COMPARE_TAIL); // limit is zero
3774
movl(limit, result);
3775
// Fallthru to tail compare
3776
}
3777
3778
// Compare 4-byte vectors
3779
andl(limit, 0xfffffffc); // vector count (in bytes)
3780
jccb(Assembler::zero, COMPARE_CHAR);
3781
3782
lea(ary1, Address(ary1, limit, Address::times_1));
3783
lea(ary2, Address(ary2, limit, Address::times_1));
3784
negptr(limit);
3785
3786
bind(COMPARE_VECTORS);
3787
movl(chr, Address(ary1, limit, Address::times_1));
3788
cmpl(chr, Address(ary2, limit, Address::times_1));
3789
jccb(Assembler::notEqual, FALSE_LABEL);
3790
addptr(limit, 4);
3791
jcc(Assembler::notZero, COMPARE_VECTORS);
3792
3793
// Compare trailing char (final 2 bytes), if any
3794
bind(COMPARE_CHAR);
3795
testl(result, 0x2); // tail char
3796
jccb(Assembler::zero, COMPARE_BYTE);
3797
load_unsigned_short(chr, Address(ary1, 0));
3798
load_unsigned_short(limit, Address(ary2, 0));
3799
cmpl(chr, limit);
3800
jccb(Assembler::notEqual, FALSE_LABEL);
3801
3802
if (is_array_equ && is_char) {
3803
bind(COMPARE_BYTE);
3804
} else {
3805
lea(ary1, Address(ary1, 2));
3806
lea(ary2, Address(ary2, 2));
3807
3808
bind(COMPARE_BYTE);
3809
testl(result, 0x1); // tail byte
3810
jccb(Assembler::zero, TRUE_LABEL);
3811
load_unsigned_byte(chr, Address(ary1, 0));
3812
load_unsigned_byte(limit, Address(ary2, 0));
3813
cmpl(chr, limit);
3814
jccb(Assembler::notEqual, FALSE_LABEL);
3815
}
3816
bind(TRUE_LABEL);
3817
movl(result, 1); // return true
3818
jmpb(DONE);
3819
3820
bind(FALSE_LABEL);
3821
xorl(result, result); // return false
3822
3823
// That's it
3824
bind(DONE);
3825
if (UseAVX >= 2) {
3826
// clean upper bits of YMM registers
3827
vpxor(vec1, vec1);
3828
vpxor(vec2, vec2);
3829
}
3830
}
3831
3832
#ifdef _LP64
3833
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3834
Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3835
assert(VM_Version::supports_avx512vlbw(), "");
3836
vpxor(xtmp, xtmp, xtmp, vec_enc);
3837
vpsubb(xtmp, xtmp, mask, vec_enc);
3838
evpmovb2m(ktmp, xtmp, vec_enc);
3839
kmovql(tmp, ktmp);
3840
switch(opc) {
3841
case Op_VectorMaskTrueCount:
3842
popcntq(dst, tmp);
3843
break;
3844
case Op_VectorMaskLastTrue:
3845
mov64(dst, -1);
3846
bsrq(tmp, tmp);
3847
cmov(Assembler::notZero, dst, tmp);
3848
break;
3849
case Op_VectorMaskFirstTrue:
3850
mov64(dst, masklen);
3851
bsfq(tmp, tmp);
3852
cmov(Assembler::notZero, dst, tmp);
3853
break;
3854
default: assert(false, "Unhandled mask operation");
3855
}
3856
}
3857
3858
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3859
XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3860
assert(VM_Version::supports_avx(), "");
3861
vpxor(xtmp, xtmp, xtmp, vec_enc);
3862
vpsubb(xtmp, xtmp, mask, vec_enc);
3863
vpmovmskb(tmp, xtmp, vec_enc);
3864
switch(opc) {
3865
case Op_VectorMaskTrueCount:
3866
popcntq(dst, tmp);
3867
break;
3868
case Op_VectorMaskLastTrue:
3869
mov64(dst, -1);
3870
bsrq(tmp, tmp);
3871
cmov(Assembler::notZero, dst, tmp);
3872
break;
3873
case Op_VectorMaskFirstTrue:
3874
mov64(dst, masklen);
3875
bsfq(tmp, tmp);
3876
cmov(Assembler::notZero, dst, tmp);
3877
break;
3878
default: assert(false, "Unhandled mask operation");
3879
}
3880
}
3881
#endif
3882
3883