CoCalc -- LoongArch64CompVec.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/LoongArch64/LoongArch64CompVec.cpp
³¹⁸⁸ views
1
// Copyright (c) 2023- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include <algorithm>
19
#include "Core/MemMap.h"
20
#include "Core/MIPS/LoongArch64/LoongArch64Jit.h"
21
#include "Core/MIPS/LoongArch64/LoongArch64RegCache.h"
22

23
// This file contains compilation for vector instructions.
24
//
25
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
26
// Currently known non working ones should have DISABLE.  No flags because that's in IR already.
27

28
// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }
29
#define CONDITIONAL_DISABLE {}
30
#define DISABLE { CompIR_Generic(inst); return; }
31
#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }
32

33
namespace MIPSComp {
34

35
using namespace LoongArch64Gen;
36
using namespace LoongArch64JitConstants;
37

38
static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {
39
	return r1 < r2 + l2 && r1 + l1 > r2;
40
}
41

42
void LoongArch64JitBackend::CompIR_VecAssign(IRInst inst) {
43
	CONDITIONAL_DISABLE;
44

45
	switch (inst.op) {
46
	case IROp::Vec4Init:
47
		regs_.Map(inst);
48

49
		switch ((Vec4Init)inst.src1) {
50
		case Vec4Init::AllZERO:
51
			if (cpu_info.LOONGARCH_LSX)
52
				VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);
53
			else
54
				for (int i = 0; i < 4; ++i)
55
					MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);
56
			break;
57

58
		case Vec4Init::AllONE:
59
			LI(SCRATCH1, 1.0f);
60
			if (cpu_info.LOONGARCH_LSX) {
61
				VREPLGR2VR_W(regs_.V(inst.dest), SCRATCH1);
62
			} else {
63
				MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);
64
				for (int i = 1; i < 4; ++i)
65
					FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.dest));
66
			}
67
			break;
68

69
		case Vec4Init::AllMinusONE:
70
			LI(SCRATCH1, -1.0f);
71
			if (cpu_info.LOONGARCH_LSX) {
72
				VREPLGR2VR_W(regs_.V(inst.dest), SCRATCH1);
73
			} else {
74
				MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);
75
				for (int i = 1; i < 4; ++i)
76
					FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.dest));
77
			}
78
			break;
79

80
		case Vec4Init::Set_1000:
81
			LI(SCRATCH1, 1.0f);
82
			if (cpu_info.LOONGARCH_LSX) {
83
				VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);
84
				VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 0);
85
			} else {
86
				for (int i = 0; i < 4; ++i) {
87
					if (i == 0) {
88
						MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
89
					} else {
90
						MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);
91
					}
92
				}
93
			}
94
			break;
95

96
		case Vec4Init::Set_0100:
97
			LI(SCRATCH1, 1.0f);
98
			if (cpu_info.LOONGARCH_LSX) {
99
				VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);
100
				VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 1);
101
			} else {
102
				for (int i = 0; i < 4; ++i) {
103
					if (i == 1) {
104
						MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
105
					} else {
106
						MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);
107
					}
108
				}
109
			}
110
			break;
111

112
		case Vec4Init::Set_0010:
113
			LI(SCRATCH1, 1.0f);
114
			if (cpu_info.LOONGARCH_LSX) {
115
				VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);
116
				VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 2);
117
			} else {
118
				for (int i = 0; i < 4; ++i) {
119
					if (i == 2) {
120
						MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
121
					} else {
122
						MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);
123
					}
124
				}
125
			}
126
			break;
127

128
		case Vec4Init::Set_0001:
129
			LI(SCRATCH1, 1.0f);
130
			if (cpu_info.LOONGARCH_LSX) {
131
				VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);
132
				VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 3);
133
			} else {
134
				for (int i = 0; i < 4; ++i) {
135
					if (i == 3) {
136
						MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
137
					} else {
138
						MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);
139
					}
140
				}
141
			}
142
			break;
143
		}
144
		break;
145

146
	case IROp::Vec4Shuffle:
147
		if (cpu_info.LOONGARCH_LSX) {
148
			regs_.Map(inst);
149
			if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0) {
150
				// This is a broadcast.  If dest == src1, this won't clear it.
151
				regs_.SpillLockFPR(inst.src1);
152
				regs_.MapVec4(inst.dest, MIPSMap::NOINIT);
153
			} else {
154
				regs_.Map(inst);
155
			}
156

157
			VSHUF4I_W(regs_.V(inst.dest), regs_.V(inst.src1), inst.src2);
158
		} else {
159
			if (inst.dest == inst.src1) {
160
				regs_.Map(inst);
161
				// Try to find the least swaps needed to move in place, never worse than 6 FMOVs.
162
				// Would be better with a vmerge and vector regs.
163
				int state[4]{ 0, 1, 2, 3 };
164
				int goal[4]{ (inst.src2 >> 0) & 3, (inst.src2 >> 2) & 3, (inst.src2 >> 4) & 3, (inst.src2 >> 6) & 3 };
165

166
				static constexpr int NOT_FOUND = 4;
167
				auto findIndex = [](int *arr, int val, int start = 0) {
168
					return (int)(std::find(arr + start, arr + 4, val) - arr);
169
				};
170
				auto moveChained = [&](const std::vector<int> &lanes, bool rotate) {
171
					int firstState = state[lanes.front()];
172
					if (rotate)
173
						FMOV_S(SCRATCHF1, regs_.F(inst.dest + lanes.front()));
174
					for (size_t i = 1; i < lanes.size(); ++i) {
175
						FMOV_S(regs_.F(inst.dest + lanes[i - 1]), regs_.F(inst.dest + lanes[i]));
176
						state[lanes[i - 1]] = state[lanes[i]];
177
					}
178
					if (rotate) {
179
						FMOV_S(regs_.F(inst.dest + lanes.back()), SCRATCHF1);
180
						state[lanes.back()] = firstState;
181
					}
182
				};
183

184
				for (int i = 0; i < 4; ++i) {
185
					// Overlap, so if they match, nothing to do.
186
					if (goal[i] == state[i])
187
						continue;
188

189
					int neededBy = findIndex(goal, state[i], i + 1);
190
					int foundIn = findIndex(state, goal[i], 0);
191
					_assert_(foundIn != NOT_FOUND);
192

193
					if (neededBy == NOT_FOUND || neededBy == foundIn) {
194
						moveChained({ i, foundIn }, neededBy == foundIn);
195
						continue;
196
					}
197

198
					// Maybe we can avoid a swap and move the next thing into place.
199
					int neededByDepth2 = findIndex(goal, state[neededBy], i + 1);
200
					if (neededByDepth2 == NOT_FOUND || neededByDepth2 == foundIn) {
201
						moveChained({ neededBy, i, foundIn }, neededByDepth2 == foundIn);
202
						continue;
203
					}
204

205
					// Since we only have 4 items, this is as deep as the chain could go.
206
					int neededByDepth3 = findIndex(goal, state[neededByDepth2], i + 1);
207
					moveChained({ neededByDepth2, neededBy, i, foundIn }, neededByDepth3 == foundIn);
208
				}
209
			} else {
210
				regs_.Map(inst);
211
				for (int i = 0; i < 4; ++i) {
212
					int lane = (inst.src2 >> (i * 2)) & 3;
213
					FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + lane));
214
				}
215
			}
216
		}
217
		break;
218

219
	case IROp::Vec4Blend:
220
		regs_.Map(inst);
221
		if (cpu_info.LOONGARCH_LSX) {
222
			IRReg src = inst.src1;
223
			uint8_t imm = inst.constant;
224
			if (inst.dest == inst.src1) {
225
				src = inst.src2;
226
			} else if (inst.dest == inst.src2) {
227
				imm = ~imm;
228
			} else {
229
				VOR_V(regs_.V(inst.dest), regs_.V(src), regs_.V(src));
230
				src = inst.src2;
231
			}
232

233
			for (int i = 0; i < 4; ++i)
234
				if (imm & (1 << i))
235
					VEXTRINS_W(regs_.V(inst.dest), regs_.V(src), (i << 4) | i);
236
		} else {
237
			for (int i = 0; i < 4; ++i) {
238
				int which = (inst.constant >> i) & 1;
239
				IRReg srcReg = which ? inst.src2 : inst.src1;
240
				if (inst.dest != srcReg)
241
					FMOV_S(regs_.F(inst.dest + i), regs_.F(srcReg + i));
242
			}
243
		}
244
		break;
245

246
	case IROp::Vec4Mov:
247
		if (inst.dest != inst.src1) {
248
			regs_.Map(inst);
249
			if (cpu_info.LOONGARCH_LSX)
250
				VOR_V(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src1));
251
			else
252
				for (int i = 0; i < 4; ++i)
253
					FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
254
		}
255
		break;
256

257
	default:
258
		INVALIDOP;
259
		break;
260
	}
261
}
262

263
void LoongArch64JitBackend::CompIR_VecArith(IRInst inst) {
264
	CONDITIONAL_DISABLE;
265

266
	switch (inst.op) {
267
	case IROp::Vec4Add:
268
		regs_.Map(inst);
269
		if (cpu_info.LOONGARCH_LSX)
270
			VFADD_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));
271
		else
272
			for (int i = 0; i < 4; ++i)
273
				FADD_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
274
		break;
275

276
	case IROp::Vec4Sub:
277
		regs_.Map(inst);
278
		if (cpu_info.LOONGARCH_LSX)
279
			VFSUB_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));
280
		else
281
			for (int i = 0; i < 4; ++i)
282
				FSUB_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
283
		break;
284

285
	case IROp::Vec4Mul:
286
		regs_.Map(inst);
287
		if (cpu_info.LOONGARCH_LSX)
288
			VFMUL_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));
289
		else
290
			for (int i = 0; i < 4; ++i)
291
				FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
292
		break;
293

294
	case IROp::Vec4Div:
295
		regs_.Map(inst);
296
		if (cpu_info.LOONGARCH_LSX)
297
			VFDIV_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));
298
		else
299
			for (int i = 0; i < 4; ++i)
300
				FDIV_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
301
		break;
302

303
	case IROp::Vec4Scale:
304
		regs_.Map(inst);
305
		if (cpu_info.LOONGARCH_LSX) {
306
			if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1))
307
				DISABLE;
308

309
			VSHUF4I_W(regs_.V(inst.src2), regs_.V(inst.src2), 0);
310
			VFMUL_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));
311
		} else {
312
			if (Overlap(inst.src2, 1, inst.dest, 3)) {
313
				// We have to handle overlap, doing dest == src2 last.
314
				for (int i = 0; i < 4; ++i) {
315
					if (inst.src2 != inst.dest + i)
316
						FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));
317
				}
318
				for (int i = 0; i < 4; ++i) {
319
					if (inst.src2 == inst.dest + i)
320
						FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));
321
				}
322
			} else {
323
				for (int i = 0; i < 4; ++i)
324
					FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));
325
			}
326
		}
327
		break;
328

329
	case IROp::Vec4Neg:
330
		regs_.Map(inst);
331
		if (cpu_info.LOONGARCH_LSX)
332
			VBITREVI_W(regs_.V(inst.dest), regs_.V(inst.src1), 31);
333
		else
334
			for (int i = 0; i < 4; ++i)
335
				FNEG_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
336
		break;
337

338
	case IROp::Vec4Abs:
339
		regs_.Map(inst);
340
		if (cpu_info.LOONGARCH_LSX)
341
			VBITCLRI_W(regs_.V(inst.dest), regs_.V(inst.src1), 31);
342
		else
343
			for (int i = 0; i < 4; ++i)
344
				FABS_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i));
345
		break;
346

347
	default:
348
		INVALIDOP;
349
		break;
350
	}
351
}
352

353
void LoongArch64JitBackend::CompIR_VecHoriz(IRInst inst) {
354
	CONDITIONAL_DISABLE;
355

356
	switch (inst.op) {
357
	case IROp::Vec4Dot:
358
		regs_.Map(inst);
359
		if (cpu_info.LOONGARCH_LSX) {
360
			if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4))
361
				DISABLE;
362

363
			VFMUL_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));
364
			VOR_V(EncodeRegToV(SCRATCHF1), regs_.V(inst.dest), regs_.V(inst.dest));
365
			VSHUF4I_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.dest), VFPU_SWIZZLE(1, 0, 3, 2));
366
			VFADD_S(regs_.V(inst.dest), regs_.V(inst.dest), EncodeRegToV(SCRATCHF1));
367
			VEXTRINS_D(EncodeRegToV(SCRATCHF1), regs_.V(inst.dest), 1);
368
			// Do we need care about upper 96 bits?
369
			VFADD_S(regs_.V(inst.dest), regs_.V(inst.dest), EncodeRegToV(SCRATCHF1));
370
		} else {
371
			if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {
372
				// This means inst.dest overlaps one of src1 or src2.  We have to do that one first.
373
				// Technically this may impact -0.0 and such, but dots accurately need to be aligned anyway.
374
				for (int i = 0; i < 4; ++i) {
375
					if (inst.dest == inst.src1 + i || inst.dest == inst.src2 + i)
376
						FMUL_S(regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));
377
				}
378
				for (int i = 0; i < 4; ++i) {
379
					if (inst.dest != inst.src1 + i && inst.dest != inst.src2 + i)
380
						FMADD_S(regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));
381
				}
382
			} else {
383
				FMUL_S(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));
384
				for (int i = 1; i < 4; ++i)
385
					FMADD_S(regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));
386
			}
387
		}
388
		break;
389

390
	default:
391
		INVALIDOP;
392
		break;
393
	}
394
}
395

396
void LoongArch64JitBackend::CompIR_VecPack(IRInst inst) {
397
	CONDITIONAL_DISABLE;
398

399
	switch (inst.op) {
400
	case IROp::Vec2Unpack16To31:
401
	case IROp::Vec2Pack31To16:
402
		CompIR_Generic(inst);
403
		break;
404

405
	case IROp::Vec4Pack32To8:
406
		if (cpu_info.LOONGARCH_LSX) {
407
			if (Overlap(inst.dest, 1, inst.src1, 4))
408
				DISABLE;
409

410
			regs_.Map(inst);
411
			VSRLI_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.src1), 24);
412
			VPICKEV_B(EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));
413
			VPICKEV_B(regs_.V(inst.dest), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));
414
		} else {
415
			CompIR_Generic(inst);
416
		}
417
		break;
418

419
	case IROp::Vec4Unpack8To32:
420
		if (cpu_info.LOONGARCH_LSX) {
421
			if (Overlap(inst.dest, 1, inst.src1, 4))
422
				DISABLE;
423

424
			regs_.Map(inst);
425
			VSLLWIL_HU_BU(regs_.V(inst.dest), regs_.V(inst.src1), 0);
426
			VSLLWIL_WU_HU(regs_.V(inst.dest), regs_.V(inst.dest), 0);
427
			VSLLI_W(regs_.V(inst.dest), regs_.V(inst.dest), 24);
428
		} else {
429
			regs_.Map(inst);
430
			MOVFR2GR_S(SCRATCH2, regs_.F(inst.src1));
431
			for (int i = 0; i < 4; ++i) {
432
				// Mask using walls.
433
				if (i != 0) {
434
					SRLI_D(SCRATCH1, SCRATCH2, i * 8);
435
					SLLI_D(SCRATCH1, SCRATCH1, 24);
436
				} else {
437
					SLLI_D(SCRATCH1, SCRATCH2, 24);
438
				}
439
				MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
440
			}
441
		}
442
		break;
443

444
	case IROp::Vec2Unpack16To32:
445
		// TODO: This works for now, but may need to handle aliasing for vectors.
446
		if (cpu_info.LOONGARCH_LSX) {
447
			CompIR_Generic(inst);
448
			break;
449
		}
450
		regs_.Map(inst);
451
		MOVFR2GR_S(SCRATCH2, regs_.F(inst.src1));
452
		SLLI_D(SCRATCH1, SCRATCH2, 16);
453
		MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);
454
		SRLI_D(SCRATCH1, SCRATCH2, 16);
455
		SLLI_D(SCRATCH1, SCRATCH1, 16);
456
		MOVGR2FR_W(regs_.F(inst.dest + 1), SCRATCH1);
457
		break;
458

459
	case IROp::Vec4DuplicateUpperBitsAndShift1:
460
		regs_.Map(inst);
461
		if (cpu_info.LOONGARCH_LSX) {
462
			VSRLI_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.src1), 16);
463
			VOR_V(EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1), regs_.V(inst.src1));
464
			VSRLI_W(regs_.V(inst.dest), EncodeRegToV(SCRATCHF1), 8);
465
			VOR_V(regs_.V(inst.dest), regs_.V(inst.dest), EncodeRegToV(SCRATCHF1));
466
			VSRLI_W(regs_.V(inst.dest), regs_.V(inst.dest), 1);
467
		} else {
468
			for (int i = 0; i < 4; i++) {
469
				MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1 + i));
470
				SRLI_W(SCRATCH2, SCRATCH1, 8);
471
				OR(SCRATCH1, SCRATCH1, SCRATCH2);
472
				SRLI_W(SCRATCH2, SCRATCH1, 16);
473
				OR(SCRATCH1, SCRATCH1, SCRATCH2);
474
				SRLI_W(SCRATCH1, SCRATCH1, 1);
475
				MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
476
			}
477
		}
478
		break;
479

480
	case IROp::Vec4Pack31To8:
481
		// TODO: This works for now, but may need to handle aliasing for vectors.
482
		if (cpu_info.LOONGARCH_LSX) {
483
			if (Overlap(inst.dest, 1, inst.src1, 4))
484
				DISABLE;
485

486
			regs_.Map(inst);
487
			VSRLI_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.src1), 23);
488
			VPICKEV_B(EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));
489
			VPICKEV_B(regs_.V(inst.dest), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));
490
		} else {
491
			regs_.Map(inst);
492
			for (int i = 0; i < 4; ++i) {
493
				MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1 + i));
494
				SRLI_D(SCRATCH1, SCRATCH1, 23);
495
				if (i == 0) {
496
					ANDI(SCRATCH2, SCRATCH1, 0xFF);
497
				} else {
498
					ANDI(SCRATCH1, SCRATCH1, 0xFF);
499
					SLLI_D(SCRATCH1, SCRATCH1, 8 * i);
500
					OR(SCRATCH2, SCRATCH2, SCRATCH1);
501
				}
502
			}
503
			MOVGR2FR_W(regs_.F(inst.dest), SCRATCH2);
504
		}
505
		break;
506

507
	case IROp::Vec2Pack32To16:
508
		// TODO: This works for now, but may need to handle aliasing for vectors.
509
		if (cpu_info.LOONGARCH_LSX) {
510
			CompIR_Generic(inst);
511
			break;
512
		}
513
		regs_.Map(inst);
514
		MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1));
515
		MOVFR2GR_S(SCRATCH2, regs_.F(inst.src1 + 1));
516
		// Keep in mind, this was sign-extended, so we have to zero the upper.
517
		SLLI_D(SCRATCH1, SCRATCH1, 32);
518
		// Now we just set (SCRATCH2 & 0xFFFF0000) | SCRATCH1.
519
		SRLI_D(SCRATCH1, SCRATCH1, 48);
520
		// Use a wall to mask.  We can ignore the upper 32 here.
521
		SRLI_D(SCRATCH2, SCRATCH2, 16);
522
		SLLI_D(SCRATCH2, SCRATCH2, 16);
523
		OR(SCRATCH1, SCRATCH1, SCRATCH2);
524
		// Okay, to the floating point register.
525
		MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);
526
		break;
527

528
	default:
529
		INVALIDOP;
530
		break;
531
	}
532
}
533

534
void LoongArch64JitBackend::CompIR_VecClamp(IRInst inst) {
535
	CONDITIONAL_DISABLE;
536

537
	switch (inst.op) {
538
	case IROp::Vec4ClampToZero:
539
		regs_.Map(inst);
540
		if (cpu_info.LOONGARCH_LSX) {
541
			VREPLGR2VR_D(EncodeRegToV(SCRATCHF1), R_ZERO);
542
			VMAX_W(regs_.V(inst.dest), regs_.V(inst.src1), EncodeRegToV(SCRATCHF1));
543
		} else {
544
			for (int i = 0; i < 4; i++) {
545
				MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1 + i));
546
				SRAI_W(SCRATCH2, SCRATCH1, 31);
547
				ORN(SCRATCH2, R_ZERO, SCRATCH2);
548
				AND(SCRATCH1, SCRATCH1, SCRATCH2);
549
				MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);
550
			}
551
		}
552
		break;
553

554
	case IROp::Vec2ClampToZero:
555
		CompIR_Generic(inst);
556
		break;
557

558
	default:
559
		INVALIDOP;
560
		break;
561
	}
562
}
563

564
} // namespace MIPSComp
565

566
Product

Resources

Company