Path: blob/master/Core/MIPS/LoongArch64/LoongArch64CompVec.cpp
3188 views
// Copyright (c) 2023- PPSSPP Project.12// This program is free software: you can redistribute it and/or modify3// it under the terms of the GNU General Public License as published by4// the Free Software Foundation, version 2.0 or later versions.56// This program is distributed in the hope that it will be useful,7// but WITHOUT ANY WARRANTY; without even the implied warranty of8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the9// GNU General Public License 2.0 for more details.1011// A copy of the GPL 2.0 should have been included with the program.12// If not, see http://www.gnu.org/licenses/1314// Official git repository and contact information can be found at15// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.1617#include <algorithm>18#include "Core/MemMap.h"19#include "Core/MIPS/LoongArch64/LoongArch64Jit.h"20#include "Core/MIPS/LoongArch64/LoongArch64RegCache.h"2122// This file contains compilation for vector instructions.23//24// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.25// Currently known non working ones should have DISABLE. No flags because that's in IR already.2627// #define CONDITIONAL_DISABLE { CompIR_Generic(inst); return; }28#define CONDITIONAL_DISABLE {}29#define DISABLE { CompIR_Generic(inst); return; }30#define INVALIDOP { _assert_msg_(false, "Invalid IR inst %d", (int)inst.op); CompIR_Generic(inst); return; }3132namespace MIPSComp {3334using namespace LoongArch64Gen;35using namespace LoongArch64JitConstants;3637static bool Overlap(IRReg r1, int l1, IRReg r2, int l2) {38return r1 < r2 + l2 && r1 + l1 > r2;39}4041void LoongArch64JitBackend::CompIR_VecAssign(IRInst inst) {42CONDITIONAL_DISABLE;4344switch (inst.op) {45case IROp::Vec4Init:46regs_.Map(inst);4748switch ((Vec4Init)inst.src1) {49case Vec4Init::AllZERO:50if (cpu_info.LOONGARCH_LSX)51VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);52else53for (int i = 0; i < 4; ++i)54MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);55break;5657case Vec4Init::AllONE:58LI(SCRATCH1, 1.0f);59if (cpu_info.LOONGARCH_LSX) {60VREPLGR2VR_W(regs_.V(inst.dest), SCRATCH1);61} else {62MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);63for (int i = 1; i < 4; ++i)64FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.dest));65}66break;6768case Vec4Init::AllMinusONE:69LI(SCRATCH1, -1.0f);70if (cpu_info.LOONGARCH_LSX) {71VREPLGR2VR_W(regs_.V(inst.dest), SCRATCH1);72} else {73MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);74for (int i = 1; i < 4; ++i)75FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.dest));76}77break;7879case Vec4Init::Set_1000:80LI(SCRATCH1, 1.0f);81if (cpu_info.LOONGARCH_LSX) {82VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);83VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 0);84} else {85for (int i = 0; i < 4; ++i) {86if (i == 0) {87MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);88} else {89MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);90}91}92}93break;9495case Vec4Init::Set_0100:96LI(SCRATCH1, 1.0f);97if (cpu_info.LOONGARCH_LSX) {98VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);99VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 1);100} else {101for (int i = 0; i < 4; ++i) {102if (i == 1) {103MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);104} else {105MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);106}107}108}109break;110111case Vec4Init::Set_0010:112LI(SCRATCH1, 1.0f);113if (cpu_info.LOONGARCH_LSX) {114VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);115VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 2);116} else {117for (int i = 0; i < 4; ++i) {118if (i == 2) {119MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);120} else {121MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);122}123}124}125break;126127case Vec4Init::Set_0001:128LI(SCRATCH1, 1.0f);129if (cpu_info.LOONGARCH_LSX) {130VREPLGR2VR_D(regs_.V(inst.dest), R_ZERO);131VINSGR2VR_W(regs_.V(inst.dest), SCRATCH1, 3);132} else {133for (int i = 0; i < 4; ++i) {134if (i == 3) {135MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);136} else {137MOVGR2FR_W(regs_.F(inst.dest + i), R_ZERO);138}139}140}141break;142}143break;144145case IROp::Vec4Shuffle:146if (cpu_info.LOONGARCH_LSX) {147regs_.Map(inst);148if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0) {149// This is a broadcast. If dest == src1, this won't clear it.150regs_.SpillLockFPR(inst.src1);151regs_.MapVec4(inst.dest, MIPSMap::NOINIT);152} else {153regs_.Map(inst);154}155156VSHUF4I_W(regs_.V(inst.dest), regs_.V(inst.src1), inst.src2);157} else {158if (inst.dest == inst.src1) {159regs_.Map(inst);160// Try to find the least swaps needed to move in place, never worse than 6 FMOVs.161// Would be better with a vmerge and vector regs.162int state[4]{ 0, 1, 2, 3 };163int goal[4]{ (inst.src2 >> 0) & 3, (inst.src2 >> 2) & 3, (inst.src2 >> 4) & 3, (inst.src2 >> 6) & 3 };164165static constexpr int NOT_FOUND = 4;166auto findIndex = [](int *arr, int val, int start = 0) {167return (int)(std::find(arr + start, arr + 4, val) - arr);168};169auto moveChained = [&](const std::vector<int> &lanes, bool rotate) {170int firstState = state[lanes.front()];171if (rotate)172FMOV_S(SCRATCHF1, regs_.F(inst.dest + lanes.front()));173for (size_t i = 1; i < lanes.size(); ++i) {174FMOV_S(regs_.F(inst.dest + lanes[i - 1]), regs_.F(inst.dest + lanes[i]));175state[lanes[i - 1]] = state[lanes[i]];176}177if (rotate) {178FMOV_S(regs_.F(inst.dest + lanes.back()), SCRATCHF1);179state[lanes.back()] = firstState;180}181};182183for (int i = 0; i < 4; ++i) {184// Overlap, so if they match, nothing to do.185if (goal[i] == state[i])186continue;187188int neededBy = findIndex(goal, state[i], i + 1);189int foundIn = findIndex(state, goal[i], 0);190_assert_(foundIn != NOT_FOUND);191192if (neededBy == NOT_FOUND || neededBy == foundIn) {193moveChained({ i, foundIn }, neededBy == foundIn);194continue;195}196197// Maybe we can avoid a swap and move the next thing into place.198int neededByDepth2 = findIndex(goal, state[neededBy], i + 1);199if (neededByDepth2 == NOT_FOUND || neededByDepth2 == foundIn) {200moveChained({ neededBy, i, foundIn }, neededByDepth2 == foundIn);201continue;202}203204// Since we only have 4 items, this is as deep as the chain could go.205int neededByDepth3 = findIndex(goal, state[neededByDepth2], i + 1);206moveChained({ neededByDepth2, neededBy, i, foundIn }, neededByDepth3 == foundIn);207}208} else {209regs_.Map(inst);210for (int i = 0; i < 4; ++i) {211int lane = (inst.src2 >> (i * 2)) & 3;212FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + lane));213}214}215}216break;217218case IROp::Vec4Blend:219regs_.Map(inst);220if (cpu_info.LOONGARCH_LSX) {221IRReg src = inst.src1;222uint8_t imm = inst.constant;223if (inst.dest == inst.src1) {224src = inst.src2;225} else if (inst.dest == inst.src2) {226imm = ~imm;227} else {228VOR_V(regs_.V(inst.dest), regs_.V(src), regs_.V(src));229src = inst.src2;230}231232for (int i = 0; i < 4; ++i)233if (imm & (1 << i))234VEXTRINS_W(regs_.V(inst.dest), regs_.V(src), (i << 4) | i);235} else {236for (int i = 0; i < 4; ++i) {237int which = (inst.constant >> i) & 1;238IRReg srcReg = which ? inst.src2 : inst.src1;239if (inst.dest != srcReg)240FMOV_S(regs_.F(inst.dest + i), regs_.F(srcReg + i));241}242}243break;244245case IROp::Vec4Mov:246if (inst.dest != inst.src1) {247regs_.Map(inst);248if (cpu_info.LOONGARCH_LSX)249VOR_V(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src1));250else251for (int i = 0; i < 4; ++i)252FMOV_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i));253}254break;255256default:257INVALIDOP;258break;259}260}261262void LoongArch64JitBackend::CompIR_VecArith(IRInst inst) {263CONDITIONAL_DISABLE;264265switch (inst.op) {266case IROp::Vec4Add:267regs_.Map(inst);268if (cpu_info.LOONGARCH_LSX)269VFADD_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));270else271for (int i = 0; i < 4; ++i)272FADD_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));273break;274275case IROp::Vec4Sub:276regs_.Map(inst);277if (cpu_info.LOONGARCH_LSX)278VFSUB_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));279else280for (int i = 0; i < 4; ++i)281FSUB_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));282break;283284case IROp::Vec4Mul:285regs_.Map(inst);286if (cpu_info.LOONGARCH_LSX)287VFMUL_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));288else289for (int i = 0; i < 4; ++i)290FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));291break;292293case IROp::Vec4Div:294regs_.Map(inst);295if (cpu_info.LOONGARCH_LSX)296VFDIV_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));297else298for (int i = 0; i < 4; ++i)299FDIV_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));300break;301302case IROp::Vec4Scale:303regs_.Map(inst);304if (cpu_info.LOONGARCH_LSX) {305if (Overlap(inst.dest, 4, inst.src2, 1) || Overlap(inst.src1, 4, inst.src2, 1))306DISABLE;307308VSHUF4I_W(regs_.V(inst.src2), regs_.V(inst.src2), 0);309VFMUL_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));310} else {311if (Overlap(inst.src2, 1, inst.dest, 3)) {312// We have to handle overlap, doing dest == src2 last.313for (int i = 0; i < 4; ++i) {314if (inst.src2 != inst.dest + i)315FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));316}317for (int i = 0; i < 4; ++i) {318if (inst.src2 == inst.dest + i)319FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));320}321} else {322for (int i = 0; i < 4; ++i)323FMUL_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i), regs_.F(inst.src2));324}325}326break;327328case IROp::Vec4Neg:329regs_.Map(inst);330if (cpu_info.LOONGARCH_LSX)331VBITREVI_W(regs_.V(inst.dest), regs_.V(inst.src1), 31);332else333for (int i = 0; i < 4; ++i)334FNEG_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i));335break;336337case IROp::Vec4Abs:338regs_.Map(inst);339if (cpu_info.LOONGARCH_LSX)340VBITCLRI_W(regs_.V(inst.dest), regs_.V(inst.src1), 31);341else342for (int i = 0; i < 4; ++i)343FABS_S(regs_.F(inst.dest + i), regs_.F(inst.src1 + i));344break;345346default:347INVALIDOP;348break;349}350}351352void LoongArch64JitBackend::CompIR_VecHoriz(IRInst inst) {353CONDITIONAL_DISABLE;354355switch (inst.op) {356case IROp::Vec4Dot:357regs_.Map(inst);358if (cpu_info.LOONGARCH_LSX) {359if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4))360DISABLE;361362VFMUL_S(regs_.V(inst.dest), regs_.V(inst.src1), regs_.V(inst.src2));363VOR_V(EncodeRegToV(SCRATCHF1), regs_.V(inst.dest), regs_.V(inst.dest));364VSHUF4I_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.dest), VFPU_SWIZZLE(1, 0, 3, 2));365VFADD_S(regs_.V(inst.dest), regs_.V(inst.dest), EncodeRegToV(SCRATCHF1));366VEXTRINS_D(EncodeRegToV(SCRATCHF1), regs_.V(inst.dest), 1);367// Do we need care about upper 96 bits?368VFADD_S(regs_.V(inst.dest), regs_.V(inst.dest), EncodeRegToV(SCRATCHF1));369} else {370if (Overlap(inst.dest, 1, inst.src1, 4) || Overlap(inst.dest, 1, inst.src2, 4)) {371// This means inst.dest overlaps one of src1 or src2. We have to do that one first.372// Technically this may impact -0.0 and such, but dots accurately need to be aligned anyway.373for (int i = 0; i < 4; ++i) {374if (inst.dest == inst.src1 + i || inst.dest == inst.src2 + i)375FMUL_S(regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i));376}377for (int i = 0; i < 4; ++i) {378if (inst.dest != inst.src1 + i && inst.dest != inst.src2 + i)379FMADD_S(regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));380}381} else {382FMUL_S(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2));383for (int i = 1; i < 4; ++i)384FMADD_S(regs_.F(inst.dest), regs_.F(inst.src1 + i), regs_.F(inst.src2 + i), regs_.F(inst.dest));385}386}387break;388389default:390INVALIDOP;391break;392}393}394395void LoongArch64JitBackend::CompIR_VecPack(IRInst inst) {396CONDITIONAL_DISABLE;397398switch (inst.op) {399case IROp::Vec2Unpack16To31:400case IROp::Vec2Pack31To16:401CompIR_Generic(inst);402break;403404case IROp::Vec4Pack32To8:405if (cpu_info.LOONGARCH_LSX) {406if (Overlap(inst.dest, 1, inst.src1, 4))407DISABLE;408409regs_.Map(inst);410VSRLI_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.src1), 24);411VPICKEV_B(EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));412VPICKEV_B(regs_.V(inst.dest), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));413} else {414CompIR_Generic(inst);415}416break;417418case IROp::Vec4Unpack8To32:419if (cpu_info.LOONGARCH_LSX) {420if (Overlap(inst.dest, 1, inst.src1, 4))421DISABLE;422423regs_.Map(inst);424VSLLWIL_HU_BU(regs_.V(inst.dest), regs_.V(inst.src1), 0);425VSLLWIL_WU_HU(regs_.V(inst.dest), regs_.V(inst.dest), 0);426VSLLI_W(regs_.V(inst.dest), regs_.V(inst.dest), 24);427} else {428regs_.Map(inst);429MOVFR2GR_S(SCRATCH2, regs_.F(inst.src1));430for (int i = 0; i < 4; ++i) {431// Mask using walls.432if (i != 0) {433SRLI_D(SCRATCH1, SCRATCH2, i * 8);434SLLI_D(SCRATCH1, SCRATCH1, 24);435} else {436SLLI_D(SCRATCH1, SCRATCH2, 24);437}438MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);439}440}441break;442443case IROp::Vec2Unpack16To32:444// TODO: This works for now, but may need to handle aliasing for vectors.445if (cpu_info.LOONGARCH_LSX) {446CompIR_Generic(inst);447break;448}449regs_.Map(inst);450MOVFR2GR_S(SCRATCH2, regs_.F(inst.src1));451SLLI_D(SCRATCH1, SCRATCH2, 16);452MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);453SRLI_D(SCRATCH1, SCRATCH2, 16);454SLLI_D(SCRATCH1, SCRATCH1, 16);455MOVGR2FR_W(regs_.F(inst.dest + 1), SCRATCH1);456break;457458case IROp::Vec4DuplicateUpperBitsAndShift1:459regs_.Map(inst);460if (cpu_info.LOONGARCH_LSX) {461VSRLI_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.src1), 16);462VOR_V(EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1), regs_.V(inst.src1));463VSRLI_W(regs_.V(inst.dest), EncodeRegToV(SCRATCHF1), 8);464VOR_V(regs_.V(inst.dest), regs_.V(inst.dest), EncodeRegToV(SCRATCHF1));465VSRLI_W(regs_.V(inst.dest), regs_.V(inst.dest), 1);466} else {467for (int i = 0; i < 4; i++) {468MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1 + i));469SRLI_W(SCRATCH2, SCRATCH1, 8);470OR(SCRATCH1, SCRATCH1, SCRATCH2);471SRLI_W(SCRATCH2, SCRATCH1, 16);472OR(SCRATCH1, SCRATCH1, SCRATCH2);473SRLI_W(SCRATCH1, SCRATCH1, 1);474MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);475}476}477break;478479case IROp::Vec4Pack31To8:480// TODO: This works for now, but may need to handle aliasing for vectors.481if (cpu_info.LOONGARCH_LSX) {482if (Overlap(inst.dest, 1, inst.src1, 4))483DISABLE;484485regs_.Map(inst);486VSRLI_W(EncodeRegToV(SCRATCHF1), regs_.V(inst.src1), 23);487VPICKEV_B(EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));488VPICKEV_B(regs_.V(inst.dest), EncodeRegToV(SCRATCHF1), EncodeRegToV(SCRATCHF1));489} else {490regs_.Map(inst);491for (int i = 0; i < 4; ++i) {492MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1 + i));493SRLI_D(SCRATCH1, SCRATCH1, 23);494if (i == 0) {495ANDI(SCRATCH2, SCRATCH1, 0xFF);496} else {497ANDI(SCRATCH1, SCRATCH1, 0xFF);498SLLI_D(SCRATCH1, SCRATCH1, 8 * i);499OR(SCRATCH2, SCRATCH2, SCRATCH1);500}501}502MOVGR2FR_W(regs_.F(inst.dest), SCRATCH2);503}504break;505506case IROp::Vec2Pack32To16:507// TODO: This works for now, but may need to handle aliasing for vectors.508if (cpu_info.LOONGARCH_LSX) {509CompIR_Generic(inst);510break;511}512regs_.Map(inst);513MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1));514MOVFR2GR_S(SCRATCH2, regs_.F(inst.src1 + 1));515// Keep in mind, this was sign-extended, so we have to zero the upper.516SLLI_D(SCRATCH1, SCRATCH1, 32);517// Now we just set (SCRATCH2 & 0xFFFF0000) | SCRATCH1.518SRLI_D(SCRATCH1, SCRATCH1, 48);519// Use a wall to mask. We can ignore the upper 32 here.520SRLI_D(SCRATCH2, SCRATCH2, 16);521SLLI_D(SCRATCH2, SCRATCH2, 16);522OR(SCRATCH1, SCRATCH1, SCRATCH2);523// Okay, to the floating point register.524MOVGR2FR_W(regs_.F(inst.dest), SCRATCH1);525break;526527default:528INVALIDOP;529break;530}531}532533void LoongArch64JitBackend::CompIR_VecClamp(IRInst inst) {534CONDITIONAL_DISABLE;535536switch (inst.op) {537case IROp::Vec4ClampToZero:538regs_.Map(inst);539if (cpu_info.LOONGARCH_LSX) {540VREPLGR2VR_D(EncodeRegToV(SCRATCHF1), R_ZERO);541VMAX_W(regs_.V(inst.dest), regs_.V(inst.src1), EncodeRegToV(SCRATCHF1));542} else {543for (int i = 0; i < 4; i++) {544MOVFR2GR_S(SCRATCH1, regs_.F(inst.src1 + i));545SRAI_W(SCRATCH2, SCRATCH1, 31);546ORN(SCRATCH2, R_ZERO, SCRATCH2);547AND(SCRATCH1, SCRATCH1, SCRATCH2);548MOVGR2FR_W(regs_.F(inst.dest + i), SCRATCH1);549}550}551break;552553case IROp::Vec2ClampToZero:554CompIR_Generic(inst);555break;556557default:558INVALIDOP;559break;560}561}562563} // namespace MIPSComp564565566