#include "ppsspp_config.h"
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include "Common/Math/SIMDHeaders.h"
#include "Common/x64Emitter.h"
#include "Common/BitScan.h"
#include "Common/CPUDetect.h"
#include "GPU/GPUState.h"
#include "GPU/Software/Sampler.h"
#include "GPU/ge_constants.h"
using namespace Gen;
using namespace Rasterizer;
namespace Sampler {
FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {
_assert_msg_(id.fetch && !id.linear, "Only fetch should be set on sampler id");
regCache_.SetupABI({
RegCache::GEN_ARG_U,
RegCache::GEN_ARG_V,
RegCache::GEN_ARG_TEXPTR,
RegCache::GEN_ARG_BUFW,
RegCache::GEN_ARG_LEVEL,
RegCache::GEN_ARG_ID,
});
regCache_.ChangeReg(RAX, RegCache::GEN_RESULT);
regCache_.ForceRetain(RegCache::GEN_RESULT);
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
BeginWrite(2048);
Describe("Init");
const u8 *start = AlignCode16();
#if PPSSPP_PLATFORM(WINDOWS)
stackArgPos_ = 8 + 32;
stackIDOffset_ = 8;
stackLevelOffset_ = 0;
#else
stackArgPos_ = 0;
stackIDOffset_ = -1;
stackLevelOffset_ = -1;
#endif
FixupBranch zeroSrc;
if (id.hasInvalidPtr) {
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
CMP(PTRBITS, R(srcReg), Imm8(0));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
FixupBranch nonZeroSrc = J_CC(CC_NZ);
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
PXOR(vecResultReg, R(vecResultReg));
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
}
if (!Jit_ReadTextureFormat(id)) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(start));
ERROR_LOG(Log::G3D, "Failed to compile fetch %s", DescribeSamplerID(id).c_str());
return nullptr;
}
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
if (regCache_.Has(RegCache::GEN_ARG_ID))
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
MOVD_xmm(vecResultReg, R(resultReg));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
regCache_.ForceRelease(RegCache::GEN_RESULT);
if (cpu_info.bSSE4_1) {
PMOVZXBD(vecResultReg, R(vecResultReg));
} else {
X64Reg vecTempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PXOR(vecTempReg, R(vecTempReg));
PUNPCKLBW(vecResultReg, R(vecTempReg));
PUNPCKLWD(vecResultReg, R(vecTempReg));
regCache_.Release(vecTempReg, RegCache::VEC_TEMP0);
}
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
Describe("Init");
if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);
}
RET();
regCache_.Reset(true);
EndWrite();
return (FetchFunc)start;
}
NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
_assert_msg_(!id.fetch && !id.linear, "Fetch and linear should be cleared on sampler id");
BeginWrite(2048);
Describe("Init");
WriteConstantPool(id);
const u8 *start = AlignCode16();
regCache_.SetupABI({
RegCache::VEC_ARG_S,
RegCache::VEC_ARG_T,
RegCache::VEC_ARG_COLOR,
RegCache::GEN_ARG_TEXPTR_PTR,
RegCache::GEN_ARG_BUFW_PTR,
RegCache::GEN_ARG_LEVEL,
RegCache::GEN_ARG_LEVELFRAC,
RegCache::GEN_ARG_ID,
});
#if PPSSPP_PLATFORM(WINDOWS)
stackArgPos_ = 8 + 32;
stackIDOffset_ = 24;
stackLevelOffset_ = 8;
#else
stackArgPos_ = 0;
stackIDOffset_ = -1;
stackLevelOffset_ = -1;
#endif
PUSH(R15);
PUSH(R14);
PUSH(R13);
PUSH(R12);
regCache_.Add(R15, RegCache::GEN_INVALID);
regCache_.Add(R14, RegCache::GEN_INVALID);
regCache_.Add(R13, RegCache::GEN_INVALID);
regCache_.Add(R12, RegCache::GEN_INVALID);
stackArgPos_ += 32;
#if PPSSPP_PLATFORM(WINDOWS)
stackUV1Offset_ = -8;
#else
stackUV1Offset_ = -stackArgPos_ - 8;
#endif
if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVEL) && id.useSharedClut)
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
if (!id.hasAnyMips && regCache_.Has(RegCache::GEN_ARG_LEVELFRAC))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
if (!cpu_info.bBMI2) {
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
MOV(64, R(R15), R(levelFracReg));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
regCache_.ChangeReg(R15, RegCache::GEN_ARG_LEVELFRAC);
regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
}
} else if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
MOV(64, R(RDX), MDisp(RSP, stackArgPos_ + 0));
regCache_.ChangeReg(RDX, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
}
if (!cpu_info.bBMI2) {
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
regCache_.ForceRetain(RegCache::GEN_SHIFTVAL);
}
bool success = true;
success = success && Jit_GetTexelCoords(id);
success = success && regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
X64Reg resultReg = regCache_.Alloc(RegCache::GEN_RESULT);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
regCache_.ForceRetain(RegCache::GEN_RESULT);
FixupBranch zeroSrc;
if (id.hasInvalidPtr) {
Describe("NullCheck");
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
if (id.hasAnyMips) {
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOV(64, R(tempReg), MDisp(srcReg, 0));
AND(64, R(tempReg), MDisp(srcReg, 8));
CMP(PTRBITS, R(tempReg), Imm8(0));
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
} else {
CMP(PTRBITS, MatR(srcReg), Imm8(0));
}
FixupBranch nonZeroSrc = J_CC(CC_NZ);
PXOR(XMM0, R(XMM0));
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
}
auto loadPtrs = [&](bool level1) {
X64Reg bufwReg = regCache_.Alloc(RegCache::GEN_ARG_BUFW);
X64Reg bufwPtrReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
MOVZX(32, 16, bufwReg, MDisp(bufwPtrReg, level1 ? 2 : 0));
regCache_.Unlock(bufwPtrReg, RegCache::GEN_ARG_BUFW_PTR);
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW);
X64Reg srcReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
X64Reg srcPtrReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
MOV(64, R(srcReg), MDisp(srcPtrReg, level1 ? 8 : 0));
regCache_.Unlock(srcPtrReg, RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR);
};
loadPtrs(false);
success = success && Jit_ReadTextureFormat(id);
X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
resultReg = regCache_.Find(RegCache::GEN_RESULT);
MOVD_xmm(vecResultReg, R(resultReg));
if (cpu_info.bSSE4_1) {
PMOVZXBW(vecResultReg, R(vecResultReg));
} else {
X64Reg zeroReg = GetZeroVec();
PUNPCKLBW(vecResultReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
if (id.hasAnyMips) {
X64Reg vecResultReg = regCache_.Alloc(RegCache::VEC_RESULT1);
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
CMP(8, R(levelFracReg), Imm8(0));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
} else {
CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));
}
FixupBranch skip = J_CC(CC_Z, true);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
ADD(32, R(levelReg), Imm8(1));
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
} else {
ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));
}
loadPtrs(true);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);
MOV(32, R(uReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0));
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRetain(RegCache::GEN_ARG_U);
X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
MOV(32, R(vReg), MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4));
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRetain(RegCache::GEN_ARG_V);
bool hadId = regCache_.Has(RegCache::GEN_ID);
bool hadZero = regCache_.Has(RegCache::VEC_ZERO);
success = success && Jit_ReadTextureFormat(id);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
MOVD_xmm(vecResultReg, R(resultReg));
if (cpu_info.bSSE4_1) {
PMOVZXBW(vecResultReg, R(vecResultReg));
} else {
X64Reg zeroReg = GetZeroVec();
PUNPCKLBW(vecResultReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
if (!hadId && regCache_.Has(RegCache::GEN_ID))
regCache_.ForceRelease(RegCache::GEN_ID);
if (!hadZero && regCache_.Has(RegCache::VEC_ZERO))
regCache_.ForceRelease(RegCache::VEC_ZERO);
SetJumpTarget(skip);
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT1);
} else {
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
}
if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR))
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
if (regCache_.Has(RegCache::GEN_ARG_BUFW_PTR))
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
if (regCache_.Has(RegCache::GEN_SHIFTVAL))
regCache_.ForceRelease(RegCache::GEN_SHIFTVAL);
regCache_.ForceRelease(RegCache::GEN_RESULT);
if (id.hasAnyMips) {
Describe("BlendMips");
if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);
MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
}
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
CMP(8, R(levelFracReg), Imm8(0));
FixupBranch skip = J_CC(CC_Z, true);
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVD_xmm(fracReg, R(levelFracReg));
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);
PMULLW(color1Reg, R(fracReg));
X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(invFracReg, M(const10All16_));
PSUBW(invFracReg, R(fracReg));
PMULLW(XMM0, R(invFracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
regCache_.Release(invFracReg, RegCache::VEC_TEMP1);
PADDW(XMM0, R(color1Reg));
PSRLW(XMM0, 4);
regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);
regCache_.ForceRelease(RegCache::VEC_RESULT1);
SetJumpTarget(skip);
}
success = success && Jit_ApplyTextureFunc(id);
Describe("Init");
if (cpu_info.bSSE4_1) {
PMOVZXWD(XMM0, R(XMM0));
} else {
X64Reg zeroReg = GetZeroVec();
PUNPCKLWD(XMM0, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
regCache_.ForceRelease(RegCache::VEC_RESULT);
if (regCache_.Has(RegCache::GEN_ARG_ID))
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
if (!success) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(start));
ERROR_LOG(Log::G3D, "Failed to compile nearest %s", DescribeSamplerID(id).c_str());
return nullptr;
}
if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);
}
POP(R12);
POP(R13);
POP(R14);
POP(R15);
RET();
regCache_.Reset(true);
EndWrite();
return (NearestFunc)start;
}
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
_assert_msg_(id.linear && !id.fetch, "Only linear should be set on sampler id");
BeginWrite(2048);
Describe("Init");
stackArgPos_ = -1;
WriteConstantPool(id);
const u8 *nearest = nullptr;
if (id.TexFmt() >= GE_TFMT_DXT1) {
regCache_.SetupABI({
RegCache::GEN_ARG_U,
RegCache::GEN_ARG_V,
RegCache::GEN_ARG_TEXPTR,
RegCache::GEN_ARG_BUFW,
RegCache::GEN_ARG_LEVEL,
RegCache::GEN_ARG_LEVELFRAC,
});
auto lockReg = [&](X64Reg r, RegCache::Purpose p) {
regCache_.ChangeReg(r, p);
regCache_.ForceRetain(p);
};
lockReg(RAX, RegCache::GEN_RESULT);
lockReg(XMM0, RegCache::VEC_ARG_U);
lockReg(XMM1, RegCache::VEC_ARG_V);
lockReg(XMM5, RegCache::VEC_RESULT);
#if !PPSSPP_PLATFORM(WINDOWS)
if (id.hasAnyMips) {
lockReg(XMM6, RegCache::VEC_U1);
lockReg(XMM7, RegCache::VEC_V1);
lockReg(XMM8, RegCache::VEC_RESULT1);
lockReg(XMM12, RegCache::VEC_INDEX1);
}
lockReg(XMM9, RegCache::VEC_ARG_COLOR);
lockReg(XMM10, RegCache::VEC_FRAC);
lockReg(XMM11, RegCache::VEC_INDEX);
#endif
nearest = AlignCode16();
if (!Jit_ReadTextureFormat(id)) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(nearest));
ERROR_LOG(Log::G3D, "Failed to compile linear nearest %s", DescribeSamplerID(id).c_str());
return nullptr;
}
Describe("Init");
RET();
regCache_.ForceRelease(RegCache::GEN_RESULT);
regCache_.ForceRelease(RegCache::VEC_ARG_U);
regCache_.ForceRelease(RegCache::VEC_ARG_V);
regCache_.ForceRelease(RegCache::VEC_RESULT);
auto unlockOptReg = [&](RegCache::Purpose p) {
if (regCache_.Has(p))
regCache_.ForceRelease(p);
};
unlockOptReg(RegCache::GEN_ARG_LEVEL);
unlockOptReg(RegCache::GEN_ARG_LEVELFRAC);
unlockOptReg(RegCache::VEC_U1);
unlockOptReg(RegCache::VEC_V1);
unlockOptReg(RegCache::VEC_RESULT1);
unlockOptReg(RegCache::VEC_ARG_COLOR);
unlockOptReg(RegCache::VEC_FRAC);
unlockOptReg(RegCache::VEC_INDEX);
unlockOptReg(RegCache::VEC_INDEX1);
regCache_.Reset(true);
}
EndWrite();
const u8 *linearResetPos = GetCodePointer();
Describe("Init");
regCache_.SetupABI({
RegCache::VEC_ARG_S,
RegCache::VEC_ARG_T,
RegCache::VEC_ARG_COLOR,
RegCache::GEN_ARG_TEXPTR_PTR,
RegCache::GEN_ARG_BUFW_PTR,
RegCache::GEN_ARG_LEVEL,
RegCache::GEN_ARG_LEVELFRAC,
RegCache::GEN_ARG_ID,
});
#if PPSSPP_PLATFORM(WINDOWS)
stackArgPos_ = 8 + 32;
stackArgPos_ += WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12 }, { R15, R14, R13, R12 });
stackIDOffset_ = 24;
stackLevelOffset_ = 8;
stackUV1Offset_ = -8;
#else
stackArgPos_ = 0;
stackArgPos_ += WriteProlog(0, {}, { R15, R14, R13, R12 });
stackIDOffset_ = -1;
stackLevelOffset_ = -1;
stackUV1Offset_ = -stackArgPos_ - 8;
#endif
if (nearest != nullptr) {
regCache_.ChangeReg(XMM10, RegCache::VEC_FRAC);
regCache_.ForceRetain(RegCache::VEC_FRAC);
regCache_.ChangeReg(XMM11, RegCache::VEC_INDEX);
regCache_.ForceRetain(RegCache::VEC_INDEX);
if (id.hasAnyMips) {
regCache_.ChangeReg(XMM12, RegCache::VEC_INDEX1);
regCache_.ForceRetain(RegCache::VEC_INDEX1);
}
}
if (id.hasAnyMips) {
regCache_.ChangeReg(XMM6, RegCache::VEC_U1);
regCache_.ChangeReg(XMM7, RegCache::VEC_V1);
regCache_.ForceRetain(RegCache::VEC_U1);
regCache_.ForceRetain(RegCache::VEC_V1);
} else if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
}
if (nearest != nullptr) {
X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
MOVDQA(XMM9, R(primColorReg));
regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
regCache_.ChangeReg(XMM9, RegCache::VEC_ARG_COLOR);
regCache_.ForceRetain(RegCache::VEC_ARG_COLOR);
}
if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR) && regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
MOV(64, R(R14), R(srcReg));
MOV(64, R(R15), R(bufwReg));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
} else if (regCache_.Has(RegCache::GEN_ARG_TEXPTR_PTR)) {
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
MOV(64, R(R14), R(srcReg));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 0));
} else {
MOV(64, R(R14), MDisp(RSP, stackArgPos_ + 0));
MOV(64, R(R15), MDisp(RSP, stackArgPos_ + 8));
}
regCache_.ChangeReg(R14, RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.ForceRetain(RegCache::GEN_ARG_TEXPTR_PTR);
if (!regCache_.Has(RegCache::GEN_ARG_BUFW_PTR)) {
regCache_.ChangeReg(R15, RegCache::GEN_ARG_BUFW_PTR);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW_PTR);
}
bool success = true;
success = success && Jit_GetTexelCoordsQuad(id);
FixupBranch zeroSrc;
if (id.hasInvalidPtr) {
Describe("NullCheck");
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
if (id.hasAnyMips) {
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOV(64, R(tempReg), MDisp(srcReg, 0));
AND(64, R(tempReg), MDisp(srcReg, 8));
CMP(PTRBITS, R(tempReg), Imm8(0));
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
} else {
CMP(PTRBITS, MatR(srcReg), Imm8(0));
}
FixupBranch nonZeroSrc = J_CC(CC_NZ);
PXOR(XMM0, R(XMM0));
zeroSrc = J(true);
SetJumpTarget(nonZeroSrc);
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
}
auto prepareDataOffsets = [&](RegCache::Purpose uPurpose, RegCache::Purpose vPurpose, bool level1) {
X64Reg uReg = regCache_.Find(uPurpose);
X64Reg vReg = regCache_.Find(vPurpose);
success = success && Jit_PrepareDataOffsets(id, uReg, vReg, level1);
regCache_.Unlock(uReg, uPurpose);
regCache_.Unlock(vReg, vPurpose);
};
Describe("DataOffsets");
prepareDataOffsets(RegCache::VEC_ARG_U, RegCache::VEC_ARG_V, false);
if (id.hasAnyMips)
prepareDataOffsets(RegCache::VEC_U1, RegCache::VEC_V1, true);
if (nearest == nullptr && id.TexFmt() != GE_TFMT_CLUT4)
regCache_.ForceRelease(RegCache::VEC_ARG_U);
if (nearest != nullptr) {
regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT);
regCache_.ForceRetain(RegCache::VEC_RESULT);
if (id.hasAnyMips) {
regCache_.ChangeReg(XMM8, RegCache::VEC_RESULT1);
regCache_.ForceRetain(RegCache::VEC_RESULT1);
}
}
auto doNearestCall = [&](int off, bool level1) {
#if PPSSPP_PLATFORM(WINDOWS)
static const X64Reg uArgReg = RCX;
static const X64Reg vArgReg = RDX;
static const X64Reg srcArgReg = R8;
static const X64Reg bufwArgReg = R9;
#else
static const X64Reg uArgReg = RDI;
static const X64Reg vArgReg = RSI;
static const X64Reg srcArgReg = RDX;
static const X64Reg bufwArgReg = RCX;
#endif
static const X64Reg resultReg = RAX;
X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
X64Reg vReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
_assert_(level1 || (uReg == XMM0 && vReg == XMM1));
if (cpu_info.bSSE4_1) {
PEXTRD(R(uArgReg), uReg, off / 4);
PEXTRD(R(vArgReg), vReg, off / 4);
} else {
MOVD_xmm(R(uArgReg), uReg);
MOVD_xmm(R(vArgReg), vReg);
PSRLDQ(uReg, 4);
PSRLDQ(vReg, 4);
}
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
regCache_.Unlock(vReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
X64Reg indexReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
if (cpu_info.bSSE4_1) {
PEXTRD(R(srcArgReg), indexReg, off / 4);
} else {
MOVD_xmm(R(srcArgReg), indexReg);
PSRLDQ(indexReg, 4);
}
regCache_.Unlock(indexReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
ADD(64, R(srcArgReg), MDisp(srcReg, level1 ? 8 : 0));
MOVZX(32, 16, bufwArgReg, MDisp(bufwReg, level1 ? 2 : 0));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
CALL(nearest);
X64Reg vecResultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
if (cpu_info.bSSE4_1) {
PINSRD(vecResultReg, R(resultReg), off / 4);
} else if (off == 0) {
MOVD_xmm(vecResultReg, R(resultReg));
} else {
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVD_xmm(tempReg, R(resultReg));
PSLLDQ(tempReg, off);
POR(vecResultReg, R(tempReg));
regCache_.Release(tempReg, RegCache::VEC_TEMP0);
}
regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
};
if (nearest != nullptr) {
Describe("Calls");
doNearestCall(0, false);
doNearestCall(4, false);
doNearestCall(8, false);
doNearestCall(12, false);
if (regCache_.Has(RegCache::GEN_ID))
regCache_.ForceRelease(RegCache::GEN_ID);
if (regCache_.Has(RegCache::VEC_ZERO))
regCache_.ForceRelease(RegCache::VEC_ZERO);
} else {
success = success && Jit_FetchQuad(id, false);
}
if (id.hasAnyMips) {
Describe("MipsCalls");
if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
CMP(8, R(levelFracReg), Imm8(0));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
} else {
CMP(8, MDisp(RSP, stackArgPos_ + 16), Imm8(0));
}
FixupBranch skip = J_CC(CC_Z, true);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
ADD(32, R(levelReg), Imm8(1));
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
} else {
ADD(32, MDisp(RSP, stackArgPos_ + stackLevelOffset_), Imm8(1));
}
if (nearest != nullptr) {
Describe("MipsCalls");
doNearestCall(0, true);
doNearestCall(4, true);
doNearestCall(8, true);
doNearestCall(12, true);
} else {
success = success && Jit_FetchQuad(id, true);
}
SetJumpTarget(skip);
}
if (nearest != nullptr) {
regCache_.ForceRelease(RegCache::VEC_ARG_U);
regCache_.ForceRelease(RegCache::VEC_ARG_V);
regCache_.ForceRelease(RegCache::VEC_INDEX);
}
if (regCache_.Has(RegCache::VEC_INDEX1))
regCache_.ForceRelease(RegCache::VEC_INDEX1);
if (regCache_.Has(RegCache::VEC_U1))
regCache_.ForceRelease(RegCache::VEC_U1);
if (regCache_.Has(RegCache::VEC_V1))
regCache_.ForceRelease(RegCache::VEC_V1);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW_PTR);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
success = success && Jit_DecodeQuad(id, false);
success = success && Jit_BlendQuad(id, false);
if (id.hasAnyMips) {
Describe("BlendMips");
if (!regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
X64Reg levelFracReg = regCache_.Alloc(RegCache::GEN_ARG_LEVELFRAC);
MOVZX(32, 8, levelFracReg, MDisp(RSP, stackArgPos_ + 16));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
regCache_.ForceRetain(RegCache::GEN_ARG_LEVELFRAC);
}
X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
CMP(8, R(levelFracReg), Imm8(0));
FixupBranch skip = J_CC(CC_Z, true);
success = success && Jit_DecodeQuad(id, true);
success = success && Jit_BlendQuad(id, true);
Describe("BlendMips");
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVD_xmm(fracReg, R(levelFracReg));
PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
X64Reg color1Reg = regCache_.Find(RegCache::VEC_RESULT1);
PMULLW(color1Reg, R(fracReg));
X64Reg invFracReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(invFracReg, M(const10All16_));
PSUBW(invFracReg, R(fracReg));
PMULLW(XMM0, R(invFracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
regCache_.Release(invFracReg, RegCache::VEC_TEMP1);
PADDW(XMM0, R(color1Reg));
PSRLW(XMM0, 4);
regCache_.Unlock(color1Reg, RegCache::VEC_RESULT1);
regCache_.ForceRelease(RegCache::VEC_RESULT1);
SetJumpTarget(skip);
}
if (regCache_.Has(RegCache::VEC_FRAC))
regCache_.ForceRelease(RegCache::VEC_FRAC);
success = success && Jit_ApplyTextureFunc(id);
Describe("Init");
if (cpu_info.bSSE4_1) {
PMOVZXWD(XMM0, R(XMM0));
} else {
X64Reg zeroReg = GetZeroVec();
PUNPCKLWD(XMM0, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
regCache_.ForceRelease(RegCache::VEC_RESULT);
if (regCache_.Has(RegCache::GEN_ARG_ID))
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
if (!success) {
regCache_.Reset(false);
EndWrite();
ResetCodePtr(GetOffset(nearest ? nearest : linearResetPos));
ERROR_LOG(Log::G3D, "Failed to compile linear %s", DescribeSamplerID(id).c_str());
return nullptr;
}
if (id.hasInvalidPtr) {
SetJumpTarget(zeroSrc);
}
const u8 *start = WriteFinalizedEpilog();
regCache_.Reset(true);
return (LinearFunc)start;
}
void SamplerJitCache::WriteConstantPool(const SamplerID &id) {
WriteSimpleConst8x16(const10All16_, 0x10);
WriteSimpleConst16x8(const10All8_, 0x10);
if (const10Low_ == nullptr) {
const10Low_ = AlignCode16();
for (int i = 0; i < 4; ++i)
Write16(0x10);
for (int i = 0; i < 4; ++i)
Write16(0);
}
WriteSimpleConst4x32(constOnes32_, 1);
WriteSimpleConst8x16(constOnes16_, 1);
WriteSimpleConst4x32(constMaxTexel32_, 511);
if (constUNext_ == nullptr) {
constUNext_ = AlignCode16();
Write32(0); Write32(1); Write32(0); Write32(1);
}
if (constVNext_ == nullptr) {
constVNext_ = AlignCode16();
Write32(0); Write32(0); Write32(1); Write32(1);
}
WriteSimpleConst4x32(const5551Swizzle_, 0x00070707);
WriteSimpleConst4x32(const5650Swizzle_, 0x00070307);
if (!id.hasAnyMips) {
float w256f = (1 << id.width0Shift) * 256;
float h256f = (1 << id.height0Shift) * 256;
constWidthHeight256f_ = AlignCode16();
Write32(*(uint32_t *)&w256f);
Write32(*(uint32_t *)&h256f);
Write32(*(uint32_t *)&w256f);
Write32(*(uint32_t *)&h256f);
WriteDynamicConst4x32(constWidthMinus1i_, id.width0Shift > 9 ? 511 : (1 << id.width0Shift) - 1);
WriteDynamicConst4x32(constHeightMinus1i_, id.height0Shift > 9 ? 511 : (1 << id.height0Shift) - 1);
} else {
constWidthHeight256f_ = nullptr;
constWidthMinus1i_ = nullptr;
constHeightMinus1i_ = nullptr;
}
}
RegCache::Reg SamplerJitCache::GetSamplerID() {
if (regCache_.Has(RegCache::GEN_ARG_ID))
return regCache_.Find(RegCache::GEN_ARG_ID);
if (!regCache_.Has(RegCache::GEN_ID)) {
X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
_assert_(stackIDOffset_ != -1);
MOV(PTRBITS, R(r), MDisp(RSP, stackArgPos_ + stackIDOffset_));
return r;
}
return regCache_.Find(RegCache::GEN_ID);
}
void SamplerJitCache::UnlockSamplerID(RegCache::Reg &r) {
if (regCache_.Has(RegCache::GEN_ARG_ID))
regCache_.Unlock(r, RegCache::GEN_ARG_ID);
else
regCache_.Unlock(r, RegCache::GEN_ID);
}
bool SamplerJitCache::Jit_FetchQuad(const SamplerID &id, bool level1) {
bool success = true;
switch (id.TexFmt()) {
case GE_TFMT_5650:
case GE_TFMT_5551:
case GE_TFMT_4444:
success = Jit_GetDataQuad(id, level1, 16);
if (cpu_info.bAVX2) {
X64Reg destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
PSLLD(destReg, 16);
PSRLD(destReg, 16);
regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
}
break;
case GE_TFMT_8888:
success = Jit_GetDataQuad(id, level1, 32);
break;
case GE_TFMT_CLUT32:
success = Jit_GetDataQuad(id, level1, 32);
if (success)
success = Jit_TransformClutIndexQuad(id, 32);
if (success)
success = Jit_ReadClutQuad(id, level1);
break;
case GE_TFMT_CLUT16:
success = Jit_GetDataQuad(id, level1, 16);
if (success)
success = Jit_TransformClutIndexQuad(id, 16);
if (success)
success = Jit_ReadClutQuad(id, level1);
break;
case GE_TFMT_CLUT8:
success = Jit_GetDataQuad(id, level1, 8);
if (success)
success = Jit_TransformClutIndexQuad(id, 8);
if (success)
success = Jit_ReadClutQuad(id, level1);
break;
case GE_TFMT_CLUT4:
success = Jit_GetDataQuad(id, level1, 4);
if (success)
success = Jit_TransformClutIndexQuad(id, 4);
if (success)
success = Jit_ReadClutQuad(id, level1);
break;
case GE_TFMT_DXT1:
case GE_TFMT_DXT3:
case GE_TFMT_DXT5:
success = false;
break;
default:
success = false;
}
return success;
}
bool SamplerJitCache::Jit_GetDataQuad(const SamplerID &id, bool level1, int bitsPerTexel) {
Describe("DataQuad");
bool success = true;
X64Reg baseReg = regCache_.Alloc(RegCache::GEN_ARG_TEXPTR);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
MOV(64, R(baseReg), MDisp(srcReg, level1 ? 8 : 0));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
X64Reg destReg = INVALID_REG;
if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32)
destReg = regCache_.Alloc(RegCache::VEC_INDEX);
else if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))
destReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
else
destReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg byteOffsetReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
if (cpu_info.bAVX2 && id.overReadSafe) {
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PCMPEQD(maskReg, R(maskReg));
VPGATHERDD(128, destReg, MComplex(baseReg, byteOffsetReg, SCALE_1, 0), maskReg);
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
} else {
if (bitsPerTexel != 32)
PXOR(destReg, R(destReg));
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bSSE4_1) {
for (int i = 0; i < 4; ++i) {
PEXTRD(R(temp2Reg), byteOffsetReg, i);
if (bitsPerTexel <= 8)
PINSRB(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 4);
else if (bitsPerTexel == 16)
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
else if (bitsPerTexel == 32)
PINSRD(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i);
}
} else {
for (int i = 0; i < 4; ++i) {
MOVD_xmm(R(temp2Reg), byteOffsetReg);
if (i != 3)
PSRLDQ(byteOffsetReg, 4);
if (bitsPerTexel <= 8) {
MOVZX(32, 8, temp2Reg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
PINSRW(destReg, R(temp2Reg), i * 2);
} else if (bitsPerTexel == 16) {
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
} else if (bitsPerTexel == 32) {
if (i == 0) {
MOVD_xmm(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
} else {
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 2), i * 2 + 1);
}
}
}
}
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
regCache_.Unlock(byteOffsetReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
regCache_.ForceRelease(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
regCache_.Release(baseReg, RegCache::GEN_ARG_TEXPTR);
if (bitsPerTexel == 4) {
X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
if (cpu_info.bAVX2) {
PSLLD(uReg, 31);
PSRLD(uReg, 29);
VPSRLVD(128, destReg, destReg, R(uReg));
} else {
PSLLD(uReg, 31);
PSRAD(uReg, 31);
X64Reg unshiftedReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVDQA(unshiftedReg, R(destReg));
PSRLD(destReg, 4);
PAND(destReg, R(uReg));
PANDN(uReg, R(unshiftedReg));
POR(destReg, R(uReg));
regCache_.Release(unshiftedReg, RegCache::VEC_TEMP0);
}
regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
regCache_.ForceRelease(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
}
if (id.TexFmt() >= GE_TFMT_CLUT4 && id.TexFmt() <= GE_TFMT_CLUT32) {
regCache_.Unlock(destReg, RegCache::VEC_INDEX);
} else {
regCache_.Unlock(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
}
return success;
}
bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPerIndex) {
Describe("TrCLUTQuad");
GEPaletteFormat fmt = id.ClutFmt();
if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
PSLLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
PSRLD(indexReg, bitsPerIndex >= 8 ? 24 : 28);
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
return true;
}
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
bool maskedIndex = false;
X64Reg formatReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg idReg = GetSamplerID();
if (cpu_info.bAVX2 && !id.hasClutShift)
VPBROADCASTD(128, formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
else
MOVD_xmm(formatReg, MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
UnlockSamplerID(idReg);
if (id.hasClutShift) {
if (bitsPerIndex < 32) {
PSLLD(indexReg, 32 - bitsPerIndex);
PSRLD(indexReg, 32 - bitsPerIndex);
maskedIndex = true;
}
X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
PSLLD(shiftReg, formatReg, 32 - 7);
PSRLD(shiftReg, 32 - 5);
PSRLD(indexReg, R(shiftReg));
regCache_.Release(shiftReg, RegCache::VEC_TEMP1);
}
if (!cpu_info.bAVX2 || id.hasClutShift)
PSHUFD(formatReg, R(formatReg), _MM_SHUFFLE(0, 0, 0, 0));
if (id.hasClutMask) {
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
PSLLD(maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);
PAND(indexReg, R(maskReg));
regCache_.Release(maskReg, RegCache::VEC_TEMP1);
} else if (!maskedIndex || bitsPerIndex > 8) {
PSLLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
PSRLD(indexReg, maskedIndex || bitsPerIndex >= 8 ? 24 : 28);
}
if (id.hasClutOffset) {
int offsetBits = fmt == GE_CMODE_32BIT_ABGR8888 ? 4 : 5;
PSRLD(formatReg, 16);
PSLLD(formatReg, 32 - offsetBits);
PSRLD(formatReg, 32 - offsetBits - 4);
POR(indexReg, R(formatReg));
}
regCache_.Release(formatReg, RegCache::VEC_TEMP0);
regCache_.Unlock(indexReg, RegCache::VEC_INDEX);
return true;
}
bool SamplerJitCache::Jit_ReadClutQuad(const SamplerID &id, bool level1) {
Describe("ReadCLUTQuad");
X64Reg indexReg = regCache_.Find(RegCache::VEC_INDEX);
if (!id.useSharedClut) {
X64Reg vecLevelReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
MOVD_xmm(vecLevelReg, R(levelReg));
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
} else {
#if PPSSPP_PLATFORM(WINDOWS)
if (cpu_info.bAVX2) {
VPBROADCASTD(128, vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));
} else {
MOVD_xmm(vecLevelReg, MDisp(RSP, stackArgPos_ + stackLevelOffset_));
PSHUFD(vecLevelReg, R(vecLevelReg), _MM_SHUFFLE(0, 0, 0, 0));
}
#else
_assert_(false);
#endif
}
PSLLD(vecLevelReg, 4);
PADDD(indexReg, R(vecLevelReg));
regCache_.Release(vecLevelReg, RegCache::VEC_TEMP0);
}
X64Reg idReg = GetSamplerID();
X64Reg clutBaseReg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(clutBaseReg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
UnlockSamplerID(idReg);
X64Reg resultReg = INVALID_REG;
if (regCache_.Has(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT))
resultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
else
resultReg = regCache_.Alloc(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (cpu_info.bAVX2 && id.overReadSafe)
PCMPEQD(maskReg, R(maskReg));
switch (id.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650:
case GE_CMODE_16BIT_ABGR5551:
case GE_CMODE_16BIT_ABGR4444:
if (cpu_info.bAVX2 && id.overReadSafe) {
VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_2, 0), maskReg);
PCMPEQD(maskReg, R(maskReg));
PSRLD(maskReg, 16);
PAND(resultReg, R(maskReg));
} else {
PXOR(resultReg, R(resultReg));
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bSSE4_1) {
for (int i = 0; i < 4; ++i) {
PEXTRD(R(temp2Reg), indexReg, i);
PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
}
} else {
for (int i = 0; i < 4; ++i) {
MOVD_xmm(R(temp2Reg), indexReg);
if (i != 3)
PSRLDQ(indexReg, 4);
PINSRW(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_2, 0), i * 2);
}
}
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
break;
case GE_CMODE_32BIT_ABGR8888:
if (cpu_info.bAVX2 && id.overReadSafe) {
VPGATHERDD(128, resultReg, MComplex(clutBaseReg, indexReg, SCALE_4, 0), maskReg);
} else {
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bSSE4_1) {
for (int i = 0; i < 4; ++i) {
PEXTRD(R(temp2Reg), indexReg, i);
PINSRD(resultReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0), i);
}
} else {
for (int i = 0; i < 4; ++i) {
MOVD_xmm(R(temp2Reg), indexReg);
if (i != 3)
PSRLDQ(indexReg, 4);
if (i == 0) {
MOVD_xmm(resultReg , MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
} else {
MOVD_xmm(maskReg, MComplex(clutBaseReg, temp2Reg, SCALE_4, 0));
PSLLDQ(maskReg, 4 * i);
POR(resultReg, R(maskReg));
}
}
}
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
break;
}
regCache_.Release(maskReg, RegCache::VEC_TEMP0);
regCache_.Unlock(resultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
regCache_.ForceRetain(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
regCache_.Release(clutBaseReg, RegCache::GEN_TEMP1);
regCache_.Release(indexReg, RegCache::VEC_INDEX);
return true;
}
bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
Describe(level1 ? "BlendQuadMips" : "BlendQuad");
if (cpu_info.bSSE4_1 && cpu_info.bSSSE3) {
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg tempArrangeReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
PUNPCKLBW(quadReg, R(tempArrangeReg));
PSHUFD(tempArrangeReg, R(quadReg), _MM_SHUFFLE(3, 2, 3, 2));
PUNPCKLWD(quadReg, R(tempArrangeReg));
regCache_.Release(tempArrangeReg, RegCache::VEC_TEMP0);
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);
X64Reg zeroReg = GetZeroVec();
if (level1) {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
} else {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
}
PSHUFB(fracReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
X64Reg multTBReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(multTBReg, M(const10All8_));
PSUBB(multTBReg, R(fracReg));
PUNPCKLBW(multTBReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
PMADDUBSW(quadReg, R(multTBReg));
regCache_.Release(multTBReg, RegCache::VEC_TEMP1);
fracReg = regCache_.Alloc(RegCache::VEC_TEMP0);
allFracReg = regCache_.Find(RegCache::VEC_FRAC);
if (level1) {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
} else {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));
}
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
X64Reg multLRReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(multLRReg, M(const10All16_));
PSUBW(multLRReg, R(fracReg));
PUNPCKLWD(multLRReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP0);
PMADDWD(quadReg, R(multLRReg));
PSRLD(quadReg, 8);
regCache_.Release(multLRReg, RegCache::VEC_TEMP1);
if (level1) {
PACKSSDW(quadReg, R(quadReg));
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
} else {
if (cpu_info.bAVX) {
VPACKSSDW(128, XMM0, quadReg, R(quadReg));
} else {
PACKSSDW(quadReg, R(quadReg));
MOVDQA(XMM0, R(quadReg));
}
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
regCache_.ForceRelease(RegCache::VEC_RESULT);
bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
_assert_msg_(changeSuccess, "Unexpected reg locked as destReg");
}
} else {
X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);
X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
X64Reg zeroReg = GetZeroVec();
PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PUNPCKLBW(topReg, R(zeroReg));
PUNPCKLBW(bottomReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
if (!level1) {
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
regCache_.ForceRelease(RegCache::VEC_RESULT);
}
X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg allFracReg = regCache_.Find(RegCache::VEC_FRAC);
X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);
if (level1) {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
} else {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(0, 0, 0, 0));
}
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
MOVDQA(fracMulReg, M(const10Low_));
PSUBW(fracMulReg, R(fracReg));
PUNPCKLQDQ(fracMulReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
PMULLW(topReg, R(fracMulReg));
PMULLW(bottomReg, R(fracMulReg));
regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);
fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
allFracReg = regCache_.Find(RegCache::VEC_FRAC);
X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);
if (level1) {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
} else {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
}
PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
MOVDQA(fracTopReg, M(const10All16_));
PSUBW(fracTopReg, R(fracReg));
PMULLW(bottomReg, R(fracReg));
PMULLW(topReg, R(fracTopReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);
regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);
PADDUSW(bottomReg, R(topReg));
regCache_.Release(topReg, RegCache::VEC_TEMP0);
if (level1) {
PSHUFD(quadReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(quadReg, R(bottomReg));
PSRLW(quadReg, 8);
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
regCache_.Unlock(quadReg, RegCache::VEC_RESULT1);
} else {
bool changeSuccess = regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
if (!changeSuccess) {
_assert_msg_(XMM0 == bottomReg, "Unexpected other reg locked as destReg");
X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(bottomReg, R(otherReg));
regCache_.Release(otherReg, RegCache::VEC_TEMP0);
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
regCache_.ChangeReg(XMM0, RegCache::VEC_RESULT);
} else {
PSHUFD(XMM0, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
PADDUSW(XMM0, R(bottomReg));
regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
}
PSRLW(XMM0, 8);
}
}
return true;
}
bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT);
X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
auto useAlphaFrom = [&](X64Reg alphaColorReg) {
if (cpu_info.bSSE4_1) {
PBLENDW(resultReg, R(alphaColorReg), 0x08);
} else {
PSRLDQ(alphaColorReg, 6);
PSLLDQ(alphaColorReg, 6);
PSLLDQ(resultReg, 10);
PSRLDQ(resultReg, 10);
POR(resultReg, R(alphaColorReg));
}
};
switch (id.TexFunc()) {
case GE_TEXFUNC_MODULATE:
Describe("Modulate");
PACKSSDW(primColorReg, R(primColorReg));
if (cpu_info.bAVX) {
VPADDW(128, tempReg, primColorReg, M(constOnes16_));
VPMULLW(128, resultReg, tempReg, R(resultReg));
} else {
MOVDQA(tempReg, M(constOnes16_));
PADDW(tempReg, R(primColorReg));
PMULLW(resultReg, R(tempReg));
}
if (id.useColorDoubling)
PSRLW(resultReg, 7);
else
PSRLW(resultReg, 8);
if (!id.useTextureAlpha) {
useAlphaFrom(primColorReg);
} else if (id.useColorDoubling) {
PSRLW(primColorReg, resultReg, 1);
useAlphaFrom(primColorReg);
}
break;
case GE_TEXFUNC_DECAL:
Describe("Decal");
PACKSSDW(primColorReg, R(primColorReg));
if (id.useTextureAlpha) {
PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3));
PADDW(resultReg, M(constOnes16_));
PMULLW(resultReg, R(tempReg));
X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1);
PCMPEQD(invAlphaReg, R(invAlphaReg));
PSRLW(invAlphaReg, 8);
PSUBW(invAlphaReg, R(tempReg));
MOVDQA(tempReg, R(primColorReg));
PADDW(tempReg, M(constOnes16_));
PMULLW(tempReg, R(invAlphaReg));
regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1);
PADDW(resultReg, R(tempReg));
if (id.useColorDoubling)
PSRLW(resultReg, 7);
else
PSRLW(resultReg, 8);
} else if (id.useColorDoubling) {
PSLLW(resultReg, 1);
}
useAlphaFrom(primColorReg);
break;
case GE_TEXFUNC_BLEND:
{
Describe("EnvBlend");
PACKSSDW(primColorReg, R(primColorReg));
X64Reg idReg = GetSamplerID();
X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1);
if (cpu_info.bSSE4_1) {
PMOVZXBW(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));
} else {
MOVD_xmm(texEnvReg, MDisp(idReg, offsetof(SamplerID, cached.texBlendColor)));
X64Reg zeroReg = GetZeroVec();
PUNPCKLBW(texEnvReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
UnlockSamplerID(idReg);
PUNPCKLWD(texEnvReg, R(primColorReg));
PCMPEQD(tempReg, R(tempReg));
PSRLW(tempReg, 8);
if (id.useTextureAlpha)
PSRLDQ(tempReg, 10);
X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP2);
MOVDQA(roundValueReg, R(tempReg));
PSUBUSW(tempReg, R(resultReg));
PUNPCKLWD(resultReg, R(tempReg));
if (id.useTextureAlpha) {
PADDW(primColorReg, M(constOnes16_));
PSRLDQ(primColorReg, 6);
PSLLDQ(primColorReg, 12);
POR(texEnvReg, R(primColorReg));
}
PMADDWD(resultReg, R(texEnvReg));
regCache_.Release(texEnvReg, RegCache::VEC_TEMP1);
if (cpu_info.bSSE4_1) {
PACKUSDW(resultReg, R(resultReg));
} else {
PSLLD(resultReg, 16);
PSRAD(resultReg, 16);
PACKSSDW(resultReg, R(resultReg));
}
PADDW(resultReg, R(roundValueReg));
regCache_.Release(roundValueReg, RegCache::VEC_TEMP2);
if (id.useColorDoubling && id.useTextureAlpha) {
PSRLW(resultReg, 7);
PSRLW(primColorReg, resultReg, 1);
useAlphaFrom(primColorReg);
} else if (id.useColorDoubling) {
PSRLW(resultReg, 7);
} else {
PSRLW(resultReg, 8);
}
if (!id.useTextureAlpha)
useAlphaFrom(primColorReg);
break;
}
case GE_TEXFUNC_REPLACE:
Describe("Replace");
if (id.useColorDoubling && id.useTextureAlpha) {
MOVDQA(primColorReg, R(resultReg));
PSLLDQ(resultReg, 10);
PSRLDQ(resultReg, 10);
PADDW(resultReg, R(primColorReg));
} else if (!id.useTextureAlpha) {
if (id.useColorDoubling) {
PSLLW(resultReg, 1);
}
PACKSSDW(primColorReg, R(primColorReg));
useAlphaFrom(primColorReg);
}
break;
case GE_TEXFUNC_ADD:
case GE_TEXFUNC_UNKNOWN1:
case GE_TEXFUNC_UNKNOWN2:
case GE_TEXFUNC_UNKNOWN3:
Describe("Add");
PACKSSDW(primColorReg, R(primColorReg));
if (id.useTextureAlpha) {
MOVDQA(tempReg, M(constOnes16_));
PADDW(tempReg, R(primColorReg));
PMULLW(tempReg, R(resultReg));
PADDW(resultReg, R(primColorReg));
if (id.useColorDoubling)
PSLLW(resultReg, 1);
PSRLW(tempReg, 8);
useAlphaFrom(tempReg);
} else {
PADDW(resultReg, R(primColorReg));
if (id.useColorDoubling)
PSLLW(resultReg, 1);
useAlphaFrom(primColorReg);
}
break;
}
regCache_.Release(tempReg, RegCache::VEC_TEMP0);
regCache_.Unlock(resultReg, RegCache::VEC_RESULT);
regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
return true;
}
bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
GETextureFormat fmt = id.TexFmt();
bool success = true;
switch (fmt) {
case GE_TFMT_5650:
success = Jit_GetTexData(id, 16);
if (success)
success = Jit_Decode5650(id);
break;
case GE_TFMT_5551:
success = Jit_GetTexData(id, 16);
if (success)
success = Jit_Decode5551(id);
break;
case GE_TFMT_4444:
success = Jit_GetTexData(id, 16);
if (success)
success = Jit_Decode4444(id);
break;
case GE_TFMT_8888:
success = Jit_GetTexData(id, 32);
break;
case GE_TFMT_CLUT32:
success = Jit_GetTexData(id, 32);
if (success)
success = Jit_TransformClutIndex(id, 32);
if (success)
success = Jit_ReadClutColor(id);
break;
case GE_TFMT_CLUT16:
success = Jit_GetTexData(id, 16);
if (success)
success = Jit_TransformClutIndex(id, 16);
if (success)
success = Jit_ReadClutColor(id);
break;
case GE_TFMT_CLUT8:
success = Jit_GetTexData(id, 8);
if (success)
success = Jit_TransformClutIndex(id, 8);
if (success)
success = Jit_ReadClutColor(id);
break;
case GE_TFMT_CLUT4:
success = Jit_GetTexData(id, 4);
if (success)
success = Jit_TransformClutIndex(id, 4);
if (success)
success = Jit_ReadClutColor(id);
break;
case GE_TFMT_DXT1:
success = Jit_GetDXT1Color(id, 8, 255);
break;
case GE_TFMT_DXT3:
success = Jit_GetDXT1Color(id, 16, 0);
if (success)
success = Jit_ApplyDXTAlpha(id);
break;
case GE_TFMT_DXT5:
success = Jit_GetDXT1Color(id, 16, 0);
if (success)
success = Jit_ApplyDXTAlpha(id);
break;
default:
success = false;
}
return success;
}
bool SamplerJitCache::Jit_GetDXT1Color(const SamplerID &id, int blockSize, int alpha) {
Describe("DXT1");
_assert_msg_(blockSize == 8 || blockSize == 16, "Invalid DXT block size");
X64Reg colorIndexReg = INVALID_REG;
if (!id.linear) {
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
X64Reg srcBaseReg = regCache_.Alloc(RegCache::GEN_TEMP0);
LEA(32, srcBaseReg, MScaled(uReg, blockSize / 4, 0));
AND(32, R(srcBaseReg), Imm32(blockSize == 8 ? ~7 : ~15));
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
ADD(64, R(srcBaseReg), R(srcReg));
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
X64Reg srcOffsetReg = regCache_.Alloc(RegCache::GEN_TEMP1);
LEA(32, srcOffsetReg, MScaled(vReg, blockSize / 4, 0));
AND(32, R(srcOffsetReg), Imm32(blockSize == 8 ? ~7 : ~15));
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
SHR(32, R(bufwReg), Imm8(2));
IMUL(32, srcOffsetReg, R(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
AND(32, R(uReg), Imm32(3));
AND(32, R(vReg), Imm32(3));
LEA(64, srcReg, MRegSum(srcBaseReg, srcOffsetReg));
regCache_.Release(srcBaseReg, RegCache::GEN_TEMP0);
regCache_.Release(srcOffsetReg, RegCache::GEN_TEMP1);
if (uReg != ECX && !cpu_info.bBMI2)
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOVZX(32, 8, colorIndexReg, MRegSum(srcReg, vReg));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
if (id.TexFmt() == GE_TFMT_DXT1)
regCache_.ForceRelease(RegCache::GEN_ARG_V);
if (uReg == ECX) {
SHR(32, R(colorIndexReg), R(CL));
SHR(32, R(colorIndexReg), R(CL));
} else if (cpu_info.bBMI2) {
SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
SHRX(32, colorIndexReg, R(colorIndexReg), uReg);
} else {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(hasRCX);
LEA(32, ECX, MScaled(uReg, SCALE_2, 0));
SHR(32, R(colorIndexReg), R(CL));
}
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
if (id.TexFmt() == GE_TFMT_DXT1)
regCache_.ForceRelease(RegCache::GEN_ARG_U);
} else {
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
if (id.TexFmt() == GE_TFMT_DXT1)
regCache_.ForceRelease(RegCache::GEN_ARG_V);
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
if (uReg != ECX && !cpu_info.bBMI2)
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
colorIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
if (cpu_info.bBMI2) {
SHRX(32, colorIndexReg, MatR(srcReg), uReg);
} else {
MOV(32, R(colorIndexReg), MatR(srcReg));
if (uReg != RCX) {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(hasRCX);
MOV(32, R(RCX), R(uReg));
}
SHR(32, R(colorIndexReg), R(CL));
}
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRelease(RegCache::GEN_ARG_U);
}
AND(32, R(colorIndexReg), Imm32(3));
X64Reg color1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg color2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
CMP(32, R(colorIndexReg), Imm32(1));
FixupBranch handleSimple565 = J_CC(CC_BE);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
MOVZX(32, 16, color1Reg, MDisp(srcReg, 4));
MOVZX(32, 16, color2Reg, MDisp(srcReg, 6));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
CMP(32, R(color1Reg), R(color2Reg));
FixupBranch handleMix23 = J_CC(CC_A, true);
XOR(32, R(resultReg), R(resultReg));
CMP(32, R(colorIndexReg), Imm32(3));
FixupBranch finishZero = J_CC(CC_E, true);
if (cpu_info.bBMI2_fast) {
MOV(32, R(colorIndexReg), Imm32(0x007C7E7C));
PDEP(32, color1Reg, color1Reg, R(colorIndexReg));
PDEP(32, color2Reg, color2Reg, R(colorIndexReg));
LEA(32, resultReg, MRegSum(color1Reg, color2Reg));
BSWAP(32, resultReg);
SHR(32, R(resultReg), Imm8(8));
} else {
PUSH(R12);
PUSH(R13);
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x0000F800));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000F800));
LEA(32, R12, MRegSum(resultReg, colorIndexReg));
SHR(32, R(R12), Imm8(9));
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x000007E0));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x000007E0));
LEA(32, resultReg, MRegSum(resultReg, colorIndexReg));
SHL(32, R(resultReg), Imm8(5 - 1));
OR(32, R(resultReg), R(R12));
AND(32, R(color1Reg), Imm32(0x0000001F));
AND(32, R(color2Reg), Imm32(0x0000001F));
LEA(32, colorIndexReg, MRegSum(color1Reg, color2Reg));
SHL(32, R(colorIndexReg), Imm8(16 + 2));
OR(32, R(resultReg), R(colorIndexReg));
POP(R13);
POP(R12);
}
FixupBranch finishMix50 = J(true);
SetJumpTarget(handleSimple565);
srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
MOVZX(32, 16, colorIndexReg, MComplex(srcReg, colorIndexReg, SCALE_2, 4));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
if (id.TexFmt() == GE_TFMT_DXT1)
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
if (cpu_info.bBMI2_fast) {
MOV(32, R(resultReg), Imm32(0x00F8FCF8));
PDEP(32, resultReg, colorIndexReg, R(resultReg));
BSWAP(32, resultReg);
SHR(32, R(resultReg), Imm8(8));
} else {
MOV(32, R(resultReg), R(colorIndexReg));
AND(32, R(resultReg), Imm32(0x0000F800));
SHR(32, R(resultReg), Imm8(8));
MOV(32, R(color2Reg), R(colorIndexReg));
AND(32, R(color2Reg), Imm32(0x000007E0));
SHL(32, R(color2Reg), Imm8(5));
OR(32, R(resultReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000001F));
SHL(32, R(colorIndexReg), Imm8(16 + 3));
OR(32, R(resultReg), R(colorIndexReg));
}
FixupBranch finish565 = J(true);
SetJumpTarget(handleMix23);
CMP(32, R(colorIndexReg), Imm32(2));
FixupBranch skipSwap23 = J_CC(CC_E);
XCHG(32, R(color2Reg), R(color1Reg));
SetJumpTarget(skipSwap23);
if (cpu_info.bBMI2_fast) {
MOV(64, R(colorIndexReg), Imm64(0x00001F0003F0001FULL));
PDEP(64, color1Reg, color1Reg, R(colorIndexReg));
PDEP(64, color2Reg, color2Reg, R(colorIndexReg));
LEA(64, resultReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
IMUL(64, resultReg, R(resultReg), Imm32(0x00000AAB));
MOV(64, R(colorIndexReg), Imm64((0xFFULL << 10) | (0xFFULL << 31) | (0xFFULL << 50)));
PEXT(64, resultReg, resultReg, R(colorIndexReg));
BSWAP(32, resultReg);
SHR(32, R(resultReg), Imm8(8));
} else {
PUSH(R12);
PUSH(R13);
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x0000F800));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x0000F800));
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
SHR(32, R(resultReg), Imm8(8));
IMUL(32, R12, R(resultReg), Imm32(0x0000AAAB));
SHR(32, R(R12), Imm8(17));
MOV(32, R(resultReg), R(color1Reg));
AND(32, R(resultReg), Imm32(0x000007E0));
MOV(32, R(colorIndexReg), R(color2Reg));
AND(32, R(colorIndexReg), Imm32(0x000007E0));
LEA(32, resultReg, MComplex(colorIndexReg, resultReg, SCALE_2, 0));
IMUL(32, resultReg, R(resultReg), Imm32(0x0000AAAB));
SHR(32, R(resultReg), Imm8(17 - 5));
AND(32, R(resultReg), Imm32(0x0000FF00));
OR(32, R(resultReg), R(R12));
AND(32, R(color1Reg), Imm32(0x0000001F));
AND(32, R(color2Reg), Imm32(0x0000001F));
LEA(32, colorIndexReg, MComplex(color2Reg, color1Reg, SCALE_2, 0));
IMUL(32, colorIndexReg, R(colorIndexReg), Imm32(0x0002AAAB));
AND(32, R(colorIndexReg), Imm32(0x00FF0000));
OR(32, R(resultReg), R(colorIndexReg));
POP(R13);
POP(R12);
}
regCache_.Release(colorIndexReg, RegCache::GEN_TEMP0);
regCache_.Release(color1Reg, RegCache::GEN_TEMP1);
regCache_.Release(color2Reg, RegCache::GEN_TEMP2);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
SetJumpTarget(finishMix50);
SetJumpTarget(finish565);
if (alpha != 0) {
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
OR(32, R(resultReg), Imm32(alpha << 24));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
}
SetJumpTarget(finishZero);
return true;
}
bool SamplerJitCache::Jit_ApplyDXTAlpha(const SamplerID &id) {
GETextureFormat fmt = id.TexFmt();
bool success = false;
if (fmt == GE_TFMT_DXT3) {
Describe("DXT3A");
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
if (id.linear) {
if (!cpu_info.bBMI2) {
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
}
if (cpu_info.bBMI2) {
SHRX(64, srcReg, MDisp(srcReg, 8), vReg);
} else {
MOV(64, R(srcReg), MDisp(srcReg, 8));
MOV(32, R(RCX), R(vReg));
SHR(64, R(srcReg), R(CL));
}
SHL(32, R(srcReg), Imm8(28));
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
OR(32, R(resultReg), R(srcReg));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
success = true;
} else {
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
if (uReg != RCX && !cpu_info.bBMI2) {
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
}
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOVZX(32, 16, temp1Reg, MComplex(srcReg, vReg, SCALE_2, 8));
if (cpu_info.bBMI2) {
LEA(32, uReg, MScaled(uReg, SCALE_4, 0));
SHRX(32, temp1Reg, R(temp1Reg), uReg);
} else {
LEA(32, RCX, MScaled(uReg, SCALE_4, 0));
SHR(32, R(temp1Reg), R(CL));
}
SHL(32, R(temp1Reg), Imm8(28));
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
OR(32, R(resultReg), R(temp1Reg));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
success = true;
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRelease(RegCache::GEN_ARG_U);
}
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRelease(RegCache::GEN_ARG_V);
} else if (fmt == GE_TFMT_DXT5) {
Describe("DXT5A");
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
X64Reg alphaIndexReg = INVALID_REG;
if (id.linear) {
if (cpu_info.bBMI2) {
alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
SHRX(64, alphaIndexReg, MDisp(srcReg, 8), vReg);
} else {
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOV(64, R(alphaIndexReg), MDisp(srcReg, 8));
MOV(32, R(RCX), R(vReg));
SHR(64, R(alphaIndexReg), R(CL));
}
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRelease(RegCache::GEN_ARG_V);
} else {
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
if (uReg != RCX && !cpu_info.bBMI2)
regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
alphaIndexReg = regCache_.Alloc(RegCache::GEN_TEMP0);
LEA(32, uReg, MComplex(uReg, vReg, SCALE_4, 0));
LEA(32, uReg, MComplex(uReg, uReg, SCALE_2, 0));
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRelease(RegCache::GEN_ARG_V);
if (cpu_info.bBMI2) {
SHRX(64, alphaIndexReg, MDisp(srcReg, 8), uReg);
} else {
MOV(32, R(alphaIndexReg), R(uReg));
SHR(32, R(alphaIndexReg), Imm8(3));
AND(32, R(uReg), Imm32(7));
MOVZX(32, 16, alphaIndexReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 8));
if (uReg != RCX) {
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
MOV(32, R(RCX), R(uReg));
}
SHR(32, R(alphaIndexReg), R(CL));
}
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRelease(RegCache::GEN_ARG_U);
}
X64Reg alpha1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg alpha2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
AND(32, R(alphaIndexReg), Imm32(7));
X64Reg temp3Reg = regCache_.Alloc(RegCache::GEN_TEMP3);
CMP(32, R(alphaIndexReg), Imm32(1));
FixupBranch handleSimple = J_CC(CC_BE, true);
MOVZX(32, 8, alpha1Reg, MDisp(srcReg, 14));
MOVZX(32, 8, alpha2Reg, MDisp(srcReg, 15));
CMP(32, R(alpha1Reg), R(alpha2Reg));
FixupBranch handleLerp8 = J_CC(CC_A);
CMP(32, R(alphaIndexReg), Imm32(6));
FixupBranch finishZero = J_CC(CC_E, true);
MOV(32, R(srcReg), Imm32(0xFF));
FixupBranch finishFull = J_CC(CC_A, true);
SHL(32, R(alphaIndexReg), Imm8(8));
MOV(32, R(temp3Reg), Imm32(6 << 8));
SUB(32, R(temp3Reg), R(alphaIndexReg));
IMUL(32, alpha1Reg, R(temp3Reg));
SUB(32, R(alphaIndexReg), Imm32(1 << 8));
IMUL(32, alpha2Reg, R(alphaIndexReg));
LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 5 * 31));
IMUL(32, srcReg, Imm32(0x3334));
SHR(32, R(srcReg), Imm8(24));
FixupBranch finishLerp6 = J(true);
SetJumpTarget(handleLerp8);
SHL(32, R(alphaIndexReg), Imm8(8));
MOV(32, R(temp3Reg), Imm32(8 << 8));
SUB(32, R(temp3Reg), R(alphaIndexReg));
IMUL(32, alpha1Reg, R(temp3Reg));
SUB(32, R(alphaIndexReg), Imm32(1 << 8));
IMUL(32, alpha2Reg, R(alphaIndexReg));
LEA(32, srcReg, MComplex(alpha1Reg, alpha2Reg, SCALE_1, 7 * 31));
IMUL(32, alpha1Reg, R(srcReg), Imm32(0x124A));
SHR(32, R(alpha1Reg), Imm8(15));
SUB(32, R(srcReg), R(alpha1Reg));
SHR(32, R(srcReg), Imm8(1));
ADD(32, R(srcReg), R(alpha1Reg));
SHR(32, R(srcReg), Imm8(10));
FixupBranch finishLerp8 = J();
SetJumpTarget(handleSimple);
MOVZX(32, 8, srcReg, MComplex(srcReg, alphaIndexReg, SCALE_1, 14));
regCache_.Release(alphaIndexReg, RegCache::GEN_TEMP0);
regCache_.Release(alpha1Reg, RegCache::GEN_TEMP1);
regCache_.Release(alpha2Reg, RegCache::GEN_TEMP2);
regCache_.Release(temp3Reg, RegCache::GEN_TEMP3);
SetJumpTarget(finishFull);
SetJumpTarget(finishLerp6);
SetJumpTarget(finishLerp8);
SHL(32, R(srcReg), Imm8(24));
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
OR(32, R(resultReg), R(srcReg));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
success = true;
SetJumpTarget(finishZero);
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
}
_dbg_assert_(success);
return success;
}
bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
if (id.swizzle) {
return Jit_GetTexDataSwizzled(id, bitsPerTexel);
}
_assert_msg_(!id.linear, "Should not use this path for linear")
Describe("TexData");
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
bool success = true;
switch (bitsPerTexel) {
case 32:
case 16:
case 8:
LEA(64, temp1Reg, MComplex(srcReg, uReg, bitsPerTexel / 8, 0));
break;
case 4: {
if (cpu_info.bBMI2_fast)
MOV(32, R(temp2Reg), Imm32(0x0F));
else
XOR(32, R(temp2Reg), R(temp2Reg));
SHR(32, R(uReg), Imm8(1));
FixupBranch skip = J_CC(CC_NC);
if (cpu_info.bBMI2_fast)
SHL(32, R(temp2Reg), Imm8(4));
else
MOV(32, R(temp2Reg), Imm32(4));
SetJumpTarget(skip);
LEA(64, temp1Reg, MRegSum(srcReg, uReg));
break;
}
default:
success = false;
break;
}
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRelease(RegCache::GEN_ARG_U);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
MOV(32, R(resultReg), R(vReg));
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRelease(RegCache::GEN_ARG_V);
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
IMUL(32, resultReg, R(bufwReg));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
if (bitsPerTexel == 4 && !cpu_info.bBMI2) {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_(hasRCX);
}
switch (bitsPerTexel) {
case 32:
case 16:
case 8:
MOVZX(32, bitsPerTexel, resultReg, MComplex(temp1Reg, resultReg, bitsPerTexel / 8, 0));
break;
case 4: {
SHR(32, R(resultReg), Imm8(1));
if (cpu_info.bBMI2_fast) {
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
PEXT(32, resultReg, resultReg, R(temp2Reg));
} else if (cpu_info.bBMI2) {
SHRX(32, resultReg, MRegSum(temp1Reg, resultReg), temp2Reg);
AND(32, R(resultReg), Imm8(0x0F));
} else {
MOV(8, R(resultReg), MRegSum(temp1Reg, resultReg));
MOV(8, R(RCX), R(temp2Reg));
SHR(8, R(resultReg), R(RCX));
AND(32, R(resultReg), Imm8(0x0F));
}
break;
}
default:
success = false;
break;
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return success;
}
bool SamplerJitCache::Jit_GetTexDataSwizzled4(const SamplerID &id) {
Describe("TexDataS4");
_assert_msg_(!id.linear, "Should not use this path for linear")
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
LEA(32, temp1Reg, MScaled(uReg, SCALE_4, 0));
AND(32, R(temp1Reg), Imm8(~127));
LEA(32, temp2Reg, MScaled(vReg, SCALE_4, 0));
AND(32, R(temp2Reg), Imm8(31));
LEA(32, temp1Reg, MComplex(temp1Reg, temp2Reg, SCALE_4, 0));
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
ADD(64, R(temp1Reg), R(srcReg));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
SHR(32, R(vReg), Imm8(3));
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
LEA(32, temp2Reg, MScaled(bufwReg, SCALE_4, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
IMUL(32, temp2Reg, R(vReg));
ADD(64, R(temp1Reg), R(temp2Reg));
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRelease(RegCache::GEN_ARG_V);
AND(32, R(uReg), Imm8(31));
SHR(32, R(uReg), Imm8(1));
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
MOV(8, R(resultReg), MRegSum(temp1Reg, uReg));
FixupBranch skipNonZero = J_CC(CC_NC);
SHR(8, R(resultReg), Imm8(4));
SetJumpTarget(skipNonZero);
AND(32, R(resultReg), Imm8(0x0F));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRelease(RegCache::GEN_ARG_U);
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
return true;
}
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) {
if (bitsPerTexel == 4) {
return Jit_GetTexDataSwizzled4(id);
}
bool success = true;
_assert_msg_(!id.linear, "Should not use this path for linear")
Describe("TexDataS");
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
X64Reg uReg = regCache_.Find(RegCache::GEN_ARG_U);
X64Reg vReg = regCache_.Find(RegCache::GEN_ARG_V);
LEA(32, temp1Reg, MScaled(vReg, SCALE_4, 0));
AND(32, R(temp1Reg), Imm8(31));
AND(32, R(vReg), Imm8(~7));
MOV(32, R(temp2Reg), R(uReg));
MOV(32, R(resultReg), R(uReg));
switch (bitsPerTexel) {
case 32:
SHR(32, R(resultReg), Imm8(2));
break;
case 16:
SHR(32, R(vReg), Imm8(1));
SHR(32, R(temp2Reg), Imm8(1));
SHR(32, R(resultReg), Imm8(3));
break;
case 8:
SHR(32, R(vReg), Imm8(2));
SHR(32, R(temp2Reg), Imm8(2));
SHR(32, R(resultReg), Imm8(4));
break;
default:
success = false;
break;
}
AND(32, R(temp2Reg), Imm8(3));
SHL(32, R(resultReg), Imm8(5));
ADD(32, R(temp1Reg), R(temp2Reg));
ADD(32, R(temp1Reg), R(resultReg));
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
LEA(64, temp1Reg, MComplex(srcReg, temp1Reg, SCALE_4, 0));
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
LEA(32, resultReg, MScaled(bufwReg, SCALE_4, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
IMUL(32, resultReg, R(vReg));
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRelease(RegCache::GEN_ARG_V);
switch (bitsPerTexel) {
case 32:
MOV(bitsPerTexel, R(resultReg), MRegSum(temp1Reg, resultReg));
break;
case 16:
AND(32, R(uReg), Imm8(1));
LEA(32, resultReg, MComplex(resultReg, uReg, SCALE_2, 0));
MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));
break;
case 8:
AND(32, R(uReg), Imm8(3));
ADD(32, R(resultReg), R(uReg));
MOVZX(32, bitsPerTexel, resultReg, MRegSum(temp1Reg, resultReg));
break;
default:
success = false;
break;
}
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.ForceRelease(RegCache::GEN_ARG_U);
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return success;
}
bool SamplerJitCache::Jit_GetTexelCoords(const SamplerID &id) {
Describe("Texel");
X64Reg uReg = regCache_.Alloc(RegCache::GEN_ARG_U);
X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
if (id.hasAnyMips) {
X64Reg idReg = GetSamplerID();
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
X64Reg levelReg = INVALID_REG;
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
} else {
levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);
MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
}
UNPCKLPS(sReg, R(tReg));
SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
X64Reg sizesReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (cpu_info.bSSE4_1) {
PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
} else {
MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
X64Reg zeroReg = GetZeroVec();
PUNPCKLWD(sizesReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
PSLLD(sizesReg, 8);
CVTDQ2PS(sizesReg, R(sizesReg));
MULPS(sReg, R(sizesReg));
CVTTPS2DQ(sReg, R(sReg));
regCache_.Release(sizesReg, RegCache::VEC_TEMP0);
PSRAD(sReg, 8);
if (!cpu_info.bSSE4_1)
PSHUFD(tReg, R(sReg), _MM_SHUFFLE(3, 2, 3, 2));
auto applyClampWrap = [&](X64Reg dest, bool clamp, bool isY, bool isLevel1) {
int offset = offsetof(SamplerID, cached.sizes[0].w) + (isY ? 2 : 0) + (isLevel1 ? 4 : 0);
MOVZX(32, 16, tempReg, MComplex(idReg, levelReg, SCALE_4, offset));
if (cpu_info.bSSE4_1) {
if (isY || isLevel1)
PEXTRD(R(dest), sReg, (isY ? 1 : 0) + (isLevel1 ? 2 : 0));
else
MOVD_xmm(R(dest), sReg);
} else {
X64Reg srcReg = isLevel1 ? tReg : sReg;
MOVD_xmm(R(dest), srcReg);
if (!isY)
PSRLDQ(srcReg, 4);
}
SUB(32, R(tempReg), Imm8(1));
AND(32, R(tempReg), Imm32(0x000001FF));
if (clamp) {
CMP(32, R(dest), R(tempReg));
CMOVcc(32, dest, R(tempReg), CC_G);
XOR(32, R(tempReg), R(tempReg));
CMP(32, R(dest), R(tempReg));
CMOVcc(32, dest, R(tempReg), CC_L);
} else {
AND(32, R(dest), R(tempReg));
}
};
applyClampWrap(uReg, id.clampS, false, true);
applyClampWrap(vReg, id.clampT, true, true);
MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 0), R(uReg));
MOV(32, MDisp(RSP, stackArgPos_ + stackUV1Offset_ + 4), R(vReg));
applyClampWrap(uReg, id.clampS, false, false);
applyClampWrap(vReg, id.clampT, true, false);
UnlockSamplerID(idReg);
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
} else {
UNPCKLPS(sReg, R(tReg));
MULPS(sReg, M(constWidthHeight256f_));
CVTTPS2DQ(sReg, R(sReg));
PSRAD(sReg, 8);
bool clampApplied = false;
if (id.width0Shift == id.height0Shift) {
if (!id.clampS && !id.clampT) {
PAND(sReg, M(constWidthMinus1i_));
clampApplied = true;
} else if (id.clampS && id.clampT && cpu_info.bSSE4_1) {
X64Reg zeroReg = GetZeroVec();
PMINSD(sReg, M(constWidthMinus1i_));
PMAXSD(sReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
clampApplied = true;
}
}
MOVQ_xmm(R(uReg), sReg);
MOV(64, R(vReg), R(uReg));
SHR(64, R(vReg), Imm8(32));
AND(32, R(uReg), R(uReg));
auto applyClampWrap = [this](X64Reg dest, bool clamp, uint8_t shift) {
if (shift > 9)
shift = 9;
if (clamp) {
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
MOV(32, R(tempReg), Imm32((1 << shift) - 1));
CMP(32, R(dest), R(tempReg));
CMOVcc(32, dest, R(tempReg), CC_G);
XOR(32, R(tempReg), R(tempReg));
CMP(32, R(dest), R(tempReg));
CMOVcc(32, dest, R(tempReg), CC_L);
regCache_.Release(tempReg, RegCache::GEN_TEMP0);
} else {
AND(32, R(dest), Imm32((1 << shift) - 1));
}
};
if (!clampApplied) {
applyClampWrap(uReg, id.clampS, id.width0Shift);
applyClampWrap(vReg, id.clampT, id.height0Shift);
}
}
regCache_.Unlock(uReg, RegCache::GEN_ARG_U);
regCache_.Unlock(vReg, RegCache::GEN_ARG_V);
regCache_.ForceRetain(RegCache::GEN_ARG_U);
regCache_.ForceRetain(RegCache::GEN_ARG_V);
regCache_.Unlock(sReg, RegCache::VEC_ARG_S);
regCache_.Unlock(tReg, RegCache::VEC_ARG_T);
regCache_.ForceRelease(RegCache::VEC_ARG_S);
regCache_.ForceRelease(RegCache::VEC_ARG_T);
return true;
}
bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
Describe("TexelQuad");
X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
X64Reg sizesReg = INVALID_REG;
if (id.hasAnyMips) {
X64Reg idReg = GetSamplerID();
X64Reg levelReg = INVALID_REG;
bool releaseLevelReg = !regCache_.Has(RegCache::GEN_ARG_LEVEL);
if (!releaseLevelReg) {
levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
} else {
releaseLevelReg = true;
levelReg = regCache_.Alloc(RegCache::GEN_ARG_LEVEL);
MOV(32, R(levelReg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
}
sizesReg = regCache_.Alloc(RegCache::VEC_TEMP5);
if (cpu_info.bSSE4_1) {
PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
} else {
MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
X64Reg zeroReg = GetZeroVec();
PUNPCKLWD(sizesReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
if (releaseLevelReg)
regCache_.Release(levelReg, RegCache::GEN_ARG_LEVEL);
else
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
UnlockSamplerID(idReg);
X64Reg sizes256Reg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSLLD(sizes256Reg, sizesReg, 8);
CVTDQ2PS(sizes256Reg, R(sizes256Reg));
UNPCKLPS(sReg, R(tReg));
SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
MULPS(sReg, R(sizes256Reg));
regCache_.Release(sizes256Reg, RegCache::VEC_TEMP0);
PSUBD(sizesReg, M(constOnes32_));
PAND(sizesReg, M(constMaxTexel32_));
} else {
UNPCKLPS(sReg, R(tReg));
MULPS(sReg, M(constWidthHeight256f_));
}
CVTPS2DQ(sReg, R(sReg));
X64Reg tempXYReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PCMPEQD(tempXYReg, R(tempXYReg));
PSLLD(tempXYReg, 7);
PADDD(sReg, R(tempXYReg));
regCache_.Release(tempXYReg, RegCache::VEC_TEMP0);
X64Reg allFracReg = INVALID_REG;
if (regCache_.Has(RegCache::VEC_FRAC))
allFracReg = regCache_.Find(RegCache::VEC_FRAC);
else
allFracReg = regCache_.Alloc(RegCache::VEC_FRAC);
PSLLD(allFracReg, sReg, 24);
PSRLD(allFracReg, 28);
PACKSSDW(allFracReg, R(allFracReg));
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
regCache_.ForceRetain(RegCache::VEC_FRAC);
PSRAD(sReg, 8);
if (id.hasAnyMips) {
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
PSHUFD(u1Reg, R(sReg), _MM_SHUFFLE(2, 2, 2, 2));
PSHUFD(v1Reg, R(sReg), _MM_SHUFFLE(3, 3, 3, 3));
PADDD(u1Reg, M(constUNext_));
PADDD(v1Reg, M(constVNext_));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
}
PSHUFD(tReg, R(sReg), _MM_SHUFFLE(1, 1, 1, 1));
PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));
PADDD(tReg, M(constVNext_));
PADDD(sReg, M(constUNext_));
X64Reg temp0ClampReg = regCache_.Alloc(RegCache::VEC_TEMP0);
bool temp0ClampZero = false;
auto doClamp = [&](bool clamp, X64Reg stReg, const OpArg &bound) {
if (!clamp) {
PAND(stReg, bound);
return;
}
if (!temp0ClampZero)
PXOR(temp0ClampReg, R(temp0ClampReg));
temp0ClampZero = true;
if (cpu_info.bSSE4_1) {
PMINSD(stReg, bound);
PMAXSD(stReg, R(temp0ClampReg));
} else {
temp0ClampZero = false;
PCMPGTD(temp0ClampReg, R(stReg));
PANDN(temp0ClampReg, R(stReg));
if (cpu_info.bAVX && bound.IsSimpleReg()) {
VPCMPGTD(128, stReg, bound.GetSimpleReg(), R(temp0ClampReg));
} else {
MOVDQA(stReg, bound);
PCMPGTD(stReg, R(temp0ClampReg));
}
PAND(temp0ClampReg, R(stReg));
PANDN(stReg, bound);
POR(stReg, R(temp0ClampReg));
}
};
if (id.hasAnyMips) {
X64Reg spreadSizeReg = regCache_.Alloc(RegCache::VEC_TEMP1);
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(0, 0, 0, 0));
doClamp(id.clampS, sReg, R(spreadSizeReg));
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(1, 1, 1, 1));
doClamp(id.clampT, tReg, R(spreadSizeReg));
X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(2, 2, 2, 2));
doClamp(id.clampS, u1Reg, R(spreadSizeReg));
PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(3, 3, 3, 3));
doClamp(id.clampT, v1Reg, R(spreadSizeReg));
regCache_.Unlock(u1Reg, RegCache::VEC_U1);
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
regCache_.Release(spreadSizeReg, RegCache::VEC_TEMP1);
} else {
doClamp(id.clampS, sReg, M(constWidthMinus1i_));
doClamp(id.clampT, tReg, M(constHeightMinus1i_));
}
if (sizesReg != INVALID_REG)
regCache_.Release(sizesReg, RegCache::VEC_TEMP5);
regCache_.Release(temp0ClampReg, RegCache::VEC_TEMP0);
regCache_.Unlock(sReg, RegCache::VEC_ARG_S);
regCache_.Unlock(tReg, RegCache::VEC_ARG_T);
regCache_.Change(RegCache::VEC_ARG_S, RegCache::VEC_ARG_U);
regCache_.Change(RegCache::VEC_ARG_T, RegCache::VEC_ARG_V);
return true;
}
bool SamplerJitCache::Jit_PrepareDataOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1) {
_assert_(id.linear);
bool success = true;
int bits = 0;
switch (id.TexFmt()) {
case GE_TFMT_5650:
case GE_TFMT_5551:
case GE_TFMT_4444:
case GE_TFMT_CLUT16:
bits = 16;
break;
case GE_TFMT_8888:
case GE_TFMT_CLUT32:
bits = 32;
break;
case GE_TFMT_CLUT8:
bits = 8;
break;
case GE_TFMT_CLUT4:
bits = 4;
break;
case GE_TFMT_DXT1:
bits = -8;
break;
case GE_TFMT_DXT3:
case GE_TFMT_DXT5:
bits = -16;
break;
default:
success = false;
}
if (success && bits != 0) {
if (bits < 0) {
success = Jit_PrepareDataDXTOffsets(id, uReg, vReg, level1, -bits);
} else if (id.swizzle) {
success = Jit_PrepareDataSwizzledOffsets(id, uReg, vReg, level1, bits);
} else {
success = Jit_PrepareDataDirectOffsets(id, uReg, vReg, level1, bits);
}
}
return success;
}
bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {
Describe("DataOff");
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!id.useStandardBufw || id.hasAnyMips) {
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bSSE4_1) {
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
} else {
PXOR(bufwVecReg, R(bufwVecReg));
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
}
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
if (bitsPerTexel == 4)
PSRLD(bufwVecReg, 1);
else if (bitsPerTexel == 16)
PSLLD(bufwVecReg, 1);
else if (bitsPerTexel == 32)
PSLLD(bufwVecReg, 2);
}
if (id.useStandardBufw && !id.hasAnyMips) {
int amt = id.width0Shift;
if (bitsPerTexel == 4)
amt -= 1;
else if (bitsPerTexel == 16)
amt += 1;
else if (bitsPerTexel == 32)
amt += 2;
PSLLD(vReg, std::max(4, amt));
} else if (cpu_info.bSSE4_1) {
PMULLD(vReg, R(bufwVecReg));
} else {
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(vOddLaneReg, R(vReg));
PMULUDQ(vReg, R(bufwVecReg));
PSRLDQ(vOddLaneReg, 4);
PSRLDQ(bufwVecReg, 4);
PMULUDQ(vOddLaneReg, R(bufwVecReg));
PSLLDQ(vOddLaneReg, 4);
POR(vReg, R(vOddLaneReg));
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
}
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
if (bitsPerTexel == 4) {
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
MOVDQA(uCopyReg, R(uReg));
PSRLD(uCopyReg, 1);
PADDD(vReg, R(uCopyReg));
regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
} else {
if (bitsPerTexel == 16)
PSLLD(uReg, 1);
else if (bitsPerTexel == 32)
PSLLD(uReg, 2);
PADDD(vReg, R(uReg));
}
return true;
}
bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCache::Reg uReg, RegCache::Reg vReg, bool level1, int bitsPerTexel) {
Describe("DataOffS");
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!id.useStandardBufw || id.hasAnyMips) {
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bSSE4_1) {
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
} else {
PXOR(bufwVecReg, R(bufwVecReg));
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
}
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
}
X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);
PSRLD(vMultReg, vReg, 3);
int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);
if (id.useStandardBufw && !id.hasAnyMips) {
int amt = id.width0Shift;
amt = std::max(7 - shiftAmount, amt);
shiftAmount += amt;
} else if (cpu_info.bSSE4_1) {
PMULLD(vMultReg, R(bufwVecReg));
} else {
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP2);
MOVDQA(vOddLaneReg, R(vMultReg));
PMULUDQ(vMultReg, R(bufwVecReg));
PSRLDQ(vOddLaneReg, 4);
PSRLDQ(bufwVecReg, 4);
PMULUDQ(vOddLaneReg, R(bufwVecReg));
PSLLDQ(vOddLaneReg, 4);
POR(vMultReg, R(vOddLaneReg));
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP2);
}
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
PSLLD(vMultReg, shiftAmount);
PSLLW(vReg, 13);
PSRLD(vReg, 9);
PADDD(vReg, R(vMultReg));
regCache_.Release(vMultReg, RegCache::VEC_TEMP1);
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSRLD(uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
PSLLD(uCopyReg, 7);
PADDD(vReg, R(uCopyReg));
if (bitsPerTexel == 4) {
PSLLW(uCopyReg, uReg, 11);
PSRLD(uCopyReg, 12);
PADDD(vReg, R(uCopyReg));
} else {
PSLLW(uReg, 32 - clz32_nonzero(bitsPerTexel - 1) + 9);
PSRLD(uReg, 12);
PADDD(vReg, R(uReg));
}
regCache_.Release(uCopyReg, RegCache::VEC_TEMP0);
return true;
}
bool SamplerJitCache::Jit_PrepareDataDXTOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, bool level1, int blockSize) {
Describe("DataOffDXT");
X64Reg baseVReg = regCache_.Find(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
PSRLD(baseVReg, vReg, 2);
PSLLD(baseVReg, blockSize == 16 ? 4 : 3);
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!id.useStandardBufw || id.hasAnyMips) {
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bSSE4_1) {
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
} else {
PXOR(bufwVecReg, R(bufwVecReg));
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
}
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
PSRLD(bufwVecReg, 2);
}
if (id.useStandardBufw && !id.hasAnyMips) {
int amt = id.width0Shift - 2;
if (amt < 0)
PSRLD(baseVReg, -amt);
else if (amt > 0)
PSLLD(baseVReg, amt);
} else if (cpu_info.bSSE4_1) {
PMULLD(baseVReg, R(bufwVecReg));
} else {
X64Reg vOddLaneReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(vOddLaneReg, R(baseVReg));
PMULUDQ(baseVReg, R(bufwVecReg));
PSRLDQ(vOddLaneReg, 4);
PSRLDQ(bufwVecReg, 4);
PMULUDQ(vOddLaneReg, R(bufwVecReg));
PSLLDQ(vOddLaneReg, 4);
POR(baseVReg, R(vOddLaneReg));
regCache_.Release(vOddLaneReg, RegCache::VEC_TEMP1);
}
regCache_.Release(bufwVecReg, RegCache::VEC_TEMP0);
X64Reg baseUReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PSRLD(baseUReg, uReg, 2);
PSLLD(baseUReg, blockSize == 16 ? 4 : 3);
PADDD(baseVReg, R(baseUReg));
regCache_.Release(baseUReg, RegCache::VEC_TEMP0);
regCache_.Unlock(baseVReg, level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
regCache_.ForceRetain(level1 ? RegCache::VEC_INDEX1 : RegCache::VEC_INDEX);
PSLLD(uReg, 30);
PSLLD(vReg, 30);
X64Reg alphaTempRegU = regCache_.Alloc(RegCache::VEC_TEMP0);
if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5)
PSRLD(alphaTempRegU, uReg, 30);
PSRLD(uReg, 30 - 1);
PSRLD(vReg, 30 - 3);
PADDD(uReg, R(vReg));
if (id.TexFmt() == GE_TFMT_DXT3 || id.TexFmt() == GE_TFMT_DXT5) {
PSRLD(vReg, 1);
PADDD(vReg, R(alphaTempRegU));
if (id.TexFmt() == GE_TFMT_DXT3) {
PSLLD(vReg, 2);
} else if (id.TexFmt() == GE_TFMT_DXT5) {
PSLLD(alphaTempRegU, vReg, 1);
PADDD(vReg, R(alphaTempRegU));
}
}
regCache_.Release(alphaTempRegU, RegCache::VEC_TEMP0);
return true;
}
bool SamplerJitCache::Jit_DecodeQuad(const SamplerID &id, bool level1) {
GETextureFormat decodeFmt = id.TexFmt();
switch (id.TexFmt()) {
case GE_TFMT_CLUT32:
case GE_TFMT_CLUT16:
case GE_TFMT_CLUT8:
case GE_TFMT_CLUT4:
decodeFmt = (GETextureFormat)id.ClutFmt();
break;
default:
break;
}
bool success = true;
X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
switch (decodeFmt) {
case GE_TFMT_5650:
success = Jit_Decode5650Quad(id, quadReg);
break;
case GE_TFMT_5551:
success = Jit_Decode5551Quad(id, quadReg);
break;
case GE_TFMT_4444:
success = Jit_Decode4444Quad(id, quadReg);
break;
default:
break;
}
regCache_.Unlock(quadReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
return success;
}
bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
Describe("5650Quad");
X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
PSLLD(temp1Reg, quadReg, 32 - 5);
PSRLD(temp1Reg, 24);
PSRLD(temp2Reg, quadReg, 11);
PSLLD(temp2Reg, 19);
POR(temp1Reg, R(temp2Reg));
PSLLD(temp2Reg, temp1Reg, 1);
PSRLD(quadReg, 5);
PSLLW(quadReg, 10);
POR(temp2Reg, R(quadReg));
POR(quadReg, R(temp1Reg));
PSRLD(temp2Reg, 6);
PAND(temp2Reg, M(const5650Swizzle_));
POR(quadReg, R(temp2Reg));
if (id.useTextureAlpha) {
PCMPEQD(temp2Reg, R(temp2Reg));
PSLLD(temp2Reg, 24);
POR(quadReg, R(temp2Reg));
}
regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
return true;
}
bool SamplerJitCache::Jit_Decode5650(const SamplerID &id) {
Describe("5650");
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bBMI2_fast) {
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
PDEP(32, temp1Reg, resultReg, R(temp1Reg));
if (id.useTextureAlpha || id.fetch)
OR(32, R(temp1Reg), Imm32(0xFF000000));
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
PEXT(32, resultReg, resultReg, R(temp2Reg));
MOV(32, R(temp2Reg), Imm32(0x00070307));
PDEP(32, resultReg, resultReg, R(temp2Reg));
OR(32, R(resultReg), R(temp1Reg));
} else {
MOV(32, R(temp2Reg), R(resultReg));
AND(32, R(temp2Reg), Imm32(0x0000001F));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp1Reg), Imm32(0x0000F800));
SHL(32, R(temp1Reg), Imm8(5));
OR(32, R(temp2Reg), R(temp1Reg));
MOV(32, R(temp1Reg), R(temp2Reg));
SHL(32, R(temp2Reg), Imm8(3));
SHR(32, R(temp1Reg), Imm8(2));
OR(32, R(temp2Reg), R(temp1Reg));
AND(32, R(temp2Reg), Imm32(0x00FF00FF));
if (id.useTextureAlpha || id.fetch)
OR(32, R(temp2Reg), Imm32(0xFF000000));
SHL(32, R(resultReg), Imm8(3 + 2));
AND(32, R(resultReg), Imm32(0x0000FC00));
MOV(32, R(temp1Reg), R(resultReg));
SHR(32, R(temp1Reg), Imm8(2 + 4));
OR(32, R(resultReg), R(temp1Reg));
AND(32, R(resultReg), Imm32(0x0000FF00));
OR(32, R(resultReg), R(temp2Reg));
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return true;
}
bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
Describe("5551Quad");
X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
PSLLD(temp1Reg, quadReg, 32 - 5);
PSRLD(temp1Reg, 24);
PSRLD(temp2Reg, quadReg, 5);
PSLLW(temp2Reg, 11);
POR(temp1Reg, R(temp2Reg));
PSRAW(quadReg, 10);
PSLLD(quadReg, 19);
POR(quadReg, R(temp1Reg));
PSRLD(temp1Reg, quadReg, 5);
PAND(temp1Reg, M(const5551Swizzle_));
POR(quadReg, R(temp1Reg));
regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
return true;
}
bool SamplerJitCache::Jit_Decode5551(const SamplerID &id) {
Describe("5551");
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (cpu_info.bBMI2_fast) {
bool keepAlpha = id.useTextureAlpha || id.fetch;
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
PDEP(32, resultReg, resultReg, R(temp1Reg));
MOV(32, R(temp2Reg), R(resultReg));
SHR(32, R(temp2Reg), Imm8(5));
AND(32, R(temp2Reg), Imm32(0x00070707));
if (keepAlpha) {
SHL(32, R(resultReg), Imm8(7));
SAR(32, R(resultReg), Imm8(7));
}
OR(32, R(resultReg), R(temp2Reg));
} else {
MOV(32, R(temp2Reg), R(resultReg));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp2Reg), Imm32(0x0000001F));
AND(32, R(temp1Reg), Imm32(0x000003E0));
SHL(32, R(temp1Reg), Imm8(3));
OR(32, R(temp2Reg), R(temp1Reg));
MOV(32, R(temp1Reg), R(resultReg));
AND(32, R(temp1Reg), Imm32(0x00007C00));
SHL(32, R(temp1Reg), Imm8(6));
OR(32, R(temp2Reg), R(temp1Reg));
MOV(32, R(temp1Reg), R(temp2Reg));
SHL(32, R(temp2Reg), Imm8(3));
SHR(32, R(temp1Reg), Imm8(2));
AND(32, R(temp1Reg), Imm32(0x00070707));
OR(32, R(temp2Reg), R(temp1Reg));
if (id.useTextureAlpha || id.fetch) {
SAR(16, R(resultReg), Imm8(15));
SHL(32, R(resultReg), Imm8(24));
OR(32, R(resultReg), R(temp2Reg));
} else {
MOV(32, R(resultReg), R(temp2Reg));
}
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return true;
}
bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCache::Reg quadReg) {
Describe("4444Quad");
X64Reg temp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
PSLLD(temp1Reg, quadReg, 28);
PSRLD(temp1Reg, 24);
PSRLD(temp2Reg, quadReg, 4);
PSLLW(temp2Reg, 12);
POR(temp1Reg, R(temp2Reg));
PSRLD(temp2Reg, quadReg, 8);
PSLLD(temp2Reg, 28);
PSRLD(temp2Reg, 8);
POR(temp1Reg, R(temp2Reg));
if (id.useTextureAlpha) {
PSRLW(quadReg, 12);
PSLLD(quadReg, 28);
POR(quadReg, R(temp1Reg));
PSRLD(temp1Reg, quadReg, 4);
POR(quadReg, R(temp1Reg));
} else {
PSRLD(quadReg, temp1Reg, 4);
POR(quadReg, R(temp1Reg));
}
regCache_.Release(temp1Reg, RegCache::VEC_TEMP1);
regCache_.Release(temp2Reg, RegCache::VEC_TEMP2);
return true;
}
alignas(16) static const u32 color4444mask[4] = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
bool SamplerJitCache::Jit_Decode4444(const SamplerID &id) {
Describe("4444");
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
if (cpu_info.bBMI2_fast) {
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(32, R(temp1Reg), Imm32(0xF0F0F0F0));
PDEP(32, resultReg, resultReg, R(temp1Reg));
MOV(32, R(temp1Reg), R(resultReg));
SHR(32, R(temp1Reg), Imm8(4));
OR(32, R(resultReg), R(temp1Reg));
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
} else {
X64Reg vecTemp1Reg = regCache_.Alloc(RegCache::VEC_TEMP1);
X64Reg vecTemp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
X64Reg vecTemp3Reg = regCache_.Alloc(RegCache::VEC_TEMP3);
MOVD_xmm(vecTemp1Reg, R(resultReg));
PUNPCKLBW(vecTemp1Reg, R(vecTemp1Reg));
if (RipAccessible(color4444mask)) {
PAND(vecTemp1Reg, M(color4444mask));
} else {
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(temp1Reg), ImmPtr(color4444mask));
PAND(vecTemp1Reg, MatR(temp1Reg));
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
}
MOVSS(vecTemp2Reg, R(vecTemp1Reg));
MOVSS(vecTemp3Reg, R(vecTemp1Reg));
PSRLW(vecTemp2Reg, 4);
PSLLW(vecTemp3Reg, 4);
POR(vecTemp1Reg, R(vecTemp2Reg));
POR(vecTemp1Reg, R(vecTemp3Reg));
MOVD_xmm(R(resultReg), vecTemp1Reg);
regCache_.Release(vecTemp1Reg, RegCache::VEC_TEMP1);
regCache_.Release(vecTemp2Reg, RegCache::VEC_TEMP2);
regCache_.Release(vecTemp3Reg, RegCache::VEC_TEMP3);
}
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return true;
}
bool SamplerJitCache::Jit_TransformClutIndex(const SamplerID &id, int bitsPerIndex) {
Describe("TrCLUT");
GEPaletteFormat fmt = id.ClutFmt();
if (!id.hasClutShift && !id.hasClutMask && !id.hasClutOffset) {
if (bitsPerIndex > 8) {
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
AND(32, R(resultReg), Imm32(0x000000FF));
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
}
return true;
}
if (!cpu_info.bBMI2) {
bool hasRCX = regCache_.ChangeReg(RCX, RegCache::GEN_SHIFTVAL);
_assert_msg_(hasRCX, "Could not obtain RCX, locked?");
}
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
X64Reg idReg = GetSamplerID();
MOV(32, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clutFormat)));
UnlockSamplerID(idReg);
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
int shiftedToSoFar = 0;
if (id.hasClutShift) {
SHR(32, R(temp1Reg), Imm8(2 - shiftedToSoFar));
shiftedToSoFar = 2;
if (cpu_info.bBMI2) {
SHRX(32, resultReg, R(resultReg), temp1Reg);
} else {
_assert_(regCache_.Has(RegCache::GEN_SHIFTVAL));
MOV(32, R(RCX), R(temp1Reg));
SHR(32, R(resultReg), R(RCX));
}
}
if (id.hasClutMask) {
SHR(32, R(temp1Reg), Imm8(8 - shiftedToSoFar));
shiftedToSoFar = 8;
AND(32, R(resultReg), R(temp1Reg));
}
u32 offsetMask = fmt == GE_CMODE_32BIT_ABGR8888 ? 0x00FF : 0x01FF;
if (bitsPerIndex > 8 && (!id.hasClutOffset || offsetMask != 0x00FF)) {
AND(32, R(resultReg), Imm32(0x000000FF));
}
if (id.hasClutOffset) {
SHR(32, R(temp1Reg), Imm8(16 - shiftedToSoFar));
SHL(32, R(temp1Reg), Imm8(4));
OR(32, R(resultReg), R(temp1Reg));
AND(32, R(resultReg), Imm32(offsetMask));
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
return true;
}
bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {
Describe("ReadCLUT");
X64Reg resultReg = regCache_.Find(RegCache::GEN_RESULT);
_assert_msg_(!id.linear, "Should not use this path for linear");
if (!id.useSharedClut) {
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
LEA(32, temp2Reg, MScaled(levelReg, SCALE_4, 0));
regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
if (id.fetch)
regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
} else {
_assert_(stackLevelOffset_ != -1);
MOV(32, R(temp2Reg), MDisp(RSP, stackArgPos_ + stackLevelOffset_));
LEA(32, temp2Reg, MScaled(temp2Reg, SCALE_4, 0));
}
LEA(64, resultReg, MComplex(resultReg, temp2Reg, SCALE_4, 0));
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
}
X64Reg idReg = GetSamplerID();
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
MOV(PTRBITS, R(temp1Reg), MDisp(idReg, offsetof(SamplerID, cached.clut)));
UnlockSamplerID(idReg);
switch (id.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650:
case GE_CMODE_16BIT_ABGR5551:
case GE_CMODE_16BIT_ABGR4444:
MOVZX(32, 16, resultReg, MComplex(temp1Reg, resultReg, SCALE_2, 0));
break;
case GE_CMODE_32BIT_ABGR8888:
MOV(32, R(resultReg), MComplex(temp1Reg, resultReg, SCALE_4, 0));
break;
}
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
regCache_.Unlock(resultReg, RegCache::GEN_RESULT);
switch (id.ClutFmt()) {
case GE_CMODE_16BIT_BGR5650:
return Jit_Decode5650(id);
case GE_CMODE_16BIT_ABGR5551:
return Jit_Decode5551(id);
case GE_CMODE_16BIT_ABGR4444:
return Jit_Decode4444(id);
case GE_CMODE_32BIT_ABGR8888:
return true;
default:
return false;
}
}
};
#endif