Path: blob/master/Common/GPU/Vulkan/VulkanQueueRunner.cpp
3187 views
#include <unordered_map>12#include "Common/GPU/DataFormat.h"3#include "Common/GPU/Vulkan/VulkanQueueRunner.h"4#include "Common/GPU/Vulkan/VulkanRenderManager.h"5#include "Common/Log.h"6#include "Common/TimeUtil.h"78using namespace PPSSPP_VK;910// Debug help: adb logcat -s DEBUG AndroidRuntime PPSSPPNativeActivity PPSSPP NativeGLView NativeRenderer NativeSurfaceView PowerSaveModeReceiver InputDeviceState PpssppActivity CameraHelper1112static void MergeRenderAreaRectInto(VkRect2D *dest, const VkRect2D &src) {13if (dest->offset.x > src.offset.x) {14dest->extent.width += (dest->offset.x - src.offset.x);15dest->offset.x = src.offset.x;16}17if (dest->offset.y > src.offset.y) {18dest->extent.height += (dest->offset.y - src.offset.y);19dest->offset.y = src.offset.y;20}21if (dest->offset.x + dest->extent.width < src.offset.x + src.extent.width) {22dest->extent.width = src.offset.x + src.extent.width - dest->offset.x;23}24if (dest->offset.y + dest->extent.height < src.offset.y + src.extent.height) {25dest->extent.height = src.offset.y + src.extent.height - dest->offset.y;26}27}2829// We need to take the "max" of the features used in the two render passes.30RenderPassType MergeRPTypes(RenderPassType a, RenderPassType b) {31// Either both are backbuffer type, or neither are.32// These can't merge with other renderpasses33if (a == RenderPassType::BACKBUFFER || b == RenderPassType::BACKBUFFER) {34_dbg_assert_(a == b);35return a;36}3738_dbg_assert_((a & RenderPassType::MULTIVIEW) == (b & RenderPassType::MULTIVIEW));3940// The rest we can just OR together to get the maximum feature set.41return (RenderPassType)((u32)a | (u32)b);42}4344void VulkanQueueRunner::CreateDeviceObjects() {45INFO_LOG(Log::G3D, "VulkanQueueRunner::CreateDeviceObjects");4647RPKey key{48VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,49VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,50};51compatibleRenderPass_ = GetRenderPass(key);5253#if 054// Just to check whether it makes sense to split some of these. drawidx is way bigger than the others...55// We should probably just move to variable-size data in a raw buffer anyway...56VkRenderData rd;57INFO_LOG(Log::G3D, "sizeof(pipeline): %d", (int)sizeof(rd.pipeline));58INFO_LOG(Log::G3D, "sizeof(draw): %d", (int)sizeof(rd.draw));59INFO_LOG(Log::G3D, "sizeof(drawidx): %d", (int)sizeof(rd.drawIndexed));60INFO_LOG(Log::G3D, "sizeof(clear): %d", (int)sizeof(rd.clear));61INFO_LOG(Log::G3D, "sizeof(viewport): %d", (int)sizeof(rd.viewport));62INFO_LOG(Log::G3D, "sizeof(scissor): %d", (int)sizeof(rd.scissor));63INFO_LOG(Log::G3D, "sizeof(blendColor): %d", (int)sizeof(rd.blendColor));64INFO_LOG(Log::G3D, "sizeof(push): %d", (int)sizeof(rd.push));65#endif66}6768void VulkanQueueRunner::DestroyDeviceObjects() {69INFO_LOG(Log::G3D, "VulkanQueueRunner::DestroyDeviceObjects");7071syncReadback_.Destroy(vulkan_);7273renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {74_dbg_assert_(rp);75rp->Destroy(vulkan_);76delete rp;77});78renderPasses_.Clear();79}8081bool VulkanQueueRunner::InitBackbufferFramebuffers(int width, int height, FrameDataShared &frameDataShared) {82VkResult res;83// We share the same depth buffer but have multiple color buffers, see the loop below.84VkImageView attachments[2] = { VK_NULL_HANDLE, depth_.view };8586VkFramebufferCreateInfo fb_info = { VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };87fb_info.renderPass = GetCompatibleRenderPass()->Get(vulkan_, RenderPassType::BACKBUFFER, VK_SAMPLE_COUNT_1_BIT);88fb_info.attachmentCount = 2;89fb_info.pAttachments = attachments;90fb_info.width = width;91fb_info.height = height;92fb_info.layers = 1;9394framebuffers_.resize(frameDataShared.swapchainImageCount_);9596for (uint32_t i = 0; i < frameDataShared.swapchainImageCount_; i++) {97attachments[0] = frameDataShared.swapchainImages_[i].view;98res = vkCreateFramebuffer(vulkan_->GetDevice(), &fb_info, nullptr, &framebuffers_[i]);99_dbg_assert_(res == VK_SUCCESS);100if (res != VK_SUCCESS) {101framebuffers_.clear();102return false;103}104}105106return true;107}108109bool VulkanQueueRunner::InitDepthStencilBuffer(VkCommandBuffer cmd, VulkanBarrierBatch *barriers) {110const VkFormat depth_format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat;111int aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;112VkImageCreateInfo image_info = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };113image_info.imageType = VK_IMAGE_TYPE_2D;114image_info.format = depth_format;115image_info.extent.width = vulkan_->GetBackbufferWidth();116image_info.extent.height = vulkan_->GetBackbufferHeight();117image_info.extent.depth = 1;118image_info.mipLevels = 1;119image_info.arrayLayers = 1;120image_info.samples = VK_SAMPLE_COUNT_1_BIT;121image_info.queueFamilyIndexCount = 0;122image_info.pQueueFamilyIndices = nullptr;123image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;124image_info.usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT;125image_info.flags = 0;126127depth_.format = depth_format;128129VmaAllocationCreateInfo allocCreateInfo{};130VmaAllocationInfo allocInfo{};131132allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;133134VkResult res = vmaCreateImage(vulkan_->Allocator(), &image_info, &allocCreateInfo, &depth_.image, &depth_.alloc, &allocInfo);135_dbg_assert_(res == VK_SUCCESS);136if (res != VK_SUCCESS)137return false;138139vulkan_->SetDebugName(depth_.image, VK_OBJECT_TYPE_IMAGE, "BackbufferDepth");140141VkImageMemoryBarrier *barrier = barriers->Add(depth_.image,142VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,143VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, 0);144barrier->subresourceRange.aspectMask = aspectMask;145barrier->oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;146barrier->newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;147barrier->srcAccessMask = 0;148barrier->dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;149150VkImageViewCreateInfo depth_view_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO };151depth_view_info.image = depth_.image;152depth_view_info.format = depth_format;153depth_view_info.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;154depth_view_info.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;155depth_view_info.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;156depth_view_info.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;157depth_view_info.subresourceRange.aspectMask = aspectMask;158depth_view_info.subresourceRange.baseMipLevel = 0;159depth_view_info.subresourceRange.levelCount = 1;160depth_view_info.subresourceRange.baseArrayLayer = 0;161depth_view_info.subresourceRange.layerCount = 1;162depth_view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;163depth_view_info.flags = 0;164165VkDevice device = vulkan_->GetDevice();166167res = vkCreateImageView(device, &depth_view_info, NULL, &depth_.view);168vulkan_->SetDebugName(depth_.view, VK_OBJECT_TYPE_IMAGE_VIEW, "depth_stencil_backbuffer");169_dbg_assert_(res == VK_SUCCESS);170if (res != VK_SUCCESS)171return false;172173return true;174}175176177void VulkanQueueRunner::DestroyBackBuffers() {178if (depth_.view) {179vulkan_->Delete().QueueDeleteImageView(depth_.view);180}181if (depth_.image) {182_dbg_assert_(depth_.alloc);183vulkan_->Delete().QueueDeleteImageAllocation(depth_.image, depth_.alloc);184}185depth_ = {};186for (uint32_t i = 0; i < framebuffers_.size(); i++) {187_dbg_assert_(framebuffers_[i] != VK_NULL_HANDLE);188vulkan_->Delete().QueueDeleteFramebuffer(framebuffers_[i]);189}190framebuffers_.clear();191192INFO_LOG(Log::G3D, "Backbuffers destroyed");193}194195// Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827196// Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies197VKRRenderPass *VulkanQueueRunner::GetRenderPass(const RPKey &key) {198VKRRenderPass *foundPass;199if (renderPasses_.Get(key, &foundPass)) {200return foundPass;201}202203VKRRenderPass *pass = new VKRRenderPass(key);204renderPasses_.Insert(key, pass);205return pass;206}207208void VulkanQueueRunner::PreprocessSteps(std::vector<VKRStep *> &steps) {209// Optimizes renderpasses, then sequences them.210// Planned optimizations:211// * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes212// as early as possible in the frame (Wipeout billboards). This will require taking over more of descriptor management so we can213// substitute descriptors, alternatively using texture array layers creatively.214215for (int j = 0; j < (int)steps.size(); j++) {216if (steps[j]->stepType == VKRStepType::RENDER &&217steps[j]->render.framebuffer) {218if (steps[j]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) {219steps[j]->render.finalColorLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;220}221if (steps[j]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) {222steps[j]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;223}224}225}226227for (int j = 0; j < (int)steps.size() - 1; j++) {228// Push down empty "Clear/Store" renderpasses, and merge them with the first "Load/Store" to the same framebuffer.229if (steps.size() > 1 && steps[j]->stepType == VKRStepType::RENDER &&230steps[j]->render.numDraws == 0 &&231steps[j]->render.numReads == 0 &&232steps[j]->render.colorLoad == VKRRenderPassLoadAction::CLEAR &&233steps[j]->render.stencilLoad == VKRRenderPassLoadAction::CLEAR &&234steps[j]->render.depthLoad == VKRRenderPassLoadAction::CLEAR) {235236// Drop the clear step, and merge it into the next step that touches the same framebuffer.237for (int i = j + 1; i < (int)steps.size(); i++) {238if (steps[i]->stepType == VKRStepType::RENDER &&239steps[i]->render.framebuffer == steps[j]->render.framebuffer) {240if (steps[i]->render.colorLoad != VKRRenderPassLoadAction::CLEAR) {241steps[i]->render.colorLoad = VKRRenderPassLoadAction::CLEAR;242steps[i]->render.clearColor = steps[j]->render.clearColor;243}244if (steps[i]->render.depthLoad != VKRRenderPassLoadAction::CLEAR) {245steps[i]->render.depthLoad = VKRRenderPassLoadAction::CLEAR;246steps[i]->render.clearDepth = steps[j]->render.clearDepth;247}248if (steps[i]->render.stencilLoad != VKRRenderPassLoadAction::CLEAR) {249steps[i]->render.stencilLoad = VKRRenderPassLoadAction::CLEAR;250steps[i]->render.clearStencil = steps[j]->render.clearStencil;251}252MergeRenderAreaRectInto(&steps[i]->render.renderArea, steps[j]->render.renderArea);253steps[i]->render.renderPassType = MergeRPTypes(steps[i]->render.renderPassType, steps[j]->render.renderPassType);254steps[i]->render.numDraws += steps[j]->render.numDraws;255steps[i]->render.numReads += steps[j]->render.numReads;256// Cheaply skip the first step.257steps[j]->stepType = VKRStepType::RENDER_SKIP;258break;259} else if (steps[i]->stepType == VKRStepType::COPY &&260steps[i]->copy.src == steps[j]->render.framebuffer) {261// Can't eliminate the clear if a game copies from it before it's262// rendered to. However this should be rare.263// TODO: This should never happen when we check numReads now.264break;265}266}267}268}269270// Queue hacks.271if (hacksEnabled_) {272if (hacksEnabled_ & QUEUE_HACK_MGS2_ACID) {273// Massive speedup due to re-ordering.274ApplyMGSHack(steps);275}276if (hacksEnabled_ & QUEUE_HACK_SONIC) {277ApplySonicHack(steps);278}279if (hacksEnabled_ & QUEUE_HACK_RENDERPASS_MERGE) {280ApplyRenderPassMerge(steps);281}282}283}284285void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, int curFrame, FrameData &frameData, FrameDataShared &frameDataShared, bool keepSteps) {286QueueProfileContext *profile = frameData.profile.enabled ? &frameData.profile : nullptr;287288if (profile)289profile->cpuStartTime = time_now_d();290291bool emitLabels = vulkan_->Extensions().EXT_debug_utils;292293VkCommandBuffer cmd = frameData.hasPresentCommands ? frameData.presentCmd : frameData.mainCmd;294295for (size_t i = 0; i < steps.size(); i++) {296const VKRStep &step = *steps[i];297if (emitLabels) {298VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };299char temp[128];300if (step.stepType == VKRStepType::RENDER && step.render.framebuffer) {301snprintf(temp, sizeof(temp), "%s: %s", step.tag, step.render.framebuffer->Tag());302labelInfo.pLabelName = temp;303} else {304labelInfo.pLabelName = step.tag;305}306vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);307}308309switch (step.stepType) {310case VKRStepType::RENDER:311{312bool perform = true;313if (!step.render.framebuffer) {314if (emitLabels) {315vkCmdEndDebugUtilsLabelEXT(cmd);316}317frameData.Submit(vulkan_, FrameSubmitType::Pending, frameDataShared);318319// If the window is minimized and we don't have a swap chain, don't bother.320if (frameDataShared.swapchainImageCount_ > 0) {321// When stepping in the GE debugger, we can end up here multiple times in a "frame".322// So only acquire once.323if (!frameData.hasAcquired) {324frameData.AcquireNextImage(vulkan_);325SetBackbuffer(framebuffers_[frameData.curSwapchainImage], frameDataShared.swapchainImages_[frameData.curSwapchainImage].image);326}327328if (!frameData.hasPresentCommands) {329// A RENDER step rendering to the backbuffer is normally the last step that happens in a frame,330// unless taking a screenshot, in which case there might be a READBACK_IMAGE after it.331// This is why we have to switch cmd to presentCmd, in this case.332VkCommandBufferBeginInfo begin{VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};333begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;334vkBeginCommandBuffer(frameData.presentCmd, &begin);335frameData.hasPresentCommands = true;336}337cmd = frameData.presentCmd;338if (emitLabels) {339VkDebugUtilsLabelEXT labelInfo{VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT};340labelInfo.pLabelName = "present";341vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo);342}343} else {344perform = false;345}346}347if (perform) {348PerformRenderPass(step, cmd, curFrame, frameData.profile);349} else {350frameData.skipSwap = true;351}352break;353}354case VKRStepType::COPY:355PerformCopy(step, cmd);356break;357case VKRStepType::BLIT:358PerformBlit(step, cmd);359break;360case VKRStepType::READBACK:361PerformReadback(step, cmd, frameData);362break;363case VKRStepType::READBACK_IMAGE:364PerformReadbackImage(step, cmd);365break;366case VKRStepType::RENDER_SKIP:367break;368}369370if (profile && profile->timestampsEnabled && profile->timestampDescriptions.size() + 1 < MAX_TIMESTAMP_QUERIES) {371vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile->queryPool, (uint32_t)profile->timestampDescriptions.size());372profile->timestampDescriptions.push_back(StepToString(vulkan_, step));373}374375if (emitLabels) {376vkCmdEndDebugUtilsLabelEXT(cmd);377}378}379380// Deleting all in one go should be easier on the instruction cache than deleting381// them as we go - and easier to debug because we can look backwards in the frame.382if (!keepSteps) {383for (auto step : steps) {384delete step;385}386steps.clear();387}388389if (profile)390profile->cpuEndTime = time_now_d();391}392393void VulkanQueueRunner::ApplyMGSHack(std::vector<VKRStep *> &steps) {394// Really need a sane way to express transforms of steps.395396// We want to turn a sequence of copy,render(1),copy,render(1),copy,render(1) to copy,copy,copy,render(n).397398// TODO: Where does this first part trigger? The below depal part triggers reliably in Acid2.399400for (int i = 0; i < (int)steps.size() - 3; i++) {401int last = -1;402if (!(steps[i]->stepType == VKRStepType::COPY &&403steps[i + 1]->stepType == VKRStepType::RENDER &&404steps[i + 2]->stepType == VKRStepType::COPY &&405steps[i + 1]->render.numDraws == 1 &&406steps[i]->copy.dst == steps[i + 2]->copy.dst))407continue;408// Looks promising! Let's start by finding the last one.409for (int j = i; j < (int)steps.size(); j++) {410switch (steps[j]->stepType) {411case VKRStepType::RENDER:412if (steps[j]->render.numDraws > 1)413last = j - 1;414// should really also check descriptor sets...415if (steps[j]->commands.size()) {416const VkRenderData &cmd = steps[j]->commands.back();417if (cmd.cmd == VKRRenderCommand::DRAW_INDEXED && cmd.draw.count != 6)418last = j - 1;419}420break;421case VKRStepType::COPY:422if (steps[j]->copy.dst != steps[i]->copy.dst)423last = j - 1;424break;425default:426break;427}428if (last != -1)429break;430}431432if (last != -1) {433// We've got a sequence from i to last that needs reordering.434// First, let's sort it, keeping the same length.435std::vector<VKRStep *> copies;436std::vector<VKRStep *> renders;437copies.reserve((last - i) / 2);438renders.reserve((last - i) / 2);439for (int n = i; n <= last; n++) {440if (steps[n]->stepType == VKRStepType::COPY)441copies.push_back(steps[n]);442else if (steps[n]->stepType == VKRStepType::RENDER)443renders.push_back(steps[n]);444}445// Write the copies back. TODO: Combine them too.446for (int j = 0; j < (int)copies.size(); j++) {447steps[i + j] = copies[j];448}449450const int firstRender = i + (int)copies.size();451452// Write the renders back (so they will be deleted properly).453for (int j = 0; j < (int)renders.size(); j++) {454steps[firstRender + j] = renders[j];455}456_assert_(steps[firstRender]->stepType == VKRStepType::RENDER);457// Combine the renders.458for (int j = 1; j < (int)renders.size(); j++) {459steps[firstRender]->commands.reserve(renders[j]->commands.size());460for (int k = 0; k < (int)renders[j]->commands.size(); k++) {461steps[firstRender]->commands.push_back(renders[j]->commands[k]);462}463MergeRenderAreaRectInto(&steps[firstRender]->render.renderArea, renders[j]->render.renderArea);464// Easier than removing them from the list, though that might be the better option.465steps[firstRender + j]->stepType = VKRStepType::RENDER_SKIP;466steps[firstRender + j]->commands.clear();467}468// We're done.469// INFO_LOG(Log::G3D, "MGS HACK part 1: copies: %d renders: %d", (int)copies.size(), (int)renders.size());470break;471}472}473474// There's also a post processing effect using depals that's just brutal in some parts475// of the game.476for (int i = 0; i < (int)steps.size() - 3; i++) {477int last = -1;478if (!(steps[i]->stepType == VKRStepType::RENDER &&479steps[i + 1]->stepType == VKRStepType::RENDER &&480steps[i + 2]->stepType == VKRStepType::RENDER &&481steps[i]->render.numDraws == 1 &&482steps[i + 1]->render.numDraws == 1 &&483steps[i + 2]->render.numDraws == 1 &&484steps[i]->render.colorLoad == VKRRenderPassLoadAction::DONT_CARE &&485steps[i + 1]->render.colorLoad == VKRRenderPassLoadAction::KEEP &&486steps[i + 2]->render.colorLoad == VKRRenderPassLoadAction::DONT_CARE)) {487continue;488}489VKRFramebuffer *depalFramebuffer = steps[i]->render.framebuffer;490VKRFramebuffer *targetFramebuffer = steps[i + 1]->render.framebuffer;491// OK, found the start of a post-process sequence. Let's scan until we find the end.492for (int j = i; j < (int)steps.size() - 3; j++) {493if (((j - i) & 1) == 0) {494// This should be a depal draw.495if (steps[j]->render.numDraws != 1)496break;497if (steps[j]->commands.size() > 5) // TODO: Not the greatest heuristic! This may change if we merge commands.498break;499if (steps[j]->render.colorLoad != VKRRenderPassLoadAction::DONT_CARE)500break;501if (steps[j]->render.framebuffer != depalFramebuffer)502break;503last = j;504} else {505// This should be a target draw.506if (steps[j]->render.numDraws != 1)507break;508if (steps[j]->commands.size() > 5) // TODO: Not the greatest heuristic! This may change if we merge commands.509break;510if (steps[j]->render.colorLoad != VKRRenderPassLoadAction::KEEP)511break;512if (steps[j]->render.framebuffer != targetFramebuffer)513break;514last = j;515}516}517518if (last == -1)519continue;520521if (last > 479) {522// Avoid some problems with the hack (oil slick crash). Some additional commands get added there that523// confuses this merging. NOTE: This is not really a solution! See #20306.524last = 479;525}526527int minScissorX = 10000;528int minScissorY = 10000;529int maxScissorX = 0;530int maxScissorY = 0;531532// Combine the depal renders. Also record scissor bounds.533for (int j = i + 2; j <= last + 1; j += 2) {534for (int k = 0; k < (int)steps[j]->commands.size(); k++) {535switch (steps[j]->commands[k].cmd) {536case VKRRenderCommand::DRAW:537case VKRRenderCommand::DRAW_INDEXED:538steps[i]->commands.push_back(steps[j]->commands[k]);539break;540case VKRRenderCommand::SCISSOR:541{542// TODO: Merge scissor rectangles.543const auto &rc = steps[j]->commands[k].scissor.scissor;544if (rc.offset.x < minScissorX) {545minScissorX = rc.offset.x;546}547if (rc.offset.y < minScissorY) {548minScissorY = rc.offset.y;549}550if (rc.offset.x + rc.extent.width > maxScissorX) {551maxScissorX = rc.offset.x + rc.extent.width;552}553if (rc.offset.y + rc.extent.height > maxScissorY) {554maxScissorY = rc.offset.y + rc.extent.height;555}556break;557}558default:559break;560}561}562MergeRenderAreaRectInto(&steps[i]->render.renderArea, steps[j]->render.renderArea);563steps[j]->stepType = VKRStepType::RENDER_SKIP;564}565566// Update the scissor in the first draw.567minScissorX = std::max(0, minScissorX);568minScissorY = std::max(0, minScissorY);569if (maxScissorX > minScissorX && maxScissorY > minScissorY) {570for (int k = 0; k < steps[i]->commands.size(); k++) {571if (steps[i]->commands[k].cmd == VKRRenderCommand::SCISSOR) {572auto &rc = steps[i]->commands[k].scissor.scissor;573rc.offset.x = minScissorX;574rc.offset.y = minScissorY;575rc.extent.width = maxScissorX - minScissorX;576rc.extent.height = maxScissorY - minScissorY;577break;578}579}580}581582// Combine the target renders.583for (int j = i + 3; j <= last; j += 2) {584for (int k = 0; k < (int)steps[j]->commands.size(); k++) {585switch (steps[j]->commands[k].cmd) {586case VKRRenderCommand::DRAW:587case VKRRenderCommand::DRAW_INDEXED:588steps[i + 1]->commands.push_back(steps[j]->commands[k]);589break;590default:591break;592}593}594MergeRenderAreaRectInto(&steps[i + 1]->render.renderArea, steps[j]->render.renderArea);595steps[j]->stepType = VKRStepType::RENDER_SKIP;596}597598// INFO_LOG(Log::G3D, "MGS HACK part 2: %d-%d : %d (total steps: %d)", i, last, (last - i), (int)steps.size());599600// We're done - we only expect one of these sequences per frame.601break;602}603}604605void VulkanQueueRunner::ApplySonicHack(std::vector<VKRStep *> &steps) {606// We want to turn a sequence of render(3),render(1),render(6),render(1),render(6),render(1),render(3) to607// render(1), render(1), render(1), render(6), render(6), render(6)608609for (int i = 0; i < (int)steps.size() - 4; i++) {610int last = -1;611if (!(steps[i]->stepType == VKRStepType::RENDER &&612steps[i + 1]->stepType == VKRStepType::RENDER &&613steps[i + 2]->stepType == VKRStepType::RENDER &&614steps[i + 3]->stepType == VKRStepType::RENDER &&615steps[i]->render.numDraws == 3 &&616steps[i + 1]->render.numDraws == 1 &&617steps[i + 2]->render.numDraws == 6 &&618steps[i + 3]->render.numDraws == 1 &&619steps[i]->render.framebuffer == steps[i + 2]->render.framebuffer &&620steps[i + 1]->render.framebuffer == steps[i + 3]->render.framebuffer))621continue;622// Looks promising! Let's start by finding the last one.623for (int j = i; j < (int)steps.size(); j++) {624switch (steps[j]->stepType) {625case VKRStepType::RENDER:626if ((j - i) & 1) {627if (steps[j]->render.framebuffer != steps[i + 1]->render.framebuffer)628last = j - 1;629if (steps[j]->render.numDraws != 1)630last = j - 1;631} else {632if (steps[j]->render.framebuffer != steps[i]->render.framebuffer)633last = j - 1;634if (steps[j]->render.numDraws != 3 && steps[j]->render.numDraws != 6)635last = j - 1;636}637break;638default:639break;640}641if (last != -1)642break;643}644645if (last != -1) {646// We've got a sequence from i to last that needs reordering.647// First, let's sort it, keeping the same length.648std::vector<VKRStep *> type1;649std::vector<VKRStep *> type2;650type1.reserve((last - i) / 2);651type2.reserve((last - i) / 2);652for (int n = i; n <= last; n++) {653if (steps[n]->render.framebuffer == steps[i]->render.framebuffer)654type1.push_back(steps[n]);655else656type2.push_back(steps[n]);657}658659// Write the renders back in order. Same amount, so deletion will work fine.660for (int j = 0; j < (int)type1.size(); j++) {661steps[i + j] = type1[j];662}663for (int j = 0; j < (int)type2.size(); j++) {664steps[i + j + type1.size()] = type2[j];665}666667// Combine the renders.668for (int j = 1; j < (int)type1.size(); j++) {669for (int k = 0; k < (int)type1[j]->commands.size(); k++) {670steps[i]->commands.push_back(type1[j]->commands[k]);671}672steps[i + j]->stepType = VKRStepType::RENDER_SKIP;673}674for (int j = 1; j < (int)type2.size(); j++) {675for (int k = 0; k < (int)type2[j]->commands.size(); k++) {676steps[i + type1.size()]->commands.push_back(type2[j]->commands[k]);677}678// Technically, should merge render area here, but they're all the same so not needed.679steps[i + type1.size() + j]->stepType = VKRStepType::RENDER_SKIP;680}681// We're done.682break;683}684}685}686687const char *AspectToString(VkImageAspectFlags aspect) {688switch (aspect) {689case VK_IMAGE_ASPECT_COLOR_BIT: return "COLOR";690case VK_IMAGE_ASPECT_DEPTH_BIT: return "DEPTH";691case VK_IMAGE_ASPECT_STENCIL_BIT: return "STENCIL";692case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: return "DEPTHSTENCIL";693default: return "UNUSUAL";694}695}696697std::string VulkanQueueRunner::StepToString(VulkanContext *vulkan, const VKRStep &step) {698char buffer[256];699switch (step.stepType) {700case VKRStepType::RENDER:701{702int w = step.render.framebuffer ? step.render.framebuffer->width : vulkan->GetBackbufferWidth();703int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan->GetBackbufferHeight();704int actual_w = step.render.renderArea.extent.width;705int actual_h = step.render.renderArea.extent.height;706const char *renderCmd = GetRPTypeName(step.render.renderPassType);707snprintf(buffer, sizeof(buffer), "%s %s %s (draws: %d, %dx%d/%dx%d)", renderCmd, step.tag, step.render.framebuffer ? step.render.framebuffer->Tag() : "", step.render.numDraws, actual_w, actual_h, w, h);708break;709}710case VKRStepType::COPY:711snprintf(buffer, sizeof(buffer), "COPY '%s' %s -> %s (%dx%d, %s)", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.copy.srcRect.extent.width, step.copy.srcRect.extent.height, AspectToString(step.copy.aspectMask));712break;713case VKRStepType::BLIT:714snprintf(buffer, sizeof(buffer), "BLIT '%s' %s -> %s (%dx%d->%dx%d, %s)", step.tag, step.copy.src->Tag(), step.copy.dst->Tag(), step.blit.srcRect.extent.width, step.blit.srcRect.extent.height, step.blit.dstRect.extent.width, step.blit.dstRect.extent.height, AspectToString(step.blit.aspectMask));715break;716case VKRStepType::READBACK:717snprintf(buffer, sizeof(buffer), "READBACK '%s' %s (%dx%d, %s)", step.tag, step.readback.src ? step.readback.src->Tag() : "(backbuffer)", step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, AspectToString(step.readback.aspectMask));718break;719case VKRStepType::READBACK_IMAGE:720snprintf(buffer, sizeof(buffer), "READBACK_IMAGE '%s' (%dx%d)", step.tag, step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height);721break;722case VKRStepType::RENDER_SKIP:723snprintf(buffer, sizeof(buffer), "(RENDER_SKIP) %s", step.tag);724break;725default:726buffer[0] = 0;727break;728}729return std::string(buffer);730}731732// Ideally, this should be cheap enough to be applied to all games. At least on mobile, it's pretty733// much a guaranteed neutral or win in terms of GPU power. However, dependency calculation really734// must be perfect!735void VulkanQueueRunner::ApplyRenderPassMerge(std::vector<VKRStep *> &steps) {736// First let's count how many times each framebuffer is rendered to.737// If it's more than one, let's do our best to merge them. This can help God of War quite a bit.738std::unordered_map<VKRFramebuffer *, int> counts;739for (int i = 0; i < (int)steps.size(); i++) {740if (steps[i]->stepType == VKRStepType::RENDER) {741counts[steps[i]->render.framebuffer]++;742}743}744745auto mergeRenderSteps = [](VKRStep *dst, VKRStep *src) {746// OK. Now, if it's a render, slurp up all the commands and kill the step.747// Also slurp up any pretransitions.748dst->preTransitions.append(src->preTransitions);749dst->commands.insert(dst->commands.end(), src->commands.begin(), src->commands.end());750MergeRenderAreaRectInto(&dst->render.renderArea, src->render.renderArea);751// So we don't consider it for other things, maybe doesn't matter.752src->dependencies.clear();753src->stepType = VKRStepType::RENDER_SKIP;754dst->render.numDraws += src->render.numDraws;755dst->render.numReads += src->render.numReads;756dst->render.pipelineFlags |= src->render.pipelineFlags;757dst->render.renderPassType = MergeRPTypes(dst->render.renderPassType, src->render.renderPassType);758};759auto renderHasClear = [](const VKRStep *step) {760const auto &r = step->render;761return r.colorLoad == VKRRenderPassLoadAction::CLEAR || r.depthLoad == VKRRenderPassLoadAction::CLEAR || r.stencilLoad == VKRRenderPassLoadAction::CLEAR;762};763764// Now, let's go through the steps. If we find one that is rendered to more than once,765// we'll scan forward and slurp up any rendering that can be merged across.766for (int i = 0; i < (int)steps.size(); i++) {767if (steps[i]->stepType == VKRStepType::RENDER && counts[steps[i]->render.framebuffer] > 1) {768auto fb = steps[i]->render.framebuffer;769TinySet<VKRFramebuffer *, 8> touchedFramebuffers; // must be the same fast-size as the dependencies TinySet for annoying reasons.770for (int j = i + 1; j < (int)steps.size(); j++) {771// If any other passes are reading from this framebuffer as-is, we cancel the scan.772if (steps[j]->dependencies.contains(fb)) {773// Reading from itself means a KEEP, which is okay.774if (steps[j]->stepType != VKRStepType::RENDER || steps[j]->render.framebuffer != fb)775break;776}777switch (steps[j]->stepType) {778case VKRStepType::RENDER:779if (steps[j]->render.framebuffer == fb) {780// Prevent Unknown's example case from https://github.com/hrydgard/ppsspp/pull/12242781if (renderHasClear(steps[j]) || steps[j]->dependencies.contains(touchedFramebuffers)) {782goto done_fb;783} else {784// Safe to merge, great.785mergeRenderSteps(steps[i], steps[j]);786}787} else {788// Remember the framebuffer this wrote to. We can't merge with later passes that depend on these.789touchedFramebuffers.insert(steps[j]->render.framebuffer);790}791break;792case VKRStepType::COPY:793if (steps[j]->copy.dst == fb) {794// Without framebuffer "renaming", we can't merge past a clobbered fb.795goto done_fb;796}797touchedFramebuffers.insert(steps[j]->copy.dst);798break;799case VKRStepType::BLIT:800if (steps[j]->blit.dst == fb) {801// Without framebuffer "renaming", we can't merge past a clobbered fb.802goto done_fb;803}804touchedFramebuffers.insert(steps[j]->blit.dst);805break;806case VKRStepType::READBACK:807// Not sure this has much effect, when executed READBACK is always the last step808// since we stall the GPU and wait immediately after.809break;810case VKRStepType::RENDER_SKIP:811case VKRStepType::READBACK_IMAGE:812break;813default:814// We added a new step? Might be unsafe.815goto done_fb;816}817}818done_fb:819;820}821}822}823824void VulkanQueueRunner::LogSteps(const std::vector<VKRStep *> &steps, bool verbose) {825INFO_LOG(Log::G3D, "=================== FRAME ====================");826for (size_t i = 0; i < steps.size(); i++) {827const VKRStep &step = *steps[i];828switch (step.stepType) {829case VKRStepType::RENDER:830LogRenderPass(step, verbose);831break;832case VKRStepType::COPY:833LogCopy(step);834break;835case VKRStepType::BLIT:836LogBlit(step);837break;838case VKRStepType::READBACK:839LogReadback(step);840break;841case VKRStepType::READBACK_IMAGE:842LogReadbackImage(step);843break;844case VKRStepType::RENDER_SKIP:845INFO_LOG(Log::G3D, "(skipped render pass)");846break;847}848}849INFO_LOG(Log::G3D, "------------------- SUBMIT ------------------");850}851852const char *RenderPassActionName(VKRRenderPassLoadAction a) {853switch (a) {854case VKRRenderPassLoadAction::CLEAR:855return "CLEAR";856case VKRRenderPassLoadAction::DONT_CARE:857return "DONT_CARE";858case VKRRenderPassLoadAction::KEEP:859return "KEEP";860}861return "?";862}863864const char *ImageLayoutToString(VkImageLayout layout) {865switch (layout) {866case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: return "COLOR_ATTACHMENT";867case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: return "DEPTH_STENCIL_ATTACHMENT";868case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: return "SHADER_READ_ONLY";869case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: return "TRANSFER_SRC";870case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return "TRANSFER_DST";871case VK_IMAGE_LAYOUT_GENERAL: return "GENERAL";872case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: return "PRESENT_SRC_KHR";873case VK_IMAGE_LAYOUT_UNDEFINED: return "UNDEFINED";874default: return "(unknown)";875}876}877878void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) {879const auto &r = pass.render;880const char *framebuf = r.framebuffer ? r.framebuffer->Tag() : "backbuffer";881int w = r.framebuffer ? r.framebuffer->width : vulkan_->GetBackbufferWidth();882int h = r.framebuffer ? r.framebuffer->height : vulkan_->GetBackbufferHeight();883884INFO_LOG(Log::G3D, "RENDER %s Begin(%s, draws: %d, %dx%d, %s, %s, %s)", pass.tag, framebuf, r.numDraws, w, h, RenderPassActionName(r.colorLoad), RenderPassActionName(r.depthLoad), RenderPassActionName(r.stencilLoad));885// TODO: Log these in detail.886for (int i = 0; i < (int)pass.preTransitions.size(); i++) {887INFO_LOG(Log::G3D, " PRETRANSITION: %s %s -> %s", pass.preTransitions[i].fb->Tag(), AspectToString(pass.preTransitions[i].aspect), ImageLayoutToString(pass.preTransitions[i].targetLayout));888}889890if (verbose) {891for (auto &cmd : pass.commands) {892switch (cmd.cmd) {893case VKRRenderCommand::REMOVED:894INFO_LOG(Log::G3D, " (Removed)");895break;896case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:897INFO_LOG(Log::G3D, " BindGraphicsPipeline(%x)", (int)(intptr_t)cmd.graphics_pipeline.pipeline);898break;899case VKRRenderCommand::BLEND:900INFO_LOG(Log::G3D, " BlendColor(%08x)", cmd.blendColor.color);901break;902case VKRRenderCommand::CLEAR:903INFO_LOG(Log::G3D, " Clear");904break;905case VKRRenderCommand::DRAW:906INFO_LOG(Log::G3D, " Draw(%d)", cmd.draw.count);907break;908case VKRRenderCommand::DRAW_INDEXED:909INFO_LOG(Log::G3D, " DrawIndexed(%d)", cmd.drawIndexed.count);910break;911case VKRRenderCommand::SCISSOR:912INFO_LOG(Log::G3D, " Scissor(%d, %d, %d, %d)", (int)cmd.scissor.scissor.offset.x, (int)cmd.scissor.scissor.offset.y, (int)cmd.scissor.scissor.extent.width, (int)cmd.scissor.scissor.extent.height);913break;914case VKRRenderCommand::STENCIL:915INFO_LOG(Log::G3D, " Stencil(ref=%d, compare=%d, write=%d)", cmd.stencil.stencilRef, cmd.stencil.stencilCompareMask, cmd.stencil.stencilWriteMask);916break;917case VKRRenderCommand::VIEWPORT:918INFO_LOG(Log::G3D, " Viewport(%f, %f, %f, %f, %f, %f)", cmd.viewport.vp.x, cmd.viewport.vp.y, cmd.viewport.vp.width, cmd.viewport.vp.height, cmd.viewport.vp.minDepth, cmd.viewport.vp.maxDepth);919break;920case VKRRenderCommand::PUSH_CONSTANTS:921INFO_LOG(Log::G3D, " PushConstants(%d)", cmd.push.size);922break;923case VKRRenderCommand::DEBUG_ANNOTATION:924INFO_LOG(Log::G3D, " DebugAnnotation(%s)", cmd.debugAnnotation.annotation);925break;926927case VKRRenderCommand::NUM_RENDER_COMMANDS:928break;929}930}931}932933INFO_LOG(Log::G3D, " Final: %s %s", ImageLayoutToString(pass.render.finalColorLayout), ImageLayoutToString(pass.render.finalDepthStencilLayout));934INFO_LOG(Log::G3D, "RENDER End(%s) - %d commands executed", framebuf, (int)pass.commands.size());935}936937void VulkanQueueRunner::LogCopy(const VKRStep &step) {938INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());939}940941void VulkanQueueRunner::LogBlit(const VKRStep &step) {942INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());943}944945void VulkanQueueRunner::LogReadback(const VKRStep &step) {946INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());947}948949void VulkanQueueRunner::LogReadbackImage(const VKRStep &step) {950INFO_LOG(Log::G3D, "%s", StepToString(vulkan_, step).c_str());951}952953void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd, int curFrame, QueueProfileContext &profile) {954for (size_t i = 0; i < step.preTransitions.size(); i++) {955const TransitionRequest &iter = step.preTransitions[i];956if (iter.aspect == VK_IMAGE_ASPECT_COLOR_BIT && iter.fb->color.layout != iter.targetLayout) {957recordBarrier_.TransitionColorImageAuto(958&iter.fb->color,959iter.targetLayout960);961} else if (iter.fb->depth.image != VK_NULL_HANDLE && (iter.aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) && iter.fb->depth.layout != iter.targetLayout) {962recordBarrier_.TransitionDepthStencilImageAuto(963&iter.fb->depth,964iter.targetLayout965);966}967}968969// Don't execute empty renderpasses that keep the contents.970if (step.commands.empty() && step.render.colorLoad == VKRRenderPassLoadAction::KEEP && step.render.depthLoad == VKRRenderPassLoadAction::KEEP && step.render.stencilLoad == VKRRenderPassLoadAction::KEEP) {971// Flush the pending barrier972recordBarrier_.Flush(cmd);973// Nothing to do.974// TODO: Though - a later step might have used this step's finalColorLayout etc to get things in a layout it expects.975// Should we just do a barrier? Or just let the later step deal with not having things in its preferred layout, like now?976return;977}978979// Write-after-write hazards. Fixed flicker in God of War on ARM (before we added another fix that removed these).980// NOTE: These are commented out because the normal barriers no longer check for equality, effectively generating these981// barriers automatically. This is safe, but sometimes I think can be improved on.982/*983if (step.render.framebuffer) {984int n = 0;985int stage = 0;986987if (step.render.framebuffer->color.layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {988recordBarrier_.TransitionImage(989step.render.framebuffer->color.image,9900,9911,992step.render.framebuffer->numLayers,993VK_IMAGE_ASPECT_COLOR_BIT,994VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,995VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,996VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,997VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT,998VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,999VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1000);1001}1002if (step.render.framebuffer->depth.image != VK_NULL_HANDLE && step.render.framebuffer->depth.layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) {1003recordBarrier_.TransitionImage(1004step.render.framebuffer->depth.image,10050,10061,1007step.render.framebuffer->numLayers,1008VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1009VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1010VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1011VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1012VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT,1013VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT,1014VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT,1015);1016}1017}*/10181019// This chooses a render pass according to the load/store attachment state. We no longer transition1020// image layouts as part of the passes.1021//1022// NOTE: Unconditionally flushes recordBarrier_.1023VKRRenderPass *renderPass = PerformBindFramebufferAsRenderTarget(step, cmd);10241025int curWidth = step.render.framebuffer ? step.render.framebuffer->width : vulkan_->GetBackbufferWidth();1026int curHeight = step.render.framebuffer ? step.render.framebuffer->height : vulkan_->GetBackbufferHeight();10271028VKRFramebuffer *fb = step.render.framebuffer;10291030VKRGraphicsPipeline *lastGraphicsPipeline = nullptr;1031VKRComputePipeline *lastComputePipeline = nullptr;10321033const auto &commands = step.commands;10341035// We can do a little bit of state tracking here to eliminate some calls into the driver.1036// The stencil ones are very commonly mostly redundant so let's eliminate them where possible.1037// Might also want to consider scissor and viewport.1038VkPipeline lastPipeline = VK_NULL_HANDLE;1039FastVec<PendingDescSet> *descSets = nullptr;1040VkPipelineLayout pipelineLayout = VK_NULL_HANDLE;10411042bool pipelineOK = false;10431044int lastStencilWriteMask = -1;1045int lastStencilCompareMask = -1;1046int lastStencilReference = -1;10471048const RenderPassType rpType = step.render.renderPassType;10491050for (size_t i = 0; i < commands.size(); i++) {1051const VkRenderData &c = commands[i];1052#ifdef _DEBUG1053if (profile.enabled) {1054if ((size_t)step.stepType < ARRAY_SIZE(profile.commandCounts)) {1055profile.commandCounts[(size_t)c.cmd]++;1056}1057}1058#endif1059switch (c.cmd) {1060case VKRRenderCommand::REMOVED:1061break;10621063case VKRRenderCommand::BIND_GRAPHICS_PIPELINE:1064{1065VKRGraphicsPipeline *graphicsPipeline = c.graphics_pipeline.pipeline;1066if (graphicsPipeline != lastGraphicsPipeline) {1067VkSampleCountFlagBits fbSampleCount = fb ? fb->sampleCount : VK_SAMPLE_COUNT_1_BIT;10681069if (RenderPassTypeHasMultisample(rpType) && fbSampleCount != graphicsPipeline->SampleCount()) {1070// should have been invalidated.1071_assert_msg_(graphicsPipeline->SampleCount() == VK_SAMPLE_COUNT_FLAG_BITS_MAX_ENUM,1072"expected %d sample count, got %d", fbSampleCount, graphicsPipeline->SampleCount());1073}10741075VkPipeline pipeline;10761077{1078std::lock_guard<std::mutex> lock(graphicsPipeline->mutex_);1079if (!graphicsPipeline->pipeline[(size_t)rpType]) {1080// NOTE: If render steps got merged, it can happen that, as they ended during recording,1081// they didn't know their final render pass type so they created the wrong pipelines in EndCurRenderStep().1082// Unfortunately I don't know if we can fix it in any more sensible place than here.1083// Maybe a middle pass. But let's try to just block and compile here for now, this doesn't1084// happen all that much.1085graphicsPipeline->pipeline[(size_t)rpType] = Promise<VkPipeline>::CreateEmpty();1086graphicsPipeline->Create(vulkan_, renderPass->Get(vulkan_, rpType, fbSampleCount), rpType, fbSampleCount, time_now_d(), -1);1087}1088pipeline = graphicsPipeline->pipeline[(size_t)rpType]->BlockUntilReady();1089}10901091if (pipeline != VK_NULL_HANDLE) {1092vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);1093descSets = &c.graphics_pipeline.pipelineLayout->frameData[curFrame].descSets_;1094pipelineLayout = c.graphics_pipeline.pipelineLayout->pipelineLayout;1095_dbg_assert_(pipelineLayout != VK_NULL_HANDLE);1096lastGraphicsPipeline = graphicsPipeline;1097pipelineOK = true;1098} else {1099pipelineOK = false;1100}11011102// Reset dynamic state so it gets refreshed with the new pipeline.1103lastStencilWriteMask = -1;1104lastStencilCompareMask = -1;1105lastStencilReference = -1;1106}1107break;1108}11091110case VKRRenderCommand::VIEWPORT:1111if (fb != nullptr) {1112vkCmdSetViewport(cmd, 0, 1, &c.viewport.vp);1113} else {1114const VkViewport &vp = c.viewport.vp;1115DisplayRect<float> rc{ vp.x, vp.y, vp.width, vp.height };1116RotateRectToDisplay(rc, (float)vulkan_->GetBackbufferWidth(), (float)vulkan_->GetBackbufferHeight());1117VkViewport final_vp;1118final_vp.x = rc.x;1119final_vp.y = rc.y;1120final_vp.width = rc.w;1121final_vp.height = rc.h;1122final_vp.maxDepth = vp.maxDepth;1123final_vp.minDepth = vp.minDepth;1124vkCmdSetViewport(cmd, 0, 1, &final_vp);1125}1126break;11271128case VKRRenderCommand::SCISSOR:1129{1130if (fb != nullptr) {1131vkCmdSetScissor(cmd, 0, 1, &c.scissor.scissor);1132} else {1133// Rendering to backbuffer. Might need to rotate.1134const VkRect2D &rc = c.scissor.scissor;1135DisplayRect<int> rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height };1136RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());1137_dbg_assert_(rotated_rc.x >= 0);1138_dbg_assert_(rotated_rc.y >= 0);1139VkRect2D finalRect = VkRect2D{ { rotated_rc.x, rotated_rc.y }, { (uint32_t)rotated_rc.w, (uint32_t)rotated_rc.h} };1140vkCmdSetScissor(cmd, 0, 1, &finalRect);1141}1142break;1143}11441145case VKRRenderCommand::BLEND:1146{1147float bc[4];1148Uint8x4ToFloat4(bc, c.blendColor.color);1149vkCmdSetBlendConstants(cmd, bc);1150break;1151}11521153case VKRRenderCommand::PUSH_CONSTANTS:1154if (pipelineOK) {1155vkCmdPushConstants(cmd, pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data);1156}1157break;11581159case VKRRenderCommand::STENCIL:1160if (lastStencilWriteMask != c.stencil.stencilWriteMask) {1161lastStencilWriteMask = (int)c.stencil.stencilWriteMask;1162vkCmdSetStencilWriteMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilWriteMask);1163}1164if (lastStencilCompareMask != c.stencil.stencilCompareMask) {1165lastStencilCompareMask = c.stencil.stencilCompareMask;1166vkCmdSetStencilCompareMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilCompareMask);1167}1168if (lastStencilReference != c.stencil.stencilRef) {1169lastStencilReference = c.stencil.stencilRef;1170vkCmdSetStencilReference(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilRef);1171}1172break;11731174case VKRRenderCommand::DRAW_INDEXED:1175if (pipelineOK) {1176VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;1177_dbg_assert_(set != VK_NULL_HANDLE);1178vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.drawIndexed.numUboOffsets, c.drawIndexed.uboOffsets);1179vkCmdBindIndexBuffer(cmd, c.drawIndexed.ibuffer, c.drawIndexed.ioffset, VK_INDEX_TYPE_UINT16);1180VkDeviceSize voffset = c.drawIndexed.voffset;1181vkCmdBindVertexBuffers(cmd, 0, 1, &c.drawIndexed.vbuffer, &voffset);1182vkCmdDrawIndexed(cmd, c.drawIndexed.count, c.drawIndexed.instances, 0, 0, 0);1183}1184break;11851186case VKRRenderCommand::DRAW:1187if (pipelineOK) {1188VkDescriptorSet set = (*descSets)[c.drawIndexed.descSetIndex].set;1189_dbg_assert_(set != VK_NULL_HANDLE);1190vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &set, c.draw.numUboOffsets, c.draw.uboOffsets);1191if (c.draw.vbuffer) {1192vkCmdBindVertexBuffers(cmd, 0, 1, &c.draw.vbuffer, &c.draw.voffset);1193}1194vkCmdDraw(cmd, c.draw.count, 1, c.draw.offset, 0);1195}1196break;11971198case VKRRenderCommand::CLEAR:1199{1200// If we get here, we failed to merge a clear into a render pass load op. This is bad for perf.1201int numAttachments = 0;1202VkClearRect rc{};1203rc.baseArrayLayer = 0;1204rc.layerCount = 1; // In multiview mode, 1 means to replicate to all the active layers.1205rc.rect.extent.width = (uint32_t)curWidth;1206rc.rect.extent.height = (uint32_t)curHeight;1207VkClearAttachment attachments[2]{};1208if (c.clear.clearMask & VK_IMAGE_ASPECT_COLOR_BIT) {1209VkClearAttachment &attachment = attachments[numAttachments++];1210attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1211attachment.colorAttachment = 0;1212Uint8x4ToFloat4(attachment.clearValue.color.float32, c.clear.clearColor);1213}1214if (c.clear.clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1215VkClearAttachment &attachment = attachments[numAttachments++];1216attachment.aspectMask = 0;1217if (c.clear.clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1218attachment.clearValue.depthStencil.depth = c.clear.clearZ;1219attachment.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1220}1221if (c.clear.clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) {1222attachment.clearValue.depthStencil.stencil = (uint32_t)c.clear.clearStencil;1223attachment.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1224}1225}1226if (numAttachments) {1227vkCmdClearAttachments(cmd, numAttachments, attachments, 1, &rc);1228}1229break;1230}12311232case VKRRenderCommand::DEBUG_ANNOTATION:1233if (vulkan_->Extensions().EXT_debug_utils) {1234VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT };1235labelInfo.pLabelName = c.debugAnnotation.annotation;1236vkCmdInsertDebugUtilsLabelEXT(cmd, &labelInfo);1237}1238break;12391240default:1241ERROR_LOG(Log::G3D, "Unimpl queue command");1242break;1243}1244}1245vkCmdEndRenderPass(cmd);12461247_dbg_assert_(recordBarrier_.empty());12481249if (fb) {1250// If the desired final layout aren't the optimal layout needed next, early-transition the image.1251if (step.render.finalColorLayout != fb->color.layout) {1252recordBarrier_.TransitionColorImageAuto(&fb->color, step.render.finalColorLayout);1253}1254if (fb->depth.image && step.render.finalDepthStencilLayout != fb->depth.layout) {1255recordBarrier_.TransitionDepthStencilImageAuto(&fb->depth, step.render.finalDepthStencilLayout);1256}1257}1258}12591260VKRRenderPass *VulkanQueueRunner::PerformBindFramebufferAsRenderTarget(const VKRStep &step, VkCommandBuffer cmd) {1261VKRRenderPass *renderPass;1262int numClearVals = 0;1263VkClearValue clearVal[4]{};1264VkFramebuffer framebuf;1265int w;1266int h;12671268bool hasDepth = RenderPassTypeHasDepth(step.render.renderPassType);12691270VkSampleCountFlagBits sampleCount;12711272// Can be used to separate the final*Layout barrier from the rest for debugging in renderdoc.1273// recordBarrier_.Flush(cmd);12741275if (step.render.framebuffer) {1276_dbg_assert_(step.render.finalColorLayout != VK_IMAGE_LAYOUT_UNDEFINED);1277_dbg_assert_(step.render.finalDepthStencilLayout != VK_IMAGE_LAYOUT_UNDEFINED);12781279RPKey key{1280step.render.colorLoad, step.render.depthLoad, step.render.stencilLoad,1281step.render.colorStore, step.render.depthStore, step.render.stencilStore,1282};1283renderPass = GetRenderPass(key);12841285VKRFramebuffer *fb = step.render.framebuffer;1286framebuf = fb->Get(renderPass, step.render.renderPassType);1287sampleCount = fb->sampleCount;1288_dbg_assert_(framebuf != VK_NULL_HANDLE);1289w = fb->width;1290h = fb->height;12911292// Mali driver on S8 (Android O) and S9 mishandles renderpasses that do just a clear1293// and then no draw calls. Memory transaction elimination gets mis-flagged or something.1294// To avoid this, we transition to GENERAL and back in this case (ARM-approved workaround).1295// See pull request #10723.1296bool maliBugWorkaround = step.render.numDraws == 0 &&1297step.render.colorLoad == VKRRenderPassLoadAction::CLEAR &&1298vulkan_->GetPhysicalDeviceProperties().properties.driverVersion == 0xaa9c4b29;1299if (maliBugWorkaround) {1300// A little suboptimal but let's go for maximum safety here.1301recordBarrier_.TransitionImage(fb->color.image, 0, 1, fb->numLayers, VK_IMAGE_ASPECT_COLOR_BIT,1302fb->color.layout, VK_IMAGE_LAYOUT_GENERAL,1303VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1304VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1305VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);1306fb->color.layout = VK_IMAGE_LAYOUT_GENERAL;1307}13081309recordBarrier_.TransitionColorImageAuto(&fb->color, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL);13101311// If the render pass doesn't touch depth, we can avoid a layout transition of the depth buffer.1312if (fb->depth.image && RenderPassTypeHasDepth(step.render.renderPassType)) {1313recordBarrier_.TransitionDepthStencilImageAuto(&fb->depth, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);1314}13151316// The transition from the optimal format happens after EndRenderPass, now that we don't1317// do it as part of the renderpass itself anymore.13181319if (sampleCount != VK_SAMPLE_COUNT_1_BIT) {1320// We don't initialize values for these.1321numClearVals = hasDepth ? 2 : 1; // Skip the resolve buffers, don't need to clear those.1322}1323if (step.render.colorLoad == VKRRenderPassLoadAction::CLEAR) {1324Uint8x4ToFloat4(clearVal[numClearVals].color.float32, step.render.clearColor);1325}1326numClearVals++;1327if (hasDepth) {1328if (step.render.depthLoad == VKRRenderPassLoadAction::CLEAR || step.render.stencilLoad == VKRRenderPassLoadAction::CLEAR) {1329clearVal[numClearVals].depthStencil.depth = step.render.clearDepth;1330clearVal[numClearVals].depthStencil.stencil = step.render.clearStencil;1331}1332numClearVals++;1333}1334_dbg_assert_(numClearVals != 3);1335} else {1336RPKey key{1337VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR, VKRRenderPassLoadAction::CLEAR,1338VKRRenderPassStoreAction::STORE, VKRRenderPassStoreAction::DONT_CARE, VKRRenderPassStoreAction::DONT_CARE,1339};1340renderPass = GetRenderPass(key);1341framebuf = backbuffer_;13421343// Raw, rotated backbuffer size.1344w = vulkan_->GetBackbufferWidth();1345h = vulkan_->GetBackbufferHeight();13461347Uint8x4ToFloat4(clearVal[0].color.float32, step.render.clearColor);1348numClearVals = hasDepth ? 2 : 1; // We might do depth-less backbuffer in the future, though doubtful of the value.1349clearVal[1].depthStencil.depth = 0.0f;1350clearVal[1].depthStencil.stencil = 0;1351sampleCount = VK_SAMPLE_COUNT_1_BIT;1352}13531354VkRenderPassBeginInfo rp_begin = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO };1355rp_begin.renderPass = renderPass->Get(vulkan_, step.render.renderPassType, sampleCount);1356rp_begin.framebuffer = framebuf;13571358VkRect2D rc = step.render.renderArea;1359if (!step.render.framebuffer) {1360// Rendering to backbuffer, must rotate, just like scissors.1361DisplayRect<int> rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height };1362RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight());13631364rc.offset.x = rotated_rc.x;1365rc.offset.y = rotated_rc.y;1366rc.extent.width = rotated_rc.w;1367rc.extent.height = rotated_rc.h;1368}13691370recordBarrier_.Flush(cmd);13711372rp_begin.renderArea = rc;1373rp_begin.clearValueCount = numClearVals;1374rp_begin.pClearValues = numClearVals ? clearVal : nullptr;1375vkCmdBeginRenderPass(cmd, &rp_begin, VK_SUBPASS_CONTENTS_INLINE);13761377return renderPass;1378}13791380void VulkanQueueRunner::PerformCopy(const VKRStep &step, VkCommandBuffer cmd) {1381// The barrier code doesn't handle this case. We'd need to transition to GENERAL to do an intra-image copy.1382_dbg_assert_(step.copy.src != step.copy.dst);13831384VKRFramebuffer *src = step.copy.src;1385VKRFramebuffer *dst = step.copy.dst;13861387int layerCount = std::min(step.copy.src->numLayers, step.copy.dst->numLayers);1388_dbg_assert_(step.copy.src->numLayers >= step.copy.dst->numLayers);13891390// TODO: If dst covers exactly the whole destination, we can set up a UNDEFINED->TRANSFER_DST_OPTIMAL transition,1391// which can potentially be more efficient.13921393if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1394recordBarrier_.TransitionColorImageAuto(&src->color, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1395recordBarrier_.TransitionColorImageAuto(&dst->color, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1396}13971398// We can't copy only depth or only stencil unfortunately - or can we?.1399if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1400_dbg_assert_(src->depth.image != VK_NULL_HANDLE);14011402recordBarrier_.TransitionDepthStencilImageAuto(&src->depth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1403if (dst->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {1404recordBarrier_.TransitionDepthStencilImageAuto(&dst->depth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1405} else {1406// Kingdom Hearts: Subsequent copies twice to the same depth buffer without any other use.1407// Not super sure how that happens, but we need a barrier to pass sync validation.1408SetupTransferDstWriteAfterWrite(dst->depth, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, &recordBarrier_);1409}1410}14111412bool multisampled = src->sampleCount != VK_SAMPLE_COUNT_1_BIT && dst->sampleCount != VK_SAMPLE_COUNT_1_BIT;1413if (multisampled) {1414// If both the targets are multisampled, copy the msaa targets too.1415// For that, we need to transition them from their normally permanent VK_*_ATTACHMENT_OPTIMAL layouts, and then back.1416if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1417recordBarrier_.TransitionColorImageAuto(&src->msaaColor, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1418recordBarrier_.TransitionColorImageAuto(&dst->msaaColor, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1419}1420if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1421// Kingdom Hearts: Subsequent copies to the same depth buffer without any other use.1422// Not super sure how that happens, but we need a barrier to pass sync validation.1423recordBarrier_.TransitionDepthStencilImageAuto(&src->msaaDepth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1424recordBarrier_.TransitionDepthStencilImageAuto(&dst->msaaDepth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1425}1426}14271428recordBarrier_.Flush(cmd);14291430VkImageCopy copy{};1431copy.srcOffset.x = step.copy.srcRect.offset.x;1432copy.srcOffset.y = step.copy.srcRect.offset.y;1433copy.srcOffset.z = 0;1434copy.srcSubresource.mipLevel = 0;1435copy.srcSubresource.layerCount = layerCount;1436copy.dstOffset.x = step.copy.dstPos.x;1437copy.dstOffset.y = step.copy.dstPos.y;1438copy.dstOffset.z = 0;1439copy.dstSubresource.mipLevel = 0;1440copy.dstSubresource.layerCount = layerCount;1441copy.extent.width = step.copy.srcRect.extent.width;1442copy.extent.height = step.copy.srcRect.extent.height;1443copy.extent.depth = 1;14441445if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1446copy.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1447copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1448vkCmdCopyImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, ©);14491450if (multisampled) {1451vkCmdCopyImage(cmd, src->msaaColor.image, src->msaaColor.layout, dst->msaaColor.image, dst->msaaColor.layout, 1, ©);1452}1453}1454if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1455_dbg_assert_(src->depth.image != VK_NULL_HANDLE);1456_dbg_assert_(dst->depth.image != VK_NULL_HANDLE);1457copy.srcSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);1458copy.dstSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);1459vkCmdCopyImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, ©);14601461if (multisampled) {1462vkCmdCopyImage(cmd, src->msaaDepth.image, src->msaaDepth.layout, dst->msaaDepth.image, dst->msaaDepth.layout, 1, ©);1463}1464}14651466if (multisampled) {1467// Transition the MSAA surfaces back to optimal.1468if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1469recordBarrier_.TransitionImage(1470src->msaaColor.image,14710,14721,1473src->msaaColor.numLayers,1474VK_IMAGE_ASPECT_COLOR_BIT,1475VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,1476VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1477VK_ACCESS_TRANSFER_READ_BIT,1478VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1479VK_PIPELINE_STAGE_TRANSFER_BIT,1480VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1481);1482src->msaaColor.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;1483recordBarrier_.TransitionImage(1484dst->msaaColor.image,14850,14861,1487dst->msaaColor.numLayers,1488VK_IMAGE_ASPECT_COLOR_BIT,1489VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1490VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,1491VK_ACCESS_TRANSFER_WRITE_BIT,1492VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,1493VK_PIPELINE_STAGE_TRANSFER_BIT,1494VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT1495);1496dst->msaaColor.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;1497}1498if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1499recordBarrier_.TransitionImage(1500src->msaaDepth.image,15010,15021,1503src->msaaDepth.numLayers,1504VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1505VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,1506VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1507VK_ACCESS_TRANSFER_READ_BIT,1508VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1509VK_PIPELINE_STAGE_TRANSFER_BIT,1510VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT1511);1512src->msaaDepth.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;1513recordBarrier_.TransitionImage(1514dst->msaaDepth.image,15150,15161,1517dst->msaaDepth.numLayers,1518VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,1519VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1520VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL,1521VK_ACCESS_TRANSFER_WRITE_BIT,1522VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,1523VK_PIPELINE_STAGE_TRANSFER_BIT,1524VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT1525);1526dst->msaaDepth.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;1527}1528// Probably not necessary.1529recordBarrier_.Flush(cmd);1530}1531}15321533void VulkanQueueRunner::PerformBlit(const VKRStep &step, VkCommandBuffer cmd) {1534// The barrier code doesn't handle this case. We'd need to transition to GENERAL to do an intra-image copy.1535_dbg_assert_(step.blit.src != step.blit.dst);15361537int layerCount = std::min(step.blit.src->numLayers, step.blit.dst->numLayers);1538_dbg_assert_(step.blit.src->numLayers >= step.blit.dst->numLayers);15391540// Blitting is not allowed for multisample images. You're suppose to use vkCmdResolveImage but it only goes in one direction (multi to single).1541_dbg_assert_(step.blit.src->sampleCount == VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT);1542_dbg_assert_(step.blit.dst->sampleCount == VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT);15431544VKRFramebuffer *src = step.blit.src;1545VKRFramebuffer *dst = step.blit.dst;15461547// First source barriers.1548if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1549recordBarrier_.TransitionColorImageAuto(&src->color, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1550recordBarrier_.TransitionColorImageAuto(&dst->color, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1551}15521553// We can't copy only depth or only stencil unfortunately.1554if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1555_assert_(src->depth.image != VK_NULL_HANDLE);1556_assert_(dst->depth.image != VK_NULL_HANDLE);1557recordBarrier_.TransitionDepthStencilImageAuto(&src->depth, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1558recordBarrier_.TransitionDepthStencilImageAuto(&dst->depth, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1559}15601561recordBarrier_.Flush(cmd);15621563// If any validation needs to be performed here, it should probably have been done1564// already when the blit was queued. So don't validate here.1565VkImageBlit blit{};1566blit.srcOffsets[0].x = step.blit.srcRect.offset.x;1567blit.srcOffsets[0].y = step.blit.srcRect.offset.y;1568blit.srcOffsets[0].z = 0;1569blit.srcOffsets[1].x = step.blit.srcRect.offset.x + step.blit.srcRect.extent.width;1570blit.srcOffsets[1].y = step.blit.srcRect.offset.y + step.blit.srcRect.extent.height;1571blit.srcOffsets[1].z = 1;1572blit.srcSubresource.mipLevel = 0;1573blit.srcSubresource.layerCount = layerCount;1574blit.dstOffsets[0].x = step.blit.dstRect.offset.x;1575blit.dstOffsets[0].y = step.blit.dstRect.offset.y;1576blit.dstOffsets[0].z = 0;1577blit.dstOffsets[1].x = step.blit.dstRect.offset.x + step.blit.dstRect.extent.width;1578blit.dstOffsets[1].y = step.blit.dstRect.offset.y + step.blit.dstRect.extent.height;1579blit.dstOffsets[1].z = 1;1580blit.dstSubresource.mipLevel = 0;1581blit.dstSubresource.layerCount = layerCount;15821583if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1584blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1585blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1586vkCmdBlitImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, &blit, step.blit.filter);1587}15881589// TODO: Need to check if the depth format is blittable.1590// Actually, we should probably almost always use copies rather than blits for depth buffers.1591if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1592blit.srcSubresource.aspectMask = 0;1593blit.dstSubresource.aspectMask = 0;1594if (step.blit.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {1595blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1596blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;1597}1598if (step.blit.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {1599blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1600blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;1601}1602vkCmdBlitImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, &blit, step.blit.filter);1603}1604}16051606void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrierBatch *recordBarrier) {1607VkImageAspectFlags imageAspect = aspect;1608VkAccessFlags srcAccessMask = 0;1609VkPipelineStageFlags srcStageMask = 0;1610if (img.format == VK_FORMAT_D16_UNORM_S8_UINT || img.format == VK_FORMAT_D24_UNORM_S8_UINT || img.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {1611// Barrier must specify both for combined depth/stencil buffers.1612imageAspect = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;1613} else {1614imageAspect = aspect;1615}1616_dbg_assert_(img.layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);1617srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;1618srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;1619recordBarrier->TransitionImage(1620img.image,16210,16221,1623img.numLayers,1624aspect,1625VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1626VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,1627VK_ACCESS_TRANSFER_WRITE_BIT,1628VK_ACCESS_TRANSFER_WRITE_BIT,1629VK_PIPELINE_STAGE_TRANSFER_BIT,1630VK_PIPELINE_STAGE_TRANSFER_BIT1631);1632}16331634void VulkanQueueRunner::ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize) {1635if (readback->buffer && requiredSize <= readback->bufferSize) {1636return;1637}16381639if (readback->buffer) {1640vulkan_->Delete().QueueDeleteBufferAllocation(readback->buffer, readback->allocation);1641}16421643readback->bufferSize = requiredSize;16441645VkDevice device = vulkan_->GetDevice();16461647VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };1648buf.size = readback->bufferSize;1649buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;16501651VmaAllocationCreateInfo allocCreateInfo{};1652allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;1653VmaAllocationInfo allocInfo{};16541655VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readback->buffer, &readback->allocation, &allocInfo);1656_assert_(res == VK_SUCCESS);16571658const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];1659readback->isCoherent = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;1660}16611662void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd, FrameData &frameData) {1663VkImage image;1664VkImageLayout copyLayout;1665// Special case for backbuffer readbacks.1666if (step.readback.src == nullptr) {1667// We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT,1668// and then back into it.1669// Regarding layers, backbuffer currently only has one layer.1670recordBarrier_.TransitionImage(backbufferImage_, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT,1671VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,16720, VK_ACCESS_TRANSFER_READ_BIT,1673VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);1674copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;1675image = backbufferImage_;1676} else {1677VKRImage *srcImage;1678if (step.readback.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {1679srcImage = &step.readback.src->color;1680recordBarrier_.TransitionColorImageAuto(srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1681} else if (step.readback.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {1682srcImage = &step.readback.src->depth;1683recordBarrier_.TransitionDepthStencilImageAuto(srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);1684_dbg_assert_(srcImage->image != VK_NULL_HANDLE);1685} else {1686_dbg_assert_msg_(false, "No image aspect to readback?");1687return;1688}1689image = srcImage->image;1690copyLayout = srcImage->layout;1691}16921693recordBarrier_.Flush(cmd);16941695// TODO: Handle different readback formats!1696u32 readbackSizeInBytes = sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height;16971698CachedReadback *cached = nullptr;16991700if (step.readback.delayed) {1701ReadbackKey key;1702key.framebuf = step.readback.src;1703key.width = step.readback.srcRect.extent.width;1704key.height = step.readback.srcRect.extent.height;17051706// See if there's already a buffer we can reuse1707if (!frameData.readbacks_.Get(key, &cached)) {1708cached = new CachedReadback();1709cached->bufferSize = 0;1710frameData.readbacks_.Insert(key, cached);1711}1712} else {1713cached = &syncReadback_;1714}17151716ResizeReadbackBuffer(cached, readbackSizeInBytes);17171718VkBufferImageCopy region{};1719region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };1720region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };1721region.imageSubresource.aspectMask = step.readback.aspectMask;1722region.imageSubresource.layerCount = 1;1723region.bufferOffset = 0;1724region.bufferRowLength = step.readback.srcRect.extent.width;1725region.bufferImageHeight = step.readback.srcRect.extent.height;17261727vkCmdCopyImageToBuffer(cmd, image, copyLayout, cached->buffer, 1, ®ion);17281729// NOTE: Can't read the buffer using the CPU here - need to sync first.17301731// If we copied from the backbuffer, transition it back.1732if (step.readback.src == nullptr) {1733// We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT,1734// and then back into it.1735// Regarding layers, backbuffer currently only has one layer.1736recordBarrier_.TransitionImage(backbufferImage_, 0, 1, 1, VK_IMAGE_ASPECT_COLOR_BIT,1737VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,1738VK_ACCESS_TRANSFER_READ_BIT, 0,1739VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);1740recordBarrier_.Flush(cmd); // probably not needed1741copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;1742}1743}17441745void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffer cmd) {1746// TODO: Clean this up - just reusing `SetupTransitionToTransferSrc`.1747VkImageLayout layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;1748recordBarrier_.TransitionColorImageAuto(step.readback_image.image, &layout, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, 0, 1, 1);1749recordBarrier_.Flush(cmd);17501751ResizeReadbackBuffer(&syncReadback_, sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);17521753VkBufferImageCopy region{};1754region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 };1755region.imageExtent = { step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height, 1 };1756region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;1757region.imageSubresource.layerCount = 1;1758region.imageSubresource.mipLevel = step.readback_image.mipLevel;1759region.bufferOffset = 0;1760region.bufferRowLength = step.readback_image.srcRect.extent.width;1761region.bufferImageHeight = step.readback_image.srcRect.extent.height;1762vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, syncReadback_.buffer, 1, ®ion);17631764// Now transfer it back to a texture.1765recordBarrier_.TransitionImage(step.readback_image.image, 0, 1, 1, // I don't think we have any multilayer cases for regular textures. Above in PerformReadback, though..1766VK_IMAGE_ASPECT_COLOR_BIT,1767VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,1768VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_SHADER_READ_BIT,1769VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);1770recordBarrier_.Flush(cmd); // probably not needed17711772// NOTE: Can't read the buffer using the CPU here - need to sync first.1773// Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host.1774}17751776bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {1777CachedReadback *readback = &syncReadback_;17781779// Look up in readback cache.1780if (src) {1781ReadbackKey key;1782key.framebuf = src;1783key.width = width;1784key.height = height;1785CachedReadback *cached;1786if (frameData.readbacks_.Get(key, &cached)) {1787readback = cached;1788} else {1789// Didn't have a cached image ready yet1790return false;1791}1792}17931794if (!readback->buffer)1795return false; // Didn't find anything in cache, or something has gone really wrong.17961797// Read back to the requested address in ram from buffer.1798void *mappedData;1799const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);1800VkResult res = vmaMapMemory(vulkan_->Allocator(), readback->allocation, &mappedData);18011802if (res != VK_SUCCESS) {1803ERROR_LOG(Log::G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res);1804return false;1805}18061807if (!readback->isCoherent) {1808vmaInvalidateAllocation(vulkan_->Allocator(), readback->allocation, 0, width * height * srcPixelSize);1809}18101811// TODO: Perform these conversions in a compute shader on the GPU.1812if (srcFormat == Draw::DataFormat::R8G8B8A8_UNORM) {1813ConvertFromRGBA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat);1814} else if (srcFormat == Draw::DataFormat::B8G8R8A8_UNORM) {1815ConvertFromBGRA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat);1816} else if (srcFormat == destFormat) {1817// Can just memcpy when it matches no matter the format!1818uint8_t *dst = pixels;1819const uint8_t *src = (const uint8_t *)mappedData;1820for (int y = 0; y < height; ++y) {1821memcpy(dst, src, width * srcPixelSize);1822src += width * srcPixelSize;1823dst += pixelStride * srcPixelSize;1824}1825} else if (destFormat == Draw::DataFormat::D32F) {1826ConvertToD32F(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat);1827} else if (destFormat == Draw::DataFormat::D16) {1828ConvertToD16(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat);1829} else {1830// TODO: Maybe a depth conversion or something?1831ERROR_LOG(Log::G3D, "CopyReadbackBuffer: Unknown format");1832_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);1833}18341835vmaUnmapMemory(vulkan_->Allocator(), readback->allocation);1836return true;1837}18381839const char *VKRRenderCommandToString(VKRRenderCommand cmd) {1840const char * const str[] = {1841"REMOVED",1842"BIND_GRAPHICS_PIPELINE", // async1843"STENCIL",1844"BLEND",1845"VIEWPORT",1846"SCISSOR",1847"CLEAR",1848"DRAW",1849"DRAW_INDEXED",1850"PUSH_CONSTANTS",1851"DEBUG_ANNOTATION",1852};1853if ((int)cmd < ARRAY_SIZE(str)) {1854return str[(int)cmd];1855} else {1856return "N/A";1857}1858}185918601861