Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Debugger/Playback.cpp
3186 views
1
// Copyright (c) 2017- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <algorithm>
19
#include <cstring>
20
#include <functional>
21
#include <mutex>
22
#include <condition_variable>
23
#include <vector>
24
#include <thread>
25
#include <snappy-c.h>
26
#include <zstd.h>
27
28
#include "Common/Profiler/Profiler.h"
29
#include "Common/CommonTypes.h"
30
#include "Common/Log.h"
31
#include "Common/Thread/ThreadUtil.h"
32
#include "Common/System/Request.h"
33
#include "Core/Config.h"
34
#include "Core/Core.h"
35
#include "Core/CoreTiming.h"
36
#include "Core/Debugger/MemBlockInfo.h"
37
#include "Core/ELF/ParamSFO.h"
38
#include "Core/FileSystems/MetaFileSystem.h"
39
#include "Core/HLE/HLE.h"
40
#include "Core/HLE/sceDisplay.h"
41
#include "Core/HLE/sceKernelMemory.h"
42
#include "Core/MemMap.h"
43
#include "Core/MIPS/MIPS.h"
44
#include "Core/MIPS/MIPSCodeUtils.h"
45
#include "Core/System.h"
46
#include "Core/Util/GameDB.h"
47
#include "GPU/GPUCommon.h"
48
#include "GPU/GPUState.h"
49
#include "GPU/ge_constants.h"
50
#include "GPU/Debugger/Playback.h"
51
#include "GPU/Debugger/Record.h"
52
#include "GPU/Debugger/RecordFormat.h"
53
54
namespace GPURecord {
55
56
// Provide the illusion of synchronous execution, although the playback is actually running on a different thread.
57
enum class OpType {
58
None,
59
UpdateStallAddr,
60
EnqueueList,
61
ListSync,
62
ReapplyGfxState,
63
Done,
64
};
65
66
static const char *OpTypeToString(OpType type) {
67
switch (type) {
68
case OpType::None: return "None";
69
case OpType::UpdateStallAddr: return "UpdateStallAddr";
70
case OpType::EnqueueList: return "EnqueueList";
71
case OpType::ListSync: return "ListSync";
72
case OpType::ReapplyGfxState: return "ReapplyGfxState";
73
case OpType::Done: return "Done";
74
default: return "N/A";
75
}
76
}
77
78
struct Operation {
79
OpType type;
80
u32 listID; // also listPC in EnqueueList
81
u32 param; // stallAddr generally
82
};
83
84
static std::string lastExecFilename;
85
static uint32_t lastExecVersion;
86
static std::vector<Command> lastExecCommands;
87
static std::vector<u8> lastExecPushbuf;
88
89
// This thread is restarted every frame (dump execution) for simplicity. TODO: Make persistent?
90
// Alternatively, get rid of it, but the code is written in a way that makes it difficult (you'll see if you try).
91
static std::thread replayThread;
92
93
static std::mutex opStartLock;
94
static std::condition_variable g_condOpStartWait;
95
96
static std::mutex opFinishLock;
97
static std::condition_variable opFinishWait;
98
99
static Operation g_opToExec;
100
static u32 g_retVal;
101
static bool g_opDone = true;
102
static bool g_cancelled = false;
103
104
// Runs on operation thread
105
u32 ExecuteOnMain(Operation opToExec) {
106
{
107
std::unique_lock<std::mutex> startLock(opStartLock);
108
g_opToExec = opToExec;
109
g_retVal = 0;
110
g_opDone = false;
111
g_condOpStartWait.notify_one();
112
}
113
114
// now wait for completion. At that point, noone cares about g_opToExec anymore, and we can safely
115
// overwrite it next time.
116
{
117
std::unique_lock<std::mutex> lock(opFinishLock);
118
opFinishWait.wait(lock, []() { return g_opDone || g_cancelled; });
119
}
120
return g_retVal;
121
}
122
123
// This class maps pushbuffer (dump data) sections to PSP memory.
124
// Dumps can be larger than available PSP memory, because they include generated data too.
125
//
126
// If possible, it maps to dynamically allocated "slabs" so nearby access is fast.
127
// Otherwise it uses "extra" allocations to manage sections that straddle two slabs.
128
// Slabs are managed with LRU, extra buffers are round-robin.
129
class BufMapping {
130
public:
131
BufMapping(const std::vector<u8> &pushbuf) : pushbuf_(pushbuf) {
132
}
133
134
// Returns a pointer to contiguous memory for this access, or else 0 (failure).
135
u32 Map(u32 bufpos, u32 sz, const std::function<void()> &flush);
136
137
// Clear and reset allocations made.
138
void Reset() {
139
slabGeneration_ = 0;
140
extraOffset_ = 0;
141
for (int i = 0; i < SLAB_COUNT; ++i) {
142
slabs_[i].Free();
143
}
144
for (int i = 0; i < EXTRA_COUNT; ++i) {
145
extra_[i].Free();
146
}
147
}
148
149
protected:
150
u32 MapSlab(u32 bufpos, const std::function<void()> &flush);
151
u32 MapExtra(u32 bufpos, u32 sz, const std::function<void()> &flush);
152
153
enum {
154
// These numbers kept low because we only have 24 MB of user memory to map into.
155
SLAB_SIZE = 1 * 1024 * 1024,
156
// 10 is the number of texture units + verts + inds.
157
// In the worst case, we could concurrently need 10 slabs/extras at the same time.
158
SLAB_COUNT = 10,
159
EXTRA_COUNT = 10,
160
};
161
162
// The current "generation". Static simply as a convenience for access.
163
// This increments on every allocation, for a simple LRU.
164
static int slabGeneration_;
165
166
// An aligned large mapping of the pushbuffer in PSP RAM.
167
struct SlabInfo {
168
u32 psp_pointer_ = 0;
169
u32 buf_pointer_ = 0;
170
int last_used_ = 0;
171
172
bool Matches(u32 bufpos) const {
173
// We check psp_pointer_ because bufpos = 0 is valid, and the initial value.
174
return buf_pointer_ == bufpos && psp_pointer_ != 0;
175
}
176
177
// Automatically marks used for LRU purposes.
178
u32 Ptr(u32 bufpos) {
179
last_used_ = slabGeneration_;
180
return psp_pointer_ + (bufpos - buf_pointer_);
181
}
182
183
int Age() const {
184
// If not allocated, it's as expired as it's gonna get.
185
if (psp_pointer_ == 0)
186
return std::numeric_limits<int>::max();
187
return slabGeneration_ - last_used_;
188
}
189
190
bool Alloc();
191
void Free();
192
bool Setup(u32 bufpos, const std::vector<u8> &pushbuf_);
193
};
194
195
// An adhoc mapping of the pushbuffer (either larger than a slab or straddling slabs.)
196
// Remember: texture data, verts, etc. must be contiguous.
197
struct ExtraInfo {
198
u32 psp_pointer_ = 0;
199
u32 buf_pointer_ = 0;
200
u32 size_ = 0;
201
202
bool Matches(u32 bufpos, u32 sz) const {
203
// We check psp_pointer_ because bufpos = 0 is valid, and the initial value.
204
return buf_pointer_ == bufpos && psp_pointer_ != 0 && size_ >= sz;
205
}
206
207
u32 Ptr() const {
208
return psp_pointer_;
209
}
210
211
bool Alloc(u32 bufpos, u32 sz, const std::vector<u8> &pushbuf_);
212
void Free();
213
};
214
215
SlabInfo slabs_[SLAB_COUNT]{};
216
u32 lastSlab_ = 0;
217
u32 extraOffset_ = 0;
218
ExtraInfo extra_[EXTRA_COUNT]{};
219
220
const std::vector<u8> &pushbuf_;
221
};
222
223
u32 BufMapping::Map(u32 bufpos, u32 sz, const std::function<void()> &flush) {
224
int slab1 = bufpos / SLAB_SIZE;
225
int slab2 = (bufpos + sz - 1) / SLAB_SIZE;
226
227
if (slab1 == slab2) {
228
// Shortcut in case it's simply the most recent slab.
229
if (slabs_[lastSlab_].Matches(slab1 * SLAB_SIZE))
230
return slabs_[lastSlab_].Ptr(bufpos);
231
// Doesn't straddle, so we can just map to a slab.
232
return MapSlab(bufpos, flush);
233
} else {
234
// We need contiguous, so we'll just allocate separately.
235
return MapExtra(bufpos, sz, flush);
236
}
237
}
238
239
u32 BufMapping::MapSlab(u32 bufpos, const std::function<void()> &flush) {
240
u32 slab_pos = (bufpos / SLAB_SIZE) * SLAB_SIZE;
241
242
int best = 0;
243
for (int i = 0; i < SLAB_COUNT; ++i) {
244
if (slabs_[i].Matches(slab_pos)) {
245
return slabs_[i].Ptr(bufpos);
246
}
247
248
if (slabs_[i].Age() > slabs_[best].Age()) {
249
best = i;
250
}
251
}
252
253
// Stall before mapping a new slab.
254
flush();
255
256
// Okay, we need to allocate.
257
if (!slabs_[best].Setup(slab_pos, pushbuf_)) {
258
return 0;
259
}
260
lastSlab_ = best;
261
return slabs_[best].Ptr(bufpos);
262
}
263
264
u32 BufMapping::MapExtra(u32 bufpos, u32 sz, const std::function<void()> &flush) {
265
for (int i = 0; i < EXTRA_COUNT; ++i) {
266
// Might be likely to reuse larger buffers straddling slabs.
267
if (extra_[i].Matches(bufpos, sz)) {
268
return extra_[i].Ptr();
269
}
270
}
271
272
// Stall first, so we don't stomp existing RAM.
273
flush();
274
275
int i = extraOffset_;
276
extraOffset_ = (extraOffset_ + 1) % EXTRA_COUNT;
277
278
if (!extra_[i].Alloc(bufpos, sz, pushbuf_)) {
279
// Let's try to power on - hopefully none of these are still in use.
280
for (int i = 0; i < EXTRA_COUNT; ++i) {
281
extra_[i].Free();
282
}
283
if (!extra_[i].Alloc(bufpos, sz, pushbuf_)) {
284
return 0;
285
}
286
}
287
return extra_[i].Ptr();
288
}
289
290
bool BufMapping::SlabInfo::Alloc() {
291
u32 sz = SLAB_SIZE;
292
psp_pointer_ = userMemory.Alloc(sz, false, "Slab");
293
if (psp_pointer_ == -1) {
294
psp_pointer_ = 0;
295
}
296
return psp_pointer_ != 0;
297
}
298
299
void BufMapping::SlabInfo::Free() {
300
if (psp_pointer_) {
301
userMemory.Free(psp_pointer_);
302
psp_pointer_ = 0;
303
buf_pointer_ = 0;
304
last_used_ = 0;
305
}
306
}
307
308
bool BufMapping::ExtraInfo::Alloc(u32 bufpos, u32 sz, const std::vector<u8> &pushbuf_) {
309
// Make sure we've freed any previous allocation first.
310
Free();
311
312
u32 allocSize = sz;
313
psp_pointer_ = userMemory.Alloc(allocSize, false, "Straddle extra");
314
if (psp_pointer_ == -1) {
315
psp_pointer_ = 0;
316
}
317
if (psp_pointer_ == 0) {
318
return false;
319
}
320
321
buf_pointer_ = bufpos;
322
size_ = sz;
323
Memory::MemcpyUnchecked(psp_pointer_, pushbuf_.data() + bufpos, sz);
324
return true;
325
}
326
327
void BufMapping::ExtraInfo::Free() {
328
if (psp_pointer_) {
329
userMemory.Free(psp_pointer_);
330
psp_pointer_ = 0;
331
buf_pointer_ = 0;
332
}
333
}
334
335
bool BufMapping::SlabInfo::Setup(u32 bufpos, const std::vector<u8> &pushbuf_) {
336
// If it already has RAM, we're simply taking it over. Slabs come only in one size.
337
if (psp_pointer_ == 0) {
338
if (!Alloc()) {
339
return false;
340
}
341
}
342
343
buf_pointer_ = bufpos;
344
u32 sz = std::min((u32)SLAB_SIZE, (u32)pushbuf_.size() - bufpos);
345
Memory::MemcpyUnchecked(psp_pointer_, pushbuf_.data() + bufpos, sz);
346
347
slabGeneration_++;
348
last_used_ = slabGeneration_;
349
return true;
350
}
351
352
int BufMapping::slabGeneration_ = 0;
353
354
class DumpExecute {
355
public:
356
DumpExecute(const std::vector<u8> &pushbuf, const std::vector<Command> &commands, uint32_t version)
357
: pushbuf_(pushbuf), commands_(commands), mapping_(pushbuf), version_(version) {
358
}
359
~DumpExecute();
360
361
ReplayResult Run();
362
363
private:
364
void SyncStall();
365
void SubmitListEnd();
366
367
void Init(u32 ptr, u32 sz);
368
void Registers(u32 ptr, u32 sz);
369
void Vertices(u32 ptr, u32 sz);
370
void Indices(u32 ptr, u32 sz);
371
void ClutAddr(u32 ptr, u32 sz);
372
void Clut(u32 ptr, u32 sz);
373
void TransferSrc(u32 ptr, u32 sz);
374
void Memset(u32 ptr, u32 sz);
375
void MemcpyDest(u32 ptr, u32 sz);
376
void Memcpy(u32 ptr, u32 sz);
377
void Texture(int level, u32 ptr, u32 sz);
378
void Framebuf(int level, u32 ptr, u32 sz);
379
void Display(u32 ptr, u32 sz, bool allowFlip);
380
void EdramTrans(u32 ptr, u32 sz);
381
382
u32 execMemcpyDest = 0;
383
u32 execClutAddr = 0;
384
u32 execClutFlags = 0;
385
u32 execListBuf = 0;
386
u32 execListPos = 0;
387
u32 execListID = 0;
388
const int LIST_BUF_SIZE = 256 * 1024;
389
std::vector<u32> execListQueue;
390
u16 lastBufw_[8]{};
391
u32 lastTex_[8]{};
392
u32 lastBase_ = 0;
393
394
const std::vector<u8> &pushbuf_;
395
const std::vector<Command> &commands_;
396
BufMapping mapping_;
397
uint32_t version_ = 0;
398
399
int resumeIndex_ = -1;
400
};
401
402
void DumpExecute::SyncStall() {
403
if (execListBuf == 0) {
404
VERBOSE_LOG(Log::GeDebugger, "SyncStall: No active display list");
405
return;
406
}
407
408
ExecuteOnMain(Operation{ OpType::UpdateStallAddr, execListID, execListPos });
409
410
s64 listTicks = gpu->GetListTicks(execListID);
411
if (listTicks != -1) {
412
s64 nowTicks = CoreTiming::GetTicks();
413
if (listTicks > nowTicks) {
414
currentMIPS->downcount -= listTicks - nowTicks;
415
}
416
}
417
418
// Make sure downcount doesn't overflow. (can this even happen?)
419
// Also this doesn't do anything in this context, we don't reschedule... or at least
420
// aren't supposed to.
421
// CoreTiming::ForceCheck();
422
}
423
424
void DumpExecute::Registers(u32 ptr, u32 sz) {
425
if (execListBuf == 0) {
426
u32 allocSize = LIST_BUF_SIZE;
427
execListBuf = userMemory.Alloc(allocSize, true, "List buf");
428
if (execListBuf == -1) {
429
execListBuf = 0;
430
}
431
if (execListBuf == 0) {
432
ERROR_LOG(Log::GeDebugger, "Unable to allocate for display list");
433
return;
434
}
435
436
execListPos = execListBuf;
437
Memory::Write_U32(GE_CMD_NOP << 24, execListPos);
438
execListPos += 4;
439
440
// TODO: Why do we disable interrupts here?
441
gpu->EnableInterrupts(false);
442
execListID = ExecuteOnMain(Operation{ OpType::EnqueueList, execListBuf, execListPos });
443
gpu->EnableInterrupts(true);
444
}
445
446
u32 pendingSize = (u32)execListQueue.size() * sizeof(u32);
447
// Validate space for jump.
448
u32 allocSize = pendingSize + sz + 8;
449
if (execListPos + allocSize >= execListBuf + LIST_BUF_SIZE) {
450
Memory::Write_U32((GE_CMD_BASE << 24) | ((execListBuf >> 8) & 0x00FF0000), execListPos);
451
Memory::Write_U32((GE_CMD_JUMP << 24) | (execListBuf & 0x00FFFFFF), execListPos + 4);
452
453
execListPos = execListBuf;
454
lastBase_ = execListBuf & 0xFF000000;
455
456
// Don't continue until we've stalled.
457
// TODO: Is this really needed? It seems fine without it.
458
SyncStall();
459
}
460
461
Memory::MemcpyUnchecked(execListPos, execListQueue.data(), pendingSize);
462
execListPos += pendingSize;
463
u32 writePos = execListPos;
464
void *srcData = (void *)(pushbuf_.data() + ptr);
465
Memory::MemcpyUnchecked(execListPos, srcData, sz);
466
execListPos += sz;
467
468
// TODO: Unfortunate. Maybe Texture commands should contain the bufw instead.
469
// The goal here is to realistically combine prims in dumps. Stalling for the bufw flushes.
470
u32_le *ops = (u32_le *)Memory::GetPointerUnchecked(writePos);
471
472
u32 lastTexHigh[8]{};
473
for (int i = 0; i < 8; ++i)
474
lastTexHigh[i] = ((lastTex_[i] & 0xFF000000) >> 8) | ((GE_CMD_TEXBUFWIDTH0 + i) << 24);
475
476
for (u32 i = 0; i < sz / 4; ++i) {
477
u32 cmd = ops[i] >> 24;
478
if (cmd >= GE_CMD_TEXBUFWIDTH0 && cmd <= GE_CMD_TEXBUFWIDTH7) {
479
int level = cmd - GE_CMD_TEXBUFWIDTH0;
480
u16 bufw = ops[i] & 0xFFFF;
481
482
// NOP the address part of the command to avoid a flush too.
483
if (bufw == lastBufw_[level])
484
ops[i] = GE_CMD_NOP << 24;
485
else
486
ops[i] = lastTexHigh[level] | bufw;
487
lastBufw_[level] = bufw;
488
}
489
490
// Since we're here anyway, also NOP out texture addresses.
491
// This makes Step Tex not hit phantom textures, but we rely on it for lastTex_[].
492
if (cmd >= GE_CMD_TEXADDR0 && cmd <= GE_CMD_TEXADDR7) {
493
ops[i] = GE_CMD_NOP << 24;
494
}
495
if (cmd == GE_CMD_SIGNAL || cmd == GE_CMD_BASE) {
496
lastBase_ = 0xFFFFFFFF;
497
}
498
}
499
500
execListQueue.clear();
501
}
502
503
void DumpExecute::SubmitListEnd() {
504
if (execListPos == 0 || g_cancelled) {
505
return;
506
}
507
508
// There's always space for the end, same size as a jump.
509
Memory::Write_U32(GE_CMD_FINISH << 24, execListPos);
510
Memory::Write_U32(GE_CMD_END << 24, execListPos + 4);
511
execListPos += 8;
512
513
for (int i = 0; i < 8; ++i)
514
lastTex_[i] = 0;
515
lastBase_ = 0xFFFFFFFF;
516
517
SyncStall();
518
ExecuteOnMain(Operation{ OpType::ListSync, execListID });
519
}
520
521
void DumpExecute::Init(u32 ptr, u32 sz) {
522
gstate.Restore((u32_le *)(pushbuf_.data() + ptr));
523
ExecuteOnMain(Operation{ OpType::ReapplyGfxState });
524
525
for (int i = 0; i < 8; ++i) {
526
lastBufw_[i] = 0;
527
lastTex_[i] = 0;
528
}
529
lastBase_ = 0xFFFFFFFF;
530
}
531
532
void DumpExecute::Vertices(u32 ptr, u32 sz) {
533
u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
534
if (psp == 0) {
535
ERROR_LOG(Log::GeDebugger, "Unable to allocate for vertices");
536
return;
537
}
538
539
if (lastBase_ != (psp & 0xFF000000)) {
540
execListQueue.push_back((GE_CMD_BASE << 24) | ((psp >> 8) & 0x00FF0000));
541
lastBase_ = psp & 0xFF000000;
542
}
543
execListQueue.push_back((GE_CMD_VADDR << 24) | (psp & 0x00FFFFFF));
544
}
545
546
void DumpExecute::Indices(u32 ptr, u32 sz) {
547
u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
548
if (psp == 0) {
549
ERROR_LOG(Log::GeDebugger, "Unable to allocate for indices");
550
return;
551
}
552
553
if (lastBase_ != (psp & 0xFF000000)) {
554
execListQueue.push_back((GE_CMD_BASE << 24) | ((psp >> 8) & 0x00FF0000));
555
lastBase_ = psp & 0xFF000000;
556
}
557
execListQueue.push_back((GE_CMD_IADDR << 24) | (psp & 0x00FFFFFF));
558
}
559
560
void DumpExecute::ClutAddr(u32 ptr, u32 sz) {
561
struct ClutAddrData {
562
u32 addr;
563
u32 flags;
564
};
565
const ClutAddrData *data = (const ClutAddrData *)(pushbuf_.data() + ptr);
566
execClutAddr = data->addr;
567
execClutFlags = data->flags;
568
}
569
570
void DumpExecute::Clut(u32 ptr, u32 sz) {
571
// This is always run when we have the actual address set.
572
if (execClutAddr != 0) {
573
const bool isTarget = (execClutFlags & 1) != 0;
574
575
// Could potentially always skip if !isTarget, but playing it safe for offset texture behavior.
576
if (Memory::IsValidRange(execClutAddr, sz) && (!isTarget || !g_Config.bSoftwareRendering)) {
577
// Intentionally don't trigger an upload here.
578
Memory::MemcpyUnchecked(execClutAddr, pushbuf_.data() + ptr, sz);
579
NotifyMemInfo(MemBlockFlags::WRITE, execClutAddr, sz, "ReplayClut");
580
}
581
582
execClutAddr = 0;
583
} else {
584
u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
585
if (psp == 0) {
586
ERROR_LOG(Log::GeDebugger, "Unable to allocate for clut");
587
return;
588
}
589
590
execListQueue.push_back((GE_CMD_CLUTADDRUPPER << 24) | ((psp >> 8) & 0x00FF0000));
591
execListQueue.push_back((GE_CMD_CLUTADDR << 24) | (psp & 0x00FFFFFF));
592
}
593
}
594
595
void DumpExecute::TransferSrc(u32 ptr, u32 sz) {
596
u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
597
if (psp == 0) {
598
ERROR_LOG(Log::GeDebugger, "Unable to allocate for transfer");
599
return;
600
}
601
602
// Need to sync in order to access gstate.transfersrcw.
603
SyncStall();
604
605
execListQueue.push_back((gstate.transfersrcw & 0xFF00FFFF) | ((psp >> 8) & 0x00FF0000));
606
execListQueue.push_back(((GE_CMD_TRANSFERSRC) << 24) | (psp & 0x00FFFFFF));
607
}
608
609
void DumpExecute::Memset(u32 ptr, u32 sz) {
610
PROFILE_THIS_SCOPE("ReplayMemset");
611
struct MemsetCommand {
612
u32 dest;
613
int value;
614
u32 sz;
615
};
616
617
const MemsetCommand *data = (const MemsetCommand *)(pushbuf_.data() + ptr);
618
619
if (Memory::IsVRAMAddress(data->dest)) {
620
SyncStall();
621
// TODO: should probably do this as an operation.
622
gpu->PerformMemorySet(data->dest, (u8)data->value, data->sz);
623
}
624
}
625
626
void DumpExecute::MemcpyDest(u32 ptr, u32 sz) {
627
execMemcpyDest = *(const u32 *)(pushbuf_.data() + ptr);
628
}
629
630
void DumpExecute::Memcpy(u32 ptr, u32 sz) {
631
PROFILE_THIS_SCOPE("ReplayMemcpy");
632
if (Memory::IsVRAMAddress(execMemcpyDest)) {
633
SyncStall();
634
Memory::MemcpyUnchecked(execMemcpyDest, pushbuf_.data() + ptr, sz);
635
NotifyMemInfo(MemBlockFlags::WRITE, execMemcpyDest, sz, "ReplayMemcpy");
636
gpu->PerformWriteColorFromMemory(execMemcpyDest, sz);
637
}
638
}
639
640
void DumpExecute::Texture(int level, u32 ptr, u32 sz) {
641
u32 psp = mapping_.Map(ptr, sz, std::bind(&DumpExecute::SyncStall, this));
642
if (psp == 0) {
643
ERROR_LOG(Log::GeDebugger, "Unable to allocate for texture");
644
return;
645
}
646
647
if (lastTex_[level] != psp) {
648
u32 bufwCmd = GE_CMD_TEXBUFWIDTH0 + level;
649
u32 addrCmd = GE_CMD_TEXADDR0 + level;
650
execListQueue.push_back((bufwCmd << 24) | ((psp >> 8) & 0x00FF0000) | lastBufw_[level]);
651
execListQueue.push_back((addrCmd << 24) | (psp & 0x00FFFFFF));
652
lastTex_[level] = psp;
653
}
654
}
655
656
void DumpExecute::Framebuf(int level, u32 ptr, u32 sz) {
657
PROFILE_THIS_SCOPE("ReplayFramebuf");
658
struct FramebufData {
659
u32 addr;
660
int bufw;
661
u32 flags;
662
u32 pad;
663
};
664
665
FramebufData *framebuf = (FramebufData *)(pushbuf_.data() + ptr);
666
667
if (lastTex_[level] != framebuf->addr || lastBufw_[level] != framebuf->bufw) {
668
u32 bufwCmd = GE_CMD_TEXBUFWIDTH0 + level;
669
u32 addrCmd = GE_CMD_TEXADDR0 + level;
670
execListQueue.push_back((bufwCmd << 24) | ((framebuf->addr >> 8) & 0x00FF0000) | framebuf->bufw);
671
execListQueue.push_back((addrCmd << 24) | (framebuf->addr & 0x00FFFFFF));
672
lastTex_[level] = framebuf->addr;
673
lastBufw_[level] = framebuf->bufw;
674
}
675
676
// And now also copy the data into VRAM (in case it wasn't actually rendered.)
677
u32 headerSize = (u32)sizeof(FramebufData);
678
u32 pspSize = sz - headerSize;
679
const bool isTarget = (framebuf->flags & 1) != 0;
680
const bool unchangedVRAM = version_ >= 6 && (framebuf->flags & 2) != 0;
681
// TODO: Could use drawnVRAM flag, but it can be wrong.
682
// Could potentially always skip if !isTarget, but playing it safe for offset texture behavior.
683
if (Memory::IsValidRange(framebuf->addr, pspSize) && !unchangedVRAM && (!isTarget || !g_Config.bSoftwareRendering)) {
684
// Intentionally don't trigger an upload here.
685
Memory::MemcpyUnchecked(framebuf->addr, pushbuf_.data() + ptr + headerSize, pspSize);
686
NotifyMemInfo(MemBlockFlags::WRITE, framebuf->addr, pspSize, "ReplayTex");
687
}
688
}
689
690
void DumpExecute::Display(u32 ptr, u32 sz, bool allowFlip) {
691
struct DisplayBufData {
692
PSPPointer<u8> topaddr;
693
int linesize, pixelFormat;
694
};
695
696
DisplayBufData *disp = (DisplayBufData *)(pushbuf_.data() + ptr);
697
698
// Sync up drawing.
699
SyncStall();
700
701
__DisplaySetFramebuf(disp->topaddr.ptr, disp->linesize, disp->pixelFormat, 1);
702
if (allowFlip) {
703
__DisplaySetFramebuf(disp->topaddr.ptr, disp->linesize, disp->pixelFormat, 0);
704
}
705
}
706
707
void DumpExecute::EdramTrans(u32 ptr, u32 sz) {
708
uint32_t value;
709
memcpy(&value, pushbuf_.data() + ptr, 4);
710
711
// Sync up drawing.
712
SyncStall();
713
714
if (gpu)
715
gpu->SetAddrTranslation(value);
716
}
717
718
DumpExecute::~DumpExecute() {
719
execMemcpyDest = 0;
720
if (execListBuf) {
721
userMemory.Free(execListBuf);
722
execListBuf = 0;
723
}
724
execListPos = 0;
725
mapping_.Reset();
726
}
727
728
ReplayResult DumpExecute::Run() {
729
// Start with the default value.
730
if (gpu)
731
gpu->SetAddrTranslation(0x400);
732
733
if (resumeIndex_ >= 0) {
734
SyncStall();
735
}
736
737
int start = resumeIndex_ >= 0 ? resumeIndex_ : 0;
738
for (size_t i = start; i < commands_.size(); i++) {
739
if (g_cancelled) {
740
break;
741
}
742
743
const Command &cmd = commands_[i];
744
switch (cmd.type) {
745
case CommandType::INIT:
746
Init(cmd.ptr, cmd.sz);
747
break;
748
749
case CommandType::REGISTERS:
750
Registers(cmd.ptr, cmd.sz);
751
break;
752
753
case CommandType::VERTICES:
754
Vertices(cmd.ptr, cmd.sz);
755
break;
756
757
case CommandType::INDICES:
758
Indices(cmd.ptr, cmd.sz);
759
break;
760
761
case CommandType::CLUTADDR:
762
ClutAddr(cmd.ptr, cmd.sz);
763
break;
764
765
case CommandType::CLUT:
766
Clut(cmd.ptr, cmd.sz);
767
break;
768
769
case CommandType::TRANSFERSRC:
770
TransferSrc(cmd.ptr, cmd.sz);
771
break;
772
773
case CommandType::MEMSET:
774
Memset(cmd.ptr, cmd.sz);
775
break;
776
777
case CommandType::MEMCPYDEST:
778
MemcpyDest(cmd.ptr, cmd.sz);
779
break;
780
781
case CommandType::MEMCPYDATA:
782
Memcpy(cmd.ptr, cmd.sz);
783
break;
784
785
case CommandType::EDRAMTRANS:
786
EdramTrans(cmd.ptr, cmd.sz);
787
break;
788
789
case CommandType::TEXTURE0:
790
case CommandType::TEXTURE1:
791
case CommandType::TEXTURE2:
792
case CommandType::TEXTURE3:
793
case CommandType::TEXTURE4:
794
case CommandType::TEXTURE5:
795
case CommandType::TEXTURE6:
796
case CommandType::TEXTURE7:
797
Texture((int)cmd.type - (int)CommandType::TEXTURE0, cmd.ptr, cmd.sz);
798
break;
799
800
case CommandType::FRAMEBUF0:
801
case CommandType::FRAMEBUF1:
802
case CommandType::FRAMEBUF2:
803
case CommandType::FRAMEBUF3:
804
case CommandType::FRAMEBUF4:
805
case CommandType::FRAMEBUF5:
806
case CommandType::FRAMEBUF6:
807
case CommandType::FRAMEBUF7:
808
Framebuf((int)cmd.type - (int)CommandType::FRAMEBUF0, cmd.ptr, cmd.sz);
809
break;
810
811
case CommandType::DISPLAY:
812
Display(cmd.ptr, cmd.sz, i == commands_.size() - 1);
813
break;
814
815
default:
816
ERROR_LOG(Log::GeDebugger, "Unsupported GE dump command: %d", (int)cmd.type);
817
return ReplayResult::Error;
818
}
819
}
820
821
SubmitListEnd();
822
return ReplayResult::Done;
823
}
824
825
static bool ReadCompressed(u32 fp, void *dest, size_t sz, uint32_t version) {
826
u32 compressed_size = 0;
827
if (pspFileSystem.ReadFile(fp, (u8 *)&compressed_size, sizeof(compressed_size)) != sizeof(compressed_size)) {
828
return false;
829
}
830
831
u8 *compressed = new u8[compressed_size];
832
if (pspFileSystem.ReadFile(fp, compressed, compressed_size) != compressed_size) {
833
delete[] compressed;
834
return false;
835
}
836
837
size_t real_size = sz;
838
if (version < 5)
839
snappy_uncompress((const char *)compressed, compressed_size, (char *)dest, &real_size);
840
else
841
real_size = ZSTD_decompress(dest, real_size, compressed, compressed_size);
842
delete[] compressed;
843
844
return real_size == sz;
845
}
846
847
static u32 LoadReplay(const std::string &filename) {
848
PROFILE_THIS_SCOPE("ReplayLoad");
849
850
NOTICE_LOG(Log::GeDebugger, "LoadReplay %s", filename.c_str());
851
852
g_cancelled = false;
853
854
u32 fp = pspFileSystem.OpenFile(filename, FILEACCESS_READ);
855
Header header;
856
pspFileSystem.ReadFile(fp, (u8 *)&header, sizeof(header));
857
u32 version = header.version;
858
859
if (memcmp(header.magic, HEADER_MAGIC, sizeof(header.magic)) != 0 || header.version > VERSION || header.version < MIN_VERSION) {
860
ERROR_LOG(Log::GeDebugger, "Invalid GE dump or unsupported version");
861
pspFileSystem.CloseFile(fp);
862
return 0;
863
}
864
if (header.version <= 3) {
865
pspFileSystem.SeekFile(fp, 12, FILEMOVE_BEGIN);
866
memset(header.gameID, 0, sizeof(header.gameID));
867
}
868
869
size_t gameIDLength = strnlen(header.gameID, sizeof(header.gameID));
870
if (gameIDLength != 0) {
871
g_paramSFO.SetValue("DISC_ID", std::string(header.gameID, gameIDLength), (int)sizeof(header.gameID));
872
std::vector<GameDBInfo> info;
873
std::string gameTitle = "(unknown title)";
874
#if !defined(__LIBRETRO__)
875
if (g_gameDB.GetGameInfos(header.gameID, &info)) {
876
gameTitle = info[0].title;
877
g_paramSFO.SetValue("TITLE", gameTitle, (int)gameTitle.size());
878
}
879
#endif
880
System_SetWindowTitle(g_paramSFO.GetValueString("DISC_ID") + " : " + gameTitle + " (GE frame dump)");
881
} else {
882
System_SetWindowTitle("(GE frame dump: old format, missing DISC_ID)");
883
}
884
885
u32 sz = 0;
886
pspFileSystem.ReadFile(fp, (u8 *)&sz, sizeof(sz));
887
u32 bufsz = 0;
888
pspFileSystem.ReadFile(fp, (u8 *)&bufsz, sizeof(bufsz));
889
890
lastExecCommands.resize(sz);
891
lastExecPushbuf.resize(bufsz);
892
893
bool truncated = false;
894
truncated = truncated || !ReadCompressed(fp, lastExecCommands.data(), sizeof(Command) * sz, header.version);
895
truncated = truncated || !ReadCompressed(fp, lastExecPushbuf.data(), bufsz, header.version);
896
897
pspFileSystem.CloseFile(fp);
898
899
if (truncated) {
900
ERROR_LOG(Log::GeDebugger, "Truncated GE dump detected - can't replay");
901
return 0;
902
}
903
904
lastExecFilename = filename;
905
lastExecVersion = version;
906
return version;
907
}
908
909
void Replay_Unload() {
910
// We might be paused inside a replay - in this case, the thread is still running and we need to tell it to stop.
911
if (replayThread.joinable()) {
912
{
913
// We just finish processing the commands until done.
914
g_cancelled = true;
915
916
std::unique_lock<std::mutex> lock(opFinishLock);
917
opFinishWait.notify_one();
918
}
919
replayThread.join();
920
}
921
922
_dbg_assert_(!replayThread.joinable());
923
924
lastExecFilename.clear();
925
lastExecVersion = 0;
926
lastExecCommands.clear();
927
lastExecPushbuf.clear();
928
929
g_opDone = true;
930
g_retVal = 0;
931
}
932
933
void WriteRunDumpCode(u32 codeStart) {
934
// NOTE: Not static, since parts are run-time computed (MIPS_MAKE_SYSCALL etc)
935
const u32 runDumpCode[] = {
936
// Save the filename.
937
MIPS_MAKE_ORI(MIPS_REG_S0, MIPS_REG_A0, 0),
938
MIPS_MAKE_ORI(MIPS_REG_S1, MIPS_REG_A1, 0),
939
// Call the actual render. Jump here to start over.
940
MIPS_MAKE_SYSCALL("FakeSysCalls", "__KernelGPUReplay"),
941
MIPS_MAKE_NOP(),
942
// Re-run immediately if requested by the return value from __KernelGPUReplay
943
MIPS_MAKE_BNEZ(codeStart + 4 * 4, codeStart + 8, MIPS_REG_V0),
944
MIPS_MAKE_NOP(),
945
// When done (__KernelGPUReplay returned 0), make sure we don't get out of sync (is this needed?)
946
MIPS_MAKE_LUI(MIPS_REG_A0, 0),
947
MIPS_MAKE_SYSCALL("sceGe_user", "sceGeDrawSync"),
948
MIPS_MAKE_NOP(),
949
// Wait for the next vblank to render again, then (through the delay slot) jump right back up to __KernelGPUReplay.
950
MIPS_MAKE_SYSCALL("sceDisplay", "sceDisplayWaitVblankStart"),
951
MIPS_MAKE_NOP(),
952
MIPS_MAKE_J(codeStart + 8),
953
MIPS_MAKE_NOP(),
954
// This never gets reached, just here to be "safe".
955
MIPS_MAKE_BREAK(0),
956
};
957
for (size_t i = 0; i < ARRAY_SIZE(runDumpCode); ++i) {
958
Memory::WriteUnchecked_U32(runDumpCode[i], codeStart + (u32)i * sizeof(u32_le));
959
}
960
}
961
962
// This is called by the syscall. It spawns a "replayThread" which parses the file and sends the commands.
963
// A long term goal is inversion of control here, but it's tricky for a number of reasons that you'll find
964
// out if you try.
965
ReplayResult RunMountedReplay(const std::string &filename) {
966
_assert_msg_(!gpuDebug->GetRecorder()->IsActivePending(), "Cannot run replay while recording.");
967
968
uint32_t version = lastExecVersion;
969
if (lastExecFilename != filename) {
970
// Does this ever happen? Can the filename change, without going through core shutdown/startup?
971
if (replayThread.joinable()) {
972
replayThread.join();
973
}
974
version = LoadReplay(filename);
975
if (!version) {
976
ERROR_LOG(Log::GeDebugger, "bad version %08x", version);
977
return ReplayResult::Error;
978
}
979
}
980
981
if (g_opToExec.type != OpType::None) {
982
std::unique_lock<std::mutex> waitLock(opFinishLock);
983
g_opDone = true;
984
g_opToExec = Operation{ OpType::None };
985
opFinishWait.notify_one();
986
}
987
988
if (!replayThread.joinable()) {
989
_dbg_assert_(g_opToExec.type == OpType::None);
990
g_opToExec = Operation{ OpType::None };
991
replayThread = std::thread([version]() {
992
SetCurrentThreadName("Replay");
993
DumpExecute executor(lastExecPushbuf, lastExecCommands, version);
994
GPURecord::ReplayResult retval = executor.Run();
995
// Finish up
996
ExecuteOnMain(Operation{ OpType::Done });
997
});
998
}
999
1000
// OK, now wait for and perform the desired action.
1001
{
1002
std::unique_lock<std::mutex> lock(opStartLock);
1003
g_condOpStartWait.wait(lock, []() { return g_opToExec.type != OpType::None; });
1004
}
1005
1006
switch (g_opToExec.type) {
1007
case OpType::UpdateStallAddr:
1008
{
1009
bool runList;
1010
hleEatCycles(190);
1011
hleCoreTimingForceCheck();
1012
gpu->UpdateStall(g_opToExec.listID, g_opToExec.param, &runList);
1013
if (runList) {
1014
hleSplitSyscallOverGe();
1015
}
1016
// We're not done yet, request another go.
1017
return ReplayResult::Break;
1018
}
1019
case OpType::EnqueueList:
1020
{
1021
bool runList;
1022
u32 listPC = g_opToExec.listID;
1023
u32 execListPos = g_opToExec.param;
1024
auto optParam = PSPPointer<PspGeListArgs>::Create(0);
1025
g_retVal = gpu->EnqueueList(listPC, execListPos, -1, optParam, false, &runList);
1026
if (runList) {
1027
hleSplitSyscallOverGe();
1028
}
1029
// We're not done yet, request another go.
1030
hleEatCycles(490);
1031
hleCoreTimingForceCheck();
1032
return ReplayResult::Break;
1033
}
1034
case OpType::ReapplyGfxState:
1035
{
1036
// try again but no need to split the sys call
1037
gpu->ReapplyGfxState();
1038
return ReplayResult::Break;
1039
}
1040
case OpType::ListSync:
1041
{
1042
u32 execListID = g_opToExec.listID;
1043
u32 mode = g_opToExec.param;
1044
// try again but no need to split the sys call
1045
hleEatCycles(220);
1046
gpu->ListSync(execListID, mode);
1047
return ReplayResult::Break;
1048
}
1049
case OpType::Done:
1050
{
1051
_dbg_assert_(replayThread.joinable());
1052
{
1053
std::unique_lock<std::mutex> lock(opFinishLock);
1054
g_opDone = true;
1055
opFinishWait.notify_one();
1056
}
1057
replayThread.join();
1058
g_opToExec = { OpType::None };
1059
break;
1060
}
1061
case OpType::None:
1062
break;
1063
}
1064
return ReplayResult::Done;
1065
}
1066
1067
} // namespace GPURecord
1068
1069