Path: blob/master/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
29285 views
// SPDX-License-Identifier: GPL-2.01/*2* Copyright 2025 Advanced Micro Devices, Inc.3*4* Permission is hereby granted, free of charge, to any person obtaining a5* copy of this software and associated documentation files (the "Software"),6* to deal in the Software without restriction, including without limitation7* the rights to use, copy, modify, merge, publish, distribute, sublicense,8* and/or sell copies of the Software, and to permit persons to whom the9* Software is furnished to do so, subject to the following conditions:10*11* The above copyright notice and this permission notice shall be included in12* all copies or substantial portions of the Software.13*14* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR15* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,16* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL17* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR18* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,19* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR20* OTHER DEALINGS IN THE SOFTWARE.21*22*/23#include <linux/list.h>24#include "amdgpu.h"2526static const guid_t MCE = CPER_NOTIFY_MCE;27static const guid_t CMC = CPER_NOTIFY_CMC;28static const guid_t BOOT = BOOT_TYPE;2930static const guid_t CRASHDUMP = AMD_CRASHDUMP;31static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR;3233static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)34{35hdr->record_length += size;36}3738static void amdgpu_cper_get_timestamp(struct cper_timestamp *timestamp)39{40struct tm tm;41time64_t now = ktime_get_real_seconds();4243time64_to_tm(now, 0, &tm);44timestamp->seconds = tm.tm_sec;45timestamp->minutes = tm.tm_min;46timestamp->hours = tm.tm_hour;47timestamp->flag = 0;48timestamp->day = tm.tm_mday;49timestamp->month = 1 + tm.tm_mon;50timestamp->year = (1900 + tm.tm_year) % 100;51timestamp->century = (1900 + tm.tm_year) / 100;52}5354void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,55struct cper_hdr *hdr,56enum amdgpu_cper_type type,57enum cper_error_severity sev)58{59char record_id[16];6061hdr->signature[0] = 'C';62hdr->signature[1] = 'P';63hdr->signature[2] = 'E';64hdr->signature[3] = 'R';65hdr->revision = CPER_HDR_REV_1;66hdr->signature_end = 0xFFFFFFFF;67hdr->error_severity = sev;6869hdr->valid_bits.platform_id = 1;70hdr->valid_bits.timestamp = 1;7172amdgpu_cper_get_timestamp(&hdr->timestamp);7374snprintf(record_id, 9, "%d:%X",75(adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?76adev->smuio.funcs->get_socket_id(adev) :770,78atomic_inc_return(&adev->cper.unique_id));79memcpy(hdr->record_id, record_id, 8);8081snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",82adev->pdev->vendor, adev->pdev->device);83/* pmfw version should be part of creator_id according to CPER spec */84snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);8586switch (type) {87case AMDGPU_CPER_TYPE_BOOT:88hdr->notify_type = BOOT;89break;90case AMDGPU_CPER_TYPE_FATAL:91case AMDGPU_CPER_TYPE_BP_THRESHOLD:92hdr->notify_type = MCE;93break;94case AMDGPU_CPER_TYPE_RUNTIME:95if (sev == CPER_SEV_NON_FATAL_CORRECTED)96hdr->notify_type = CMC;97else98hdr->notify_type = MCE;99break;100default:101dev_err(adev->dev, "Unknown CPER Type\n");102break;103}104105__inc_entry_length(hdr, HDR_LEN);106}107108static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,109struct cper_sec_desc *section_desc,110bool bp_threshold,111bool poison,112enum cper_error_severity sev,113guid_t sec_type,114uint32_t section_length,115uint32_t section_offset)116{117section_desc->revision_minor = CPER_SEC_MINOR_REV_1;118section_desc->revision_major = CPER_SEC_MAJOR_REV_22;119section_desc->sec_offset = section_offset;120section_desc->sec_length = section_length;121section_desc->valid_bits.fru_text = 1;122section_desc->flag_bits.primary = 1;123section_desc->severity = sev;124section_desc->sec_type = sec_type;125126snprintf(section_desc->fru_text, 20, "OAM%d",127(adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?128adev->smuio.funcs->get_socket_id(adev) :1290);130131if (bp_threshold)132section_desc->flag_bits.exceed_err_threshold = 1;133if (poison)134section_desc->flag_bits.latent_err = 1;135136return 0;137}138139int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,140struct cper_hdr *hdr,141uint32_t idx,142struct cper_sec_crashdump_reg_data reg_data)143{144struct cper_sec_desc *section_desc;145struct cper_sec_crashdump_fatal *section;146147section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));148section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +149FATAL_SEC_OFFSET(hdr->sec_cnt, idx));150151amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,152CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,153FATAL_SEC_OFFSET(hdr->sec_cnt, idx));154155section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;156section->body.reg_arr_size = sizeof(reg_data);157section->body.data = reg_data;158159__inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);160161return 0;162}163164int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,165struct cper_hdr *hdr,166uint32_t idx,167enum cper_error_severity sev,168uint32_t *reg_dump,169uint32_t reg_count)170{171struct cper_sec_desc *section_desc;172struct cper_sec_nonstd_err *section;173bool poison;174175poison = sev != CPER_SEV_NON_FATAL_CORRECTED;176section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));177section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +178NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));179180amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,181sev, RUNTIME, NONSTD_SEC_LEN,182NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));183184reg_count = umin(reg_count, CPER_ACA_REG_COUNT);185186section->hdr.valid_bits.err_info_cnt = 1;187section->hdr.valid_bits.err_context_cnt = 1;188189section->info.error_type = RUNTIME;190section->info.ms_chk_bits.err_type_valid = 1;191section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;192section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);193194memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));195196__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);197198return 0;199}200201int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,202struct cper_hdr *hdr,203uint32_t idx)204{205struct cper_sec_desc *section_desc;206struct cper_sec_nonstd_err *section;207uint32_t socket_id;208209section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));210section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +211NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));212213amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,214CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,215NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));216217section->hdr.valid_bits.err_info_cnt = 1;218section->hdr.valid_bits.err_context_cnt = 1;219220section->info.error_type = RUNTIME;221section->info.valid_bits.ms_chk = 1;222section->info.ms_chk_bits.err_type_valid = 1;223section->info.ms_chk_bits.err_type = 1;224section->info.ms_chk_bits.pcc = 1;225section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;226section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);227228/* Hardcoded Reg dump for bad page threshold CPER */229socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?230adev->smuio.funcs->get_socket_id(adev) :2310;232section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1;233section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0;234section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;235section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;236section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0;237section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0;238section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0;239section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0;240section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;241section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;242section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = (socket_id / 4) & 0x01;243section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x096 | (((socket_id % 4) & 0x3) << 12);244section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0;245section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0;246247__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);248249return 0;250}251252struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,253enum amdgpu_cper_type type,254uint16_t section_count)255{256struct cper_hdr *hdr;257uint32_t size = 0;258259size += HDR_LEN;260size += (SEC_DESC_LEN * section_count);261262switch (type) {263case AMDGPU_CPER_TYPE_RUNTIME:264case AMDGPU_CPER_TYPE_BP_THRESHOLD:265size += (NONSTD_SEC_LEN * section_count);266break;267case AMDGPU_CPER_TYPE_FATAL:268size += (FATAL_SEC_LEN * section_count);269break;270case AMDGPU_CPER_TYPE_BOOT:271size += (BOOT_SEC_LEN * section_count);272break;273default:274dev_err(adev->dev, "Unknown CPER Type!\n");275return NULL;276}277278hdr = kzalloc(size, GFP_KERNEL);279if (!hdr)280return NULL;281282/* Save this early */283hdr->sec_cnt = section_count;284285return hdr;286}287288int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,289struct aca_bank *bank)290{291struct cper_hdr *fatal = NULL;292struct cper_sec_crashdump_reg_data reg_data = { 0 };293struct amdgpu_ring *ring = &adev->cper.ring_buf;294int ret;295296fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);297if (!fatal) {298dev_err(adev->dev, "fail to alloc cper entry for ue record\n");299return -ENOMEM;300}301302reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);303reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);304reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);305reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);306reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);307reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);308reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);309reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);310311amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);312ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);313if (ret)314return ret;315316amdgpu_cper_ring_write(ring, fatal, fatal->record_length);317kfree(fatal);318319return 0;320}321322int amdgpu_cper_generate_bp_threshold_record(struct amdgpu_device *adev)323{324struct cper_hdr *bp_threshold = NULL;325struct amdgpu_ring *ring = &adev->cper.ring_buf;326int ret;327328bp_threshold = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_BP_THRESHOLD, 1);329if (!bp_threshold) {330dev_err(adev->dev, "fail to alloc cper entry for bad page threshold record\n");331return -ENOMEM;332}333334amdgpu_cper_entry_fill_hdr(adev, bp_threshold,335AMDGPU_CPER_TYPE_BP_THRESHOLD,336CPER_SEV_FATAL);337ret = amdgpu_cper_entry_fill_bad_page_threshold_section(adev, bp_threshold, 0);338if (ret)339return ret;340341amdgpu_cper_ring_write(ring, bp_threshold, bp_threshold->record_length);342kfree(bp_threshold);343344return 0;345}346347static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,348enum aca_error_type aca_err_type)349{350switch (aca_err_type) {351case ACA_ERROR_TYPE_UE:352return CPER_SEV_FATAL;353case ACA_ERROR_TYPE_CE:354return CPER_SEV_NON_FATAL_CORRECTED;355case ACA_ERROR_TYPE_DEFERRED:356return CPER_SEV_NON_FATAL_UNCORRECTED;357default:358dev_err(adev->dev, "Unknown ACA error type!\n");359return CPER_SEV_FATAL;360}361}362363int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,364struct aca_banks *banks,365uint16_t bank_count)366{367struct cper_hdr *corrected = NULL;368enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;369struct amdgpu_ring *ring = &adev->cper.ring_buf;370uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };371struct aca_bank_node *node;372struct aca_bank *bank;373uint32_t i = 0;374int ret;375376corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);377if (!corrected) {378dev_err(adev->dev, "fail to allocate cper entry for ce records\n");379return -ENOMEM;380}381382/* Raise severity if any DE is detected in the ACA bank list */383list_for_each_entry(node, &banks->list, node) {384bank = &node->bank;385if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {386sev = CPER_SEV_NON_FATAL_UNCORRECTED;387break;388}389}390391amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);392393/* Combine CE and DE in cper record */394list_for_each_entry(node, &banks->list, node) {395bank = &node->bank;396reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);397reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);398reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);399reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);400reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);401reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);402reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);403reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);404reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);405reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);406reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);407reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);408reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);409reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);410411ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,412amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),413reg_data, CPER_ACA_REG_COUNT);414if (ret)415return ret;416}417418amdgpu_cper_ring_write(ring, corrected, corrected->record_length);419kfree(corrected);420421return 0;422}423424static bool amdgpu_cper_is_hdr(struct amdgpu_ring *ring, u64 pos)425{426struct cper_hdr *chdr;427428chdr = (struct cper_hdr *)&(ring->ring[pos]);429return strcmp(chdr->signature, "CPER") ? false : true;430}431432static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)433{434struct cper_hdr *chdr;435u64 p;436u32 chunk, rec_len = 0;437438chdr = (struct cper_hdr *)&(ring->ring[pos]);439chunk = ring->ring_size - (pos << 2);440441if (!strcmp(chdr->signature, "CPER")) {442rec_len = chdr->record_length;443goto calc;444}445446/* ring buffer is not full, no cper data after ring->wptr */447if (ring->count_dw)448goto calc;449450for (p = pos + 1; p <= ring->buf_mask; p++) {451chdr = (struct cper_hdr *)&(ring->ring[p]);452if (!strcmp(chdr->signature, "CPER")) {453rec_len = (p - pos) << 2;454goto calc;455}456}457458calc:459if (!rec_len)460return chunk;461else462return umin(rec_len, chunk);463}464465void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)466{467u64 pos, wptr_old, rptr;468int rec_cnt_dw = count >> 2;469u32 chunk, ent_sz;470u8 *s = (u8 *)src;471472if (count >= ring->ring_size - 4) {473dev_err(ring->adev->dev,474"CPER data size(%d) is larger than ring size(%d)\n",475count, ring->ring_size - 4);476477return;478}479480mutex_lock(&ring->adev->cper.ring_lock);481482wptr_old = ring->wptr;483rptr = *ring->rptr_cpu_addr & ring->ptr_mask;484485while (count) {486ent_sz = amdgpu_cper_ring_get_ent_sz(ring, ring->wptr);487chunk = umin(ent_sz, count);488489memcpy(&ring->ring[ring->wptr], s, chunk);490491ring->wptr += (chunk >> 2);492ring->wptr &= ring->ptr_mask;493count -= chunk;494s += chunk;495}496497if (ring->count_dw < rec_cnt_dw)498ring->count_dw = 0;499500/* the buffer is overflow, adjust rptr */501if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||502((ring->wptr < wptr_old) && (wptr_old < rptr)) ||503((rptr <= ring->wptr) && (ring->wptr < wptr_old))) {504pos = (ring->wptr + 1) & ring->ptr_mask;505506do {507ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos);508509rptr += (ent_sz >> 2);510rptr &= ring->ptr_mask;511*ring->rptr_cpu_addr = rptr;512513pos = rptr;514} while (!amdgpu_cper_is_hdr(ring, rptr));515}516517if (ring->count_dw >= rec_cnt_dw)518ring->count_dw -= rec_cnt_dw;519mutex_unlock(&ring->adev->cper.ring_lock);520}521522static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)523{524return *(ring->rptr_cpu_addr);525}526527static u64 amdgpu_cper_ring_get_wptr(struct amdgpu_ring *ring)528{529return ring->wptr;530}531532static const struct amdgpu_ring_funcs cper_ring_funcs = {533.type = AMDGPU_RING_TYPE_CPER,534.align_mask = 0xff,535.support_64bit_ptrs = false,536.get_rptr = amdgpu_cper_ring_get_rptr,537.get_wptr = amdgpu_cper_ring_get_wptr,538};539540static int amdgpu_cper_ring_init(struct amdgpu_device *adev)541{542struct amdgpu_ring *ring = &(adev->cper.ring_buf);543544mutex_init(&adev->cper.ring_lock);545546ring->adev = NULL;547ring->ring_obj = NULL;548ring->use_doorbell = false;549ring->no_scheduler = true;550ring->funcs = &cper_ring_funcs;551552sprintf(ring->name, "cper");553return amdgpu_ring_init(adev, ring, CPER_MAX_RING_SIZE, NULL, 0,554AMDGPU_RING_PRIO_DEFAULT, NULL);555}556557int amdgpu_cper_init(struct amdgpu_device *adev)558{559int r;560561if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))562return 0;563564r = amdgpu_cper_ring_init(adev);565if (r) {566dev_err(adev->dev, "failed to initialize cper ring, r = %d\n", r);567return r;568}569570mutex_init(&adev->cper.cper_lock);571572adev->cper.enabled = true;573adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;574575return 0;576}577578int amdgpu_cper_fini(struct amdgpu_device *adev)579{580if (!amdgpu_aca_is_enabled(adev) && !amdgpu_sriov_ras_cper_en(adev))581return 0;582583adev->cper.enabled = false;584585amdgpu_ring_fini(&(adev->cper.ring_buf));586adev->cper.count = 0;587adev->cper.wptr = 0;588589return 0;590}591592593