1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 #include <linux/list.h> 25 #include "amdgpu.h" 26 27 static const guid_t MCE = CPER_NOTIFY_MCE; 28 static const guid_t CMC = CPER_NOTIFY_CMC; 29 static const guid_t BOOT = BOOT_TYPE; 30 31 static const guid_t CRASHDUMP = AMD_CRASHDUMP; 32 static const guid_t RUNTIME = AMD_GPU_NONSTANDARD_ERROR; 33 34 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size) 35 { 36 hdr->record_length += size; 37 } 38 39 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, 40 struct cper_hdr *hdr, 41 enum amdgpu_cper_type type, 42 enum cper_error_severity sev) 43 { 44 hdr->signature[0] = 'C'; 45 hdr->signature[1] = 'P'; 46 hdr->signature[2] = 'E'; 47 hdr->signature[3] = 'R'; 48 hdr->revision = CPER_HDR_REV_1; 49 hdr->signature_end = 0xFFFFFFFF; 50 hdr->error_severity = sev; 51 52 hdr->valid_bits.platform_id = 1; 53 hdr->valid_bits.partition_id = 1; 54 hdr->valid_bits.timestamp = 1; 55 /*TODO need to initialize hdr->timestamp */ 56 57 snprintf(hdr->record_id, 8, "%d", atomic_inc_return(&adev->cper.unique_id)); 58 snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", 59 adev->pdev->vendor, adev->pdev->device); 60 /* pmfw version should be part of creator_id according to CPER spec */ 61 snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU); 62 63 switch (type) { 64 case AMDGPU_CPER_TYPE_BOOT: 65 hdr->notify_type = BOOT; 66 break; 67 case AMDGPU_CPER_TYPE_FATAL: 68 case AMDGPU_CPER_TYPE_BP_THRESHOLD: 69 hdr->notify_type = MCE; 70 break; 71 case AMDGPU_CPER_TYPE_RUNTIME: 72 if (sev == CPER_SEV_NON_FATAL_CORRECTED) 73 hdr->notify_type = CMC; 74 else 75 hdr->notify_type = MCE; 76 break; 77 default: 78 dev_err(adev->dev, "Unknown CPER Type\n"); 79 break; 80 } 81 82 __inc_entry_length(hdr, HDR_LEN); 83 } 84 85 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev, 86 struct cper_sec_desc *section_desc, 87 bool bp_threshold, 88 bool poison, 89 enum cper_error_severity sev, 90 guid_t sec_type, 91 uint32_t section_length, 92 uint32_t section_offset) 93 { 94 section_desc->revision_minor = CPER_SEC_MINOR_REV_1; 95 section_desc->revision_major = CPER_SEC_MAJOR_REV_22; 96 section_desc->sec_offset = section_offset; 97 section_desc->sec_length = section_length; 98 section_desc->valid_bits.fru_id = 1; 99 section_desc->valid_bits.fru_text = 1; 100 section_desc->flag_bits.primary = 1; 101 section_desc->severity = sev; 102 section_desc->sec_type = sec_type; 103 104 if (adev->smuio.funcs && 105 adev->smuio.funcs->get_socket_id) 106 snprintf(section_desc->fru_text, 20, "OAM%d", 107 adev->smuio.funcs->get_socket_id(adev)); 108 /* TODO: fru_id is 16 bytes in CPER spec, but driver defines it as 20 bytes */ 109 snprintf(section_desc->fru_id, 16, "%llx", adev->unique_id); 110 111 if (bp_threshold) 112 section_desc->flag_bits.exceed_err_threshold = 1; 113 if (poison) 114 section_desc->flag_bits.latent_err = 1; 115 116 return 0; 117 } 118 119 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev, 120 struct cper_hdr *hdr, 121 uint32_t idx, 122 struct cper_sec_crashdump_reg_data reg_data) 123 { 124 struct cper_sec_desc *section_desc; 125 struct cper_sec_crashdump_fatal *section; 126 127 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 128 section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr + 129 FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); 130 131 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false, 132 CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN, 133 FATAL_SEC_OFFSET(hdr->sec_cnt, idx)); 134 135 section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH; 136 section->body.reg_arr_size = sizeof(reg_data); 137 section->body.data = reg_data; 138 139 __inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN); 140 141 return 0; 142 } 143 144 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev, 145 struct cper_hdr *hdr, 146 uint32_t idx, 147 enum cper_error_severity sev, 148 uint32_t *reg_dump, 149 uint32_t reg_count) 150 { 151 struct cper_sec_desc *section_desc; 152 struct cper_sec_nonstd_err *section; 153 bool poison; 154 155 poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true; 156 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 157 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + 158 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 159 160 amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison, 161 sev, RUNTIME, NONSTD_SEC_LEN, 162 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 163 164 reg_count = min(reg_count, CPER_ACA_REG_COUNT); 165 166 section->hdr.valid_bits.err_info_cnt = 1; 167 section->hdr.valid_bits.err_context_cnt = 1; 168 169 section->info.error_type = RUNTIME; 170 section->info.ms_chk_bits.err_type_valid = 1; 171 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; 172 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); 173 174 memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t)); 175 176 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); 177 178 return 0; 179 } 180 181 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev, 182 struct cper_hdr *hdr, 183 uint32_t idx) 184 { 185 struct cper_sec_desc *section_desc; 186 struct cper_sec_nonstd_err *section; 187 188 section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); 189 section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + 190 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 191 192 amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false, 193 CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN, 194 NONSTD_SEC_OFFSET(hdr->sec_cnt, idx)); 195 196 section->hdr.valid_bits.err_info_cnt = 1; 197 section->hdr.valid_bits.err_context_cnt = 1; 198 199 section->info.error_type = RUNTIME; 200 section->info.ms_chk_bits.err_type_valid = 1; 201 section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; 202 section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); 203 204 /* Hardcoded Reg dump for bad page threshold CPER */ 205 section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1; 206 section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0; 207 section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137; 208 section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000; 209 section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO] = 0x0; 210 section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI] = 0x0; 211 section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO] = 0x0; 212 section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0; 213 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2; 214 section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff; 215 section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0; 216 section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96; 217 section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0; 218 section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0; 219 220 __inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN); 221 222 return 0; 223 } 224 225 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev, 226 enum amdgpu_cper_type type, 227 uint16_t section_count) 228 { 229 struct cper_hdr *hdr; 230 uint32_t size = 0; 231 232 size += HDR_LEN; 233 size += (SEC_DESC_LEN * section_count); 234 235 switch (type) { 236 case AMDGPU_CPER_TYPE_RUNTIME: 237 case AMDGPU_CPER_TYPE_BP_THRESHOLD: 238 size += (NONSTD_SEC_LEN * section_count); 239 break; 240 case AMDGPU_CPER_TYPE_FATAL: 241 size += (FATAL_SEC_LEN * section_count); 242 break; 243 case AMDGPU_CPER_TYPE_BOOT: 244 size += (BOOT_SEC_LEN * section_count); 245 break; 246 default: 247 dev_err(adev->dev, "Unknown CPER Type!\n"); 248 return NULL; 249 } 250 251 hdr = kzalloc(size, GFP_KERNEL); 252 if (!hdr) 253 return NULL; 254 255 /* Save this early */ 256 hdr->sec_cnt = section_count; 257 258 return hdr; 259 } 260 261 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev, 262 struct aca_bank *bank) 263 { 264 struct cper_hdr *fatal = NULL; 265 struct cper_sec_crashdump_reg_data reg_data = { 0 }; 266 int ret; 267 268 fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1); 269 if (!fatal) { 270 dev_err(adev->dev, "fail to alloc cper entry for ue record\n"); 271 return -ENOMEM; 272 } 273 274 reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 275 reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 276 reg_data.addr_lo = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 277 reg_data.addr_hi = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 278 reg_data.ipid_lo = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); 279 reg_data.ipid_hi = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); 280 reg_data.synd_lo = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); 281 reg_data.synd_hi = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); 282 283 amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL); 284 ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data); 285 if (ret) 286 return ret; 287 288 /*TODO: commit the cper entry to cper ring */ 289 290 return 0; 291 } 292 293 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev, 294 enum aca_error_type aca_err_type) 295 { 296 switch (aca_err_type) { 297 case ACA_ERROR_TYPE_UE: 298 return CPER_SEV_FATAL; 299 case ACA_ERROR_TYPE_CE: 300 return CPER_SEV_NON_FATAL_CORRECTED; 301 case ACA_ERROR_TYPE_DEFERRED: 302 return CPER_SEV_NON_FATAL_UNCORRECTED; 303 default: 304 dev_err(adev->dev, "Unknown ACA error type!\n"); 305 return CPER_SEV_FATAL; 306 } 307 } 308 309 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev, 310 struct aca_banks *banks, 311 uint16_t bank_count) 312 { 313 struct cper_hdr *corrected = NULL; 314 enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED; 315 uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 }; 316 struct aca_bank_node *node; 317 struct aca_bank *bank; 318 uint32_t i = 0; 319 int ret; 320 321 corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count); 322 if (!corrected) { 323 dev_err(adev->dev, "fail to allocate cper entry for ce records\n"); 324 return -ENOMEM; 325 } 326 327 /* Raise severity if any DE is detected in the ACA bank list */ 328 list_for_each_entry(node, &banks->list, node) { 329 bank = &node->bank; 330 if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) { 331 sev = CPER_SEV_NON_FATAL_UNCORRECTED; 332 break; 333 } 334 } 335 336 amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev); 337 338 /* Combine CE and UE in cper record */ 339 list_for_each_entry(node, &banks->list, node) { 340 bank = &node->bank; 341 reg_data[CPER_ACA_REG_CTL_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]); 342 reg_data[CPER_ACA_REG_CTL_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]); 343 reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 344 reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]); 345 reg_data[CPER_ACA_REG_ADDR_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 346 reg_data[CPER_ACA_REG_ADDR_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]); 347 reg_data[CPER_ACA_REG_MISC0_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]); 348 reg_data[CPER_ACA_REG_MISC0_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]); 349 reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); 350 reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]); 351 reg_data[CPER_ACA_REG_IPID_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]); 352 reg_data[CPER_ACA_REG_IPID_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]); 353 reg_data[CPER_ACA_REG_SYND_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]); 354 reg_data[CPER_ACA_REG_SYND_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]); 355 356 ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++, 357 amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type), 358 reg_data, CPER_ACA_REG_COUNT); 359 if (ret) 360 return ret; 361 } 362 363 /*TODO: commit the cper entry to cper ring */ 364 365 return 0; 366 } 367 368 int amdgpu_cper_init(struct amdgpu_device *adev) 369 { 370 mutex_init(&adev->cper.cper_lock); 371 372 adev->cper.enabled = true; 373 adev->cper.max_count = CPER_MAX_ALLOWED_COUNT; 374 375 /*TODO: initialize cper ring*/ 376 377 return 0; 378 } 379 380 int amdgpu_cper_fini(struct amdgpu_device *adev) 381 { 382 adev->cper.enabled = false; 383 384 /*TODO: free cper ring */ 385 adev->cper.count = 0; 386 adev->cper.wptr = 0; 387 388 return 0; 389 } 390