1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include <linux/list.h>
25 #include "amdgpu.h"
26 
27 static const guid_t MCE			= CPER_NOTIFY_MCE;
28 static const guid_t CMC			= CPER_NOTIFY_CMC;
29 static const guid_t BOOT		= BOOT_TYPE;
30 
31 static const guid_t CRASHDUMP		= AMD_CRASHDUMP;
32 static const guid_t RUNTIME		= AMD_GPU_NONSTANDARD_ERROR;
33 
34 static void __inc_entry_length(struct cper_hdr *hdr, uint32_t size)
35 {
36 	hdr->record_length += size;
37 }
38 
39 void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
40 				struct cper_hdr *hdr,
41 				enum amdgpu_cper_type type,
42 				enum cper_error_severity sev)
43 {
44 	hdr->signature[0]		= 'C';
45 	hdr->signature[1]		= 'P';
46 	hdr->signature[2]		= 'E';
47 	hdr->signature[3]		= 'R';
48 	hdr->revision			= CPER_HDR_REV_1;
49 	hdr->signature_end		= 0xFFFFFFFF;
50 	hdr->error_severity		= sev;
51 
52 	hdr->valid_bits.platform_id	= 1;
53 	hdr->valid_bits.partition_id	= 1;
54 	hdr->valid_bits.timestamp	= 1;
55 	/*TODO need to initialize hdr->timestamp */
56 
57 	snprintf(hdr->record_id, 8, "%d", atomic_inc_return(&adev->cper.unique_id));
58 	snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
59 		 adev->pdev->vendor, adev->pdev->device);
60 	/* pmfw version should be part of creator_id according to CPER spec */
61 	snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID_AMDGPU);
62 
63 	switch (type) {
64 	case AMDGPU_CPER_TYPE_BOOT:
65 		hdr->notify_type = BOOT;
66 		break;
67 	case AMDGPU_CPER_TYPE_FATAL:
68 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
69 		hdr->notify_type = MCE;
70 		break;
71 	case AMDGPU_CPER_TYPE_RUNTIME:
72 		if (sev == CPER_SEV_NON_FATAL_CORRECTED)
73 			hdr->notify_type = CMC;
74 		else
75 			hdr->notify_type = MCE;
76 		break;
77 	default:
78 		dev_err(adev->dev, "Unknown CPER Type\n");
79 		break;
80 	}
81 
82 	__inc_entry_length(hdr, HDR_LEN);
83 }
84 
85 static int amdgpu_cper_entry_fill_section_desc(struct amdgpu_device *adev,
86 					       struct cper_sec_desc *section_desc,
87 					       bool bp_threshold,
88 					       bool poison,
89 					       enum cper_error_severity sev,
90 					       guid_t sec_type,
91 					       uint32_t section_length,
92 					       uint32_t section_offset)
93 {
94 	section_desc->revision_minor		= CPER_SEC_MINOR_REV_1;
95 	section_desc->revision_major		= CPER_SEC_MAJOR_REV_22;
96 	section_desc->sec_offset		= section_offset;
97 	section_desc->sec_length		= section_length;
98 	section_desc->valid_bits.fru_id		= 1;
99 	section_desc->valid_bits.fru_text	= 1;
100 	section_desc->flag_bits.primary		= 1;
101 	section_desc->severity			= sev;
102 	section_desc->sec_type			= sec_type;
103 
104 	if (adev->smuio.funcs &&
105 	    adev->smuio.funcs->get_socket_id)
106 		snprintf(section_desc->fru_text, 20, "OAM%d",
107 			 adev->smuio.funcs->get_socket_id(adev));
108 	/* TODO: fru_id is 16 bytes in CPER spec, but driver defines it as 20 bytes */
109 	snprintf(section_desc->fru_id, 16, "%llx", adev->unique_id);
110 
111 	if (bp_threshold)
112 		section_desc->flag_bits.exceed_err_threshold = 1;
113 	if (poison)
114 		section_desc->flag_bits.latent_err = 1;
115 
116 	return 0;
117 }
118 
119 int amdgpu_cper_entry_fill_fatal_section(struct amdgpu_device *adev,
120 					 struct cper_hdr *hdr,
121 					 uint32_t idx,
122 					 struct cper_sec_crashdump_reg_data reg_data)
123 {
124 	struct cper_sec_desc *section_desc;
125 	struct cper_sec_crashdump_fatal *section;
126 
127 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
128 	section = (struct cper_sec_crashdump_fatal *)((uint8_t *)hdr +
129 		   FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
130 
131 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, false,
132 					    CPER_SEV_FATAL, CRASHDUMP, FATAL_SEC_LEN,
133 					    FATAL_SEC_OFFSET(hdr->sec_cnt, idx));
134 
135 	section->body.reg_ctx_type = CPER_CTX_TYPE_CRASH;
136 	section->body.reg_arr_size = sizeof(reg_data);
137 	section->body.data = reg_data;
138 
139 	__inc_entry_length(hdr, SEC_DESC_LEN + FATAL_SEC_LEN);
140 
141 	return 0;
142 }
143 
144 int amdgpu_cper_entry_fill_runtime_section(struct amdgpu_device *adev,
145 					   struct cper_hdr *hdr,
146 					   uint32_t idx,
147 					   enum cper_error_severity sev,
148 					   uint32_t *reg_dump,
149 					   uint32_t reg_count)
150 {
151 	struct cper_sec_desc *section_desc;
152 	struct cper_sec_nonstd_err *section;
153 	bool poison;
154 
155 	poison = (sev == CPER_SEV_NON_FATAL_CORRECTED) ? false : true;
156 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
157 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
158 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
159 
160 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, false, poison,
161 					    sev, RUNTIME, NONSTD_SEC_LEN,
162 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
163 
164 	reg_count = min(reg_count, CPER_ACA_REG_COUNT);
165 
166 	section->hdr.valid_bits.err_info_cnt = 1;
167 	section->hdr.valid_bits.err_context_cnt = 1;
168 
169 	section->info.error_type = RUNTIME;
170 	section->info.ms_chk_bits.err_type_valid = 1;
171 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
172 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
173 
174 	memcpy(section->ctx.reg_dump, reg_dump, reg_count * sizeof(uint32_t));
175 
176 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
177 
178 	return 0;
179 }
180 
181 int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev,
182 						      struct cper_hdr *hdr,
183 						      uint32_t idx)
184 {
185 	struct cper_sec_desc *section_desc;
186 	struct cper_sec_nonstd_err *section;
187 
188 	section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
189 	section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
190 		   NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
191 
192 	amdgpu_cper_entry_fill_section_desc(adev, section_desc, true, false,
193 					    CPER_SEV_FATAL, RUNTIME, NONSTD_SEC_LEN,
194 					    NONSTD_SEC_OFFSET(hdr->sec_cnt, idx));
195 
196 	section->hdr.valid_bits.err_info_cnt = 1;
197 	section->hdr.valid_bits.err_context_cnt = 1;
198 
199 	section->info.error_type = RUNTIME;
200 	section->info.ms_chk_bits.err_type_valid = 1;
201 	section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
202 	section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
203 
204 	/* Hardcoded Reg dump for bad page threshold CPER */
205 	section->ctx.reg_dump[CPER_ACA_REG_CTL_LO]    = 0x1;
206 	section->ctx.reg_dump[CPER_ACA_REG_CTL_HI]    = 0x0;
207 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
208 	section->ctx.reg_dump[CPER_ACA_REG_STATUS_HI] = 0xB0000000;
209 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_LO]   = 0x0;
210 	section->ctx.reg_dump[CPER_ACA_REG_ADDR_HI]   = 0x0;
211 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_LO]  = 0x0;
212 	section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI]  = 0x0;
213 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
214 	section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
215 	section->ctx.reg_dump[CPER_ACA_REG_IPID_LO]   = 0x0;
216 	section->ctx.reg_dump[CPER_ACA_REG_IPID_HI]   = 0x96;
217 	section->ctx.reg_dump[CPER_ACA_REG_SYND_LO]   = 0x0;
218 	section->ctx.reg_dump[CPER_ACA_REG_SYND_HI]   = 0x0;
219 
220 	__inc_entry_length(hdr, SEC_DESC_LEN + NONSTD_SEC_LEN);
221 
222 	return 0;
223 }
224 
225 struct cper_hdr *amdgpu_cper_alloc_entry(struct amdgpu_device *adev,
226 					 enum amdgpu_cper_type type,
227 					 uint16_t section_count)
228 {
229 	struct cper_hdr *hdr;
230 	uint32_t size = 0;
231 
232 	size += HDR_LEN;
233 	size += (SEC_DESC_LEN * section_count);
234 
235 	switch (type) {
236 	case AMDGPU_CPER_TYPE_RUNTIME:
237 	case AMDGPU_CPER_TYPE_BP_THRESHOLD:
238 		size += (NONSTD_SEC_LEN * section_count);
239 		break;
240 	case AMDGPU_CPER_TYPE_FATAL:
241 		size += (FATAL_SEC_LEN * section_count);
242 		break;
243 	case AMDGPU_CPER_TYPE_BOOT:
244 		size += (BOOT_SEC_LEN * section_count);
245 		break;
246 	default:
247 		dev_err(adev->dev, "Unknown CPER Type!\n");
248 		return NULL;
249 	}
250 
251 	hdr = kzalloc(size, GFP_KERNEL);
252 	if (!hdr)
253 		return NULL;
254 
255 	/* Save this early */
256 	hdr->sec_cnt = section_count;
257 
258 	return hdr;
259 }
260 
261 int amdgpu_cper_generate_ue_record(struct amdgpu_device *adev,
262 				   struct aca_bank *bank)
263 {
264 	struct cper_hdr *fatal = NULL;
265 	struct cper_sec_crashdump_reg_data reg_data = { 0 };
266 	int ret;
267 
268 	fatal = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_FATAL, 1);
269 	if (!fatal) {
270 		dev_err(adev->dev, "fail to alloc cper entry for ue record\n");
271 		return -ENOMEM;
272 	}
273 
274 	reg_data.status_lo = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
275 	reg_data.status_hi = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
276 	reg_data.addr_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
277 	reg_data.addr_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
278 	reg_data.ipid_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
279 	reg_data.ipid_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
280 	reg_data.synd_lo   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
281 	reg_data.synd_hi   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
282 
283 	amdgpu_cper_entry_fill_hdr(adev, fatal, AMDGPU_CPER_TYPE_FATAL, CPER_SEV_FATAL);
284 	ret = amdgpu_cper_entry_fill_fatal_section(adev, fatal, 0, reg_data);
285 	if (ret)
286 		return ret;
287 
288 	/*TODO: commit the cper entry to cper ring */
289 
290 	return 0;
291 }
292 
293 static enum cper_error_severity amdgpu_aca_err_type_to_cper_sev(struct amdgpu_device *adev,
294 								enum aca_error_type aca_err_type)
295 {
296 	switch (aca_err_type) {
297 	case ACA_ERROR_TYPE_UE:
298 		return CPER_SEV_FATAL;
299 	case ACA_ERROR_TYPE_CE:
300 		return CPER_SEV_NON_FATAL_CORRECTED;
301 	case ACA_ERROR_TYPE_DEFERRED:
302 		return CPER_SEV_NON_FATAL_UNCORRECTED;
303 	default:
304 		dev_err(adev->dev, "Unknown ACA error type!\n");
305 		return CPER_SEV_FATAL;
306 	}
307 }
308 
309 int amdgpu_cper_generate_ce_records(struct amdgpu_device *adev,
310 				    struct aca_banks *banks,
311 				    uint16_t bank_count)
312 {
313 	struct cper_hdr *corrected = NULL;
314 	enum cper_error_severity sev = CPER_SEV_NON_FATAL_CORRECTED;
315 	uint32_t reg_data[CPER_ACA_REG_COUNT] = { 0 };
316 	struct aca_bank_node *node;
317 	struct aca_bank *bank;
318 	uint32_t i = 0;
319 	int ret;
320 
321 	corrected = amdgpu_cper_alloc_entry(adev, AMDGPU_CPER_TYPE_RUNTIME, bank_count);
322 	if (!corrected) {
323 		dev_err(adev->dev, "fail to allocate cper entry for ce records\n");
324 		return -ENOMEM;
325 	}
326 
327 	/* Raise severity if any DE is detected in the ACA bank list */
328 	list_for_each_entry(node, &banks->list, node) {
329 		bank = &node->bank;
330 		if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
331 			sev = CPER_SEV_NON_FATAL_UNCORRECTED;
332 			break;
333 		}
334 	}
335 
336 	amdgpu_cper_entry_fill_hdr(adev, corrected, AMDGPU_CPER_TYPE_RUNTIME, sev);
337 
338 	/* Combine CE and UE in cper record */
339 	list_for_each_entry(node, &banks->list, node) {
340 		bank = &node->bank;
341 		reg_data[CPER_ACA_REG_CTL_LO]    = lower_32_bits(bank->regs[ACA_REG_IDX_CTL]);
342 		reg_data[CPER_ACA_REG_CTL_HI]    = upper_32_bits(bank->regs[ACA_REG_IDX_CTL]);
343 		reg_data[CPER_ACA_REG_STATUS_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
344 		reg_data[CPER_ACA_REG_STATUS_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_STATUS]);
345 		reg_data[CPER_ACA_REG_ADDR_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
346 		reg_data[CPER_ACA_REG_ADDR_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_ADDR]);
347 		reg_data[CPER_ACA_REG_MISC0_LO]  = lower_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
348 		reg_data[CPER_ACA_REG_MISC0_HI]  = upper_32_bits(bank->regs[ACA_REG_IDX_MISC0]);
349 		reg_data[CPER_ACA_REG_CONFIG_LO] = lower_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
350 		reg_data[CPER_ACA_REG_CONFIG_HI] = upper_32_bits(bank->regs[ACA_REG_IDX_CONFIG]);
351 		reg_data[CPER_ACA_REG_IPID_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_IPID]);
352 		reg_data[CPER_ACA_REG_IPID_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_IPID]);
353 		reg_data[CPER_ACA_REG_SYND_LO]   = lower_32_bits(bank->regs[ACA_REG_IDX_SYND]);
354 		reg_data[CPER_ACA_REG_SYND_HI]   = upper_32_bits(bank->regs[ACA_REG_IDX_SYND]);
355 
356 		ret = amdgpu_cper_entry_fill_runtime_section(adev, corrected, i++,
357 				amdgpu_aca_err_type_to_cper_sev(adev, bank->aca_err_type),
358 				reg_data, CPER_ACA_REG_COUNT);
359 		if (ret)
360 			return ret;
361 	}
362 
363 	/*TODO: commit the cper entry to cper ring */
364 
365 	return 0;
366 }
367 
368 int amdgpu_cper_init(struct amdgpu_device *adev)
369 {
370 	mutex_init(&adev->cper.cper_lock);
371 
372 	adev->cper.enabled = true;
373 	adev->cper.max_count = CPER_MAX_ALLOWED_COUNT;
374 
375 	/*TODO: initialize cper ring*/
376 
377 	return 0;
378 }
379 
380 int amdgpu_cper_fini(struct amdgpu_device *adev)
381 {
382 	adev->cper.enabled = false;
383 
384 	/*TODO: free cper ring */
385 	adev->cper.count = 0;
386 	adev->cper.wptr = 0;
387 
388 	return 0;
389 }
390